1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifndef _SIM_PYTRIE_H 39 #define _SIM_PYTRIE_H 40 41 #include <stdio.h> 42 43 #include "../portability.h" 44 45 #include <stdint.h> 46 #include <string> 47 #include <vector> 48 #include <map> 49 #include <set> 50 #include <list> 51 52 #define WORD_ID_WIDTH 18 53 54 class CPinyinTrie { 55 public: 56 static const unsigned char SYLLABLE_BREAKER = '\''; 57 friend class CPinyinTrieMaker; 58 59 class TTransUnit { 60 public: 61 #if !defined(WORDS_BIGENDIAN) 62 unsigned m_Char : 8; 63 unsigned m_Offset: 24; 64 #else 65 unsigned m_Offset: 24; 66 unsigned m_Char : 8; 67 #endif 68 }; 69 70 class TWordIdInfo { 71 public: 72 #if !defined(WORDS_BIGENDIAN) 73 unsigned m_id : WORD_ID_WIDTH; 74 unsigned m_bGBK : 1; 75 unsigned m_bGB18030 : 1; 76 unsigned m_len : 6; 77 unsigned m_cost : 5; 78 unsigned m_bSeen : 1; 79 #else 80 unsigned m_bSeen : 1; 81 unsigned m_cost : 5; 82 unsigned m_len : 6; 83 unsigned m_bGB18030 : 1; 84 unsigned m_bGBK : 1; 85 unsigned m_id : WORD_ID_WIDTH; 86 #endif 87 88 public: 89 TWordIdInfo() 90 { memset(this, 0, sizeof(TWordIdInfo)); } 91 92 TWordIdInfo(unsigned id, unsigned len=0, unsigned seen=0, unsigned cost = 0, unsigned gbk = 0, unsigned gb18030 = 0) 93 : m_id(id), m_bGBK(gbk), m_bGB18030(gb18030), m_len(len), m_cost(cost), m_bSeen(seen) { } 94 95 operator 96 unsigned int() const { return m_id; } 97 }; 98 99 class TNode { 100 public: 101 #if !defined(WORDS_BIGENDIAN) 102 unsigned m_nWordId : 16; 103 unsigned m_nTransfer : 8; 104 unsigned m_bGBK : 1; 105 unsigned m_bGB18030 : 1; 106 unsigned m_bFullSyllableTransfer: 1; 107 unsigned m_bOther : 5; 108 109 #else 110 unsigned m_bOther : 5; 111 unsigned m_bFullSyllableTransfer: 1; 112 unsigned m_bGB18030 : 1; 113 unsigned m_bGBK : 1; 114 unsigned m_nTransfer : 8; 115 unsigned m_nWordId : 16; 116 117 #endif 118 119 public: 120 static unsigned int 121 size_for(unsigned int nTransfer, unsigned int nWordId) 122 { return sizeof(TNode) + sizeof(TTransUnit)*nTransfer + 123 sizeof(TWordIdInfo)*nWordId; } 124 125 public: 126 TNode() { *((uint32_t*)this) = 0; } 127 128 bool 129 hasPinyinChild(void) const 130 { return (m_nTransfer > 1) || 131 (m_nTransfer ==1 && getTrans()->m_Char != SYLLABLE_BREAKER); } 132 133 const TTransUnit* 134 getTrans() const 135 { return (TTransUnit*)(this+1); } 136 137 const TWordIdInfo* 138 getWordIdPtr() const 139 { return (TWordIdInfo*)(((char*)(this+1))+sizeof(TTransUnit)*m_nTransfer); } 140 141 unsigned int 142 transfer(unsigned char c) const; 143 144 inline unsigned int 145 transfer(unsigned c) const 146 { return transfer((unsigned char)(c)); } 147 }; 148 149 public: 150 CPinyinTrie() : m_Size(0), m_mem(NULL), m_words(NULL) { } 151 152 ~CPinyinTrie() { free(); } 153 154 bool 155 load(const char* fileName); 156 157 bool 158 isValid(const TNode* pnode, bool allowNonComplete, bool allowGBK=true); 159 160 unsigned int 161 getRootOffset() const { return 3 * sizeof(unsigned int); } 162 163 const TNode* 164 getRootNode() const { return (TNode*)(m_mem+getRootOffset()); } 165 166 const TNode* 167 nodeFromOffset(unsigned int offset) const 168 { return (offset < getRootOffset())?NULL:((TNode*)(m_mem+offset)); } 169 170 //@{ 171 /** transfer on an char or a string from a specific node*/ 172 //inline const TNode* 173 //transfer(const TNode* pnode, char c) const 174 // { return transfer(pnode, (unsigned char)c); } 175 176 inline const TNode* 177 transfer(const TNode* pnode, unsigned char c) const 178 { return nodeFromOffset(pnode->transfer(c)); } 179 180 inline const TNode* 181 transfer(const TNode* pnode, TWCHAR wc) const 182 { return nodeFromOffset(pnode->transfer(unsigned(wc))); } 183 184 /* 185 inline const TNode* 186 transfer(const TNode* pnode, const char* str) const 187 { return transfer(pnode, (const unsigned char*)str); } 188 */ 189 190 const TNode* 191 transfer(const TNode* pnode, const unsigned char* str) const; 192 193 const TNode* 194 transfer(const TNode* pnode, const TWCHAR* wstr) const; 195 196 /* 197 inline const TNode* 198 transfer(const TNode* pnode, const char* str, int nlen) const 199 { return transfer(pnode, (const unsigned char*)str, nlen); } 200 */ 201 202 const TNode* 203 transfer(const TNode* pnode, const unsigned char* str, int nlen) const; 204 205 const TNode* 206 transfer(const TNode* pnode, const TWCHAR* wstr, int nlen) const; 207 //@} 208 209 //@{ 210 /** transfer on an char or a string from root node*/ 211 /* 212 inline const TNode* 213 transfer(const char* str) const 214 { return transfer(getRootNode(), str); } 215 */ 216 217 inline const TNode* 218 transfer(const unsigned char* str) const 219 { return transfer(getRootNode(), str); } 220 221 inline const TNode* 222 transfer(const TWCHAR* wstr) const 223 { return transfer(getRootNode(), wstr); } 224 225 /* 226 inline const TNode* 227 transfer(const char* str, int nlen) const 228 { return transfer(getRootNode(), str, nlen); } 229 */ 230 231 inline const TNode* 232 transfer(const unsigned char* str, int nlen) const 233 { return transfer(getRootNode(), str, nlen); } 234 235 inline const TNode* 236 transfer(const TWCHAR* wstr, int nlen) const 237 { return transfer(getRootNode(), wstr, nlen); } 238 //@} 239 240 unsigned int 241 getWordCount(void) const { return *(unsigned int*)m_mem; } 242 243 unsigned int 244 getNodeCount(void) const { return *(unsigned int*)(m_mem+sizeof(unsigned int)); } 245 246 unsigned int 247 getStringOffset(void) const { return *(unsigned int*)(m_mem+2*sizeof(unsigned int)); } 248 249 unsigned int 250 getSimbolId(const TWCHAR* wstr); 251 252 unsigned int 253 getSimbolId(const wstring & wstr); 254 255 const TWCHAR* 256 operator[](unsigned int idx) const { return m_words[idx]; } 257 258 int 259 lengthAt(unsigned int idx) const; 260 261 void 262 free(void); 263 264 void 265 print(FILE *fp) const; 266 267 protected: 268 unsigned int m_Size; 269 char *m_mem; 270 TWCHAR **m_words; 271 272 std::map<wstring, unsigned> m_SimbolMap; 273 274 protected: 275 void 276 print(const TNode* pRoot, std::string& prefix, FILE *fp) const; 277 }; 278 279 #endif 280