1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifdef HAVE_CONFIG_H 39 #include <config.h> 40 #endif 41 42 #include <stdio.h> 43 #include <fcntl.h> 44 #include <unistd.h> 45 #include <deque> 46 47 #ifdef HAVE_SYS_MMAN_H 48 #include <sys/mman.h> 49 #endif 50 51 #include "pytrie.h" 52 53 const unsigned char CPinyinTrie::SYLLABLE_BREAKER; 54 55 const CPinyinTrie::TNode* 56 CPinyinTrie::transfer(const TNode* pnode, const unsigned char* str) const 57 { 58 if (str) { 59 for (; *str != 0 && pnode != NULL; ++str) 60 pnode = transfer(pnode, *str); 61 } 62 return pnode; 63 } 64 65 const CPinyinTrie::TNode* 66 CPinyinTrie::transfer(const TNode* pnode, const TWCHAR* wstr) const 67 { 68 if (wstr) { 69 for (; *wstr != WCH_NULL && pnode != NULL; ++wstr) 70 pnode = transfer(pnode, unsigned(*wstr)); 71 } 72 return pnode; 73 } 74 75 const CPinyinTrie::TNode* 76 CPinyinTrie::transfer(const TNode* pnode, const unsigned char* str, int nlen) const 77 { 78 if (str && nlen > 0) { 79 for (; nlen>0 && *str != 0 && pnode != NULL; --nlen) 80 pnode = transfer(pnode, *str++); 81 } 82 return pnode; 83 } 84 85 const CPinyinTrie::TNode* 86 CPinyinTrie::transfer(const TNode* pnode, const TWCHAR* wstr, int nlen) const 87 { 88 if (wstr && nlen > 0) { 89 for (; nlen>0 && *wstr != WCH_NULL && pnode != NULL; --nlen) 90 pnode = transfer(pnode, unsigned(*wstr++)); 91 } 92 return pnode; 93 } 94 95 bool 96 CPinyinTrie::isValid(const TNode* pnode, bool allowNonComplete, bool allowGBK) 97 { 98 if ((pnode != NULL) && (allowGBK || pnode->m_bGBK == 0) && (pnode->transfer(SYLLABLE_BREAKER) != 0)) 99 return (allowNonComplete || (pnode->m_bFullSyllableTransfer == 1)); 100 return false; 101 } 102 103 unsigned int 104 CPinyinTrie::TNode::transfer(unsigned char c) const 105 { 106 unsigned int s = 0, t = m_nTransfer; 107 const TTransUnit* ptrans = getTrans(); 108 while (s < t) { 109 int m = (s+t)/2; 110 if (ptrans[m].m_Char == c) 111 return ptrans[m].m_Offset; 112 if (ptrans[m].m_Char < c) 113 s = m + 1; 114 else 115 t = m; 116 } 117 return 0; 118 } 119 120 int 121 CPinyinTrie::lengthAt(unsigned int idx) const 122 { 123 if (idx < getWordCount() -1 ) { 124 return (m_words[idx+1] - m_words[idx]) - 1; 125 } else if (idx == getWordCount() - 1) { 126 return (((TWCHAR*)(m_mem+m_Size))-m_words[idx])-1; 127 } 128 return 0; 129 } 130 131 void 132 CPinyinTrie::print(FILE *fp) const 133 { 134 std::string prefix; 135 print(getRootNode(), prefix, fp); 136 } 137 138 void 139 CPinyinTrie::print(const TNode* pRoot, std::string& prefix, FILE *fp) const 140 { 141 static char buf[1024]; 142 if (pRoot->m_nWordId > 0) { 143 fprintf(fp, "%s", prefix.c_str()); 144 if (pRoot->m_bGBK) 145 fprintf(fp, "(PureGBK)"); 146 unsigned int sz = pRoot->m_nWordId; 147 const TWordIdInfo *pwids = pRoot->getWordIdPtr(); 148 for (unsigned int i = 0; i < sz; ++i) { 149 unsigned int id = pwids[i].m_id; 150 const TWCHAR *pw = operator[](id); 151 int len = WCSLEN(pw); 152 if (len != lengthAt(id)) { 153 printf(" (lengthAt %d error) ", id); 154 } 155 WCSTOMBS(buf, pw, 1024); 156 fprintf(fp, " %s", buf); 157 if (pwids[i].m_bSeen == 0) 158 fprintf(fp, "[x]"); 159 else 160 fprintf(fp, "[o]"); 161 } 162 fprintf(fp, "\n"); 163 } 164 unsigned int sz = pRoot->m_nTransfer; 165 const TTransUnit* ptrans = pRoot->getTrans(); 166 for (unsigned int i = 0; i < sz; ++i) { 167 unsigned char c = (unsigned char)(ptrans[i].m_Char); 168 const TNode *pch = transfer(pRoot, c); 169 prefix += c; 170 print(pch, prefix, fp); 171 prefix.resize(prefix.size()-1); 172 } 173 } 174 175 unsigned int 176 CPinyinTrie::getSimbolId(const TWCHAR* wstr) 177 { 178 std::map<wstring, unsigned>::const_iterator it; 179 180 it = m_SimbolMap.find(wstring(wstr)); 181 if (it != m_SimbolMap.end()) 182 return it->second; 183 return 0; 184 } 185 186 unsigned int 187 CPinyinTrie::getSimbolId(const wstring & wstr) 188 { 189 std::map<wstring, unsigned>::const_iterator it; 190 191 it = m_SimbolMap.find(wstr); 192 if (it != m_SimbolMap.end()) 193 return it->second; 194 return 0; 195 } 196 197 void 198 CPinyinTrie::free(void) 199 { 200 if (m_mem) { 201 #ifdef HAVE_SYS_MMAN_H 202 m_mem -= sizeof(unsigned int); 203 m_Size += sizeof(unsigned int); 204 munmap (m_mem, m_Size); 205 #else 206 delete []m_mem; 207 #endif 208 m_mem = NULL; 209 } 210 if (m_words) { 211 delete []m_words; 212 m_words = NULL; 213 } 214 m_SimbolMap.clear(); 215 } 216 217 bool 218 CPinyinTrie::load(const char *fname) 219 { 220 free(); 221 222 bool suc = false; 223 int fd = open(fname, O_RDONLY); 224 if (fd == -1) return false; 225 226 suc = read (fd, &m_Size, sizeof(unsigned int)) > 0; 227 #ifdef HAVE_SYS_MMAN_H 228 suc = suc && ((m_mem = (char *) mmap (NULL, m_Size+sizeof(unsigned int), PROT_READ, MAP_SHARED, fd, 0)) 229 != MAP_FAILED); 230 m_mem += sizeof(unsigned int); 231 #else 232 suc = suc && ((m_mem = new char [m_Size]) != NULL); 233 suc = suc && (read (fd, m_mem, m_Size) > 0); 234 #endif 235 close(fd); 236 237 suc = suc && ((m_words = new TWCHAR*[getWordCount()]) != NULL); 238 239 if (suc) { 240 TWCHAR *p = (TWCHAR *)(m_mem + getStringOffset()); 241 for (int i=0, sz=getWordCount(); i < sz; ++i) { 242 m_words[i] = p; 243 while (*p++) 244 ; 245 } 246 for (unsigned i=1; i < 100; ++i) { 247 if (*m_words[i] != WCH_NULL && *m_words[i] != WCH_LESSTHAN) 248 m_SimbolMap[wstring(m_words[i])] = i; 249 } 250 } 251 return suc; 252 } 253