1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifdef HAVE_CONFIG_H 39 #include "config.h" 40 #endif 41 42 #ifdef HAVE_ASSERT_H 43 #include <assert.h> 44 #endif 45 46 #include <algorithm> 47 48 #ifdef HAVE_ICONV_H 49 #include <iconv.h> 50 #endif 51 52 #include "pytrie_gen.h" 53 54 static const char* 55 skipSpace(const char* p) 56 { 57 while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r') 58 ++p; 59 return p; 60 } 61 62 static const char* 63 skipNonSpace(const char* p) 64 { 65 while (*p != '\0' && *p != ' ' && *p != '\t' && *p != '\n' && *p != '\r') 66 ++p; 67 return p; 68 } 69 70 static void 71 insertWordId(CPinyinTrieMaker::CWordSet& idset, CPinyinTrieMaker::TWordId id) 72 { 73 CPinyinTrieMaker::CWordSet::iterator it = idset.find(id); 74 if (it == idset.end()) 75 idset.insert(id); 76 else { 77 const CPinyinTrieMaker::TWordId& a = *it; 78 if ((a.anony.m_bHide && !id.anony.m_bHide) || (a.anony.m_bHide == id.anony.m_bHide && a.anony.m_cost > id.anony.m_cost)) { 79 idset.erase(it); 80 idset.insert(id); 81 } 82 } 83 } 84 85 struct TSyllableInfo { 86 std::string m_py; 87 int m_cost; 88 89 TSyllableInfo(const char* py=NULL, int cost=0) : m_py(py), m_cost(cost) {} 90 bool operator< (const TSyllableInfo& b) const { return m_py < b.m_py; } 91 }; 92 93 #ifdef HAVE_ICONV_H 94 bool isCorrectConverted(const char* utf8, iconv_t ic, iconv_t ric) 95 { 96 static char gbstr[256]; 97 static char utstr[256]; 98 99 TIConvSrcPtr src = (TIConvSrcPtr)utf8; 100 size_t srclen = strlen((char*)src)+1; 101 char* dst = (char *)gbstr; 102 size_t dstlen = 256; 103 size_t res = iconv(ic, &src, &srclen, &dst, &dstlen); 104 105 if (res != size_t(-1) && srclen == 0) { 106 // do revert convertion and compare them 107 src = (TIConvSrcPtr)gbstr; 108 srclen = strlen((char*)src)+1; 109 dst = (char *)utstr; 110 dstlen = 256; 111 res = iconv(ric, &src, &srclen, &dst, &dstlen); 112 if (res != size_t(-1) && srclen == 0) 113 return (strcmp(utf8, utstr) == 0); 114 } 115 return false; 116 } 117 118 //return: bit 0x1: contains some gbk out of gb2312, bit 0x2: contains some gb18030 outof gbk 119 unsigned getPureGBEncoding(const char* utf8str) 120 { 121 static iconv_t ic_gb = iconv_open("GB2312", "UTF-8"); 122 static iconv_t ic_gbk = iconv_open("GBK", "UTF-8"); 123 static iconv_t ric_gb = iconv_open("UTF-8", "GB2312"); 124 static iconv_t ric_gbk = iconv_open("UTF-8", "GBK"); 125 126 unsigned ret = 0; 127 128 if (!isCorrectConverted(utf8str, ic_gb, ric_gb)) { 129 ret = 1; // at least it is contains some GBK char 130 if (!isCorrectConverted(utf8str, ic_gbk, ric_gbk)) 131 ret = 3; //contains some GB18030-only char 132 133 #ifdef DEBUG 134 fprintf(stderr, "==> GB category of (%s) is (0x%x)\n ", utf8str, ret); 135 fflush(stderr); 136 #endif 137 } 138 return ret; 139 } 140 #else // !HAVE_ICONV_H 141 unsigned getPureGBEncoding(const char* utf8str) 142 { 143 // FIXME 144 return 0x3; 145 } 146 #endif // HAVE_ICONV_H 147 148 bool 149 parseLine(char* buf, char* word_buf, int& id, std::set<TSyllableInfo>& pyset) 150 { 151 pyset.clear(); 152 153 /* ignore the empty lines and comment lines */ 154 if (*buf == '\n' || *buf == '#') 155 return 0; 156 157 char* word_start = word_buf; 158 char* p = (char*)skipSpace(buf); 159 char* t = (char*)skipNonSpace(p); 160 while(p < t) *word_buf++ = *p++; 161 *word_buf = 0; 162 163 p = (char*)skipSpace(p); 164 t = (char*)skipNonSpace(p); 165 if (*t) 166 *t++ = 0; 167 id = atoi(p); 168 p = (char*)skipSpace(t); 169 while (*p) { 170 const char* s = p; 171 t = (char*)skipNonSpace(p); 172 if (*t) 173 *t++ = 0; 174 while ((*p >= 'a' && *p <= 'z') || (*p == CPinyinTrie::SYLLABLE_BREAKER)) 175 ++p; 176 if ((p > s) && ((*p == 0) || (*p == ':'))) { 177 int cost = 0; 178 if (*p == ':') { 179 *p++ = 0; 180 cost = atoi(p); 181 } 182 pyset.insert(TSyllableInfo(s, cost)); 183 } 184 p = (char*)skipSpace(t); 185 } 186 return pyset.size() > 0; 187 } 188 189 190 CPinyinTrieMaker::CPinyinTrieMaker() 191 : m_RootNode(), m_FullSyllables(), m_StateMap(), m_AllNodes() 192 { 193 m_AllNodes.push_back(&m_RootNode); 194 m_RootNode.m_bExpanded = true; 195 m_RootNode.m_PrimitiveNodes.insert(&m_RootNode); 196 m_StateMap[&(m_RootNode.m_PrimitiveNodes)] = &m_RootNode; 197 } 198 /********************************************************** 199 lexicon 200 TAB(1) 201 word id 202 ' 203 4095; 204 **********************************************************/ 205 #define RARE_MULTI_PHONETIC_STARTING_ID 140000 /* FIXME */ 206 bool 207 CPinyinTrieMaker::constructFromLexicon(const char* fileName) 208 { 209 static int rmp_id = RARE_MULTI_PHONETIC_STARTING_ID; 210 static char buf[4096]; 211 static char word_buf[2048]; 212 213 int id; 214 bool suc = true; 215 std::set<TSyllableInfo> pyset; 216 FILE *fp = fopen(fileName, "r"); 217 printf("Adding pinyin and corresponding words..."); fflush(stdout); 218 while (fgets(buf, 4096, fp) != NULL) { 219 if (!parseLine(buf, word_buf, id, pyset)) { 220 if (word_buf[0] != L'<' && word_buf[0] != 0) { 221 if (m_Lexicon.size() < id+1) m_Lexicon.resize(id+1); 222 m_Lexicon[id] = std::string(word_buf); 223 } 224 continue; 225 } 226 unsigned gbcategory = getPureGBEncoding(word_buf); 227 228 std::set<TSyllableInfo>::iterator its = pyset.begin(); 229 std::set<TSyllableInfo>::iterator ite = pyset.end(); 230 for (; its != ite; ++its) { 231 const char *t = its->m_py.c_str(); 232 int cost = its->m_cost; 233 int myid = id; 234 235 if (cost < 0) { 236 cost = 30 / (-cost); 237 myid = rmp_id ++; 238 } 239 240 if (m_Lexicon.size() < myid+1) m_Lexicon.resize(myid+1); 241 m_Lexicon[myid] = std::string(word_buf); 242 243 CPinyinTrieMaker::TWordId wid(myid, cost, its->m_cost < 0, gbcategory & 0x1, gbcategory & 0x2); 244 suc = insertFullPinyinPair(t, wid) && suc; 245 246 while (*t) { 247 char *p = buf; 248 while (*t != 0 && *t != CPinyinTrie::SYLLABLE_BREAKER) 249 *p++ = *t++; 250 *p = 0; 251 registerFullSyllable(buf); 252 if (*t == CPinyinTrie::SYLLABLE_BREAKER) 253 ++t; 254 } 255 } 256 } 257 fclose(fp); 258 259 std::string pyPrefix = ""; 260 261 printf("\n %d primitive nodes", m_AllNodes.size()); fflush(stdout); 262 263 /* 264 printf("\n Printing it to stderr..."); 265 print(stderr, &m_RootNode, pyPrefix); 266 */ 267 268 printf("\nThreading non-complete pinyin..."); fflush(stdout); 269 suc = threadNonCompletePinyin() && suc; 270 printf("\n %d total nodes", m_AllNodes.size()); fflush(stdout); 271 272 /* 273 printf("\n Printing it to stderr..."); 274 fprintf(stderr, "\n\n\n\n-----------------------------\n\n\n\n"); 275 print(stderr, &m_RootNode, pyPrefix); 276 */ 277 278 printf("\n"); fflush(stdout); 279 280 return suc; 281 } 282 283 CPinyinTrieMaker::TNode::TNode() 284 : m_bFullSyllableTransfer(false), m_bExpanded(false), m_WordIdSet(), 285 m_Trans(), m_PrimitiveNodes(), m_SyllablePrefix() 286 { 287 } 288 289 bool 290 CPinyinTrieMaker::PNodeSet::operator< (const PNodeSet& another) const 291 { 292 CNodeSet::const_iterator t1 = m_pns->begin(); 293 CNodeSet::const_iterator t2 = m_pns->end(); 294 CNodeSet::const_iterator a1 = another.m_pns->begin(); 295 CNodeSet::const_iterator a2 = another.m_pns->end(); 296 for (; t1 != t2 && a1 != a2; ++t1, ++a1) { 297 if (*t1 < *a1) return true; 298 if (*t1 > *a1) return false; 299 } 300 return (a1 != a2); 301 } 302 303 bool 304 CPinyinTrieMaker::PNodeSet::operator==(const PNodeSet& another) const 305 { 306 CNodeSet::const_iterator t1 = m_pns->begin(); 307 CNodeSet::const_iterator t2 = m_pns->end(); 308 CNodeSet::const_iterator a1 = another.m_pns->begin(); 309 CNodeSet::const_iterator a2 = another.m_pns->end(); 310 for (; t1 != t2 && a1 != a2; ++t1, ++a1) { 311 if (*t1 != *a1) return false; 312 } 313 return (a1 == a2 && t1 != t2); 314 } 315 316 317 void 318 CPinyinTrieMaker::print(FILE* fp, TNode* root, std::string& pinyin) 319 { 320 if (root && root->m_WordIdSet.size() > 0) { 321 fprintf(fp, "%s", pinyin.c_str()); 322 CWordSet::iterator itId = root->m_WordIdSet.begin(); 323 CWordSet::iterator itIdLast = root->m_WordIdSet.end(); 324 for (; itId != itIdLast; ++itId) { 325 fprintf(fp, " %s", m_Lexicon[itId->anony.m_id].c_str()); 326 } 327 fprintf(fp, "\n"); 328 } 329 if (root) { 330 CTrans::iterator itTrans = root->m_Trans.begin(); 331 CTrans::iterator itTransLast = root->m_Trans.end(); 332 for (; itTrans != itTransLast; ++itTrans) { 333 pinyin += itTrans->first; 334 print(fp, itTrans->second, pinyin); 335 pinyin.resize(pinyin.size() -1); 336 } 337 } 338 } 339 340 341 /*********************************************************** 342 343 pinyin trie 344 ***********************************************************/ 345 bool 346 CPinyinTrieMaker::registerFullSyllable(const char* pinyin) 347 { 348 CPinyinString str(pinyin); 349 m_FullSyllables.insert(str); 350 return true; 351 } 352 353 354 CPinyinTrieMaker::TNode* 355 CPinyinTrieMaker::insertTransfer(TNode* pnode, unsigned char c) 356 { 357 CTrans::iterator itt = pnode->m_Trans.find(c); 358 CTrans::iterator ite = pnode->m_Trans.end(); 359 if (itt == ite) { 360 TNode *p = new TNode(); 361 m_AllNodes.push_back(p); 362 pnode->m_Trans[c] = p; 363 if (c != CPinyinTrie::SYLLABLE_BREAKER) { 364 p->m_SyllablePrefix = pnode->m_SyllablePrefix; 365 p->m_SyllablePrefix += c; 366 } 367 return p; 368 } 369 return itt->second; 370 } 371 372 /*********************************************************** 373 pinyin: != NULL, 26' 374 >0'' 375 wid : word id 376 ***********************************************************/ 377 bool 378 CPinyinTrieMaker::insertFullPinyinPair(const char* pinyin, TWordId wid) 379 { 380 const char* p = pinyin; 381 TNode *pnode = &m_RootNode; 382 for (; *p; ++p) { 383 if (*p == CPinyinTrie::SYLLABLE_BREAKER) 384 pnode->m_bFullSyllableTransfer = true; 385 pnode = insertTransfer(pnode, (unsigned char)*p); 386 pnode->m_bExpanded = true; 387 pnode->m_PrimitiveNodes.insert(pnode); 388 m_StateMap[&(pnode->m_PrimitiveNodes)] = pnode; 389 } 390 if (*p-1 != CPinyinTrie::SYLLABLE_BREAKER) { 391 pnode->m_bFullSyllableTransfer = true; 392 pnode = insertTransfer(pnode, CPinyinTrie::SYLLABLE_BREAKER); 393 pnode->m_bExpanded = true; 394 pnode->m_PrimitiveNodes.insert(pnode); 395 m_StateMap[&(pnode->m_PrimitiveNodes)] = pnode; 396 } 397 insertWordId(pnode->m_WordIdSet, wid); 398 return true; 399 } 400 401 bool 402 CPinyinTrieMaker::threadNonCompletePinyin(void) 403 { 404 CNodeList::iterator itNode = m_AllNodes.begin(); 405 for (; itNode != m_AllNodes.end(); ++itNode) { 406 TNode* pnode = *itNode; 407 if (!pnode->m_bExpanded) 408 expandNode(pnode); 409 410 if (pnode->m_SyllablePrefix.size() > 0 && 411 m_FullSyllables.find(pnode->m_SyllablePrefix) == m_FullSyllables.end() && 412 pnode->m_Trans.find(CPinyinTrie::SYLLABLE_BREAKER) == pnode->m_Trans.end()) { 413 addNonCompleteSyllableTransfer(pnode); 414 } 415 } 416 return true; 417 } 418 419 /** 420 * For those node which are added after fullComplete PINYIN string, 421 * give their succ nodes, for example, for "dian'ying dui'yu dian'hua", an 422 * new state "d'" would be expand to "d'y d'h" 423 */ 424 void 425 CPinyinTrieMaker::expandNode(TNode* pnode) 426 { 427 std::map<char, CNodeSet> combTrans; 428 429 CNodeSet::iterator itNode = pnode->m_PrimitiveNodes.begin(); 430 CNodeSet::iterator itNodeLast = pnode->m_PrimitiveNodes.end(); 431 for (; itNode != itNodeLast; ++itNode) { 432 CTrans::iterator itTrans = (*itNode)->m_Trans.begin(); 433 CTrans::iterator itTransLast = (*itNode)->m_Trans.end(); 434 for (; itTrans != itTransLast; ++itTrans) { 435 if (itTrans->first == CPinyinTrie::SYLLABLE_BREAKER && 436 !(*itNode)->m_bFullSyllableTransfer) { 437 continue; 438 } 439 combTrans[itTrans->first].insert(itTrans->second); 440 } 441 } 442 443 std::map<char, CNodeSet>::iterator itCombTrans = combTrans.begin(); 444 std::map<char, CNodeSet>::iterator itCombTransLast = combTrans.end(); 445 for (; itCombTrans != itCombTransLast; ++itCombTrans) { 446 //if a new state, ie new node set appear, create the new state --> ps 447 //esle let ps = the founded old state, let transfer(c) = state 448 unsigned char c = itCombTrans->first; 449 // if (c == CPinyinTrie::SYLLABLE_BREAKER && !pnode->m_bFullSyllableTransfer) { 450 // continue; 451 // } 452 TNode* pChildNode = NULL; 453 CStateMap::iterator itStateMap = m_StateMap.find(&itCombTrans->second); 454 if (itStateMap != m_StateMap.end()) { 455 pChildNode = itStateMap->second; 456 } else { 457 pChildNode = new TNode(); 458 m_AllNodes.push_back(pChildNode); 459 pChildNode->m_PrimitiveNodes = itCombTrans->second; 460 m_StateMap[&pChildNode->m_PrimitiveNodes] = pChildNode; 461 if (c != CPinyinTrie::SYLLABLE_BREAKER) { 462 pChildNode->m_SyllablePrefix = pnode->m_SyllablePrefix; 463 pChildNode->m_SyllablePrefix += c; 464 } else { 465 CNodeSet::iterator itps = pChildNode->m_PrimitiveNodes.begin(); 466 CNodeSet::iterator itpse = pChildNode->m_PrimitiveNodes.end(); 467 for (; itps != itpse; ++itps) { 468 CWordSet::iterator ita = (*itps)->m_WordIdSet.begin(); 469 CWordSet::iterator itb = (*itps)->m_WordIdSet.end(); 470 for (; ita != itb; ++ita) 471 insertWordId(pChildNode->m_WordIdSet, *ita); 472 //pChildNode->m_WordIdSet.insert((*itps)->m_WordIdSet.begin(), (*itps)->m_WordIdSet.end()); 473 } 474 pnode->m_bFullSyllableTransfer = false; 475 } 476 } 477 pnode->m_Trans[c] = pChildNode; 478 } 479 pnode->m_bExpanded = true; 480 } 481 482 void 483 CPinyinTrieMaker::addNonCompleteSyllableTransfer(TNode* pnode) 484 { 485 CNodeSet syChildren; 486 TNode* pChildNode = NULL; 487 488 findSyllableChildren(pnode, syChildren); 489 if (syChildren.size() == 0) //z, c, s with only zh, ch, sh children 490 return; 491 CStateMap::iterator itStateMap = m_StateMap.find(&syChildren); 492 if (itStateMap != m_StateMap.end()) { 493 pChildNode = itStateMap->second; 494 } else { 495 pChildNode = new TNode(); 496 m_AllNodes.push_back(pChildNode); 497 pChildNode->m_PrimitiveNodes = syChildren; 498 m_StateMap[&pChildNode->m_PrimitiveNodes] = pChildNode; 499 CNodeSet::iterator itps = pChildNode->m_PrimitiveNodes.begin(); 500 CNodeSet::iterator itpse = pChildNode->m_PrimitiveNodes.end(); 501 for (; itps != itpse; ++itps) { 502 CWordSet::iterator ita = (*itps)->m_WordIdSet.begin(); 503 CWordSet::iterator itb = (*itps)->m_WordIdSet.end(); 504 for (; ita != itb; ++ita) 505 insertWordId(pChildNode->m_WordIdSet, *ita); 506 //pChildNode->m_WordIdSet.insert((*itps)->m_WordIdSet.begin(), (*itps)->m_WordIdSet.end()); 507 } 508 } 509 pnode->m_Trans[CPinyinTrie::SYLLABLE_BREAKER] = pChildNode; 510 // this is default: pnode->m_bFullSyllableTransfer = false; 511 } 512 513 int 514 CPinyinTrieMaker::findSyllableChildren(const TNode *pn, CNodeSet& children) 515 { 516 CNodeSet::iterator itNode = pn->m_PrimitiveNodes.begin(); 517 CNodeSet::iterator itNodeLast = pn->m_PrimitiveNodes.end(); 518 for (children.clear(); itNode != itNodeLast; ++itNode) { 519 findPrimitiveSyllableChildren(*itNode, children); 520 } 521 return children.size(); 522 } 523 524 void 525 CPinyinTrieMaker::findPrimitiveSyllableChildren(const TNode *pn, CNodeSet& children) 526 { 527 CTrans::const_iterator it = pn->m_Trans.begin(); 528 CTrans::const_iterator ite= pn->m_Trans.end(); 529 for (; it != ite; ++it) { 530 if (it->first != CPinyinTrie::SYLLABLE_BREAKER) { 531 if (it->first == 'h' && 532 (pn->m_SyllablePrefix == "c" || pn->m_SyllablePrefix == "z" || 533 pn->m_SyllablePrefix == "s" ) ) { 534 continue; 535 } 536 findPrimitiveSyllableChildren(it->second, children); 537 } else { 538 if (pn->m_bFullSyllableTransfer) { 539 children.insert(it->second); 540 } 541 } 542 } 543 } 544 545 bool 546 CPinyinTrieMaker::write(const char* fileName, CWordEvaluator* psrt) 547 { 548 bool suc = false; 549 FILE* fp = fopen(fileName, "wb"); 550 if (fp != NULL) { 551 suc = write(fp, psrt); 552 fclose(fp); 553 } 554 return suc; 555 } 556 557 bool 558 CPinyinTrieMaker::write(FILE *fp, CWordEvaluator* psrt) 559 { 560 bool suc = true; 561 static TWCHAR wbuf[1024]; 562 563 std::map<TNode*, unsigned int> nodeOffsetMap; 564 565 /*the file started with m_nWord, the size itself do not included here*/ 566 unsigned int nWord = m_Lexicon.size(); 567 unsigned int nNode = m_AllNodes.size(); 568 unsigned int lexiconOffset; 569 unsigned int offset = sizeof(unsigned int) * 3; 570 571 CNodeList::const_iterator itNode = m_AllNodes.begin(); 572 CNodeList::const_iterator itNodeLast = m_AllNodes.end(); 573 for (; itNode != itNodeLast; ++itNode) { 574 nodeOffsetMap[*itNode] = offset; 575 offset += CPinyinTrie::TNode::size_for((*itNode)->m_Trans.size(), 576 (*itNode)->m_WordIdSet.size()); 577 } 578 lexiconOffset = offset; 579 CLexicon::const_iterator itWordStr = m_Lexicon.begin(); 580 CLexicon::const_iterator itWordStrLast = m_Lexicon.end(); 581 for (; itWordStr != itWordStrLast; ++itWordStr) { 582 MBSTOWCS(wbuf, itWordStr->c_str(), 1024); 583 int sz = WCSLEN(wbuf); 584 offset += (sz+1)*sizeof(TWCHAR); 585 } 586 587 suc = (fwrite(&offset, sizeof(unsigned int), 1, fp) == 1); 588 suc = (fwrite(&nWord, sizeof(unsigned int), 1, fp) == 1); 589 suc = (fwrite(&nNode, sizeof(unsigned int), 1, fp) == 1); 590 suc = (fwrite(&lexiconOffset, sizeof(unsigned int), 1, fp) == 1); 591 592 itNode = m_AllNodes.begin(); 593 itNodeLast = m_AllNodes.end(); 594 for (; itNode != itNodeLast && suc; ++itNode) { 595 CPinyinTrie::TNode outNode; 596 outNode.m_nTransfer = (*itNode)->m_Trans.size(); 597 outNode.m_nWordId = (*itNode)->m_WordIdSet.size(); 598 outNode.m_bFullSyllableTransfer = (*itNode)->m_bFullSyllableTransfer; 599 600 //determine this node's GB category, have some pure gb2312 words, or all GBK/GB18030 words 601 outNode.m_bGBK = 1; 602 outNode.m_bGB18030 = 1; 603 604 TNode* itequ = *itNode; 605 if (outNode.m_nWordId == 0) { 606 if ((*itNode)->m_Trans.find(CPinyinTrie::SYLLABLE_BREAKER) != (*itNode)->m_Trans.end()) { 607 itequ = (*itNode)->m_Trans[CPinyinTrie::SYLLABLE_BREAKER]; 608 if (itequ->m_WordIdSet.size() == 0) { 609 outNode.m_bGBK = 0; 610 outNode.m_bGB18030 = 0; 611 } 612 } else { 613 outNode.m_bGBK = 0; 614 outNode.m_bGB18030 = 0; 615 } 616 } 617 CWordSet::iterator itId = itequ->m_WordIdSet.begin(); 618 CWordSet::iterator itIdLast = itequ->m_WordIdSet.end(); 619 for (; itId != itIdLast && outNode.m_bGBK; ++itId) { 620 outNode.m_bGB18030 &= itId->anony.m_bGB18030; 621 outNode.m_bGBK &= itId->anony.m_bGBK; 622 } 623 #ifdef DEBUG 624 if (outNode.m_bGBK) { 625 CWordSet::iterator itId = (*itNode)->m_WordIdSet.begin(); 626 CWordSet::iterator itIdLast = (*itNode)->m_WordIdSet.end(); 627 fprintf(stderr, "========>("); 628 for (; itId != itIdLast; ++itId) { 629 fprintf(stderr, " %d-%1d", itId->anony.m_id, itId->anony.m_bGBK); 630 } 631 fprintf(stderr, " )\n\n"); 632 fflush(stderr); 633 } 634 #endif 635 suc = (fwrite(&outNode, sizeof(outNode), 1, fp) == 1); 636 637 CTrans::iterator itTrans = (*itNode)->m_Trans.begin(); 638 CTrans::iterator itTransLast = (*itNode)->m_Trans.end(); 639 for (; itTrans != itTransLast && suc; ++itTrans) { 640 CPinyinTrie::TTransUnit tru; 641 tru.m_Char = itTrans->first; 642 tru.m_Offset = nodeOffsetMap[itTrans->second]; 643 assert(tru.m_Offset != 0); 644 suc = (fwrite(&tru, sizeof(tru), 1, fp) == 1); 645 } 646 647 CWordVec vec; 648 itId = (*itNode)->m_WordIdSet.begin(); 649 itIdLast = (*itNode)->m_WordIdSet.end(); 650 for (; itId != itIdLast; ++itId) 651 vec.push_back(TWordInfo(*itId, psrt->getCost(*itId), psrt->isSeen(*itId))); 652 std::make_heap(vec.begin(), vec.end()); 653 std::sort_heap(vec.begin(), vec.end()); 654 655 CWordVec::iterator itv = vec.begin(); 656 CWordVec::iterator itve = vec.end(); 657 for (; itv != itve && suc; ++itv) { 658 CPinyinTrie::TWordIdInfo wi; 659 wi.m_id = itv->m_id.anony.m_id; 660 wi.m_bGBK = itv->m_id.anony.m_bGBK; 661 wi.m_bGB18030 = itv->m_id.anony.m_bGB18030; 662 wi.m_len = m_Lexicon[itv->m_id.anony.m_id].size(); 663 wi.m_bSeen = ((itv->m_bSeen)?(1):(0)); 664 wi.m_cost = itv->m_id.anony.m_cost; 665 suc = (fwrite(&wi, sizeof(wi), 1, fp) == 1); 666 } 667 } 668 itWordStr = m_Lexicon.begin(); 669 itWordStrLast = m_Lexicon.end(); 670 for (; itWordStr != itWordStrLast && suc; ++itWordStr) { 671 MBSTOWCS(wbuf, itWordStr->c_str(), 1024); 672 int sz = WCSLEN(wbuf); 673 suc = (fwrite(wbuf, (sz+1)*sizeof(TWCHAR), 1, fp) == 1); 674 } 675 return suc; 676 } 677