Home | History | Annotate | Download | only in lexicon
      1 /*
      2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  *
      4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  *
      6  * The contents of this file are subject to the terms of either the GNU Lesser
      7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  * specific language governing permissions and limitations under the License. When
     13  * distributing the software, include this License Header Notice in each file and
     14  * include the full text of the License in the License file as well as the
     15  * following notice:
     16  *
     17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  * (CDDL)
     19  * For Covered Software in this distribution, this License shall be governed by the
     20  * laws of the State of California (excluding conflict-of-law provisions).
     21  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  * the Federal Courts of the Northern District of California and the state courts
     23  * of the State of California, with venue lying in Santa Clara County, California.
     24  *
     25  * Contributor(s):
     26  *
     27  * If you wish your version of this file to be governed by only the CDDL or only
     28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  * license." If you don't indicate a single choice of license, a recipient has the
     31  * option to distribute your version of this file under either the CDDL or the LGPL
     32  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  * Version 2 license, then the option applies only if the new code is made subject
     35  * to such option by the copyright holder.
     36  */
     37 
     38 #ifdef HAVE_CONFIG_H
     39 #include "config.h"
     40 #endif
     41 
     42 #ifdef HAVE_ASSERT_H
     43 #include <assert.h>
     44 #endif
     45 
     46 #include <algorithm>
     47 
     48 #ifdef HAVE_ICONV_H
     49 #include <iconv.h>
     50 #endif
     51 
     52 #include "pytrie_gen.h"
     53 
     54 static const char*
     55 skipSpace(const char* p)
     56 {
     57     while (*p == ' ' || *p == '\t' || *p == '\n' || *p == '\r')
     58         ++p;
     59     return p;
     60 }
     61 
     62 static const char*
     63 skipNonSpace(const char* p)
     64 {
     65     while (*p != '\0' && *p != ' ' && *p != '\t' && *p != '\n' && *p != '\r')
     66         ++p;
     67     return p;
     68 }
     69 
     70 static void
     71 insertWordId(CPinyinTrieMaker::CWordSet& idset, CPinyinTrieMaker::TWordId id)
     72 {
     73     CPinyinTrieMaker::CWordSet::iterator it = idset.find(id);
     74     if (it == idset.end())
     75         idset.insert(id);
     76     else {
     77         const CPinyinTrieMaker::TWordId& a = *it;
     78         if ((a.anony.m_bHide && !id.anony.m_bHide) || (a.anony.m_bHide == id.anony.m_bHide && a.anony.m_cost > id.anony.m_cost)) {
     79             idset.erase(it);
     80             idset.insert(id);
     81         }
     82     }
     83 }
     84 
     85 struct TSyllableInfo {
     86     std::string   m_py;
     87     int           m_cost;
     88 
     89     TSyllableInfo(const char* py=NULL, int cost=0) : m_py(py), m_cost(cost) {}
     90     bool operator< (const TSyllableInfo& b) const { return m_py < b.m_py; }
     91 };
     92 
     93 #ifdef HAVE_ICONV_H
     94 bool isCorrectConverted(const char* utf8, iconv_t ic, iconv_t ric)
     95 {
     96     static char gbstr[256];
     97     static char utstr[256];
     98 
     99     TIConvSrcPtr src = (TIConvSrcPtr)utf8;
    100     size_t srclen = strlen((char*)src)+1;
    101     char* dst = (char *)gbstr;
    102     size_t dstlen = 256;
    103     size_t res = iconv(ic, &src, &srclen, &dst, &dstlen);
    104 
    105     if (res != size_t(-1) && srclen == 0) {
    106         // do revert convertion and compare them
    107         src = (TIConvSrcPtr)gbstr;
    108         srclen = strlen((char*)src)+1;
    109         dst = (char *)utstr;
    110         dstlen = 256;
    111         res = iconv(ric, &src, &srclen, &dst, &dstlen);
    112         if (res != size_t(-1) && srclen == 0)
    113             return (strcmp(utf8, utstr) == 0);
    114     }
    115     return false;
    116 }
    117 
    118 //return: bit 0x1: contains some gbk out of gb2312, bit 0x2: contains some gb18030 outof gbk
    119 unsigned getPureGBEncoding(const char* utf8str)
    120 {
    121     static iconv_t ic_gb = iconv_open("GB2312", "UTF-8");
    122     static iconv_t ic_gbk = iconv_open("GBK", "UTF-8");
    123     static iconv_t ric_gb = iconv_open("UTF-8", "GB2312");
    124     static iconv_t ric_gbk = iconv_open("UTF-8", "GBK");
    125 
    126     unsigned ret = 0;
    127 
    128     if (!isCorrectConverted(utf8str, ic_gb, ric_gb)) {
    129         ret = 1; // at least it is contains some GBK char
    130         if (!isCorrectConverted(utf8str, ic_gbk, ric_gbk))
    131             ret = 3; //contains some GB18030-only char
    132 
    133         #ifdef DEBUG
    134             fprintf(stderr, "==> GB category of (%s) is (0x%x)\n ", utf8str, ret);
    135             fflush(stderr);
    136         #endif
    137     }
    138     return ret;
    139 }
    140 #else // !HAVE_ICONV_H
    141 unsigned getPureGBEncoding(const char* utf8str)
    142 {
    143     // FIXME
    144     return 0x3;
    145 }
    146 #endif // HAVE_ICONV_H
    147 
    148 bool
    149 parseLine(char* buf, char* word_buf, int& id, std::set<TSyllableInfo>& pyset)
    150 {
    151     pyset.clear();
    152 
    153     /* ignore the empty lines and comment lines */
    154     if (*buf == '\n' || *buf == '#')
    155         return 0;
    156 
    157     char* word_start = word_buf;
    158     char* p = (char*)skipSpace(buf);
    159     char* t = (char*)skipNonSpace(p);
    160     while(p < t) *word_buf++ = *p++;
    161     *word_buf = 0;
    162 
    163     p = (char*)skipSpace(p);
    164     t = (char*)skipNonSpace(p);
    165     if (*t)
    166         *t++ = 0;
    167     id = atoi(p);
    168     p = (char*)skipSpace(t);
    169     while (*p) {
    170         const char* s = p;
    171         t = (char*)skipNonSpace(p);
    172         if (*t)
    173             *t++ = 0;
    174         while ((*p >= 'a' && *p <= 'z') || (*p == CPinyinTrie::SYLLABLE_BREAKER))
    175             ++p;
    176         if ((p > s) && ((*p == 0) || (*p == ':'))) {
    177             int  cost = 0;
    178             if (*p == ':') {
    179                 *p++ = 0;
    180                 cost = atoi(p);
    181             }
    182             pyset.insert(TSyllableInfo(s, cost));
    183         }
    184         p = (char*)skipSpace(t);
    185     }
    186     return pyset.size() > 0;
    187 }
    188 
    189 
    190 CPinyinTrieMaker::CPinyinTrieMaker()
    191     : m_RootNode(), m_FullSyllables(), m_StateMap(), m_AllNodes()
    192 {
    193     m_AllNodes.push_back(&m_RootNode);
    194     m_RootNode.m_bExpanded = true;
    195     m_RootNode.m_PrimitiveNodes.insert(&m_RootNode);
    196     m_StateMap[&(m_RootNode.m_PrimitiveNodes)] = &m_RootNode;
    197 }
    198 /**********************************************************
    199     lexicon
    200         TAB(1)
    201          word id
    202         '
    203         4095;
    204 **********************************************************/
    205 #define RARE_MULTI_PHONETIC_STARTING_ID 140000 /* FIXME */
    206 bool
    207 CPinyinTrieMaker::constructFromLexicon(const char* fileName)
    208 {
    209     static int  rmp_id = RARE_MULTI_PHONETIC_STARTING_ID;
    210     static char buf[4096];
    211     static char word_buf[2048];
    212 
    213     int id;
    214     bool suc = true;
    215     std::set<TSyllableInfo> pyset;
    216     FILE *fp = fopen(fileName, "r");
    217     printf("Adding pinyin and corresponding words..."); fflush(stdout);
    218     while (fgets(buf, 4096, fp) != NULL) {
    219         if (!parseLine(buf, word_buf, id, pyset)) {
    220             if (word_buf[0] != L'<' && word_buf[0] != 0) {
    221                 if (m_Lexicon.size() < id+1) m_Lexicon.resize(id+1);
    222                 m_Lexicon[id] = std::string(word_buf);
    223             }
    224             continue;
    225         }
    226         unsigned gbcategory = getPureGBEncoding(word_buf);
    227 
    228         std::set<TSyllableInfo>::iterator its = pyset.begin();
    229         std::set<TSyllableInfo>::iterator ite = pyset.end();
    230         for (; its != ite; ++its) {
    231             const char *t = its->m_py.c_str();
    232             int cost = its->m_cost;
    233             int myid = id;
    234 
    235             if (cost < 0) {
    236                 cost = 30 / (-cost);
    237                 myid = rmp_id ++;
    238             }
    239 
    240             if (m_Lexicon.size() < myid+1) m_Lexicon.resize(myid+1);
    241             m_Lexicon[myid] = std::string(word_buf);
    242 
    243             CPinyinTrieMaker::TWordId wid(myid, cost, its->m_cost < 0, gbcategory & 0x1, gbcategory & 0x2);
    244             suc = insertFullPinyinPair(t, wid) && suc;
    245 
    246             while (*t) {
    247                 char *p = buf;
    248                 while (*t != 0 && *t != CPinyinTrie::SYLLABLE_BREAKER)
    249                     *p++ = *t++;
    250                 *p = 0;
    251                 registerFullSyllable(buf);
    252                 if (*t == CPinyinTrie::SYLLABLE_BREAKER)
    253                     ++t;
    254             }
    255         }
    256     }
    257     fclose(fp);
    258 
    259     std::string pyPrefix = "";
    260 
    261     printf("\n    %d primitive nodes", m_AllNodes.size());  fflush(stdout);
    262 
    263     /*
    264     printf("\n    Printing it to stderr...");
    265     print(stderr, &m_RootNode, pyPrefix);
    266     */
    267 
    268     printf("\nThreading non-complete pinyin...");  fflush(stdout);
    269     suc = threadNonCompletePinyin() && suc;
    270     printf("\n    %d total nodes", m_AllNodes.size());  fflush(stdout);
    271 
    272     /*
    273     printf("\n    Printing it to stderr...");
    274     fprintf(stderr, "\n\n\n\n-----------------------------\n\n\n\n");
    275     print(stderr, &m_RootNode, pyPrefix);
    276     */
    277 
    278     printf("\n");  fflush(stdout);
    279 
    280     return suc;
    281 }
    282 
    283 CPinyinTrieMaker::TNode::TNode()
    284     : m_bFullSyllableTransfer(false), m_bExpanded(false), m_WordIdSet(),
    285       m_Trans(), m_PrimitiveNodes(), m_SyllablePrefix()
    286 {
    287 }
    288 
    289 bool
    290 CPinyinTrieMaker::PNodeSet::operator< (const PNodeSet& another) const
    291 {
    292     CNodeSet::const_iterator t1 = m_pns->begin();
    293     CNodeSet::const_iterator t2 = m_pns->end();
    294     CNodeSet::const_iterator a1 = another.m_pns->begin();
    295     CNodeSet::const_iterator a2 = another.m_pns->end();
    296     for (; t1 != t2 && a1 != a2; ++t1, ++a1) {
    297         if (*t1 < *a1) return true;
    298         if (*t1 > *a1) return false;
    299     }
    300     return (a1 != a2);
    301 }
    302 
    303 bool
    304 CPinyinTrieMaker::PNodeSet::operator==(const PNodeSet& another) const
    305 {
    306     CNodeSet::const_iterator t1 = m_pns->begin();
    307     CNodeSet::const_iterator t2 = m_pns->end();
    308     CNodeSet::const_iterator a1 = another.m_pns->begin();
    309     CNodeSet::const_iterator a2 = another.m_pns->end();
    310     for (; t1 != t2 && a1 != a2; ++t1, ++a1) {
    311         if (*t1 != *a1) return false;
    312     }
    313     return (a1 == a2 && t1 != t2);
    314 }
    315 
    316 
    317 void
    318 CPinyinTrieMaker::print(FILE* fp, TNode* root, std::string& pinyin)
    319 {
    320     if (root && root->m_WordIdSet.size() > 0) {
    321         fprintf(fp, "%s", pinyin.c_str());
    322         CWordSet::iterator itId = root->m_WordIdSet.begin();
    323         CWordSet::iterator itIdLast = root->m_WordIdSet.end();
    324         for (; itId != itIdLast; ++itId) {
    325             fprintf(fp, " %s", m_Lexicon[itId->anony.m_id].c_str());
    326         }
    327         fprintf(fp, "\n");
    328     }
    329     if (root) {
    330         CTrans::iterator itTrans = root->m_Trans.begin();
    331         CTrans::iterator itTransLast = root->m_Trans.end();
    332         for (; itTrans != itTransLast; ++itTrans) {
    333             pinyin += itTrans->first;
    334             print(fp, itTrans->second, pinyin);
    335             pinyin.resize(pinyin.size() -1);
    336         }
    337     }
    338 }
    339 
    340 
    341 /***********************************************************
    342     
    343     pinyin trie
    344 ***********************************************************/
    345 bool
    346 CPinyinTrieMaker::registerFullSyllable(const char* pinyin)
    347 {
    348     CPinyinString str(pinyin);
    349     m_FullSyllables.insert(str);
    350     return true;
    351 }
    352 
    353 
    354 CPinyinTrieMaker::TNode*
    355 CPinyinTrieMaker::insertTransfer(TNode* pnode, unsigned char c)
    356 {
    357     CTrans::iterator itt = pnode->m_Trans.find(c);
    358     CTrans::iterator ite = pnode->m_Trans.end();
    359     if (itt == ite) {
    360         TNode *p = new TNode();
    361         m_AllNodes.push_back(p);
    362         pnode->m_Trans[c] = p;
    363         if (c != CPinyinTrie::SYLLABLE_BREAKER) {
    364             p->m_SyllablePrefix = pnode->m_SyllablePrefix;
    365             p->m_SyllablePrefix += c;
    366         }
    367         return p;
    368     }
    369     return itt->second;
    370 }
    371 
    372 /***********************************************************
    373     pinyin:   != NULL, 26'
    374                >0''
    375     wid   :   word id
    376 ***********************************************************/
    377 bool
    378 CPinyinTrieMaker::insertFullPinyinPair(const char* pinyin, TWordId wid)
    379 {
    380     const char* p = pinyin;
    381     TNode *pnode = &m_RootNode;
    382     for (; *p; ++p) {
    383         if (*p == CPinyinTrie::SYLLABLE_BREAKER)
    384             pnode->m_bFullSyllableTransfer = true;
    385         pnode = insertTransfer(pnode, (unsigned char)*p);
    386         pnode->m_bExpanded = true;
    387         pnode->m_PrimitiveNodes.insert(pnode);
    388         m_StateMap[&(pnode->m_PrimitiveNodes)] = pnode;
    389     }
    390     if (*p-1 != CPinyinTrie::SYLLABLE_BREAKER) {
    391         pnode->m_bFullSyllableTransfer = true;
    392         pnode = insertTransfer(pnode, CPinyinTrie::SYLLABLE_BREAKER);
    393         pnode->m_bExpanded = true;
    394         pnode->m_PrimitiveNodes.insert(pnode);
    395         m_StateMap[&(pnode->m_PrimitiveNodes)] = pnode;
    396     }
    397     insertWordId(pnode->m_WordIdSet, wid);
    398     return true;
    399 }
    400 
    401 bool
    402 CPinyinTrieMaker::threadNonCompletePinyin(void)
    403 {
    404     CNodeList::iterator itNode = m_AllNodes.begin();
    405     for (; itNode != m_AllNodes.end(); ++itNode) {
    406         TNode* pnode = *itNode;
    407         if (!pnode->m_bExpanded)
    408             expandNode(pnode);
    409 
    410         if (pnode->m_SyllablePrefix.size() > 0 &&
    411                 m_FullSyllables.find(pnode->m_SyllablePrefix) == m_FullSyllables.end() &&
    412                 pnode->m_Trans.find(CPinyinTrie::SYLLABLE_BREAKER) == pnode->m_Trans.end()) {
    413             addNonCompleteSyllableTransfer(pnode);
    414         }
    415     }
    416     return true;
    417 }
    418 
    419 /**
    420 * For those node which are added after fullComplete PINYIN string,
    421 * give their succ nodes, for example, for "dian'ying dui'yu dian'hua", an
    422 * new state "d'" would be expand to "d'y d'h"
    423 */
    424 void
    425 CPinyinTrieMaker::expandNode(TNode* pnode)
    426 {
    427     std::map<char, CNodeSet> combTrans;
    428 
    429     CNodeSet::iterator itNode = pnode->m_PrimitiveNodes.begin();
    430     CNodeSet::iterator itNodeLast = pnode->m_PrimitiveNodes.end();
    431     for (; itNode != itNodeLast; ++itNode) {
    432          CTrans::iterator itTrans = (*itNode)->m_Trans.begin();
    433          CTrans::iterator itTransLast = (*itNode)->m_Trans.end();
    434          for (; itTrans != itTransLast; ++itTrans) {
    435              if (itTrans->first == CPinyinTrie::SYLLABLE_BREAKER &&
    436                         !(*itNode)->m_bFullSyllableTransfer) {
    437                  continue;
    438              }
    439              combTrans[itTrans->first].insert(itTrans->second);
    440          }
    441     }
    442 
    443     std::map<char, CNodeSet>::iterator itCombTrans = combTrans.begin();
    444     std::map<char, CNodeSet>::iterator itCombTransLast = combTrans.end();
    445     for (; itCombTrans != itCombTransLast; ++itCombTrans) {
    446         //if a new state, ie new node set appear, create the new state --> ps
    447         //esle let ps = the founded old state, let transfer(c) = state
    448         unsigned char c = itCombTrans->first;
    449 //        if (c == CPinyinTrie::SYLLABLE_BREAKER && !pnode->m_bFullSyllableTransfer) {
    450 //            continue;
    451 //        }
    452         TNode* pChildNode = NULL;
    453         CStateMap::iterator itStateMap = m_StateMap.find(&itCombTrans->second);
    454         if (itStateMap != m_StateMap.end()) {
    455             pChildNode = itStateMap->second;
    456         } else {
    457             pChildNode = new TNode();
    458             m_AllNodes.push_back(pChildNode);
    459             pChildNode->m_PrimitiveNodes = itCombTrans->second;
    460             m_StateMap[&pChildNode->m_PrimitiveNodes] = pChildNode;
    461             if (c != CPinyinTrie::SYLLABLE_BREAKER) {
    462                 pChildNode->m_SyllablePrefix = pnode->m_SyllablePrefix;
    463                 pChildNode->m_SyllablePrefix += c;
    464             } else {
    465                 CNodeSet::iterator itps = pChildNode->m_PrimitiveNodes.begin();
    466                 CNodeSet::iterator itpse = pChildNode->m_PrimitiveNodes.end();
    467                 for (; itps != itpse; ++itps) {
    468                     CWordSet::iterator ita = (*itps)->m_WordIdSet.begin();
    469                     CWordSet::iterator itb = (*itps)->m_WordIdSet.end();
    470                     for (; ita != itb; ++ita)
    471                         insertWordId(pChildNode->m_WordIdSet, *ita);
    472                     //pChildNode->m_WordIdSet.insert((*itps)->m_WordIdSet.begin(), (*itps)->m_WordIdSet.end());
    473                 }
    474                 pnode->m_bFullSyllableTransfer = false;
    475             }
    476         }
    477         pnode->m_Trans[c] = pChildNode;
    478     }
    479     pnode->m_bExpanded = true;
    480 }
    481 
    482 void
    483 CPinyinTrieMaker::addNonCompleteSyllableTransfer(TNode* pnode)
    484 {
    485     CNodeSet syChildren;
    486     TNode* pChildNode = NULL;
    487 
    488     findSyllableChildren(pnode, syChildren);
    489     if (syChildren.size() == 0)  //z, c, s with only zh, ch, sh children
    490         return;
    491     CStateMap::iterator itStateMap = m_StateMap.find(&syChildren);
    492     if (itStateMap != m_StateMap.end()) {
    493         pChildNode = itStateMap->second;
    494     } else {
    495         pChildNode = new TNode();
    496         m_AllNodes.push_back(pChildNode);
    497         pChildNode->m_PrimitiveNodes = syChildren;
    498         m_StateMap[&pChildNode->m_PrimitiveNodes] = pChildNode;
    499         CNodeSet::iterator itps = pChildNode->m_PrimitiveNodes.begin();
    500         CNodeSet::iterator itpse = pChildNode->m_PrimitiveNodes.end();
    501         for (; itps != itpse; ++itps) {
    502             CWordSet::iterator ita = (*itps)->m_WordIdSet.begin();
    503             CWordSet::iterator itb = (*itps)->m_WordIdSet.end();
    504             for (; ita != itb; ++ita)
    505                 insertWordId(pChildNode->m_WordIdSet, *ita);
    506             //pChildNode->m_WordIdSet.insert((*itps)->m_WordIdSet.begin(), (*itps)->m_WordIdSet.end());
    507         }
    508     }
    509     pnode->m_Trans[CPinyinTrie::SYLLABLE_BREAKER] = pChildNode;
    510     // this is default: pnode->m_bFullSyllableTransfer = false;
    511 }
    512 
    513 int
    514 CPinyinTrieMaker::findSyllableChildren(const TNode *pn, CNodeSet& children)
    515 {
    516     CNodeSet::iterator itNode = pn->m_PrimitiveNodes.begin();
    517     CNodeSet::iterator itNodeLast = pn->m_PrimitiveNodes.end();
    518     for (children.clear(); itNode != itNodeLast; ++itNode) {
    519         findPrimitiveSyllableChildren(*itNode, children);
    520     }
    521     return children.size();
    522 }
    523 
    524 void
    525 CPinyinTrieMaker::findPrimitiveSyllableChildren(const TNode *pn, CNodeSet& children)
    526 {
    527     CTrans::const_iterator it = pn->m_Trans.begin();
    528     CTrans::const_iterator ite= pn->m_Trans.end();
    529     for (; it != ite; ++it) {
    530         if (it->first != CPinyinTrie::SYLLABLE_BREAKER) {
    531             if (it->first == 'h' &&
    532                 (pn->m_SyllablePrefix == "c" || pn->m_SyllablePrefix == "z" ||
    533                  pn->m_SyllablePrefix == "s" ) ) {
    534                 continue;
    535             }
    536             findPrimitiveSyllableChildren(it->second, children);
    537         } else {
    538             if (pn->m_bFullSyllableTransfer) {
    539                 children.insert(it->second);
    540             }
    541         }
    542     }
    543 }
    544 
    545 bool
    546 CPinyinTrieMaker::write(const char* fileName, CWordEvaluator* psrt)
    547 {
    548     bool suc = false;
    549     FILE* fp = fopen(fileName, "wb");
    550     if (fp != NULL) {
    551         suc = write(fp, psrt);
    552         fclose(fp);
    553     }
    554     return suc;
    555 }
    556 
    557 bool
    558 CPinyinTrieMaker::write(FILE *fp, CWordEvaluator* psrt)
    559 {
    560     bool suc = true;
    561     static TWCHAR wbuf[1024];
    562 
    563     std::map<TNode*, unsigned int> nodeOffsetMap;
    564 
    565     /*the file started with m_nWord, the size itself do not included here*/
    566     unsigned int nWord = m_Lexicon.size();
    567     unsigned int nNode = m_AllNodes.size();
    568     unsigned int lexiconOffset;
    569     unsigned int offset = sizeof(unsigned int) * 3;
    570 
    571     CNodeList::const_iterator itNode = m_AllNodes.begin();
    572     CNodeList::const_iterator itNodeLast = m_AllNodes.end();
    573     for (; itNode != itNodeLast; ++itNode) {
    574         nodeOffsetMap[*itNode] = offset;
    575         offset += CPinyinTrie::TNode::size_for((*itNode)->m_Trans.size(),
    576                                                (*itNode)->m_WordIdSet.size());
    577     }
    578     lexiconOffset = offset;
    579     CLexicon::const_iterator itWordStr = m_Lexicon.begin();
    580     CLexicon::const_iterator itWordStrLast = m_Lexicon.end();
    581     for (; itWordStr != itWordStrLast; ++itWordStr) {
    582         MBSTOWCS(wbuf, itWordStr->c_str(), 1024);
    583         int sz = WCSLEN(wbuf);
    584         offset += (sz+1)*sizeof(TWCHAR);
    585     }
    586 
    587     suc = (fwrite(&offset, sizeof(unsigned int), 1, fp) == 1);
    588     suc = (fwrite(&nWord, sizeof(unsigned int), 1, fp) == 1);
    589     suc = (fwrite(&nNode, sizeof(unsigned int), 1, fp) == 1);
    590     suc = (fwrite(&lexiconOffset, sizeof(unsigned int), 1, fp) == 1);
    591 
    592     itNode = m_AllNodes.begin();
    593     itNodeLast = m_AllNodes.end();
    594     for (; itNode != itNodeLast && suc; ++itNode) {
    595         CPinyinTrie::TNode outNode;
    596         outNode.m_nTransfer = (*itNode)->m_Trans.size();
    597         outNode.m_nWordId = (*itNode)->m_WordIdSet.size();
    598         outNode.m_bFullSyllableTransfer = (*itNode)->m_bFullSyllableTransfer;
    599 
    600         //determine this node's GB category, have some pure gb2312 words, or all GBK/GB18030 words
    601         outNode.m_bGBK = 1;
    602         outNode.m_bGB18030 = 1;
    603 
    604         TNode* itequ = *itNode;
    605         if (outNode.m_nWordId == 0) {
    606             if ((*itNode)->m_Trans.find(CPinyinTrie::SYLLABLE_BREAKER) != (*itNode)->m_Trans.end()) {
    607                 itequ = (*itNode)->m_Trans[CPinyinTrie::SYLLABLE_BREAKER];
    608                 if (itequ->m_WordIdSet.size() == 0) {
    609                     outNode.m_bGBK = 0;
    610                     outNode.m_bGB18030 = 0;
    611                 }
    612             } else {
    613                 outNode.m_bGBK = 0;
    614                 outNode.m_bGB18030 = 0;
    615             }
    616         }
    617         CWordSet::iterator itId = itequ->m_WordIdSet.begin();
    618         CWordSet::iterator itIdLast = itequ->m_WordIdSet.end();
    619         for (; itId != itIdLast && outNode.m_bGBK; ++itId) {
    620             outNode.m_bGB18030 &= itId->anony.m_bGB18030;
    621             outNode.m_bGBK &= itId->anony.m_bGBK;
    622         }
    623         #ifdef DEBUG
    624             if (outNode.m_bGBK) {
    625                 CWordSet::iterator itId = (*itNode)->m_WordIdSet.begin();
    626                 CWordSet::iterator itIdLast = (*itNode)->m_WordIdSet.end();
    627                 fprintf(stderr, "========>(");
    628                 for (; itId != itIdLast; ++itId) {
    629                     fprintf(stderr, " %d-%1d", itId->anony.m_id, itId->anony.m_bGBK);
    630                 }
    631                 fprintf(stderr, " )\n\n");
    632                 fflush(stderr);
    633             }
    634         #endif
    635         suc = (fwrite(&outNode, sizeof(outNode), 1, fp) == 1);
    636 
    637         CTrans::iterator itTrans = (*itNode)->m_Trans.begin();
    638         CTrans::iterator itTransLast = (*itNode)->m_Trans.end();
    639         for (; itTrans != itTransLast && suc; ++itTrans) {
    640             CPinyinTrie::TTransUnit tru;
    641             tru.m_Char = itTrans->first;
    642             tru.m_Offset = nodeOffsetMap[itTrans->second];
    643             assert(tru.m_Offset != 0);
    644             suc = (fwrite(&tru, sizeof(tru), 1, fp) == 1);
    645         }
    646 
    647         CWordVec vec;
    648         itId = (*itNode)->m_WordIdSet.begin();
    649         itIdLast = (*itNode)->m_WordIdSet.end();
    650         for (; itId != itIdLast; ++itId)
    651             vec.push_back(TWordInfo(*itId, psrt->getCost(*itId), psrt->isSeen(*itId)));
    652         std::make_heap(vec.begin(), vec.end());
    653         std::sort_heap(vec.begin(), vec.end());
    654 
    655         CWordVec::iterator itv = vec.begin();
    656         CWordVec::iterator itve = vec.end();
    657         for (; itv != itve && suc; ++itv) {
    658             CPinyinTrie::TWordIdInfo wi;
    659             wi.m_id = itv->m_id.anony.m_id;
    660             wi.m_bGBK = itv->m_id.anony.m_bGBK;
    661             wi.m_bGB18030 = itv->m_id.anony.m_bGB18030;
    662             wi.m_len = m_Lexicon[itv->m_id.anony.m_id].size();
    663             wi.m_bSeen = ((itv->m_bSeen)?(1):(0));
    664             wi.m_cost = itv->m_id.anony.m_cost;
    665             suc = (fwrite(&wi, sizeof(wi), 1, fp) == 1);
    666         }
    667     }
    668     itWordStr = m_Lexicon.begin();
    669     itWordStrLast = m_Lexicon.end();
    670     for (; itWordStr != itWordStrLast && suc; ++itWordStr) {
    671         MBSTOWCS(wbuf, itWordStr->c_str(), 1024);
    672         int sz = WCSLEN(wbuf);
    673         suc = (fwrite(wbuf, (sz+1)*sizeof(TWCHAR), 1, fp) == 1);
    674     }
    675     return suc;
    676 }
    677