Home | History | Annotate | Download | only in lexicon
      1 /*
      2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  *
      4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  *
      6  * The contents of this file are subject to the terms of either the GNU Lesser
      7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  * specific language governing permissions and limitations under the License. When
     13  * distributing the software, include this License Header Notice in each file and
     14  * include the full text of the License in the License file as well as the
     15  * following notice:
     16  *
     17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  * (CDDL)
     19  * For Covered Software in this distribution, this License shall be governed by the
     20  * laws of the State of California (excluding conflict-of-law provisions).
     21  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  * the Federal Courts of the Northern District of California and the state courts
     23  * of the State of California, with venue lying in Santa Clara County, California.
     24  *
     25  * Contributor(s):
     26  *
     27  * If you wish your version of this file to be governed by only the CDDL or only
     28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  * license." If you don't indicate a single choice of license, a recipient has the
     31  * option to distribute your version of this file under either the CDDL or the LGPL
     32  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  * Version 2 license, then the option applies only if the new code is made subject
     35  * to such option by the copyright holder.
     36  */
     37 
     38 #ifdef HAVE_CONFIG_H
     39 #include <config.h>
     40 #endif
     41 
     42 #include <stdio.h>
     43 #include <fcntl.h>
     44 #include <unistd.h>
     45 #include <deque>
     46 
     47 #ifdef HAVE_SYS_MMAN_H
     48 #include <sys/mman.h>
     49 #endif
     50 
     51 #include "pytrie.h"
     52 
     53 const unsigned char CPinyinTrie::SYLLABLE_BREAKER;
     54 
     55 const CPinyinTrie::TNode*
     56 CPinyinTrie::transfer(const TNode* pnode, const unsigned char* str) const
     57 {
     58     if (str) {
     59         for (; *str != 0 && pnode != NULL; ++str)
     60             pnode = transfer(pnode, *str);
     61     }
     62     return pnode;
     63 }
     64 
     65 const CPinyinTrie::TNode*
     66 CPinyinTrie::transfer(const TNode* pnode, const TWCHAR* wstr) const
     67 {
     68     if (wstr) {
     69         for (; *wstr != WCH_NULL && pnode != NULL; ++wstr)
     70             pnode = transfer(pnode, unsigned(*wstr));
     71     }
     72     return pnode;
     73 }
     74 
     75 const CPinyinTrie::TNode*
     76 CPinyinTrie::transfer(const TNode* pnode, const unsigned char* str, int nlen) const
     77 {
     78     if (str && nlen > 0) {
     79         for (; nlen>0 && *str != 0 && pnode != NULL; --nlen)
     80             pnode = transfer(pnode, *str++);
     81     }
     82     return pnode;
     83 }
     84 
     85 const CPinyinTrie::TNode*
     86 CPinyinTrie::transfer(const TNode* pnode, const TWCHAR* wstr, int nlen) const
     87 {
     88     if (wstr && nlen > 0) {
     89         for (; nlen>0 && *wstr != WCH_NULL && pnode != NULL; --nlen)
     90             pnode = transfer(pnode, unsigned(*wstr++));
     91     }
     92     return pnode;
     93 }
     94 
     95 bool
     96 CPinyinTrie::isValid(const TNode* pnode, bool allowNonComplete, bool allowGBK)
     97 {
     98     if ((pnode != NULL) && (allowGBK || pnode->m_bGBK == 0) && (pnode->transfer(SYLLABLE_BREAKER) != 0))
     99         return (allowNonComplete || (pnode->m_bFullSyllableTransfer == 1));
    100     return false;
    101 }
    102 
    103 unsigned int
    104 CPinyinTrie::TNode::transfer(unsigned char c) const
    105 {
    106     unsigned int s = 0, t = m_nTransfer;
    107     const TTransUnit* ptrans = getTrans();
    108     while (s < t) {
    109         int m = (s+t)/2;
    110         if (ptrans[m].m_Char == c)
    111             return ptrans[m].m_Offset;
    112         if (ptrans[m].m_Char < c)
    113             s = m + 1;
    114         else
    115             t = m;
    116     }
    117     return 0;
    118 }
    119 
    120 int
    121 CPinyinTrie::lengthAt(unsigned int idx) const
    122 {
    123     if (idx < getWordCount() -1 ) {
    124        return (m_words[idx+1] - m_words[idx]) - 1;
    125     } else if (idx == getWordCount() - 1) {
    126         return (((TWCHAR*)(m_mem+m_Size))-m_words[idx])-1;
    127     }
    128     return 0;
    129 }
    130 
    131 void
    132 CPinyinTrie::print(FILE *fp) const
    133 {
    134     std::string prefix;
    135     print(getRootNode(), prefix, fp);
    136 }
    137 
    138 void
    139 CPinyinTrie::print(const TNode* pRoot, std::string& prefix, FILE *fp) const
    140 {
    141     static char buf[1024];
    142     if (pRoot->m_nWordId > 0) {
    143         fprintf(fp, "%s", prefix.c_str());
    144         if (pRoot->m_bGBK)
    145             fprintf(fp, "(PureGBK)");
    146         unsigned int sz = pRoot->m_nWordId;
    147         const TWordIdInfo *pwids = pRoot->getWordIdPtr();
    148         for (unsigned int i = 0; i < sz; ++i) {
    149             unsigned int id = pwids[i].m_id;
    150             const TWCHAR *pw = operator[](id);
    151             int len = WCSLEN(pw);
    152             if (len != lengthAt(id)) {
    153                 printf(" (lengthAt %d error) ", id);
    154             }
    155             WCSTOMBS(buf, pw, 1024);
    156             fprintf(fp, " %s", buf);
    157             if (pwids[i].m_bSeen == 0)
    158                 fprintf(fp, "[x]");
    159             else
    160                 fprintf(fp, "[o]");
    161         }
    162         fprintf(fp, "\n");
    163     }
    164     unsigned int sz = pRoot->m_nTransfer;
    165     const TTransUnit* ptrans = pRoot->getTrans();
    166     for (unsigned int i = 0; i < sz; ++i) {
    167         unsigned char c = (unsigned char)(ptrans[i].m_Char);
    168         const TNode *pch = transfer(pRoot, c);
    169         prefix += c;
    170         print(pch, prefix, fp);
    171         prefix.resize(prefix.size()-1);
    172     }
    173 }
    174 
    175 unsigned int
    176 CPinyinTrie::getSimbolId(const TWCHAR* wstr)
    177 {
    178     std::map<wstring, unsigned>::const_iterator it;
    179 
    180     it = m_SimbolMap.find(wstring(wstr));
    181     if (it != m_SimbolMap.end())
    182         return it->second;
    183     return 0;
    184 }
    185 
    186 unsigned int
    187 CPinyinTrie::getSimbolId(const wstring & wstr)
    188 {
    189     std::map<wstring, unsigned>::const_iterator it;
    190 
    191     it = m_SimbolMap.find(wstr);
    192     if (it != m_SimbolMap.end())
    193         return it->second;
    194     return 0;
    195 }
    196 
    197 void
    198 CPinyinTrie::free(void)
    199 {
    200     if (m_mem) {
    201 #ifdef HAVE_SYS_MMAN_H
    202         m_mem -= sizeof(unsigned int);
    203         m_Size += sizeof(unsigned int);
    204         munmap (m_mem, m_Size);
    205 #else
    206         delete []m_mem;
    207 #endif
    208         m_mem = NULL;
    209     }
    210     if (m_words) {
    211         delete []m_words;
    212         m_words = NULL;
    213     }
    214     m_SimbolMap.clear();
    215 }
    216 
    217 bool
    218 CPinyinTrie::load(const char *fname)
    219 {
    220     free();
    221 
    222     bool suc = false;
    223     int fd = open(fname, O_RDONLY);
    224     if (fd == -1) return false;
    225 
    226     suc = read (fd, &m_Size, sizeof(unsigned int)) > 0;
    227 #ifdef HAVE_SYS_MMAN_H
    228     suc = suc && ((m_mem = (char *) mmap (NULL, m_Size+sizeof(unsigned int), PROT_READ, MAP_SHARED, fd, 0))
    229                   != MAP_FAILED);
    230     m_mem += sizeof(unsigned int);
    231 #else
    232     suc = suc && ((m_mem = new char [m_Size]) != NULL);
    233     suc = suc && (read (fd, m_mem, m_Size) > 0);
    234 #endif
    235     close(fd);
    236 
    237     suc = suc && ((m_words = new TWCHAR*[getWordCount()]) != NULL);
    238 
    239     if (suc) {
    240         TWCHAR *p = (TWCHAR *)(m_mem + getStringOffset());
    241         for (int i=0, sz=getWordCount(); i < sz; ++i) {
    242             m_words[i] = p;
    243             while (*p++)
    244                 ;
    245         }
    246         for (unsigned i=1; i < 100; ++i) {
    247             if (*m_words[i] != WCH_NULL && *m_words[i] != WCH_LESSTHAN)
    248                 m_SimbolMap[wstring(m_words[i])] = i;
    249         }
    250     }
    251     return suc;
    252 }
    253