OpenGrok

Cross Reference: slmseg.cpp
xref: /nv-g11n/inputmethod/sunpinyin/slm/src/slm/slmseg/slmseg.cpp
Home | History | Annotate | Line # | Download | only in slmseg
      1 /*
      2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  *
      4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  *
      6  * The contents of this file are subject to the terms of either the GNU Lesser
      7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  * specific language governing permissions and limitations under the License. When
     13  * distributing the software, include this License Header Notice in each file and
     14  * include the full text of the License in the License file as well as the
     15  * following notice:
     16  *
     17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  * (CDDL)
     19  * For Covered Software in this distribution, this License shall be governed by the
     20  * laws of the State of California (excluding conflict-of-law provisions).
     21  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  * the Federal Courts of the Northern District of California and the state courts
     23  * of the State of California, with venue lying in Santa Clara County, California.
     24  *
     25  * Contributor(s):
     26  *
     27  * If you wish your version of this file to be governed by only the CDDL or only
     28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  * license." If you don't indicate a single choice of license, a recipient has the
     31  * option to distribute your version of this file under either the CDDL or the LGPL
     32  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  * Version 2 license, then the option applies only if the new code is made subject
     35  * to such option by the copyright holder.
     36  */
     37 
     38 #ifdef HAVE_CONFIG_H
     39 #include "config.h"
     40 #endif
     41 
     42 #ifdef HAVE_ASSERT_H
     43 #include <assert.h>
     44 #endif
     45 
     46 #ifdef HAVE_GETOPT_H
     47 #include <getopt.h>
     48 #endif
     49 
     50 #include <stdio.h>
     51 #include <unistd.h>
     52 #include <locale.h>
     53 
     54 #include <vector>
     55 #include <map>
     56 #include <algorithm>
     57 
     58 #include "../sim_dict.h"
     59 #include "../sim_sen.h"
     60 #include "../slm.h"
     61 
     62 static struct option long_options[] =
     63 {
     64     {"dict", 1, 0, 'd'},
     65     {"format", 1, 0, 'f'},
     66     {"show-id", 0, 0, 'i'},
     67     {"s-tok", 1, 0, 's'},
     68     {"model", 1, 0, 'm'},
     69     {0, 0, 0, 0}
     70 };
     71 
     72 static char* s_strDictFile = NULL;
     73 static char* s_strSlmFile = NULL;
     74 static bool s_bTextOut = false;
     75 static bool s_bShowId = false;
     76 static TSIMWordId s_iSTOKID = 10;
     77 
     78 static CSIMDict *s_dict = NULL;
     79 static CThreadSlm *s_tslm = NULL;
     80 
     81 static void
     82 ShowUsage()
     83 {
     84     fprintf(stderr, "\nUsage:\n");
     85     fprintf(stderr, "slmseg -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-m lm_file]\n\n");
     86     fprintf(stderr, "  -f --format:\n");
     87     fprintf(stderr, "    Output Format, can be 'text' or 'bin'. default 'bin'\n");
     88     fprintf(stderr, "    Normally, in text mode, word text are output, while in binary mode,\n");
     89     fprintf(stderr, "    binary short integer of the word-ids are writed to stdout.\n");
     90     fprintf(stderr, "  -s --stok:\n");
     91     fprintf(stderr, "    Sentence token id. Default 10.\n");
     92     fprintf(stderr, "    It will be write to output in binary mode after every sentence.\n");
     93     fprintf(stderr, "  -i --show-id:\n");
     94     fprintf(stderr, "    Show Id info. Under text output format mode, Attach id after known-words.\n");
     95     fprintf(stderr, "                  Under binary mode, print id in text.\n");
     96     fprintf(stderr, "  -m --model:\n");
     97     fprintf(stderr, "    Language model file name");
     98     fprintf(stderr, "\n");
     99     fprintf(stderr, "Notes:\n");
    100     fprintf(stderr, "  Under binary mode, consecutive id of 0 are merged into one 0.\n");
    101     fprintf(stderr, "  Under text mode, no space are insert between unknown-words. \n");
    102     fprintf(stderr, "\n");
    103     fprintf(stderr, "\n");
    104     exit(1000);
    105 }
    106 
    107 static void
    108 getParameters(int argc, char* argv[])
    109 {
    110     int c;
    111     while ((c=getopt_long(argc, argv, "d:if:s:m:", long_options, NULL)) != -1)
    112     {
    113         switch (c) {
    114         case 'd':
    115             s_strDictFile = strdup(optarg);
    116             break;
    117         case 'i':
    118             s_bShowId = true;
    119             break;
    120         case 'f':
    121             s_bTextOut = (strcmp(optarg, "text") == 0);
    122             break;
    123         case 's':
    124             s_iSTOKID = atoi(optarg);
    125             break;
    126         case 'm':
    127             s_strSlmFile  = strdup(optarg);
    128             break;
    129         default:
    130             ShowUsage();
    131             break;
    132         }
    133     }
    134     if (s_strDictFile == NULL)
    135         ShowUsage();
    136 }
    137 
    138 static void
    139 output_stok(int& nWords)
    140 {
    141     if (s_bShowId) {
    142         if (nWords > 0)
    143             printf(" ");
    144         printf("%d", unsigned(s_iSTOKID));
    145     } else {
    146         fwrite(&s_iSTOKID, sizeof(TSIMWordId), 1, stdout);
    147     }
    148     ++nWords;
    149 }
    150 
    151 static void
    152 output(int len, const TWCHAR* p, TSIMWordId idprev, TSIMWordId idcur, int& nWords)
    153 {
    154     static char mbword[1024];
    155     static TWCHAR wcword[1024];
    156 
    157     bool bRealGap = (idcur != SIM_ID_NOT_WORD || idprev != SIM_ID_NOT_WORD);
    158     if (s_bTextOut) {
    159         for (int i=0; i < len; ++i, ++p)
    160             wcword[i] = *p;
    161         wcword[len] = 0;
    162         WCSTOMBS(mbword, wcword, sizeof(mbword));
    163         if (bRealGap && idprev == SIM_ID_NOT_WORD)
    164             printf("(%d)", unsigned(idprev));
    165         if (bRealGap && (nWords > 0))
    166             printf(" ");
    167         printf("%s", mbword);
    168         if (s_bShowId && idcur != SIM_ID_NOT_WORD)
    169             printf("(%d)", unsigned(idcur));
    170     } else {
    171         if (bRealGap) {
    172             if (s_bShowId) {
    173                 if (nWords > 0)
    174                     printf(" ");
    175                 printf("%d", unsigned(idcur));
    176             } else
    177                 fwrite(&idcur, sizeof(TSIMWordId), 1, stdout);
    178         }
    179     }
    180     if (bRealGap)
    181         ++nWords;
    182 }
    183 
    184 struct TLatticeWord {
    185   int m_left;
    186   int m_right;
    187   int m_wordId;
    188 
    189   TLatticeWord(int left=0, int right=0, int wid=0)
    190       : m_left(left), m_right(right), m_wordId(wid) { }
    191 };
    192 
    193 typedef std::vector<TLatticeWord> TLatticeWordVec;
    194 
    195 struct TLatticeStateValue {
    196   double                m_pr;
    197   TLatticeWord*         mp_btword;
    198   CThreadSlm::TState    m_btstate;
    199 
    200   TLatticeStateValue(double pr=0.0, TLatticeWord* btword=NULL, CThreadSlm::TState btstate = CThreadSlm::TState())
    201       : m_pr(pr), mp_btword(btword), m_btstate(btstate) { }
    202 };
    203 
    204 typedef std::map<CThreadSlm::TState, TLatticeStateValue> TLatticeColumnStates;
    205 
    206 struct TLatticeColumn {
    207   TLatticeWordVec          m_wordstarting;
    208   TLatticeColumnStates     m_states;
    209 };
    210 
    211 typedef std::vector<TLatticeColumn> CLattice;
    212 
    213 inline void insertLatticeWord(CLattice& lattice, TLatticeWord word)
    214 {
    215     lattice[word.m_left].m_wordstarting.push_back(word);
    216 }
    217 
    218 int
    219 getAmbiLen(const TWCHAR* p, int word_len)
    220 {
    221     const CSIMDict::TState* pstate;
    222 
    223     for (int i=1; (i<word_len) && *(p+i) != WCH_NULL; ++i) {
    224         int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p+i);
    225         if (word_len < i+len)
    226             word_len = i+len;
    227     }
    228 
    229     return word_len;
    230 }
    231 
    232 void fullSegBuildLattice(wstring& sntnc, int left, int len, CLattice& lattice)
    233 {
    234     for (int right=left+len; left < right; ++left) {
    235         bool found = false;
    236 
    237         const TWCHAR* p = sntnc.c_str()+left;
    238         const CSIMDict::TState* pds = s_dict->getRoot();
    239         for (len = 0; left+len < right; ++len) {
    240             if ((pds = s_dict->step(pds, *p++)) == NULL)
    241                 break;
    242             if (pds->word_id != SIM_ID_NOT_WORD) {
    243                 found = true;
    244                 insertLatticeWord(lattice, TLatticeWord(left, left+len+1, pds->word_id));
    245             }
    246         }
    247         if (!found)
    248             insertLatticeWord(lattice, TLatticeWord(left, left+1, SIM_ID_NOT_WORD));
    249     }
    250 }
    251 
    252 /**
    253 * Lattice head should have one state, with its TState using slm's root. its
    254 * pr = 0 and its mp_btword == NULL;
    255 * Lattice tail must contain no word, and it previous node contain only one word
    256 * with its right = left+1, right == tail.
    257 * The lattice should ensure the lattice path existing
    258 */
    259 void buildLattice(wstring &sntnc, CLattice& lattice)
    260 {
    261     lattice.clear();
    262     lattice.resize(sntnc.size()+2);
    263 
    264     unsigned int idcur = SIM_ID_NOT_WORD;
    265     lattice[0].m_states[CThreadSlm::TState()] = TLatticeStateValue(0.0, NULL, CThreadSlm::TState());
    266 
    267     for (int i=0, sz=sntnc.size(); i < sz; ) {
    268         const CSIMDict::TState* pstate;
    269         const TWCHAR* p = sntnc.c_str()+i;
    270         int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p);
    271         if (len <= 0) {
    272             idcur = SIM_ID_NOT_WORD;
    273             len = 1;
    274         } else {
    275             idcur = pstate->word_id;
    276         }
    277         int ambilen = getAmbiLen(p, len);
    278 
    279         if (ambilen <= len) {
    280             insertLatticeWord(lattice, TLatticeWord(i, i+len, idcur));
    281             i += len;
    282         } else {
    283             fullSegBuildLattice(sntnc, i, ambilen, lattice);
    284             i += ambilen;
    285         }
    286     }
    287     lattice[sntnc.size()].m_wordstarting.push_back(TLatticeWord(sntnc.size(), sntnc.size()+1, s_iSTOKID));
    288 }
    289 
    290 void searchBest(CLattice& lattice)
    291 {
    292     for (int i=0, sz=lattice.size(); i < sz; ++i) {
    293         TLatticeColumnStates & states = lattice[i].m_states;
    294         TLatticeColumnStates::iterator itss = states.begin();
    295         TLatticeColumnStates::iterator itse = states.end();
    296         for (; itss != itse; ++itss) {
    297             TLatticeWordVec::iterator itws = lattice[i].m_wordstarting.begin();
    298             TLatticeWordVec::iterator itwe = lattice[i].m_wordstarting.end();
    299             for (; itws != itwe; ++itws) {
    300                 CThreadSlm::TState his = itss->first;
    301                 double pr = itss->second.m_pr;
    302                 pr += s_tslm->transferNegLog(his, itws->m_wordId, his);
    303                 TLatticeColumnStates & rss = lattice[itws->m_right].m_states;
    304                 s_tslm->historify(his);
    305                 TLatticeColumnStates::iterator itn = rss.find(his);
    306                 if (itn == rss.end()) {
    307                     rss[his] = TLatticeStateValue(pr, &(*itws), itss->first);
    308                 } else {
    309                     if (itn->second.m_pr > pr) {
    310                         rss[his] = TLatticeStateValue(pr, &(*itws), itss->first);
    311                     }
    312                 }
    313             }
    314         }
    315     }
    316 }
    317 
    318 void getBestPath(CLattice& lattice, TLatticeWordVec& segResult)
    319 {
    320     TLatticeColumnStates & states = lattice.back().m_states;
    321     TLatticeColumnStates::iterator its = states.begin();
    322 
    323     TLatticeWord* pbtword = its->second.mp_btword;
    324     CThreadSlm::TState btstate = its->second.m_btstate;
    325     its = lattice[pbtword->m_left].m_states.find(btstate);
    326     assert(its != lattice[pbtword->m_left].m_states.end());
    327 
    328     segResult.clear();
    329     while (true) {
    330         pbtword = its->second.mp_btword;
    331         if (pbtword != NULL) {
    332 #ifndef HOST_OS_GNUC_2
    333             segResult.push_back(*pbtword);
    334 #else // HOST_OS_GNUC_2
    335             segResult.insert(segResult.begin(), *pbtword);
    336 #endif // !HOST_OS_GNUC_2
    337             btstate = its->second.m_btstate;
    338             its = lattice[pbtword->m_left].m_states.find(btstate);
    339             assert(its != lattice[pbtword->m_left].m_states.end());
    340         } else {
    341             break;
    342         }
    343     }
    344 #ifndef HOST_OS_GNUC_2
    345     std::reverse(segResult.begin(), segResult.end());
    346 #endif // HOST_OS_GNUC_2
    347 }
    348 
    349 static bool
    350 processSingleFile(FILE* fp, int &nWords, int &nAmbis)
    351 {
    352     nWords = 0;
    353     nAmbis = 0;
    354 
    355     wstring sntnc;
    356     CSIMCharReader *pReader = new CSIMCharReader(fp);
    357     CSIMCharReader::iterator iter = pReader->begin();
    358     TSIMWordId idcur, idprev = s_iSTOKID;
    359 
    360     if (!s_bTextOut)
    361         output_stok(nWords);
    362 
    363     while (true){
    364         if (ReadSentence(sntnc, iter, false) == false)
    365             break;
    366 
    367         CLattice lattice;
    368         buildLattice(sntnc, lattice);
    369         searchBest(lattice);
    370 
    371         TLatticeWordVec segResult;
    372         getBestPath(lattice, segResult);
    373 
    374         for (int i=0, sz=segResult.size(); i < sz; ++i) {
    375             const TWCHAR *p = sntnc.c_str()+segResult[i].m_left;
    376             int len = segResult[i].m_right - segResult[i].m_left;
    377             idcur = segResult[i].m_wordId;
    378 
    379             output(len, p, idprev, idcur, nWords);
    380             idprev = idcur;
    381         }
    382 
    383         if (!s_bTextOut) {
    384             output_stok(nWords);
    385             idprev = s_iSTOKID;
    386         }
    387     }
    388 
    389     fflush(stdout);
    390     return true;
    391 }
    392 
    393 int
    394 main(int argc, char *argv[])
    395 {
    396     int nWords, nAmbis;
    397 
    398     setlocale(LC_ALL, "");
    399     getParameters(argc, argv);
    400     argc -= optind;
    401     argv += optind;
    402 
    403     fprintf(stderr, "Loading lexicon...");
    404     fflush(stderr);
    405     s_dict = new CSIMDict();
    406     s_tslm = new CThreadSlm();
    407     if (!s_dict->parseText(s_strDictFile)) {
    408         fprintf(stderr, "fail to open Lexicon file!\n");
    409         fflush(stderr);
    410         exit(11);
    411     }
    412     if (!s_tslm->load(s_strSlmFile, true)) {
    413         fprintf(stderr, "fail to open slm file!\n");
    414         fflush(stderr);
    415         exit(12);
    416     }
    417     fprintf(stderr, "done");
    418     fflush(stderr);
    419 
    420     if (argc == 0) {
    421         fprintf(stderr, "\nProcessing from stdin...");
    422         fflush(stderr);
    423         processSingleFile(stdin, nWords, nAmbis);
    424         fprintf(stderr, "%d words, %d ambiguious. Done!\n", nWords, nAmbis);
    425         fflush(stderr);
    426     } else {
    427         for (int i=0; i < argc; ++i) {
    428             fprintf(stderr, "\nProcessing %s...", argv[i]); fflush(stderr);
    429             FILE *fp = fopen(argv[i], "r");
    430             if (fp != NULL) {
    431                 processSingleFile(fp, nWords, nAmbis);
    432                 fprintf(stderr, "@Offset %u, %d words, %d ambiguious. Done!\n",
    433                                 ftell(fp), nWords, nAmbis);
    434                 fflush(stderr);
    435             } else {
    436                 fprintf(stderr, "Can not Open!!!!!!!\n");
    437                 fflush(stderr);
    438             }
    439             fclose(fp);
    440         }
    441     }
    442 
    443     s_tslm->free();
    444     delete s_tslm;
    445     s_tslm = NULL;
    446     s_dict->close();
    447     delete s_dict;
    448     s_dict = NULL;
    449     return 0;
    450 }
    451