Home | History | Annotate | Download | only in slm
      1    0   yongsun /*
      2   82   yongsun  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3   82   yongsun  *
      4   82   yongsun  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5   82   yongsun  *
      6   82   yongsun  * The contents of this file are subject to the terms of either the GNU Lesser
      7   82   yongsun  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8   82   yongsun  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9   82   yongsun  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10   82   yongsun  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11   82   yongsun  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12   82   yongsun  * specific language governing permissions and limitations under the License. When
     13   82   yongsun  * distributing the software, include this License Header Notice in each file and
     14   82   yongsun  * include the full text of the License in the License file as well as the
     15   82   yongsun  * following notice:
     16   82   yongsun  *
     17   82   yongsun  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18   82   yongsun  * (CDDL)
     19   82   yongsun  * For Covered Software in this distribution, this License shall be governed by the
     20   82   yongsun  * laws of the State of California (excluding conflict-of-law provisions).
     21   82   yongsun  * Any litigation relating to this License shall be subject to the jurisdiction of
     22   82   yongsun  * the Federal Courts of the Northern District of California and the state courts
     23   82   yongsun  * of the State of California, with venue lying in Santa Clara County, California.
     24   82   yongsun  *
     25   82   yongsun  * Contributor(s):
     26   82   yongsun  *
     27   82   yongsun  * If you wish your version of this file to be governed by only the CDDL or only
     28   82   yongsun  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29   82   yongsun  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30   82   yongsun  * license." If you don't indicate a single choice of license, a recipient has the
     31   82   yongsun  * option to distribute your version of this file under either the CDDL or the LGPL
     32   82   yongsun  * Version 2.1, or to extend the choice of license to its licensees as provided
     33   82   yongsun  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34   82   yongsun  * Version 2 license, then the option applies only if the new code is made subject
     35   82   yongsun  * to such option by the copyright holder.
     36    0   yongsun  */
     37   82   yongsun 
     38    0   yongsun #ifdef HAVE_CONFIG_H
     39    0   yongsun #include "config.h"
     40    0   yongsun #endif
     41    0   yongsun 
     42    0   yongsun #ifdef HAVE_ASSERT_H
     43    0   yongsun #include <assert.h>
     44    0   yongsun #endif
     45    0   yongsun 
     46    0   yongsun #include <stdlib.h>
     47    0   yongsun #include <math.h>
     48    0   yongsun #include <vector>
     49    0   yongsun #include <algorithm>
     50    0   yongsun 
     51    0   yongsun #include "sim_slmbuilder.h"
     52    0   yongsun 
     53    0   yongsun void CSlmGTDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr)
     54    0   yongsun {
     55    0   yongsun     if (dis != NULL)
     56    0   yongsun         delete [] dis;
     57    0   yongsun     dis = new double[--n];
     58    0   yongsun     if (thres > n) thres = n;
     59    0   yongsun     for (int freq=1; freq < n; ++freq) {
     60    0   yongsun         if (nr[freq] == 0 || nr[freq+1] == 0)
     61    0   yongsun             dis[freq] = 1.0;
     62    0   yongsun         else
     63    0   yongsun             dis[freq] = double(nr[freq+1])/nr[freq];
     64    0   yongsun         printf("%lf ", dis[freq]); fflush(stdout);
     65    0   yongsun     }
     66    0   yongsun }
     67    0   yongsun 
     68    0   yongsun double CSlmGTDiscounter::discount(int freq)
     69    0   yongsun {
     70    0   yongsun     double newfreq = freq * ((freq < thres)?dis[freq]:hd);
     71    0   yongsun     if (newfreq >= double(freq))
     72    0   yongsun         newfreq = freq * hd;
     73    0   yongsun     return newfreq;
     74    0   yongsun }
     75    0   yongsun 
     76    0   yongsun void CSlmAbsoluteDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr)
     77    0   yongsun {
     78    0   yongsun     // normally, c should not greater than 1.0, yet when cut-off is used, it could be so.
     79    0   yongsun     if (c <= 0.0) {
     80    0   yongsun         c = double(nr[1]) / (nr[1] + 2.0*nr[2]);
     81    0   yongsun         printf("parameter c=%lf", c); fflush(stdout);
     82    0   yongsun     } else {
     83    0   yongsun         printf("Using given parameter c=%lf", c); fflush(stdout);
     84    0   yongsun     }
     85    0   yongsun }
     86    0   yongsun 
     87    0   yongsun double CSlmAbsoluteDiscounter::discount(int freq)
     88    0   yongsun {
     89    0   yongsun     return (freq > 0)?(freq - c):(0.0);
     90    0   yongsun }
     91    0   yongsun 
     92    0   yongsun void CSlmLinearDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr)
     93    0   yongsun {
     94    0   yongsun     if (dis <= 0.0 || dis >= 1.0) {
     95    0   yongsun         dis = 1.0 - double(nr[1])/nr[0];
     96    0   yongsun         printf("parameter d=%lf", dis); fflush(stdout);
     97    0   yongsun     } else {
     98    0   yongsun         printf("Using given parameter d=%lf", dis); fflush(stdout);
     99    0   yongsun     }
    100    0   yongsun }
    101    0   yongsun 
    102    0   yongsun double CSlmLinearDiscounter::discount(int freq)
    103    0   yongsun {
    104    0   yongsun     return freq * dis;
    105    0   yongsun }
    106    0   yongsun 
    107    0   yongsun // n=1 for unigram, n=2 for bigram;
    108    0   yongsun // level[0] is for psuedo 0 gram, ...
    109    0   yongsun void CSlmBuilder::Create(int n)
    110    0   yongsun {
    111    0   yongsun     assert(n != 0);
    112    0   yongsun     nlevel = n;
    113    0   yongsun     level = new void * [n+1];
    114    0   yongsun     for (int i=0; i<n; ++i) {
    115    0   yongsun         level[i] = new std::vector<TNode>;
    116    0   yongsun         if (i) ((TNodeLevel*)level[i])->reserve(1024);
    117    0   yongsun     }
    118    0   yongsun     //Add leaf level
    119    0   yongsun     level[n] = new std::vector<TLeaf>;
    120    0   yongsun     ((TLeafLevel*)level[n])->reserve(1024);
    121    0   yongsun 
    122    0   yongsun     //Add psuedo root node
    123    0   yongsun     ((TNodeLevel*)level[0])->push_back(TNode(0, 0, 0));
    124   87   yongsun 
    125   87   yongsun     //Initialize the nr[n+1][SLM_MAX_R] 2-D array
    126    0   yongsun     nr = new FREQ_TYPE[n+1][SLM_MAX_R];
    127   87   yongsun     for (int lvl=0; lvl<n+1; ++lvl)
    128   87   yongsun         for (int r=0; r<SLM_MAX_R; ++r)
    129   87   yongsun             nr[lvl][r] = 0;
    130    0   yongsun }
    131    0   yongsun 
    132    0   yongsun void CSlmBuilder::SetCut(FREQ_TYPE threshold[])
    133    0   yongsun {
    134    0   yongsun     if (cut != NULL)
    135    0   yongsun         delete []cut;
    136    0   yongsun     cut = new FREQ_TYPE[nlevel+1];
    137    0   yongsun     for (int i=0; i<nlevel; ++i)
    138    0   yongsun         cut[i+1] = threshold[i];
    139    0   yongsun }
    140    0   yongsun 
    141    0   yongsun void CSlmBuilder::SetDiscounter(CSlmDiscounter* dis[])
    142    0   yongsun {
    143    0   yongsun     if (discounter != NULL)
    144    0   yongsun         delete []discounter;
    145    0   yongsun     discounter = new CSlmDiscounter* [nlevel+1];
    146    0   yongsun     for (int i=0; i < nlevel; ++i)
    147    0   yongsun         discounter[i+1] = dis[i];
    148    0   yongsun }
    149    0   yongsun 
    150    0   yongsun void CSlmBuilder::SetBreakerIds(int nId, TSIMWordId brks[])
    151    0   yongsun {
    152    0   yongsun     breaker.clear();
    153    0   yongsun     for (int i=0; i < nId; ++i)
    154    0   yongsun         breaker.push_back(brks[i]);
    155    0   yongsun     std::make_heap(breaker.begin(), breaker.end());
    156    0   yongsun     std::sort_heap(breaker.begin(), breaker.end());
    157    0   yongsun }
    158    0   yongsun 
    159    0   yongsun void CSlmBuilder::SetExcludeIds(int nId, TSIMWordId excludes[])
    160    0   yongsun {
    161    0   yongsun     m_excludes.clear();
    162    0   yongsun     for (int i=0; i < nId; ++i)
    163    0   yongsun         m_excludes.push_back(excludes[i]);
    164    0   yongsun     std::make_heap(m_excludes.begin(), m_excludes.end());
    165    0   yongsun     std::sort_heap(m_excludes.begin(), m_excludes.end());
    166    0   yongsun }
    167    0   yongsun 
    168    0   yongsun bool CSlmBuilder::isBreakId(TSIMWordId id)
    169    0   yongsun {
    170    0   yongsun     return std::binary_search(breaker.begin(), breaker.end(), id);
    171    0   yongsun }
    172    0   yongsun 
    173    0   yongsun bool CSlmBuilder::isExcludeId(TSIMWordId id)
    174    0   yongsun {
    175    0   yongsun     return std::binary_search(m_excludes.begin(), m_excludes.end(), id);
    176    0   yongsun }
    177    0   yongsun 
    178    0   yongsun void CSlmBuilder::AddNGram(TSIMWordId* ngram, FREQ_TYPE fr)
    179    0   yongsun {
    180    0   yongsun     int ch;
    181    0   yongsun     bool brk = isExcludeId(*ngram);
    182    0   yongsun 
    183    0   yongsun     for (int i=1; i < nlevel; ++i) {
    184    0   yongsun         TNodeLevel* pnl = (TNodeLevel*)(level[i]);
    185    0   yongsun         if (pnl->capacity() == pnl->size()) {
    186    0   yongsun             size_t newsz = 2*pnl->capacity();
    187    0   yongsun             if (pnl->capacity() > 1024*1024)
    188    0   yongsun                 newsz = pnl->capacity() + 1024*1024;
    189    0   yongsun             pnl->reserve(newsz);
    190    0   yongsun         }
    191    0   yongsun     }
    192    0   yongsun     TLeafLevel* pll = (TLeafLevel*)(level[nlevel]);
    193    0   yongsun     if (pll->capacity() == pll->size()) {
    194    0   yongsun         size_t newsz = 2*pll->capacity();
    195    0   yongsun         if (pll->capacity() > 1024*1024)
    196    0   yongsun             newsz = pll->capacity() + 1024*1024;
    197    0   yongsun         pll->reserve(newsz);
    198    0   yongsun     }
    199    0   yongsun 
    200    0   yongsun     if (!brk)
    201    0   yongsun         (*(TNodeLevel*)(level[0]))[0].freq += fr;
    202    0   yongsun 
    203    0   yongsun     bool branch = false;
    204    0   yongsun     for (int i=1; (!brk && i < nlevel); ++i) {
    205    0   yongsun         std::vector<TNode> & pv = *(TNodeLevel*)(level[i-1]);
    206    0   yongsun         std::vector<TNode> & v  = *(TNodeLevel*)(level[i]);
    207    0   yongsun         branch = branch || (pv.back().child >= v.size()) || (v.back().id != ngram[i-1]) ;
    208    0   yongsun         if (branch) {
    209    0   yongsun             if (i == nlevel-1)
    210    0   yongsun                 ch = ((TLeafLevel*)(level[i+1]))->size();
    211    0   yongsun             else
    212    0   yongsun                 ch = ((TNodeLevel*)(level[i+1]))->size();
    213    0   yongsun             v.push_back(TNode(ngram[i-1], ch, fr));
    214    0   yongsun         } else {
    215    0   yongsun             v.back().freq += fr;
    216    0   yongsun         }
    217    0   yongsun         brk = (i > 1 && isBreakId(ngram[i-1])) || isExcludeId(ngram[i]);
    218    0   yongsun     }
    219    0   yongsun 
    220    0   yongsun     // Insert to the leaf level
    221   87   yongsun     if (!brk) {
    222   87   yongsun         if (fr > cut[nlevel]) {
    223   87   yongsun             TLeafLevel& v = *(TLeafLevel*)(level[nlevel]);
    224   87   yongsun             v.push_back(TLeaf(ngram[nlevel-1], fr));
    225   87   yongsun         } else {
    226   87   yongsun             nr[nlevel][0] += fr;
    227   87   yongsun             nr[nlevel][fr] += fr;
    228   87   yongsun         }
    229    0   yongsun     }
    230    0   yongsun }
    231    0   yongsun 
    232    0   yongsun void CSlmBuilder::CountNr()
    233    0   yongsun {
    234    0   yongsun     printf("\nCounting Nr..."); fflush(stdout);
    235    0   yongsun     for (int lvl=1; lvl<nlevel; ++lvl) {
    236    0   yongsun         TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
    237    0   yongsun         for (TNodeIterator it=v.begin(), ite=v.end(); it != ite; ++it) {
    238    0   yongsun             FREQ_TYPE freq = it->freq;
    239    0   yongsun             nr[lvl][0] += freq;
    240    0   yongsun             if (freq < SLM_MAX_R && freq > 0)
    241    0   yongsun                 nr[lvl][freq] += freq;
    242    0   yongsun         }
    243    0   yongsun     }
    244    0   yongsun     TLeafLevel& v = *(TLeafLevel*)(level[nlevel]);
    245    0   yongsun     for (TLeafIterator it=v.begin(), ite=v.end(); it != ite; ++it) {
    246    0   yongsun         FREQ_TYPE freq = it->freq;
    247    0   yongsun         nr[nlevel][0] += freq;
    248    0   yongsun         if (freq < SLM_MAX_R && freq > 0)
    249    0   yongsun             nr[nlevel][freq] += freq;
    250    0   yongsun     }
    251    0   yongsun     printf("\n"); fflush(stdout);
    252    0   yongsun }
    253    0   yongsun 
    254    0   yongsun int
    255    0   yongsun CSlmBuilder::CutLeafLevel(TNodeIterator pfirst, TNodeIterator plast,
    256    0   yongsun                           TLeafIterator chfirst, TLeafIterator chlast, int thred)
    257    0   yongsun {
    258    0   yongsun     int idxfirst, idxchk;
    259    0   yongsun     TLeafIterator chchk = chfirst;
    260    0   yongsun     for (idxfirst=idxchk=0; chchk != chlast; ++chchk, ++idxchk) {
    261    0   yongsun         //do not cut item whoese 1. freq > thred; 2. psuedo tail
    262    0   yongsun         if (chchk->freq > thred || (chchk+1) == chlast) {
    263    0   yongsun             if (idxfirst < idxchk)
    264    0   yongsun                 *chfirst = *chchk;
    265    0   yongsun             for (;pfirst != plast && pfirst->child <= idxchk; ++pfirst)
    266    0   yongsun                 pfirst->child = idxfirst;
    267    0   yongsun             ++idxfirst;
    268    0   yongsun             ++chfirst;
    269    0   yongsun         }
    270    0   yongsun     }
    271    0   yongsun     assert(pfirst == plast);
    272    0   yongsun     return idxfirst;
    273    0   yongsun }
    274    0   yongsun 
    275    0   yongsun int
    276    0   yongsun CSlmBuilder::CutNodeLevel(TNodeIterator pfirst, TNodeIterator plast,
    277    0   yongsun                           TNodeIterator chfirst, TNodeIterator chlast, int thred)
    278    0   yongsun {
    279    0   yongsun     int idxfirst, idxchk;
    280    0   yongsun     TNodeIterator chchk = chfirst;
    281    0   yongsun     for (idxfirst=idxchk=0; chchk != chlast; ++chchk, ++idxchk) {
    282    0   yongsun         //do not cut item whoese 1. freq > thred; 2. psuedo tail; 3. leading children
    283    0   yongsun         TNodeIterator chnext = chchk + 1;
    284    0   yongsun         if (chchk->freq > thred || chnext == chlast || (chnext->child != chchk->child)) {
    285    0   yongsun             if (idxfirst < idxchk)
    286    0   yongsun                 *chfirst = *chchk;
    287    0   yongsun             for (;pfirst != plast && pfirst->child <= idxchk; ++pfirst)
    288    0   yongsun                 pfirst->child = idxfirst;
    289    0   yongsun             ++idxfirst;
    290    0   yongsun             ++chfirst;
    291    0   yongsun         }
    292    0   yongsun     }
    293    0   yongsun     assert(pfirst == plast);
    294    0   yongsun     return idxfirst;
    295    0   yongsun }
    296    0   yongsun 
    297    0   yongsun void CSlmBuilder::Cut()
    298    0   yongsun {
    299    0   yongsun     printf("\nCuting according freq..."); fflush(stdout);
    300    0   yongsun     for (int lvl=nlevel; lvl>0; --lvl) {
    301    0   yongsun         printf("\n    Cut level %d with threshold %d...", lvl, cut[lvl]); fflush(stdout);
    302    0   yongsun         TNodeLevel& parent = *(TNodeLevel*)(level[lvl-1]);
    303    0   yongsun         if (lvl == nlevel) {
    304    0   yongsun             if (cut[lvl] > 0) {
    305    0   yongsun                 TLeafLevel& v = *(TLeafLevel*)(level[lvl]);
    306    0   yongsun                 int newsize = CutLeafLevel(parent.begin(), parent.end(), v.begin(), v.end(), cut[lvl]);
    307    0   yongsun                 v.erase(v.begin()+newsize, v.end());
    308    0   yongsun             }
    309    0   yongsun         } else {
    310    0   yongsun             if (cut[lvl] > 0) {
    311    0   yongsun                 TNodeLevel& v= *(TNodeLevel*)(level[lvl]);
    312    0   yongsun                 int newsize = CutNodeLevel(parent.begin(), parent.end(), v.begin(), v.end(), cut[lvl]);
    313    0   yongsun                 v.erase(v.begin()+newsize, v.end());
    314    0   yongsun             }
    315    0   yongsun         }
    316    0   yongsun     }
    317    0   yongsun     printf("\n"); fflush(stdout);
    318    0   yongsun }
    319    0   yongsun 
    320    0   yongsun void CSlmBuilder::AppendTails()
    321    0   yongsun {
    322    0   yongsun     printf("\nAppending psuedo tail node for each level..."); fflush(stdout);
    323    0   yongsun     for (int lvl=0; lvl<nlevel; ++lvl) {
    324    0   yongsun         int child_size = 0;
    325    0   yongsun         if (lvl == nlevel - 1){
    326    0   yongsun             child_size = ((TLeafLevel*)(level[lvl+1]))->size();
    327    0   yongsun         } else {
    328    0   yongsun             child_size = ((TNodeLevel*)(level[lvl+1]))->size();
    329    0   yongsun         }
    330    0   yongsun         TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
    331    0   yongsun         v.push_back(TNode(0x00FFFFFF, child_size, 1));
    332    0   yongsun     }
    333    0   yongsun     //also make a psuedo tail node for the leaf level
    334    0   yongsun     ((TLeafLevel*)(level[nlevel]))->push_back(TLeaf(0, 1));
    335    0   yongsun     printf("\n"); fflush(stdout);
    336    0   yongsun }
    337    0   yongsun 
    338    0   yongsun template<class TChildLevel>
    339    0   yongsun void DiscountOneLevel(CSlmBuilder::TNodeLevel& v, TChildLevel& ch, CSlmDiscounter* disc, int bUseLogPr)
    340    0   yongsun {
    341    0   yongsun     CSlmBuilder::TNodeIterator it = v.begin();
    342    0   yongsun     CSlmBuilder::TNodeIterator ite= v.begin() + (v.size()-1);
    343    0   yongsun     for (; it != ite; ++it) { //do not calc the psuedo tail item
    344    0   yongsun         CSlmBuilder::TNodeIterator itnext = it + 1;
    345    0   yongsun         double root_freq = it->freq;
    346    0   yongsun         for (int h = it->child, t = itnext->child; h < t; ++h) {
    347    0   yongsun             double pr = disc->discount(ch[h].freq) / root_freq;
    348    0   yongsun             assert(pr > 0.0 && pr < 1.0);
    349    0   yongsun             if (bUseLogPr) {
    350    0   yongsun                 ch[h].pr = CSlmBuilder::PR_TYPE( -log(pr) );
    351    0   yongsun             } else {
    352    0   yongsun                 ch[h].pr = CSlmBuilder::PR_TYPE( pr );
    353    0   yongsun             }
    354    0   yongsun         }
    355    0   yongsun     }
    356    0   yongsun }
    357    0   yongsun 
    358    0   yongsun void CSlmBuilder::Discount()
    359    0   yongsun {
    360    0   yongsun     printf("\nDiscounting...");
    361    0   yongsun     for (int lvl=nlevel; lvl>0; --lvl){
    362    0   yongsun         printf("\n    Initializing level %d's %s discount method: ", lvl, discounter[lvl]->getName());
    363    0   yongsun         discounter[lvl]->init(SLM_MAX_R, nr[lvl]);
    364    0   yongsun     }
    365    0   yongsun     printf("\n");
    366    0   yongsun     for (int lvl = nlevel-1; lvl >= 0; --lvl) {
    367    0   yongsun         printf("\n    Discounting level %d ...", lvl+1); fflush(stdout);
    368    0   yongsun         TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
    369    0   yongsun         if (lvl == nlevel - 1) { //its child is leaf
    370    0   yongsun             TLeafLevel& ch = *(TLeafLevel*)(level[lvl+1]);
    371    0   yongsun             DiscountOneLevel(v, ch, discounter[lvl+1], bUseLogPr);
    372    0   yongsun         } else {
    373    0   yongsun             TNodeLevel& ch = *(TNodeLevel*)(level[lvl+1]);
    374    0   yongsun             DiscountOneLevel(v, ch, discounter[lvl+1], bUseLogPr);
    375    0   yongsun         }
    376    0   yongsun     }
    377    0   yongsun     printf("\n    Giving psuedo root level 0 a distribution...");
    378    0   yongsun     //make the psuedo 0-gram a equal distribution
    379    0   yongsun     TNodeLevel& v0 = *(TNodeLevel*)(level[0]);
    380    0   yongsun     if (bUseLogPr) {
    381    0   yongsun         v0[0].pr = PR_TYPE(-log(double(1.0)/m_nWord));
    382    0   yongsun     } else {
    383    0   yongsun         v0[0].pr = PR_TYPE(double(1.0)/m_nWord);
    384    0   yongsun     }
    385    0   yongsun     printf("\n"); fflush(stdout);
    386    0   yongsun }
    387    0   yongsun 
    388    0   yongsun template<class chIterator>
    389    0   yongsun double CalcNodeBow(CSlmBuilder* builder, int lvl, TSIMWordId words[], chIterator chh, chIterator cht, int bUseLogPr)
    390    0   yongsun {
    391    0   yongsun     if (chh == cht) return 1.0;
    392    0   yongsun     double sumnext = 0.0, sum=0.0;
    393    0   yongsun     for (; chh < cht; ++chh) {
    394    0   yongsun         if (bUseLogPr) {
    395    0   yongsun             sumnext += exp(-(chh->pr));
    396    0   yongsun         } else {
    397    0   yongsun             sumnext += double(chh->pr);
    398    0   yongsun         }
    399    0   yongsun         words[lvl+1] = chh->id;
    400    0   yongsun         sum += builder->getPr(lvl, words+2);
    401    0   yongsun     }
    402    0   yongsun     assert(sumnext > 0.0 && sumnext < 1.05);
    403    0   yongsun     assert(sum < 1.05 && sum > 0.0);
    404    0   yongsun     //
    405    0   yongsun     if (sumnext >= 1.0 || sum >= 1.0) {
    406    0   yongsun         double bow = ((sumnext > sum)?sumnext:sum)+0.0001;
    407    0   yongsun         bow = (bow - sumnext) / (bow - sum);
    408    0   yongsun         printf("\n (sigma(p(w|h)=%lf, sigma(p(w|h')=%lf) bow ==> %lf due to Calculation precision for %d-gram:", sumnext, sum, bow, lvl);
    409    0   yongsun         for (int i=1; i <= lvl; ++i)
    410    0   yongsun             printf("%d ", words[i]);
    411    0   yongsun         return bow;
    412    0   yongsun     }
    413    0   yongsun     return (1.0-sumnext)/(1.0-sum);
    414    0   yongsun }
    415    0   yongsun 
    416    0   yongsun void CSlmBuilder::CalcBOW()
    417    0   yongsun {
    418    0   yongsun     printf("\nCalculating Back-Off Weight...");
    419    0   yongsun     for (int lvl=0; lvl < nlevel; ++lvl) {
    420    0   yongsun         printf("\n    Processing level %d ", lvl); fflush(stdout);
    421  241  tchaikov         TNode* base[16]; //it should be lvl+1, yet some compiler does not support it
    422  241  tchaikov         int idx[16];     //it should be lvl+1, yet some compiler does not support it
    423    0   yongsun         for (int i=0; i <= lvl; ++i) {
    424    0   yongsun             base[i] = &((*(TNodeLevel*)level[i])[0]);
    425    0   yongsun             idx[i] = 0;
    426    0   yongsun         }
    427    0   yongsun         TSIMWordId words[17];  //it should be lvl+2, yet some compiler do not support it
    428    0   yongsun         int sz = ((TNodeLevel*)(level[lvl]))->size()-1;
    429    0   yongsun         printf("(%d items)...", sz+1); fflush(stdout);
    430    0   yongsun         for (; idx[lvl] < sz; ++idx[lvl]) {
    431    0   yongsun             words[lvl] = base[lvl][idx[lvl]].id;
    432    0   yongsun             for (int k=lvl-1; k >= 0; --k) {
    433    0   yongsun                 while (base[k][idx[k]+1].child <= idx[k+1])
    434    0   yongsun                     ++idx[k];
    435    0   yongsun                 words[k] = base[k][idx[k]].id;
    436    0   yongsun             }
    437    0   yongsun             TNode & node = base[lvl][idx[lvl]];
    438    0   yongsun             TNode & nodenext = *((&node)+1);
    439    0   yongsun             double bow;
    440    0   yongsun             if (lvl == nlevel-1) {
    441    0   yongsun                 TLeaf * ch = &((*(TLeafLevel*)level[lvl+1])[0]);
    442    0   yongsun                 bow = CalcNodeBow(this, lvl, words, ch+node.child, ch+nodenext.child, bUseLogPr);
    443    0   yongsun             } else {
    444    0   yongsun                 TNode * ch = &((*(TNodeLevel*)level[lvl+1])[0]);
    445    0   yongsun                 bow = CalcNodeBow(this, lvl, words, ch+node.child, ch+nodenext.child, bUseLogPr);
    446    0   yongsun             }
    447    0   yongsun             if (bUseLogPr) {
    448    0   yongsun                 node.bow = PR_TYPE(-log(bow));
    449    0   yongsun             } else {
    450    0   yongsun                 node.bow = PR_TYPE(bow);
    451    0   yongsun             }
    452    0   yongsun         }
    453    0   yongsun     }
    454    0   yongsun     printf("\n"); fflush(stdout);
    455    0   yongsun }
    456    0   yongsun 
    457    0   yongsun double CSlmBuilder::getPr(int n, TSIMWordId *words)
    458    0   yongsun {
    459    0   yongsun     int lvl;
    460    0   yongsun     double bow = 1.0;
    461    0   yongsun     void* pnode = &((*(TNodeLevel*)level[0])[0]);
    462    0   yongsun 
    463    0   yongsun     assert(n <= nlevel);
    464    0   yongsun 
    465    0   yongsun     if (n==0) {
    466    0   yongsun         if (bUseLogPr) {
    467    0   yongsun             return exp(-((TNode*)pnode)->pr);
    468    0   yongsun         } else {
    469    0   yongsun             return ((TNode*)pnode)->pr;
    470    0   yongsun         }
    471    0   yongsun     }
    472    0   yongsun 
    473    0   yongsun     for (lvl = 0; pnode != NULL && lvl < n; ++lvl) {
    474    0   yongsun         if (bUseLogPr) {
    475    0   yongsun             bow = exp(-((TNode*)pnode)->bow);
    476    0   yongsun         } else {
    477    0   yongsun             bow = ((TNode*)pnode)->bow;
    478    0   yongsun         }
    479    0   yongsun         pnode = FindChild(lvl, (TNode*)pnode, words[lvl]);
    480    0   yongsun     }
    481    0   yongsun 
    482    0   yongsun     if (pnode != NULL) { // find the whole string
    483    0   yongsun         if (bUseLogPr) {
    484    0   yongsun             return exp(-((TLeaf*)pnode)->pr);
    485    0   yongsun         } else {
    486    0   yongsun             return ((TLeaf*)pnode)->pr;
    487    0   yongsun         }
    488    0   yongsun     } else if (lvl == n-1) { // only find the history
    489    0   yongsun         return bow*getPr(n-1, words+1);
    490    0   yongsun     } else { //even not find the history
    491    0   yongsun         return getPr(n-1, words+1);
    492    0   yongsun     }
    493    0   yongsun }
    494    0   yongsun 
    495    0   yongsun void* CSlmBuilder::FindChild(int lvl, TNode* root, TSIMWordId id)
    496    0   yongsun {
    497    0   yongsun     int chh = root->child, cht = (root+1)->child;
    498    0   yongsun     if (lvl == nlevel-1) {
    499    0   yongsun         TLeaf* pleaf = &((*(TLeafLevel*)level[lvl+1])[0]);
    500    0   yongsun         return (void*)binary_find(pleaf, chh, cht, TLeaf(id));
    501    0   yongsun     } else {
    502    0   yongsun         TNode* pnode = &((*(TNodeLevel*)level[lvl+1])[0]);
    503    0   yongsun         return (void*)binary_find(pnode, chh, cht, TNode(id));
    504    0   yongsun     }
    505    0   yongsun }
    506    0   yongsun 
    507    0   yongsun void CSlmBuilder::Build()
    508    0   yongsun {
    509    0   yongsun     CountNr();
    510    0   yongsun     AppendTails();
    511    0   yongsun     Cut();
    512    0   yongsun     Discount();
    513    0   yongsun     CalcBOW();
    514    0   yongsun }
    515    0   yongsun 
    516    0   yongsun void CSlmBuilder::Write(FILE *out)
    517    0   yongsun {
    518    0   yongsun     fwrite(&nlevel, sizeof(nlevel), 1, out);
    519    0   yongsun     fwrite(&bUseLogPr, sizeof(bUseLogPr), 1, out);
    520    0   yongsun     for (int lvl=0; lvl <= nlevel; ++lvl) {
    521    0   yongsun         int sz = 0;
    522    0   yongsun         if (lvl == nlevel)
    523    0   yongsun             sz = ((TLeafLevel*)(level[lvl]))->size();
    524    0   yongsun         else
    525    0   yongsun             sz = ((TNodeLevel*)(level[lvl]))->size();
    526    0   yongsun         fwrite(&sz, sizeof(sz), 1, out);
    527    0   yongsun     }
    528    0   yongsun     for (int lvl=0; lvl < nlevel; ++lvl) {
    529    0   yongsun         TNodeLevel& v = *(TNodeLevel*)(level[lvl]);
    530    0   yongsun         for (TNodeIterator it=v.begin(), ite=v.end(); it != ite; ++it)
    531    0   yongsun             fwrite(&(*it), sizeof(TNode), 1, out);
    532    0   yongsun     }
    533    0   yongsun     TLeafLevel& v = *(TLeafLevel*)(level[nlevel]);
    534    0   yongsun     for (TLeafIterator it=v.begin(), ite=v.end(); it != ite; ++it)
    535    0   yongsun         fwrite(&(*it), sizeof(TLeaf), 1, out);
    536    0   yongsun }
    537    0   yongsun 
    538    0   yongsun void CSlmBuilder::Close(void)
    539    0   yongsun {
    540    0   yongsun     if (level != NULL) {
    541    0   yongsun         for (int lvl=0; lvl <= nlevel; ++lvl) {
    542    0   yongsun             if (lvl == nlevel)
    543    0   yongsun                 delete (TLeafLevel*)(level[lvl]);
    544    0   yongsun             else
    545    0   yongsun                 delete (TNodeLevel*)(level[lvl]);
    546    0   yongsun         }
    547    0   yongsun         delete [] level;
    548    0   yongsun         level = NULL;
    549    0   yongsun     }
    550    0   yongsun     if (cut != NULL) {
    551    0   yongsun         delete [] cut;
    552    0   yongsun         cut = NULL;
    553    0   yongsun     }
    554    0   yongsun     if (discounter != NULL) {
    555    0   yongsun         for (int lvl = 1; lvl <= nlevel; ++lvl) {
    556    0   yongsun             delete discounter[lvl];
    557    0   yongsun         }
    558    0   yongsun         delete [] discounter;
    559    0   yongsun         discounter = NULL;
    560    0   yongsun     }
    561    0   yongsun     if (nr != NULL) {
    562    0   yongsun         delete [] nr;
    563    0   yongsun         nr = NULL;
    564    0   yongsun     }
    565    0   yongsun     breaker.clear();
    566    0   yongsun     m_nWord = 0;
    567    0   yongsun     nlevel = 0;
    568    0   yongsun }
    569    0   yongsun 
    570