1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifdef HAVE_CONFIG_H 39 #include "config.h" 40 #endif 41 42 #ifdef HAVE_ASSERT_H 43 #include <assert.h> 44 #endif 45 46 #include <stdlib.h> 47 #include <math.h> 48 #include <vector> 49 #include <algorithm> 50 51 #include "sim_slmbuilder.h" 52 53 void CSlmGTDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr) 54 { 55 if (dis != NULL) 56 delete [] dis; 57 dis = new double[--n]; 58 if (thres > n) thres = n; 59 for (int freq=1; freq < n; ++freq) { 60 if (nr[freq] == 0 || nr[freq+1] == 0) 61 dis[freq] = 1.0; 62 else 63 dis[freq] = double(nr[freq+1])/nr[freq]; 64 printf("%lf ", dis[freq]); fflush(stdout); 65 } 66 } 67 68 double CSlmGTDiscounter::discount(int freq) 69 { 70 double newfreq = freq * ((freq < thres)?dis[freq]:hd); 71 if (newfreq >= double(freq)) 72 newfreq = freq * hd; 73 return newfreq; 74 } 75 76 void CSlmAbsoluteDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr) 77 { 78 // normally, c should not greater than 1.0, yet when cut-off is used, it could be so. 79 if (c <= 0.0) { 80 c = double(nr[1]) / (nr[1] + 2.0*nr[2]); 81 printf("parameter c=%lf", c); fflush(stdout); 82 } else { 83 printf("Using given parameter c=%lf", c); fflush(stdout); 84 } 85 } 86 87 double CSlmAbsoluteDiscounter::discount(int freq) 88 { 89 return (freq > 0)?(freq - c):(0.0); 90 } 91 92 void CSlmLinearDiscounter::init(int n, CSlmBuilder::FREQ_TYPE *nr) 93 { 94 if (dis <= 0.0 || dis >= 1.0) { 95 dis = 1.0 - double(nr[1])/nr[0]; 96 printf("parameter d=%lf", dis); fflush(stdout); 97 } else { 98 printf("Using given parameter d=%lf", dis); fflush(stdout); 99 } 100 } 101 102 double CSlmLinearDiscounter::discount(int freq) 103 { 104 return freq * dis; 105 } 106 107 // n=1 for unigram, n=2 for bigram; 108 // level[0] is for psuedo 0 gram, ... 109 void CSlmBuilder::Create(int n) 110 { 111 assert(n != 0); 112 nlevel = n; 113 level = new void * [n+1]; 114 for (int i=0; i<n; ++i) { 115 level[i] = new std::vector<TNode>; 116 if (i) ((TNodeLevel*)level[i])->reserve(1024); 117 } 118 //Add leaf level 119 level[n] = new std::vector<TLeaf>; 120 ((TLeafLevel*)level[n])->reserve(1024); 121 122 //Add psuedo root node 123 ((TNodeLevel*)level[0])->push_back(TNode(0, 0, 0)); 124 125 //Initialize the nr[n+1][SLM_MAX_R] 2-D array 126 nr = new FREQ_TYPE[n+1][SLM_MAX_R]; 127 for (int lvl=0; lvl<n+1; ++lvl) 128 for (int r=0; r<SLM_MAX_R; ++r) 129 nr[lvl][r] = 0; 130 } 131 132 void CSlmBuilder::SetCut(FREQ_TYPE threshold[]) 133 { 134 if (cut != NULL) 135 delete []cut; 136 cut = new FREQ_TYPE[nlevel+1]; 137 for (int i=0; i<nlevel; ++i) 138 cut[i+1] = threshold[i]; 139 } 140 141 void CSlmBuilder::SetDiscounter(CSlmDiscounter* dis[]) 142 { 143 if (discounter != NULL) 144 delete []discounter; 145 discounter = new CSlmDiscounter* [nlevel+1]; 146 for (int i=0; i < nlevel; ++i) 147 discounter[i+1] = dis[i]; 148 } 149 150 void CSlmBuilder::SetBreakerIds(int nId, TSIMWordId brks[]) 151 { 152 breaker.clear(); 153 for (int i=0; i < nId; ++i) 154 breaker.push_back(brks[i]); 155 std::make_heap(breaker.begin(), breaker.end()); 156 std::sort_heap(breaker.begin(), breaker.end()); 157 } 158 159 void CSlmBuilder::SetExcludeIds(int nId, TSIMWordId excludes[]) 160 { 161 m_excludes.clear(); 162 for (int i=0; i < nId; ++i) 163 m_excludes.push_back(excludes[i]); 164 std::make_heap(m_excludes.begin(), m_excludes.end()); 165 std::sort_heap(m_excludes.begin(), m_excludes.end()); 166 } 167 168 bool CSlmBuilder::isBreakId(TSIMWordId id) 169 { 170 return std::binary_search(breaker.begin(), breaker.end(), id); 171 } 172 173 bool CSlmBuilder::isExcludeId(TSIMWordId id) 174 { 175 return std::binary_search(m_excludes.begin(), m_excludes.end(), id); 176 } 177 178 void CSlmBuilder::AddNGram(TSIMWordId* ngram, FREQ_TYPE fr) 179 { 180 int ch; 181 bool brk = isExcludeId(*ngram); 182 183 for (int i=1; i < nlevel; ++i) { 184 TNodeLevel* pnl = (TNodeLevel*)(level[i]); 185 if (pnl->capacity() == pnl->size()) { 186 size_t newsz = 2*pnl->capacity(); 187 if (pnl->capacity() > 1024*1024) 188 newsz = pnl->capacity() + 1024*1024; 189 pnl->reserve(newsz); 190 } 191 } 192 TLeafLevel* pll = (TLeafLevel*)(level[nlevel]); 193 if (pll->capacity() == pll->size()) { 194 size_t newsz = 2*pll->capacity(); 195 if (pll->capacity() > 1024*1024) 196 newsz = pll->capacity() + 1024*1024; 197 pll->reserve(newsz); 198 } 199 200 if (!brk) 201 (*(TNodeLevel*)(level[0]))[0].freq += fr; 202 203 bool branch = false; 204 for (int i=1; (!brk && i < nlevel); ++i) { 205 std::vector<TNode> & pv = *(TNodeLevel*)(level[i-1]); 206 std::vector<TNode> & v = *(TNodeLevel*)(level[i]); 207 branch = branch || (pv.back().child >= v.size()) || (v.back().id != ngram[i-1]) ; 208 if (branch) { 209 if (i == nlevel-1) 210 ch = ((TLeafLevel*)(level[i+1]))->size(); 211 else 212 ch = ((TNodeLevel*)(level[i+1]))->size(); 213 v.push_back(TNode(ngram[i-1], ch, fr)); 214 } else { 215 v.back().freq += fr; 216 } 217 brk = (i > 1 && isBreakId(ngram[i-1])) || isExcludeId(ngram[i]); 218 } 219 220 // Insert to the leaf level 221 if (!brk) { 222 if (fr > cut[nlevel]) { 223 TLeafLevel& v = *(TLeafLevel*)(level[nlevel]); 224 v.push_back(TLeaf(ngram[nlevel-1], fr)); 225 } else { 226 nr[nlevel][0] += fr; 227 nr[nlevel][fr] += fr; 228 } 229 } 230 } 231 232 void CSlmBuilder::CountNr() 233 { 234 printf("\nCounting Nr..."); fflush(stdout); 235 for (int lvl=1; lvl<nlevel; ++lvl) { 236 TNodeLevel& v = *(TNodeLevel*)(level[lvl]); 237 for (TNodeIterator it=v.begin(), ite=v.end(); it != ite; ++it) { 238 FREQ_TYPE freq = it->freq; 239 nr[lvl][0] += freq; 240 if (freq < SLM_MAX_R && freq > 0) 241 nr[lvl][freq] += freq; 242 } 243 } 244 TLeafLevel& v = *(TLeafLevel*)(level[nlevel]); 245 for (TLeafIterator it=v.begin(), ite=v.end(); it != ite; ++it) { 246 FREQ_TYPE freq = it->freq; 247 nr[nlevel][0] += freq; 248 if (freq < SLM_MAX_R && freq > 0) 249 nr[nlevel][freq] += freq; 250 } 251 printf("\n"); fflush(stdout); 252 } 253 254 int 255 CSlmBuilder::CutLeafLevel(TNodeIterator pfirst, TNodeIterator plast, 256 TLeafIterator chfirst, TLeafIterator chlast, int thred) 257 { 258 int idxfirst, idxchk; 259 TLeafIterator chchk = chfirst; 260 for (idxfirst=idxchk=0; chchk != chlast; ++chchk, ++idxchk) { 261 //do not cut item whoese 1. freq > thred; 2. psuedo tail 262 if (chchk->freq > thred || (chchk+1) == chlast) { 263 if (idxfirst < idxchk) 264 *chfirst = *chchk; 265 for (;pfirst != plast && pfirst->child <= idxchk; ++pfirst) 266 pfirst->child = idxfirst; 267 ++idxfirst; 268 ++chfirst; 269 } 270 } 271 assert(pfirst == plast); 272 return idxfirst; 273 } 274 275 int 276 CSlmBuilder::CutNodeLevel(TNodeIterator pfirst, TNodeIterator plast, 277 TNodeIterator chfirst, TNodeIterator chlast, int thred) 278 { 279 int idxfirst, idxchk; 280 TNodeIterator chchk = chfirst; 281 for (idxfirst=idxchk=0; chchk != chlast; ++chchk, ++idxchk) { 282 //do not cut item whoese 1. freq > thred; 2. psuedo tail; 3. leading children 283 TNodeIterator chnext = chchk + 1; 284 if (chchk->freq > thred || chnext == chlast || (chnext->child != chchk->child)) { 285 if (idxfirst < idxchk) 286 *chfirst = *chchk; 287 for (;pfirst != plast && pfirst->child <= idxchk; ++pfirst) 288 pfirst->child = idxfirst; 289 ++idxfirst; 290 ++chfirst; 291 } 292 } 293 assert(pfirst == plast); 294 return idxfirst; 295 } 296 297 void CSlmBuilder::Cut() 298 { 299 printf("\nCuting according freq..."); fflush(stdout); 300 for (int lvl=nlevel; lvl>0; --lvl) { 301 printf("\n Cut level %d with threshold %d...", lvl, cut[lvl]); fflush(stdout); 302 TNodeLevel& parent = *(TNodeLevel*)(level[lvl-1]); 303 if (lvl == nlevel) { 304 if (cut[lvl] > 0) { 305 TLeafLevel& v = *(TLeafLevel*)(level[lvl]); 306 int newsize = CutLeafLevel(parent.begin(), parent.end(), v.begin(), v.end(), cut[lvl]); 307 v.erase(v.begin()+newsize, v.end()); 308 } 309 } else { 310 if (cut[lvl] > 0) { 311 TNodeLevel& v= *(TNodeLevel*)(level[lvl]); 312 int newsize = CutNodeLevel(parent.begin(), parent.end(), v.begin(), v.end(), cut[lvl]); 313 v.erase(v.begin()+newsize, v.end()); 314 } 315 } 316 } 317 printf("\n"); fflush(stdout); 318 } 319 320 void CSlmBuilder::AppendTails() 321 { 322 printf("\nAppending psuedo tail node for each level..."); fflush(stdout); 323 for (int lvl=0; lvl<nlevel; ++lvl) { 324 int child_size = 0; 325 if (lvl == nlevel - 1){ 326 child_size = ((TLeafLevel*)(level[lvl+1]))->size(); 327 } else { 328 child_size = ((TNodeLevel*)(level[lvl+1]))->size(); 329 } 330 TNodeLevel& v = *(TNodeLevel*)(level[lvl]); 331 v.push_back(TNode(0x00FFFFFF, child_size, 1)); 332 } 333 //also make a psuedo tail node for the leaf level 334 ((TLeafLevel*)(level[nlevel]))->push_back(TLeaf(0, 1)); 335 printf("\n"); fflush(stdout); 336 } 337 338 template<class TChildLevel> 339 void DiscountOneLevel(CSlmBuilder::TNodeLevel& v, TChildLevel& ch, CSlmDiscounter* disc, int bUseLogPr) 340 { 341 CSlmBuilder::TNodeIterator it = v.begin(); 342 CSlmBuilder::TNodeIterator ite= v.begin() + (v.size()-1); 343 for (; it != ite; ++it) { //do not calc the psuedo tail item 344 CSlmBuilder::TNodeIterator itnext = it + 1; 345 double root_freq = it->freq; 346 for (int h = it->child, t = itnext->child; h < t; ++h) { 347 double pr = disc->discount(ch[h].freq) / root_freq; 348 assert(pr > 0.0 && pr < 1.0); 349 if (bUseLogPr) { 350 ch[h].pr = CSlmBuilder::PR_TYPE( -log(pr) ); 351 } else { 352 ch[h].pr = CSlmBuilder::PR_TYPE( pr ); 353 } 354 } 355 } 356 } 357 358 void CSlmBuilder::Discount() 359 { 360 printf("\nDiscounting..."); 361 for (int lvl=nlevel; lvl>0; --lvl){ 362 printf("\n Initializing level %d's %s discount method: ", lvl, discounter[lvl]->getName()); 363 discounter[lvl]->init(SLM_MAX_R, nr[lvl]); 364 } 365 printf("\n"); 366 for (int lvl = nlevel-1; lvl >= 0; --lvl) { 367 printf("\n Discounting level %d ...", lvl+1); fflush(stdout); 368 TNodeLevel& v = *(TNodeLevel*)(level[lvl]); 369 if (lvl == nlevel - 1) { //its child is leaf 370 TLeafLevel& ch = *(TLeafLevel*)(level[lvl+1]); 371 DiscountOneLevel(v, ch, discounter[lvl+1], bUseLogPr); 372 } else { 373 TNodeLevel& ch = *(TNodeLevel*)(level[lvl+1]); 374 DiscountOneLevel(v, ch, discounter[lvl+1], bUseLogPr); 375 } 376 } 377 printf("\n Giving psuedo root level 0 a distribution..."); 378 //make the psuedo 0-gram a equal distribution 379 TNodeLevel& v0 = *(TNodeLevel*)(level[0]); 380 if (bUseLogPr) { 381 v0[0].pr = PR_TYPE(-log(double(1.0)/m_nWord)); 382 } else { 383 v0[0].pr = PR_TYPE(double(1.0)/m_nWord); 384 } 385 printf("\n"); fflush(stdout); 386 } 387 388 template<class chIterator> 389 double CalcNodeBow(CSlmBuilder* builder, int lvl, TSIMWordId words[], chIterator chh, chIterator cht, int bUseLogPr) 390 { 391 if (chh == cht) return 1.0; 392 double sumnext = 0.0, sum=0.0; 393 for (; chh < cht; ++chh) { 394 if (bUseLogPr) { 395 sumnext += exp(-(chh->pr)); 396 } else { 397 sumnext += double(chh->pr); 398 } 399 words[lvl+1] = chh->id; 400 sum += builder->getPr(lvl, words+2); 401 } 402 assert(sumnext > 0.0 && sumnext < 1.05); 403 assert(sum < 1.05 && sum > 0.0); 404 // 405 if (sumnext >= 1.0 || sum >= 1.0) { 406 double bow = ((sumnext > sum)?sumnext:sum)+0.0001; 407 bow = (bow - sumnext) / (bow - sum); 408 printf("\n (sigma(p(w|h)=%lf, sigma(p(w|h')=%lf) bow ==> %lf due to Calculation precision for %d-gram:", sumnext, sum, bow, lvl); 409 for (int i=1; i <= lvl; ++i) 410 printf("%d ", words[i]); 411 return bow; 412 } 413 return (1.0-sumnext)/(1.0-sum); 414 } 415 416 void CSlmBuilder::CalcBOW() 417 { 418 printf("\nCalculating Back-Off Weight..."); 419 for (int lvl=0; lvl < nlevel; ++lvl) { 420 printf("\n Processing level %d ", lvl); fflush(stdout); 421 TNode* base[16]; //it should be lvl+1, yet some compiler does not support it 422 int idx[16]; //it should be lvl+1, yet some compiler does not support it 423 for (int i=0; i <= lvl; ++i) { 424 base[i] = &((*(TNodeLevel*)level[i])[0]); 425 idx[i] = 0; 426 } 427 TSIMWordId words[17]; //it should be lvl+2, yet some compiler do not support it 428 int sz = ((TNodeLevel*)(level[lvl]))->size()-1; 429 printf("(%d items)...", sz+1); fflush(stdout); 430 for (; idx[lvl] < sz; ++idx[lvl]) { 431 words[lvl] = base[lvl][idx[lvl]].id; 432 for (int k=lvl-1; k >= 0; --k) { 433 while (base[k][idx[k]+1].child <= idx[k+1]) 434 ++idx[k]; 435 words[k] = base[k][idx[k]].id; 436 } 437 TNode & node = base[lvl][idx[lvl]]; 438 TNode & nodenext = *((&node)+1); 439 double bow; 440 if (lvl == nlevel-1) { 441 TLeaf * ch = &((*(TLeafLevel*)level[lvl+1])[0]); 442 bow = CalcNodeBow(this, lvl, words, ch+node.child, ch+nodenext.child, bUseLogPr); 443 } else { 444 TNode * ch = &((*(TNodeLevel*)level[lvl+1])[0]); 445 bow = CalcNodeBow(this, lvl, words, ch+node.child, ch+nodenext.child, bUseLogPr); 446 } 447 if (bUseLogPr) { 448 node.bow = PR_TYPE(-log(bow)); 449 } else { 450 node.bow = PR_TYPE(bow); 451 } 452 } 453 } 454 printf("\n"); fflush(stdout); 455 } 456 457 double CSlmBuilder::getPr(int n, TSIMWordId *words) 458 { 459 int lvl; 460 double bow = 1.0; 461 void* pnode = &((*(TNodeLevel*)level[0])[0]); 462 463 assert(n <= nlevel); 464 465 if (n==0) { 466 if (bUseLogPr) { 467 return exp(-((TNode*)pnode)->pr); 468 } else { 469 return ((TNode*)pnode)->pr; 470 } 471 } 472 473 for (lvl = 0; pnode != NULL && lvl < n; ++lvl) { 474 if (bUseLogPr) { 475 bow = exp(-((TNode*)pnode)->bow); 476 } else { 477 bow = ((TNode*)pnode)->bow; 478 } 479 pnode = FindChild(lvl, (TNode*)pnode, words[lvl]); 480 } 481 482 if (pnode != NULL) { // find the whole string 483 if (bUseLogPr) { 484 return exp(-((TLeaf*)pnode)->pr); 485 } else { 486 return ((TLeaf*)pnode)->pr; 487 } 488 } else if (lvl == n-1) { // only find the history 489 return bow*getPr(n-1, words+1); 490 } else { //even not find the history 491 return getPr(n-1, words+1); 492 } 493 } 494 495 void* CSlmBuilder::FindChild(int lvl, TNode* root, TSIMWordId id) 496 { 497 int chh = root->child, cht = (root+1)->child; 498 if (lvl == nlevel-1) { 499 TLeaf* pleaf = &((*(TLeafLevel*)level[lvl+1])[0]); 500 return (void*)binary_find(pleaf, chh, cht, TLeaf(id)); 501 } else { 502 TNode* pnode = &((*(TNodeLevel*)level[lvl+1])[0]); 503 return (void*)binary_find(pnode, chh, cht, TNode(id)); 504 } 505 } 506 507 void CSlmBuilder::Build() 508 { 509 CountNr(); 510 AppendTails(); 511 Cut(); 512 Discount(); 513 CalcBOW(); 514 } 515 516 void CSlmBuilder::Write(FILE *out) 517 { 518 fwrite(&nlevel, sizeof(nlevel), 1, out); 519 fwrite(&bUseLogPr, sizeof(bUseLogPr), 1, out); 520 for (int lvl=0; lvl <= nlevel; ++lvl) { 521 int sz = 0; 522 if (lvl == nlevel) 523 sz = ((TLeafLevel*)(level[lvl]))->size(); 524 else 525 sz = ((TNodeLevel*)(level[lvl]))->size(); 526 fwrite(&sz, sizeof(sz), 1, out); 527 } 528 for (int lvl=0; lvl < nlevel; ++lvl) { 529 TNodeLevel& v = *(TNodeLevel*)(level[lvl]); 530 for (TNodeIterator it=v.begin(), ite=v.end(); it != ite; ++it) 531 fwrite(&(*it), sizeof(TNode), 1, out); 532 } 533 TLeafLevel& v = *(TLeafLevel*)(level[nlevel]); 534 for (TLeafIterator it=v.begin(), ite=v.end(); it != ite; ++it) 535 fwrite(&(*it), sizeof(TLeaf), 1, out); 536 } 537 538 void CSlmBuilder::Close(void) 539 { 540 if (level != NULL) { 541 for (int lvl=0; lvl <= nlevel; ++lvl) { 542 if (lvl == nlevel) 543 delete (TLeafLevel*)(level[lvl]); 544 else 545 delete (TNodeLevel*)(level[lvl]); 546 } 547 delete [] level; 548 level = NULL; 549 } 550 if (cut != NULL) { 551 delete [] cut; 552 cut = NULL; 553 } 554 if (discounter != NULL) { 555 for (int lvl = 1; lvl <= nlevel; ++lvl) { 556 delete discounter[lvl]; 557 } 558 delete [] discounter; 559 discounter = NULL; 560 } 561 if (nr != NULL) { 562 delete [] nr; 563 nr = NULL; 564 } 565 breaker.clear(); 566 m_nWord = 0; 567 nlevel = 0; 568 } 569 570