1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifdef HAVE_CONFIG_H 39 #include "config.h" 40 #endif 41 42 #ifdef HAVE_ASSERT_H 43 #include <assert.h> 44 #endif 45 46 #include <stdio.h> 47 #include <unistd.h> 48 #include <stdlib.h> 49 50 #include <vector> 51 #include <map> 52 #include <math.h> 53 54 #include "../sim_slm.h" 55 #include "../slm.h" 56 57 #include "ValueCompress.h" 58 59 class CSIMSlmWithIteration : public CSIMSlm{ 60 public: 61 struct TLevelIterator { 62 std::vector<int> m_history; 63 }; 64 65 public: 66 int 67 getLevelSize(int lvl) { return sz[lvl]; } 68 69 int 70 getN() { return N; } 71 72 void 73 getIdString(TLevelIterator& it, std::vector<TSIMWordId>& history); 74 75 void 76 beginLevelIteration(int lvl, TLevelIterator& it); 77 78 void 79 next(TLevelIterator& it); 80 81 bool 82 isEnd(TLevelIterator& it); 83 84 TLeaf* 85 getNodePtr(TLevelIterator& it); 86 87 int 88 findState(int n, TSIMWordId*hw); 89 90 void 91 findBackOffState(int n, TSIMWordId*hw, unsigned & bol, unsigned& bon); 92 93 94 protected: 95 void 96 adjustIterator(TLevelIterator& it); 97 }; 98 99 int 100 CSIMSlmWithIteration::findState(int n, TSIMWordId*hw) 101 { 102 if (n == 0) return 0; 103 104 int m = -1; 105 while (n > N) { --n; ++hw; } 106 107 void* pstate = ((TNode*)level[0]); 108 for (int lvl=0; lvl < n && pstate != NULL; ++lvl) { 109 int h = ((TNode*)pstate)->child; 110 int t = (((TNode*)pstate)+1)->child; 111 if (lvl == N-1) { 112 TLeaf* p = (TLeaf*)level[lvl+1]; 113 pstate = (void*)binary_find_id(p+h, p+t, hw[lvl]); 114 m = (pstate != NULL)?(((TLeaf*)pstate) - p):(-1); 115 } else { 116 TNode* p = (TNode*)level[lvl+1]; 117 pstate = (void*)binary_find_id(p+h, p+t, hw[lvl]); 118 m = (pstate != NULL)?(((TNode*)pstate) - p):(-1); 119 } 120 } 121 return m; 122 } 123 124 void 125 CSIMSlmWithIteration::findBackOffState(int n, TSIMWordId*hw, unsigned & bol, unsigned& bon) 126 { 127 while (n > 1) { 128 --n; ++hw; 129 int idx = findState(n, hw); 130 if (idx >= 0 && ((TNode*)(level[n]))[idx].child < ((TNode*)(level[n]))[idx+1].child) { 131 bol = n; bon = idx; return; 132 } 133 } 134 bol = bon = 0; 135 return; 136 } 137 138 void 139 CSIMSlmWithIteration::getIdString(TLevelIterator& it, std::vector<TSIMWordId>& history) 140 { 141 history.clear(); 142 for (int i=1, tmp_sz=it.m_history.size(); i < tmp_sz; ++i) { 143 int idx = it.m_history[i]; 144 if (i == N) 145 history.push_back(((TLeaf*)(level[i]))[idx].id); 146 else 147 history.push_back(((TNode*)(level[i]))[idx].id); 148 } 149 } 150 151 void 152 CSIMSlmWithIteration::beginLevelIteration(int lvl, TLevelIterator& it) 153 { 154 it.m_history.clear(); 155 for (int i=0, tmp_sz=lvl; i <= tmp_sz; ++i) 156 it.m_history.push_back(0); 157 adjustIterator(it); 158 } 159 160 void 161 CSIMSlmWithIteration::next(TLevelIterator& it) 162 { 163 ++(it.m_history.back()); 164 adjustIterator(it); 165 } 166 167 bool 168 CSIMSlmWithIteration::isEnd(TLevelIterator& it) 169 { 170 return ((it.m_history.back()+1 >= sz[it.m_history.size()-1])); 171 } 172 173 void 174 CSIMSlmWithIteration::adjustIterator(TLevelIterator& it) 175 { 176 int ch = it.m_history.back(); 177 for (int i= it.m_history.size()-2; i >= 0; --i) { 178 int len = sz[i]; 179 int& parent = it.m_history[i]; 180 TNode* pn = (TNode*)(level[i]); 181 while (parent < len && pn[parent+1].child <= ch) 182 ++parent; 183 ch = parent; 184 } 185 } 186 187 CSIMSlm::TLeaf* 188 CSIMSlmWithIteration::getNodePtr(TLevelIterator& it) 189 { 190 int lvl = it.m_history.size()-1; 191 int idx = it.m_history.back(); 192 if (lvl == N) 193 return (((TLeaf*)(level[lvl]))+idx); 194 else 195 return (((TNode*)(level[lvl]))+idx); 196 } 197 198 199 200 void ShowUsage() 201 { 202 printf("Usage:\n"); 203 printf(" slmthread primitive_slm threaded_slm\n"); 204 printf("\nDescription:\n"); 205 printf(" slmthread add back-off-state for each slm node in the primitive_slm. "); 206 printf("Also it compresses 32-bit float into 16 bit representation.\n\n"); 207 exit(100); 208 } 209 210 FILE* fp = NULL; 211 CThreadSlm::TNode* levels[16]; 212 CThreadSlm::TLeaf* lastLevel; 213 214 int main(int argc, char* argv[]) 215 { 216 CValueCompressor vc; 217 unsigned int bol, bon; 218 CSIMSlmWithIteration slm; 219 std::vector<TSIMWordId> history; 220 float real_pr, eff_pr, real_bow, eff_bow; 221 222 std::map<float, float> pr_eff, bow_eff; // effval --> val 223 std::map<float, int> pr_values, bow_values; // effval --> freq 224 std::map<float, int> pr_map, bow_map; // result: val --> int 225 std::vector<float> pr_table, bow_table; // result: val vector 226 std::vector<float>::iterator itt, itte; 227 228 if (argc != 3) 229 ShowUsage(); 230 231 printf("Loading original slm..."); fflush(stdout); 232 if (slm.Load(argv[1]) == false) 233 ShowUsage(); 234 235 bool usingLogPr = slm.isUseLogPr(); 236 237 #define EffectivePr(a) (float((usingLogPr)?((a)/log(2.0)):(-log2((a))))) 238 #define OriginalPr(b) (float((usingLogPr)?((b)*log(2.0)):(exp2(-(b))))) 239 #define EffectiveBow(a) (float((usingLogPr)?(exp(-(a))):((a)))) 240 #define OriginalBow(b) (float((usingLogPr)?(-log((b))):((b)))) 241 242 printf("\nfirst pass..."); fflush(stdout); 243 for (int lvl=0; lvl <= slm.getN(); ++lvl) { 244 CSIMSlmWithIteration::TLevelIterator it; 245 slm.beginLevelIteration(lvl, it); 246 for (; !slm.isEnd(it); slm.next(it)) { 247 CSIMSlm::TLeaf* pl = slm.getNodePtr(it); 248 real_pr = pl->pr; 249 eff_pr = EffectivePr(real_pr); 250 if (pr_eff.find(eff_pr) == pr_eff.end()) { 251 pr_eff[eff_pr] = real_pr; 252 } else { // precision error cause non 1:1 mapping 253 pr_eff[eff_pr] = OriginalPr(eff_pr); 254 } 255 ++(pr_values[eff_pr]); 256 if (lvl < slm.getN()) { 257 real_bow = ((CSIMSlm::TNode*)pl)->bow; 258 eff_bow = EffectiveBow(real_bow); 259 if (bow_eff.find(eff_bow) == bow_eff.end()) { 260 bow_eff[eff_bow] = real_bow; 261 } else { // two values map to same distance value due to precision error 262 bow_eff[eff_bow] = OriginalBow(eff_bow); 263 } 264 ++(bow_values[eff_bow]); 265 } 266 } 267 } 268 269 // Following pr value should not be grouped, or as milestone values. 270 static float msprs[] = { 271 0.9, 0.8, 0.7, 0.6, 272 1.0/2, 1.0/4, 1.0/8, 1.0/16, 1.0/32, 1.0/64, 1.0/128, 273 1.0/256, 1.0/512, 1.0/1024, 1.0/2048, 1.0/4096, 1.0/8192, 274 1.0/16384, 1.0/32768, 1.0/65536 275 }; 276 277 for (unsigned i=0, sz=sizeof(msprs)/sizeof(float); i < sz; ++i) { 278 float real_pr = (usingLogPr)?(-log(msprs[i])):(msprs[i]); 279 float eff_pr = EffectivePr(real_pr); 280 if (pr_eff.find(eff_pr) == pr_eff.end()) { 281 pr_eff[eff_pr] = real_pr; 282 } else { // precision error cause non 1:1 mapping 283 pr_eff[eff_pr] = OriginalPr(eff_pr); 284 } 285 pr_values[eff_pr] = 0; 286 } 287 288 // Following bow value should not be grouped, or as milestone values. 289 static float msbows[] = { 290 1.0, 0.9, 0.8, 0.7, 0.6, 0.5, 0.4, 0.3, 0.2, 291 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001, 292 0.00005, 0.00001, 0.000005, 0.000001, 0.0000005, 0.0000001 293 }; 294 295 for (unsigned i=0, sz=sizeof(msbows)/sizeof(float); i < sz; ++i) { 296 float real_bow = (usingLogPr)?(-log(msbows[i])):(msbows[i]); 297 float eff_bow = EffectiveBow(real_bow); 298 if (bow_eff.find(eff_bow) == bow_eff.end()) { 299 bow_eff[eff_bow] = real_bow; 300 } else { // two values map to same distance value due to precision error 301 bow_eff[eff_bow] = OriginalBow(eff_bow); 302 } 303 bow_values[eff_bow] = 0; 304 } 305 306 printf("\nCompressing pr values..."); fflush(stdout); 307 vc(pr_eff, pr_values, pr_map, pr_table, (1 << CThreadSlm::BITS_PR)); 308 pr_values.clear(); 309 itte = pr_table.end(); 310 for (itt = pr_table.begin(); itt != itte; ++itt) { 311 *itt = OriginalPr(*itt); 312 assert(usingLogPr || (*itt > 0.0 && *itt < 1.0)); 313 assert(!usingLogPr || *itt > 0.0); 314 } 315 printf("%d float values ==> %d values", pr_eff.size(), pr_table.size()); 316 317 printf("\nCompressing bow values..."); fflush(stdout); 318 vc(bow_eff, bow_values, bow_map, bow_table, (1 << CThreadSlm::BITS_BOW)); 319 bow_values.clear(); 320 itte = bow_table.end(); 321 for (itt = bow_table.begin(); itt != itte; ++itt) 322 *itt = OriginalBow(*itt); 323 printf("%d float values ==> %d values", bow_eff.size(), bow_table.size()); 324 325 326 printf("\nThreading the new model..."); fflush(stdout); 327 for (int lvl=0; lvl < slm.getN(); ++lvl) { 328 levels[lvl] = new CThreadSlm::TNode[slm.getLevelSize(lvl)]; 329 330 CSIMSlmWithIteration::TLevelIterator it; 331 slm.beginLevelIteration(lvl, it); 332 for (; !slm.isEnd(it); slm.next(it)) { 333 slm.getIdString(it, history); 334 if (history.size() == 0) { 335 slm.findBackOffState(lvl, NULL, bol, bon); 336 } else { 337 slm.findBackOffState(lvl, &history[0], bol, bon); 338 } 339 340 CSIMSlm::TNode* pn = (CSIMSlm::TNode*)slm.getNodePtr(it); 341 CThreadSlm::TNode& nn = levels[lvl][it.m_history.back()]; 342 343 std::map<float, int>::iterator prit = pr_map.find(pn->pr); 344 if (prit == pr_map.end()) { // This would be cause by precision error 345 double val = EffectivePr(pn->pr); 346 val = OriginalPr(val); 347 prit = pr_map.find(val); 348 assert(prit != pr_map.end()); 349 } 350 int idx_pr = prit->second; 351 nn.set_pr(idx_pr); 352 353 nn.set_wid(pn->id); 354 nn.set_bon(bon); 355 nn.set_bol(bol); 356 357 std::map<float, int>::iterator bowit = bow_map.find(pn->bow); 358 if (bowit == bow_map.end()) { // precision error 359 double val = EffectiveBow(pn->bow); 360 val = OriginalBow(val); 361 bowit = bow_map.find(val); 362 assert(bowit != bow_map.end()); 363 } 364 int idx_bow = bowit->second; 365 nn.set_bow(idx_bow); 366 367 nn.set_ch(pn->child); 368 369 assert(usingLogPr || (pr_table[idx_pr] > 0.0 && pr_table[idx_pr] < 1.0)); 370 assert(!usingLogPr || pr_table[idx_pr] > 0.0); 371 } 372 CSIMSlm::TNode* pn = (CSIMSlm::TNode*)slm.getNodePtr(it); 373 CThreadSlm::TNode& nn = levels[lvl][it.m_history.back()]; 374 nn.set_ch(pn->child); 375 }; 376 377 378 lastLevel = new CThreadSlm::TLeaf [slm.getLevelSize(slm.getN())]; 379 CSIMSlmWithIteration::TLevelIterator it; 380 slm.beginLevelIteration(slm.getN(), it); 381 for (int lvl=slm.getN(); !slm.isEnd(it); slm.next(it)) { 382 CSIMSlm::TLeaf* pn = slm.getNodePtr(it); 383 slm.getIdString(it, history); 384 slm.findBackOffState(lvl, &history[0], bol, bon); 385 386 CThreadSlm::TLeaf& nn = lastLevel[it.m_history.back()]; 387 388 std::map<float, int>::iterator prit = pr_map.find(pn->pr); 389 if (prit == pr_map.end()) { // This would be cause by precision error 390 double val = EffectivePr(pn->pr); 391 val = OriginalPr(val); 392 prit = pr_map.find(val); 393 assert(prit != pr_map.end()); 394 } 395 int idx_pr = prit->second; 396 nn.set_pr(idx_pr); 397 398 nn.set_wid(pn->id); 399 nn.set_bon(bon); 400 nn.set_bol(bol); 401 } 402 403 404 printf("\nWriting out..."); fflush(stdout); 405 406 float dummy = 0.0; 407 fp = fopen(argv[2], "wb"); 408 int N = slm.getN(); 409 fwrite(&N, sizeof(int), 1, fp); 410 unsigned ulp = slm.isUseLogPr(); 411 fwrite(&ulp, sizeof(unsigned), 1, fp); 412 413 for (int lvl = 0; lvl <= N; ++lvl) { 414 int len = slm.getLevelSize(lvl); 415 fwrite(&len, sizeof(int), 1, fp); 416 } 417 fwrite(&pr_table[0], sizeof(float), pr_table.size(), fp); 418 for (int i = pr_table.size(), sz=(1 << CThreadSlm::BITS_PR); i < sz; ++i) 419 fwrite(&dummy, sizeof(float), 1, fp); 420 421 fwrite(&bow_table[0], sizeof(float), bow_table.size(), fp); 422 for (int i = bow_table.size(), sz=(1 << CThreadSlm::BITS_BOW); i < sz; ++i) 423 fwrite(&dummy, sizeof(float), 1, fp); 424 425 for (int lvl=0; lvl < N; ++lvl) 426 fwrite(levels[lvl], sizeof(CThreadSlm::TNode), slm.getLevelSize(lvl), fp); 427 fwrite(lastLevel, sizeof(CThreadSlm::TLeaf), slm.getLevelSize(N), fp); 428 fclose(fp); 429 430 printf("done!\n"); fflush(stdout); 431 432 delete [] lastLevel; 433 for (int lvl=0; lvl < N; ++lvl) 434 delete []levels[lvl]; 435 436 bow_values.clear(); 437 bow_map.clear(); 438 bow_table.clear(); 439 440 pr_values.clear(); 441 pr_map.clear(); 442 pr_table.clear(); 443 444 slm.Free(); 445 446 447 return 0; 448 } 449