1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifdef HAVE_CONFIG_H 39 #include "config.h" 40 #endif 41 42 #ifdef HAVE_ASSERT_H 43 #include <assert.h> 44 #endif 45 46 #ifdef HAVE_GETOPT_H 47 #include <getopt.h> 48 #endif 49 50 #include <stdio.h> 51 #include <unistd.h> 52 #include <locale.h> 53 54 #include <vector> 55 #include <map> 56 #include <algorithm> 57 58 #include "../sim_dict.h" 59 #include "../sim_sen.h" 60 #include "../slm.h" 61 62 static struct option long_options[] = 63 { 64 {"dict", 1, 0, 'd'}, 65 {"format", 1, 0, 'f'}, 66 {"show-id", 0, 0, 'i'}, 67 {"s-tok", 1, 0, 's'}, 68 {"model", 1, 0, 'm'}, 69 {0, 0, 0, 0} 70 }; 71 72 static char* s_strDictFile = NULL; 73 static char* s_strSlmFile = NULL; 74 static bool s_bTextOut = false; 75 static bool s_bShowId = false; 76 static TSIMWordId s_iSTOKID = 10; 77 78 static CSIMDict *s_dict = NULL; 79 static CThreadSlm *s_tslm = NULL; 80 81 static void 82 ShowUsage() 83 { 84 fprintf(stderr, "\nUsage:\n"); 85 fprintf(stderr, "slmseg -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-m lm_file]\n\n"); 86 fprintf(stderr, " -f --format:\n"); 87 fprintf(stderr, " Output Format, can be 'text' or 'bin'. default 'bin'\n"); 88 fprintf(stderr, " Normally, in text mode, word text are output, while in binary mode,\n"); 89 fprintf(stderr, " binary short integer of the word-ids are writed to stdout.\n"); 90 fprintf(stderr, " -s --stok:\n"); 91 fprintf(stderr, " Sentence token id. Default 10.\n"); 92 fprintf(stderr, " It will be write to output in binary mode after every sentence.\n"); 93 fprintf(stderr, " -i --show-id:\n"); 94 fprintf(stderr, " Show Id info. Under text output format mode, Attach id after known-words.\n"); 95 fprintf(stderr, " Under binary mode, print id in text.\n"); 96 fprintf(stderr, " -m --model:\n"); 97 fprintf(stderr, " Language model file name"); 98 fprintf(stderr, "\n"); 99 fprintf(stderr, "Notes:\n"); 100 fprintf(stderr, " Under binary mode, consecutive id of 0 are merged into one 0.\n"); 101 fprintf(stderr, " Under text mode, no space are insert between unknown-words. \n"); 102 fprintf(stderr, "\n"); 103 fprintf(stderr, "\n"); 104 exit(1000); 105 } 106 107 static void 108 getParameters(int argc, char* argv[]) 109 { 110 int c; 111 while ((c=getopt_long(argc, argv, "d:if:s:m:", long_options, NULL)) != -1) 112 { 113 switch (c) { 114 case 'd': 115 s_strDictFile = strdup(optarg); 116 break; 117 case 'i': 118 s_bShowId = true; 119 break; 120 case 'f': 121 s_bTextOut = (strcmp(optarg, "text") == 0); 122 break; 123 case 's': 124 s_iSTOKID = atoi(optarg); 125 break; 126 case 'm': 127 s_strSlmFile = strdup(optarg); 128 break; 129 default: 130 ShowUsage(); 131 break; 132 } 133 } 134 if (s_strDictFile == NULL) 135 ShowUsage(); 136 } 137 138 static void 139 output_stok(int& nWords) 140 { 141 if (s_bShowId) { 142 if (nWords > 0) 143 printf(" "); 144 printf("%d", unsigned(s_iSTOKID)); 145 } else { 146 fwrite(&s_iSTOKID, sizeof(TSIMWordId), 1, stdout); 147 } 148 ++nWords; 149 } 150 151 static void 152 output(int len, const TWCHAR* p, TSIMWordId idprev, TSIMWordId idcur, int& nWords) 153 { 154 static char mbword[1024]; 155 static TWCHAR wcword[1024]; 156 157 bool bRealGap = (idcur != SIM_ID_NOT_WORD || idprev != SIM_ID_NOT_WORD); 158 if (s_bTextOut) { 159 for (int i=0; i < len; ++i, ++p) 160 wcword[i] = *p; 161 wcword[len] = 0; 162 WCSTOMBS(mbword, wcword, sizeof(mbword)); 163 if (bRealGap && idprev == SIM_ID_NOT_WORD) 164 printf("(%d)", unsigned(idprev)); 165 if (bRealGap && (nWords > 0)) 166 printf(" "); 167 printf("%s", mbword); 168 if (s_bShowId && idcur != SIM_ID_NOT_WORD) 169 printf("(%d)", unsigned(idcur)); 170 } else { 171 if (bRealGap) { 172 if (s_bShowId) { 173 if (nWords > 0) 174 printf(" "); 175 printf("%d", unsigned(idcur)); 176 } else 177 fwrite(&idcur, sizeof(TSIMWordId), 1, stdout); 178 } 179 } 180 if (bRealGap) 181 ++nWords; 182 } 183 184 struct TLatticeWord { 185 int m_left; 186 int m_right; 187 int m_wordId; 188 189 TLatticeWord(int left=0, int right=0, int wid=0) 190 : m_left(left), m_right(right), m_wordId(wid) { } 191 }; 192 193 typedef std::vector<TLatticeWord> TLatticeWordVec; 194 195 struct TLatticeStateValue { 196 double m_pr; 197 TLatticeWord* mp_btword; 198 CThreadSlm::TState m_btstate; 199 200 TLatticeStateValue(double pr=0.0, TLatticeWord* btword=NULL, CThreadSlm::TState btstate = CThreadSlm::TState()) 201 : m_pr(pr), mp_btword(btword), m_btstate(btstate) { } 202 }; 203 204 typedef std::map<CThreadSlm::TState, TLatticeStateValue> TLatticeColumnStates; 205 206 struct TLatticeColumn { 207 TLatticeWordVec m_wordstarting; 208 TLatticeColumnStates m_states; 209 }; 210 211 typedef std::vector<TLatticeColumn> CLattice; 212 213 inline void insertLatticeWord(CLattice& lattice, TLatticeWord word) 214 { 215 lattice[word.m_left].m_wordstarting.push_back(word); 216 } 217 218 int 219 getAmbiLen(const TWCHAR* p, int word_len) 220 { 221 const CSIMDict::TState* pstate; 222 223 for (int i=1; (i<word_len) && *(p+i) != WCH_NULL; ++i) { 224 int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p+i); 225 if (word_len < i+len) 226 word_len = i+len; 227 } 228 229 return word_len; 230 } 231 232 void fullSegBuildLattice(wstring& sntnc, int left, int len, CLattice& lattice) 233 { 234 for (int right=left+len; left < right; ++left) { 235 bool found = false; 236 237 const TWCHAR* p = sntnc.c_str()+left; 238 const CSIMDict::TState* pds = s_dict->getRoot(); 239 for (len = 0; left+len < right; ++len) { 240 if ((pds = s_dict->step(pds, *p++)) == NULL) 241 break; 242 if (pds->word_id != SIM_ID_NOT_WORD) { 243 found = true; 244 insertLatticeWord(lattice, TLatticeWord(left, left+len+1, pds->word_id)); 245 } 246 } 247 if (!found) 248 insertLatticeWord(lattice, TLatticeWord(left, left+1, SIM_ID_NOT_WORD)); 249 } 250 } 251 252 /** 253 * Lattice head should have one state, with its TState using slm's root. its 254 * pr = 0 and its mp_btword == NULL; 255 * Lattice tail must contain no word, and it previous node contain only one word 256 * with its right = left+1, right == tail. 257 * The lattice should ensure the lattice path existing 258 */ 259 void buildLattice(wstring &sntnc, CLattice& lattice) 260 { 261 lattice.clear(); 262 lattice.resize(sntnc.size()+2); 263 264 unsigned int idcur = SIM_ID_NOT_WORD; 265 lattice[0].m_states[CThreadSlm::TState()] = TLatticeStateValue(0.0, NULL, CThreadSlm::TState()); 266 267 for (int i=0, sz=sntnc.size(); i < sz; ) { 268 const CSIMDict::TState* pstate; 269 const TWCHAR* p = sntnc.c_str()+i; 270 int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p); 271 if (len <= 0) { 272 idcur = SIM_ID_NOT_WORD; 273 len = 1; 274 } else { 275 idcur = pstate->word_id; 276 } 277 int ambilen = getAmbiLen(p, len); 278 279 if (ambilen <= len) { 280 insertLatticeWord(lattice, TLatticeWord(i, i+len, idcur)); 281 i += len; 282 } else { 283 fullSegBuildLattice(sntnc, i, ambilen, lattice); 284 i += ambilen; 285 } 286 } 287 lattice[sntnc.size()].m_wordstarting.push_back(TLatticeWord(sntnc.size(), sntnc.size()+1, s_iSTOKID)); 288 } 289 290 void searchBest(CLattice& lattice) 291 { 292 for (int i=0, sz=lattice.size(); i < sz; ++i) { 293 TLatticeColumnStates & states = lattice[i].m_states; 294 TLatticeColumnStates::iterator itss = states.begin(); 295 TLatticeColumnStates::iterator itse = states.end(); 296 for (; itss != itse; ++itss) { 297 TLatticeWordVec::iterator itws = lattice[i].m_wordstarting.begin(); 298 TLatticeWordVec::iterator itwe = lattice[i].m_wordstarting.end(); 299 for (; itws != itwe; ++itws) { 300 CThreadSlm::TState his = itss->first; 301 double pr = itss->second.m_pr; 302 pr += s_tslm->transferNegLog(his, itws->m_wordId, his); 303 TLatticeColumnStates & rss = lattice[itws->m_right].m_states; 304 s_tslm->historify(his); 305 TLatticeColumnStates::iterator itn = rss.find(his); 306 if (itn == rss.end()) { 307 rss[his] = TLatticeStateValue(pr, &(*itws), itss->first); 308 } else { 309 if (itn->second.m_pr > pr) { 310 rss[his] = TLatticeStateValue(pr, &(*itws), itss->first); 311 } 312 } 313 } 314 } 315 } 316 } 317 318 void getBestPath(CLattice& lattice, TLatticeWordVec& segResult) 319 { 320 TLatticeColumnStates & states = lattice.back().m_states; 321 TLatticeColumnStates::iterator its = states.begin(); 322 323 TLatticeWord* pbtword = its->second.mp_btword; 324 CThreadSlm::TState btstate = its->second.m_btstate; 325 its = lattice[pbtword->m_left].m_states.find(btstate); 326 assert(its != lattice[pbtword->m_left].m_states.end()); 327 328 segResult.clear(); 329 while (true) { 330 pbtword = its->second.mp_btword; 331 if (pbtword != NULL) { 332 #ifndef HOST_OS_GNUC_2 333 segResult.push_back(*pbtword); 334 #else // HOST_OS_GNUC_2 335 segResult.insert(segResult.begin(), *pbtword); 336 #endif // !HOST_OS_GNUC_2 337 btstate = its->second.m_btstate; 338 its = lattice[pbtword->m_left].m_states.find(btstate); 339 assert(its != lattice[pbtword->m_left].m_states.end()); 340 } else { 341 break; 342 } 343 } 344 #ifndef HOST_OS_GNUC_2 345 std::reverse(segResult.begin(), segResult.end()); 346 #endif // HOST_OS_GNUC_2 347 } 348 349 static bool 350 processSingleFile(FILE* fp, int &nWords, int &nAmbis) 351 { 352 nWords = 0; 353 nAmbis = 0; 354 355 wstring sntnc; 356 CSIMCharReader *pReader = new CSIMCharReader(fp); 357 CSIMCharReader::iterator iter = pReader->begin(); 358 TSIMWordId idcur, idprev = s_iSTOKID; 359 360 if (!s_bTextOut) 361 output_stok(nWords); 362 363 while (true){ 364 if (ReadSentence(sntnc, iter, false) == false) 365 break; 366 367 CLattice lattice; 368 buildLattice(sntnc, lattice); 369 searchBest(lattice); 370 371 TLatticeWordVec segResult; 372 getBestPath(lattice, segResult); 373 374 for (int i=0, sz=segResult.size(); i < sz; ++i) { 375 const TWCHAR *p = sntnc.c_str()+segResult[i].m_left; 376 int len = segResult[i].m_right - segResult[i].m_left; 377 idcur = segResult[i].m_wordId; 378 379 output(len, p, idprev, idcur, nWords); 380 idprev = idcur; 381 } 382 383 if (!s_bTextOut) { 384 output_stok(nWords); 385 idprev = s_iSTOKID; 386 } 387 } 388 389 fflush(stdout); 390 return true; 391 } 392 393 int 394 main(int argc, char *argv[]) 395 { 396 int nWords, nAmbis; 397 398 setlocale(LC_ALL, ""); 399 getParameters(argc, argv); 400 argc -= optind; 401 argv += optind; 402 403 fprintf(stderr, "Loading lexicon..."); 404 fflush(stderr); 405 s_dict = new CSIMDict(); 406 s_tslm = new CThreadSlm(); 407 if (!s_dict->parseText(s_strDictFile)) { 408 fprintf(stderr, "fail to open Lexicon file!\n"); 409 fflush(stderr); 410 exit(11); 411 } 412 if (!s_tslm->load(s_strSlmFile, true)) { 413 fprintf(stderr, "fail to open slm file!\n"); 414 fflush(stderr); 415 exit(12); 416 } 417 fprintf(stderr, "done"); 418 fflush(stderr); 419 420 if (argc == 0) { 421 fprintf(stderr, "\nProcessing from stdin..."); 422 fflush(stderr); 423 processSingleFile(stdin, nWords, nAmbis); 424 fprintf(stderr, "%d words, %d ambiguious. Done!\n", nWords, nAmbis); 425 fflush(stderr); 426 } else { 427 for (int i=0; i < argc; ++i) { 428 fprintf(stderr, "\nProcessing %s...", argv[i]); fflush(stderr); 429 FILE *fp = fopen(argv[i], "r"); 430 if (fp != NULL) { 431 processSingleFile(fp, nWords, nAmbis); 432 fprintf(stderr, "@Offset %u, %d words, %d ambiguious. Done!\n", 433 ftell(fp), nWords, nAmbis); 434 fflush(stderr); 435 } else { 436 fprintf(stderr, "Can not Open!!!!!!!\n"); 437 fflush(stderr); 438 } 439 fclose(fp); 440 } 441 } 442 443 s_tslm->free(); 444 delete s_tslm; 445 s_tslm = NULL; 446 s_dict->close(); 447 delete s_dict; 448 s_dict = NULL; 449 return 0; 450 } 451
