1 0 yongsun /* 2 82 yongsun * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 82 yongsun * 4 82 yongsun * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 82 yongsun * 6 82 yongsun * The contents of this file are subject to the terms of either the GNU Lesser 7 82 yongsun * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 82 yongsun * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 82 yongsun * file except in compliance with the License. You can obtain a copy of the CDDL at 10 82 yongsun * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 82 yongsun * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 82 yongsun * specific language governing permissions and limitations under the License. When 13 82 yongsun * distributing the software, include this License Header Notice in each file and 14 82 yongsun * include the full text of the License in the License file as well as the 15 82 yongsun * following notice: 16 82 yongsun * 17 82 yongsun * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 82 yongsun * (CDDL) 19 82 yongsun * For Covered Software in this distribution, this License shall be governed by the 20 82 yongsun * laws of the State of California (excluding conflict-of-law provisions). 21 82 yongsun * Any litigation relating to this License shall be subject to the jurisdiction of 22 82 yongsun * the Federal Courts of the Northern District of California and the state courts 23 82 yongsun * of the State of California, with venue lying in Santa Clara County, California. 24 82 yongsun * 25 82 yongsun * Contributor(s): 26 82 yongsun * 27 82 yongsun * If you wish your version of this file to be governed by only the CDDL or only 28 82 yongsun * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 82 yongsun * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 82 yongsun * license." If you don't indicate a single choice of license, a recipient has the 31 82 yongsun * option to distribute your version of this file under either the CDDL or the LGPL 32 82 yongsun * Version 2.1, or to extend the choice of license to its licensees as provided 33 82 yongsun * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 82 yongsun * Version 2 license, then the option applies only if the new code is made subject 35 82 yongsun * to such option by the copyright holder. 36 0 yongsun */ 37 82 yongsun 38 0 yongsun #ifdef HAVE_CONFIG_H 39 0 yongsun #include "config.h" 40 0 yongsun #endif 41 0 yongsun 42 0 yongsun #ifdef HAVE_ASSERT_H 43 0 yongsun #include <assert.h> 44 0 yongsun #endif 45 0 yongsun 46 0 yongsun #ifdef HAVE_GETOPT_H 47 0 yongsun #include <getopt.h> 48 0 yongsun #endif 49 0 yongsun 50 0 yongsun #include <stdio.h> 51 0 yongsun #include <unistd.h> 52 0 yongsun #include <locale.h> 53 0 yongsun 54 0 yongsun #include "../sim_dict.h" 55 0 yongsun #include "../sim_sen.h" 56 0 yongsun 57 0 yongsun static struct option long_options[] = 58 0 yongsun { 59 0 yongsun {"dict", 1, 0, 'd'}, 60 0 yongsun {"format", 1, 0, 'f'}, 61 0 yongsun {"show-id", 0, 0, 'i'}, 62 0 yongsun {"s-tok", 1, 0, 's'}, 63 0 yongsun {"ambiguious-id", 1, 0, 'a'}, 64 0 yongsun {0, 0, 0, 0} 65 0 yongsun }; 66 0 yongsun 67 0 yongsun static char* s_strDictFile = NULL; 68 0 yongsun static bool s_bTextOut = false; 69 0 yongsun static bool s_bShowId = false; 70 0 yongsun static TSIMWordId s_iSTOKID = 10; 71 0 yongsun static TSIMWordId s_iAmbiID = 0; 72 0 yongsun 73 0 yongsun static CSIMDict *s_dict = NULL; 74 0 yongsun 75 0 yongsun static void 76 0 yongsun ShowUsage() 77 0 yongsun { 78 0 yongsun fprintf(stderr, "\nUsage:\n"); 79 0 yongsun fprintf(stderr, "mmseg -d dict_file [-f (text|bin)] [-i] [-s STOK_ID] [-a AMBI_ID]\n\n"); 80 0 yongsun fprintf(stderr, " -f --format:\n"); 81 0 yongsun fprintf(stderr, " Output Format, can be 'text' or 'bin'. default 'bin'\n"); 82 0 yongsun fprintf(stderr, " Normally, in text mode, word text are output, while in binary mode,\n"); 83 209 tchaikov fprintf(stderr, " binary short integer of the word-ids are written to stdout.\n"); 84 0 yongsun fprintf(stderr, " -s --stok:\n"); 85 0 yongsun fprintf(stderr, " Sentence token id. Default 10.\n"); 86 209 tchaikov fprintf(stderr, " It will be written to output in binary mode after every sentence.\n"); 87 0 yongsun fprintf(stderr, " -i --show-id:\n"); 88 11 yongsun fprintf(stderr, " Show Id info. Under text output format mode, attach id after known.\n"); 89 11 yongsun fprintf(stderr, " words. If under binary mode, print id(s) in text.\n"); 90 0 yongsun fprintf(stderr, " -a --ambiguious-id:\n"); 91 11 yongsun fprintf(stderr, " Ambiguious means ABC => A BC or AB C. If specified (AMBI-ID != 0), \n"); 92 11 yongsun fprintf(stderr, " The sequence ABC will not be segmented, in binary mode, the AMBI-ID \n"); 93 11 yongsun fprintf(stderr, " is written out; in text mode, <ambi>ABC</ambi> will be output. Default \n"); 94 11 yongsun fprintf(stderr, " is 0.\n"); 95 0 yongsun fprintf(stderr, "\n"); 96 0 yongsun fprintf(stderr, "Notes:\n"); 97 0 yongsun fprintf(stderr, " Under binary mode, consecutive id of 0 are merged into one 0.\n"); 98 209 tchaikov fprintf(stderr, " Under text mode, no space are inserted between unknown-words. \n"); 99 0 yongsun fprintf(stderr, "\n"); 100 0 yongsun fprintf(stderr, "\n"); 101 0 yongsun exit(1000); 102 0 yongsun } 103 0 yongsun 104 0 yongsun static void 105 0 yongsun getParameters(int argc, char* argv[]) 106 0 yongsun { 107 0 yongsun int c; 108 0 yongsun while ((c=getopt_long(argc, argv, "d:if:s:a:", long_options, NULL)) != -1) 109 0 yongsun { 110 0 yongsun switch (c) { 111 0 yongsun case 'd': 112 0 yongsun s_strDictFile = strdup(optarg); 113 0 yongsun break; 114 0 yongsun case 'i': 115 0 yongsun s_bShowId = true; 116 0 yongsun break; 117 0 yongsun case 'f': 118 0 yongsun s_bTextOut = (strcmp(optarg, "text") == 0); 119 0 yongsun break; 120 0 yongsun case 's': 121 0 yongsun s_iSTOKID = atoi(optarg); 122 0 yongsun break; 123 0 yongsun case 'a': 124 0 yongsun s_iAmbiID = atoi(optarg); 125 0 yongsun break; 126 0 yongsun default: 127 0 yongsun ShowUsage(); 128 0 yongsun break; 129 0 yongsun } 130 0 yongsun } 131 0 yongsun if (s_strDictFile == NULL) 132 0 yongsun ShowUsage(); 133 0 yongsun } 134 0 yongsun 135 0 yongsun static void 136 0 yongsun output_stok(int& nWords) 137 0 yongsun { 138 0 yongsun if (s_bShowId) { 139 0 yongsun if (nWords > 0) 140 0 yongsun printf(" "); 141 0 yongsun printf("%d", unsigned(s_iSTOKID)); 142 0 yongsun } else { 143 0 yongsun fwrite(&s_iSTOKID, sizeof(TSIMWordId), 1, stdout); 144 0 yongsun } 145 0 yongsun ++nWords; 146 0 yongsun } 147 0 yongsun 148 0 yongsun static void 149 0 yongsun output(int len, const TWCHAR* p, TSIMWordId idprev, TSIMWordId idcur, int& nWords) 150 0 yongsun { 151 0 yongsun static char mbword[1024]; 152 0 yongsun static TWCHAR wcword[1024]; 153 0 yongsun 154 0 yongsun bool bRealGap = (idcur != SIM_ID_NOT_WORD || idprev != SIM_ID_NOT_WORD); 155 0 yongsun if (s_bTextOut) { 156 0 yongsun for (int i=0; i < len; ++i, ++p) 157 0 yongsun wcword[i] = *p; 158 0 yongsun wcword[len] = 0; 159 0 yongsun WCSTOMBS(mbword, wcword, sizeof(mbword)); 160 0 yongsun if (bRealGap && idprev == SIM_ID_NOT_WORD) 161 0 yongsun printf("(%d)", unsigned(idprev)); 162 0 yongsun if (bRealGap && (nWords > 0)) 163 0 yongsun printf(" "); 164 11 yongsun (s_iAmbiID && idcur == s_iAmbiID)? printf ("<ambi>%s</ambi>", mbword): 165 11 yongsun printf("%s", mbword); 166 0 yongsun if (s_bShowId && idcur != SIM_ID_NOT_WORD) 167 0 yongsun printf("(%d)", unsigned(idcur)); 168 0 yongsun } else { 169 0 yongsun if (bRealGap) { 170 0 yongsun if (s_bShowId) { 171 0 yongsun if (nWords > 0) 172 0 yongsun printf(" "); 173 0 yongsun printf("%d", unsigned(idcur)); 174 0 yongsun } else 175 0 yongsun fwrite(&idcur, sizeof(TSIMWordId), 1, stdout); 176 0 yongsun } 177 0 yongsun } 178 0 yongsun if (bRealGap) 179 0 yongsun ++nWords; 180 0 yongsun } 181 0 yongsun 182 0 yongsun /** 183 0 yongsun * Return . For example, ABCDEF if ABC CD DEF are words. 184 0 yongsun * if return len > word_len, then ambiguious exists at word [p p+len)... 185 0 yongsun */ 186 0 yongsun int 187 0 yongsun getAmbiLen(const TWCHAR* p, int word_len) 188 0 yongsun { 189 0 yongsun const CSIMDict::TState* pstate; 190 0 yongsun 191 0 yongsun for (int i=1; i<word_len && *(p+i) != WCH_NULL; ++i) { 192 0 yongsun int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p+i); 193 0 yongsun if (word_len < i+len) 194 0 yongsun word_len = i+len; 195 0 yongsun } 196 0 yongsun 197 0 yongsun return word_len; 198 0 yongsun } 199 0 yongsun 200 0 yongsun static bool 201 0 yongsun processSingleFile(FILE* fp, int &nWords, int &nAmbis) 202 0 yongsun { 203 0 yongsun nWords = 0; 204 0 yongsun nAmbis = 0; 205 0 yongsun 206 0 yongsun wstring sntnc; 207 0 yongsun CSIMCharReader *pReader = new CSIMCharReader(fp); 208 0 yongsun CSIMCharReader::iterator iter = pReader->begin(); 209 0 yongsun TSIMWordId idcur, idprev = s_iSTOKID; 210 0 yongsun 211 0 yongsun if (!s_bTextOut) 212 0 yongsun output_stok(nWords); 213 0 yongsun 214 0 yongsun while (true){ 215 0 yongsun if (ReadSentence(sntnc, iter, false) == false) 216 0 yongsun break; 217 0 yongsun 218 0 yongsun for (const TWCHAR *p = sntnc.c_str(); (*p); ) { 219 0 yongsun 220 0 yongsun const CSIMDict::TState* pstate; 221 0 yongsun int len = s_dict->matchLongest(s_dict->getRoot(), pstate, p); 222 0 yongsun if (len <= 0) { 223 0 yongsun idcur = SIM_ID_NOT_WORD; 224 0 yongsun len = 1; 225 0 yongsun } else 226 0 yongsun idcur = pstate->word_id; 227 0 yongsun 228 11 yongsun if (s_iAmbiID != WCH_NULL) { 229 0 yongsun int ambiLen=getAmbiLen(p, len); 230 0 yongsun if (ambiLen > len) { 231 0 yongsun len = ambiLen; 232 0 yongsun idcur = s_iAmbiID; 233 0 yongsun ++nAmbis; 234 0 yongsun } 235 0 yongsun } 236 0 yongsun 237 0 yongsun output(len, p, idprev, idcur, nWords); 238 0 yongsun 239 0 yongsun idprev = idcur; 240 0 yongsun p += len; 241 0 yongsun } 242 0 yongsun 243 0 yongsun if (!s_bTextOut) { 244 0 yongsun output_stok(nWords); 245 0 yongsun idprev = s_iSTOKID; 246 0 yongsun } 247 0 yongsun } 248 0 yongsun 249 0 yongsun fflush(stdout); 250 0 yongsun return true; 251 0 yongsun } 252 0 yongsun 253 0 yongsun int 254 0 yongsun main(int argc, char *argv[]) 255 0 yongsun { 256 0 yongsun int nWords, nAmbis; 257 0 yongsun 258 0 yongsun setlocale(LC_ALL, ""); 259 0 yongsun getParameters(argc, argv); 260 0 yongsun argc -= optind; 261 0 yongsun argv += optind; 262 0 yongsun 263 0 yongsun fprintf(stderr, "Loading lexicon..."); fflush(stderr); 264 0 yongsun s_dict = new CSIMDict(); 265 0 yongsun if (!s_dict->parseText(s_strDictFile)) { 266 0 yongsun fprintf(stderr, "fail\n"); fflush(stderr); 267 0 yongsun exit(1001); 268 0 yongsun } 269 0 yongsun fprintf(stderr, "done"); fflush(stderr); 270 0 yongsun 271 0 yongsun if (argc == 0) { 272 0 yongsun fprintf(stderr, "\nProcessing from stdin..."); fflush(stderr); 273 0 yongsun processSingleFile(stdin, nWords, nAmbis); 274 0 yongsun fprintf(stderr, "%d words, %d ambiguious. Done!\n", nWords, nAmbis); fflush(stderr); 275 0 yongsun } else { 276 0 yongsun for (int i=0; i < argc; ++i) { 277 0 yongsun fprintf(stderr, "\nProcessing %s...", argv[i]); fflush(stderr); 278 0 yongsun FILE *fp = fopen(argv[i], "r"); 279 0 yongsun if (fp != NULL) { 280 0 yongsun processSingleFile(fp, nWords, nAmbis); 281 0 yongsun fprintf(stderr, "@Offset %u, %d words, %d ambiguious. Done!\n", ftell(fp), nWords, nAmbis); fflush(stderr); 282 0 yongsun } else { 283 0 yongsun fprintf(stderr, "Can not Open!!!!!!!\n"); fflush(stderr); 284 0 yongsun } 285 0 yongsun fclose(fp); 286 0 yongsun } 287 0 yongsun } 288 0 yongsun 289 0 yongsun s_dict->close(); 290 0 yongsun delete s_dict; 291 0 yongsun s_dict = NULL; 292 0 yongsun return 0; 293 0 yongsun } 294