1 /* 2 * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 * 4 * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 * 6 * The contents of this file are subject to the terms of either the GNU Lesser 7 * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 * file except in compliance with the License. You can obtain a copy of the CDDL at 10 * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 * specific language governing permissions and limitations under the License. When 13 * distributing the software, include this License Header Notice in each file and 14 * include the full text of the License in the License file as well as the 15 * following notice: 16 * 17 * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 * (CDDL) 19 * For Covered Software in this distribution, this License shall be governed by the 20 * laws of the State of California (excluding conflict-of-law provisions). 21 * Any litigation relating to this License shall be subject to the jurisdiction of 22 * the Federal Courts of the Northern District of California and the state courts 23 * of the State of California, with venue lying in Santa Clara County, California. 24 * 25 * Contributor(s): 26 * 27 * If you wish your version of this file to be governed by only the CDDL or only 28 * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 * license." If you don't indicate a single choice of license, a recipient has the 31 * option to distribute your version of this file under either the CDDL or the LGPL 32 * Version 2.1, or to extend the choice of license to its licensees as provided 33 * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 * Version 2 license, then the option applies only if the new code is made subject 35 * to such option by the copyright holder. 36 */ 37 38 #ifdef HAVE_CONFIG_H 39 #include "config.h" 40 #endif 41 42 #ifdef HAVE_ASSERT_H 43 #include <assert.h> 44 #endif 45 46 #ifdef HAVE_GETOPT_H 47 #include <getopt.h> 48 #endif 49 50 #include <stdio.h> 51 #include <map> 52 #include <vector> 53 #include <algorithm> 54 55 #include "../sim_fmerge.h" 56 #include "idngram.h" 57 #include "idngram_merge.h" 58 59 template<int N> 60 void WriteOut(FILE* out, std::map<CSIM_Idngram<N>, unsigned int> & map) 61 { 62 typedef typename std::map<CSIM_Idngram<N>,unsigned int>::iterator TMapIterator; 63 TMapIterator its=map.begin(), ite=map.end(); 64 for (; its != ite; ++its) { 65 fwrite(its->first.ids, sizeof(TSIMWordId), N, out); 66 fwrite(&(its->second), sizeof(unsigned int), 1, out); 67 } 68 map.clear(); 69 } 70 71 template<int N> 72 void ProcessingRead(FILE *fp, FILE* swap, std::vector<long>& para_offsets, size_t paraMax) 73 { 74 typedef CSIM_Idngram<N> TNgram; 75 typedef typename std::map<CSIM_Idngram<N>, unsigned int> TMap; 76 77 TMap map; 78 TNgram ngram; 79 80 TSIMWordId* ids = ngram.ids; 81 fread(ids, sizeof(TSIMWordId), N-1, fp); 82 while (fread(ids+N-1, sizeof(TSIMWordId), 1, fp) == 1) { 83 ++map[ngram]; 84 if (map.size() >= paraMax) 85 { 86 printf("."); fflush(stdout); 87 WriteOut(swap, map); 88 para_offsets.push_back(ftell(swap)); 89 } 90 for (int i=0; i<N-1; ++i) ids[i] = ids[i+1]; 91 } 92 if (map.size() > 0) { 93 printf("."); fflush(stdout); 94 WriteOut(swap, map); 95 para_offsets.push_back(ftell(swap)); 96 } 97 } 98 99 static struct option long_options[] = 100 { 101 {"NMax", 1, 0, 'n'}, 102 {"out", 1, 0, 'o'}, 103 {"swap", 1, 0, 's'}, 104 {"para", 1, 0, 'p'}, 105 {0, 0, 0, 0} 106 }; 107 108 static int N=0; 109 static int paraMax=0; 110 static char* output=NULL; 111 static char* swapfile=NULL; 112 113 void ShowUsage() 114 { 115 printf("Usage:\n\tids2ngram options idsfile[ idsfile...]\n"); 116 printf("\nDescription\n"); 117 printf(" This program generate idngram file, which is a sorted [id1,..idN,freq] array, from binary id stream files.\n"); 118 printf("\nInput:\n"); 119 printf("\tBinary id stream files looks like [id0,...,idX]\n"); 120 printf("\nOptions:\n"); 121 printf("\t -n N # N-gram\n"); 122 printf("\t -s swapfile # intermedia temporary file\n"); 123 printf("\t -o outputfile # result idngram file [id1, ... idN, freq]*\n"); 124 printf("\t -p para_size # maxium ngram-items per para\n"); 125 printf("\nExample:\n"); 126 printf(" Following example will use three input idstream file idsfile[1,2,3] to generate the idngram file all.id3gram. Each para (internal map size or hash size) would be 1024000, using swap file for temp result. All temp para result would final be merged to got the final result.\n"); 127 printf("\tids2idngram -n 3 -s /tmp/swap -o all.id3gram -p 1024000 idsfile1 idsfile2 idsfile3\n\n"); 128 exit(100); 129 } 130 131 static void getParameters(int argc, char* const argv[]) 132 { 133 int option_index = 0; 134 int c; 135 while ((c=getopt_long(argc, argv, "p:n:s:o:", long_options, &option_index)) != -1) 136 { 137 switch (c) { 138 case 'n': 139 N = atoi(strdup(optarg)); 140 break; 141 case 'p': 142 paraMax = atoi(strdup(optarg)); 143 break; 144 case 'o': 145 output = strdup(optarg); 146 break; 147 case 's': 148 swapfile = strdup(optarg); 149 break; 150 default: 151 ShowUsage(); 152 } 153 } 154 if (N < 1 || N > 3 || paraMax < 1024 || output == NULL || swapfile == NULL) 155 ShowUsage(); 156 } 157 158 static std::vector<long> para_offsets; 159 160 int main(int argc, char* argv[]) 161 { 162 getParameters(argc, argv); 163 FILE *swap = fopen(swapfile, "wb+"); 164 FILE *out = fopen(output, "wb+"); 165 if (optind >= argc) ShowUsage(); 166 while (optind < argc) { 167 printf("Processing %s:", argv[optind]); fflush(stdout); 168 FILE *fp = fopen(argv[optind], "rb"); 169 switch (N) { 170 case 1: 171 ProcessingRead<1>(fp, swap, para_offsets, paraMax); 172 break; 173 case 2: 174 ProcessingRead<2>(fp, swap, para_offsets, paraMax); 175 break; 176 case 3: 177 ProcessingRead<3>(fp, swap, para_offsets, paraMax); 178 break; 179 } 180 fclose(fp); 181 printf ("\n"); fflush(stdout); 182 ++optind; 183 } 184 printf("Merging..."); fflush(stdout); 185 switch (N) { 186 case 1: 187 ProcessingIdngramMerge<1>(swap, out, para_offsets); 188 break; 189 case 2: 190 ProcessingIdngramMerge<2>(swap, out, para_offsets); 191 break; 192 case 3: 193 ProcessingIdngramMerge<3>(swap, out, para_offsets); 194 break; 195 } 196 printf ("Done\n"); fflush(stdout); 197 fclose(out); 198 fclose(swap); 199 return 0; 200 } 201 202