Home | History | Annotate | Download | only in ids2ngram
      1 /*
      2  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3  *
      4  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5  *
      6  * The contents of this file are subject to the terms of either the GNU Lesser
      7  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12  * specific language governing permissions and limitations under the License. When
     13  * distributing the software, include this License Header Notice in each file and
     14  * include the full text of the License in the License file as well as the
     15  * following notice:
     16  *
     17  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18  * (CDDL)
     19  * For Covered Software in this distribution, this License shall be governed by the
     20  * laws of the State of California (excluding conflict-of-law provisions).
     21  * Any litigation relating to this License shall be subject to the jurisdiction of
     22  * the Federal Courts of the Northern District of California and the state courts
     23  * of the State of California, with venue lying in Santa Clara County, California.
     24  *
     25  * Contributor(s):
     26  *
     27  * If you wish your version of this file to be governed by only the CDDL or only
     28  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30  * license." If you don't indicate a single choice of license, a recipient has the
     31  * option to distribute your version of this file under either the CDDL or the LGPL
     32  * Version 2.1, or to extend the choice of license to its licensees as provided
     33  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34  * Version 2 license, then the option applies only if the new code is made subject
     35  * to such option by the copyright holder.
     36  */
     37 
     38 #ifndef _SIM_IDNGRAM_MERGE_H
     39 #define _SIM_IDNGRAM_MERGE_H
     40 
     41 #include <stdio.h>
     42 #include <map>
     43 #include <vector>
     44 #include <algorithm>
     45 
     46 #include "../sim_fmerge.h"
     47 #include "idngram.h"
     48 
     49 template<int N>
     50 void DoIdngramMerge(FILE*out, CMultiWayFileMerger<CSIM_IdngramFreq<N> > &merger)
     51 {
     52 	merger.start();
     53 	CSIM_IdngramFreq<N> prevItem;
     54 	while (true) {
     55 		file_para<CSIM_IdngramFreq<N> >	* ppara = merger.getBest();
     56 		TUnitAndParaInfo<CSIM_IdngramFreq<N> > & upi = *(*ppara);
     57 		if (upi.runOut) {
     58 			if (prevItem.freq != 0) {
     59 				fwrite(prevItem.ids, sizeof(TSIMWordId), N, out);
     60 				fwrite(&(prevItem.freq), sizeof(unsigned int), 1, out);
     61 			}
     62 			break;
     63 		}
     64 		CSIM_IdngramFreq<N>& ng = upi.unit;
     65 		if (!(prevItem == ng)) {
     66 			if (prevItem.freq != 0) {
     67 				fwrite(prevItem.ids, sizeof(TSIMWordId), N, out);
     68 				fwrite(&(prevItem.freq), sizeof(unsigned int), 1, out);
     69 			}
     70 			prevItem = ng;
     71 		} else {
     72 			prevItem.freq += ng.freq;
     73 		}
     74 		merger.next();
     75 	}
     76 }
     77 
     78 template<int N>
     79 void ProcessingIdngramMerge(FILE *swap, FILE* out, std::vector<long>& para_offsets)
     80 {
     81 	CMultiWayFileMerger<CSIM_IdngramFreq<N> > merger;
     82 	long s = 0;
     83 	for (int i=0; i < para_offsets.size(); ++i) {
     84 		merger.addPara(swap, s, para_offsets[i]);
     85 		s = para_offsets[i];
     86 	}
     87 	DoIdngramMerge<N>(out, merger);
     88 }
     89 
     90 template<int N>
     91 void ProcessingIdngramMerge(FILE* out, std::vector<FILE* >& file_list)
     92 {
     93 	CMultiWayFileMerger<CSIM_IdngramFreq<N> > merger;
     94 	for (int i=0; i < file_list.size(); ++i) {
     95 		fseek(file_list[i], 0, SEEK_END);
     96 		merger.addPara(file_list[i], 0, ftell(file_list[i]));
     97 	}
     98 	DoIdngramMerge<N>(out, merger);
     99 }
    100 
    101 #endif
    102 
    103