1 0 yongsun /* 2 82 yongsun * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER. 3 82 yongsun * 4 82 yongsun * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved. 5 82 yongsun * 6 82 yongsun * The contents of this file are subject to the terms of either the GNU Lesser 7 82 yongsun * General Public License Version 2.1 only ("LGPL") or the Common Development and 8 82 yongsun * Distribution License ("CDDL")(collectively, the "License"). You may not use this 9 82 yongsun * file except in compliance with the License. You can obtain a copy of the CDDL at 10 82 yongsun * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at 11 82 yongsun * http://www.opensource.org/licenses/lgpl-license.php. See the License for the 12 82 yongsun * specific language governing permissions and limitations under the License. When 13 82 yongsun * distributing the software, include this License Header Notice in each file and 14 82 yongsun * include the full text of the License in the License file as well as the 15 82 yongsun * following notice: 16 82 yongsun * 17 82 yongsun * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE 18 82 yongsun * (CDDL) 19 82 yongsun * For Covered Software in this distribution, this License shall be governed by the 20 82 yongsun * laws of the State of California (excluding conflict-of-law provisions). 21 82 yongsun * Any litigation relating to this License shall be subject to the jurisdiction of 22 82 yongsun * the Federal Courts of the Northern District of California and the state courts 23 82 yongsun * of the State of California, with venue lying in Santa Clara County, California. 24 82 yongsun * 25 82 yongsun * Contributor(s): 26 82 yongsun * 27 82 yongsun * If you wish your version of this file to be governed by only the CDDL or only 28 82 yongsun * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to 29 82 yongsun * include this software in this distribution under the [CDDL or LGPL Version 2.1] 30 82 yongsun * license." If you don't indicate a single choice of license, a recipient has the 31 82 yongsun * option to distribute your version of this file under either the CDDL or the LGPL 32 82 yongsun * Version 2.1, or to extend the choice of license to its licensees as provided 33 82 yongsun * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL 34 82 yongsun * Version 2 license, then the option applies only if the new code is made subject 35 82 yongsun * to such option by the copyright holder. 36 0 yongsun */ 37 82 yongsun 38 0 yongsun #ifndef SUNPY_LATTICE_STATES_H 39 0 yongsun #define SUNPY_LATTICE_STATES_H 40 0 yongsun 41 0 yongsun #include "portability.h" 42 0 yongsun 43 0 yongsun #include "imi_context.h" 44 0 yongsun 45 0 yongsun /** 46 0 yongsun * CStateKey represent the history. In real implementation, it's a 47 0 yongsun * node pointer to a state in the language model. But to save the 48 0 yongsun * language model size, the state node in language model do not 49 0 yongsun * thread the back-off pointer. Now, we just use the Word Id for 50 0 yongsun * the node in the language model. Later we should abstract the 51 0 yongsun * StateNode from language model implemetation to replace this 52 0 yongsun * definition. 53 0 yongsun */ 54 0 yongsun typedef CThreadSlm::TState CStateKey; 55 0 yongsun 56 0 yongsun /** 57 0 yongsun * A WordKey could represent a word. Define this use the unsigned int 58 0 yongsun * directly. Because in the future, we may adopt word class, such as 59 0 yongsun * Digital Word Class. 60 0 yongsun */ 61 0 yongsun typedef unsigned CWordKey; 62 0 yongsun 63 0 yongsun /** 64 0 yongsun * This class is used to record lexicon state (pinyin trie nodes) 65 0 yongsun * just before a bone. From the bone, it could see when arriving 66 0 yongsun * it, how many valid Pinyin Trie Node still could be used to search 67 0 yongsun * more words further, and what bone is its starting bone. 68 0 yongsun */ 69 0 yongsun struct CLexiconState { 70 0 yongsun public: 71 0 yongsun /** 72 0 yongsun * The bone where m_pPYNode starts 73 0 yongsun */ 74 0 yongsun CSkeletonIter m_BoneStart; 75 0 yongsun 76 0 yongsun /** 77 0 yongsun * PinYin trie node, from which we could get its corresponding 78 0 yongsun * words (if it contains some), and search further for longer 79 0 yongsun * words. 80 0 yongsun */ 81 0 yongsun const CPinyinTrie::TNode *m_pPYNode; 82 0 yongsun 83 0 yongsun /** 84 0 yongsun * If is not a PINYIN state, we should use the m_WordId directly. 85 0 yongsun */ 86 0 yongsun bool m_bPinyin; 87 0 yongsun 88 0 yongsun /** 89 0 yongsun * the word id for normal string, like english string, punc, etc. 90 0 yongsun */ 91 0 yongsun unsigned int m_WordId; 92 0 yongsun 93 0 yongsun public: 94 0 yongsun CLexiconState(const CLexiconState& b) 95 0 yongsun : m_BoneStart(b.m_BoneStart), m_pPYNode(b.m_pPYNode), 96 0 yongsun m_bPinyin(b.m_bPinyin), m_WordId(b.m_WordId) { } 97 0 yongsun 98 0 yongsun CLexiconState(CSkeletonIter boneStart = CSkeletonIter(), 99 0 yongsun const CPinyinTrie::TNode *pynptr = NULL) 100 0 yongsun : m_BoneStart(boneStart), m_pPYNode(pynptr), 101 0 yongsun m_bPinyin(true), m_WordId(0) { } 102 0 yongsun 103 0 yongsun CLexiconState(CSkeletonIter boneStart, unsigned int wid) 104 0 yongsun : m_BoneStart(boneStart), m_pPYNode(NULL), 105 0 yongsun m_bPinyin(false), m_WordId(wid) { } 106 0 yongsun }; 107 0 yongsun 108 0 yongsun /** 109 0 yongsun * A list of Lexicon State. Every state may from different 110 0 yongsun * starting position. Later, when Fuzzy PinYin are added, 111 0 yongsun * more than one state may comes from one starting bone. 112 0 yongsun */ 113 0 yongsun typedef std::vector<CLexiconState> CLexiconStates; 114 0 yongsun 115 0 yongsun 116 0 yongsun /** 117 0 yongsun * The basic static unit used in the lattice searching 118 0 yongsun */ 119 0 yongsun struct TLatticeState { 120 0 yongsun public: 121 0 yongsun 122 0 yongsun /** 123 0 yongsun * The score from the beginning of the sentence to here 124 0 yongsun * pr(w1)*pr(w2|w1)*... or its log format 125 0 yongsun */ 126 0 yongsun TSentenceScore m_Score; 127 0 yongsun 128 0 yongsun /** 129 0 yongsun * The bone right after this lattice node. 130 0 yongsun */ 131 0 yongsun CSkeletonIter m_BoneAfter; 132 0 yongsun 133 0 yongsun /** 134 0 yongsun * The history key 135 0 yongsun */ 136 0 yongsun CStateKey m_State; 137 0 yongsun 138 0 yongsun /** 139 0 yongsun * prevState --- word ---> thisState 140 0 yongsun * BackTraceNode record the best prevState that make thisState 141 0 yongsun * best score. 142 0 yongsun */ 143 0 yongsun TLatticeState *m_pBackTraceNode; 144 0 yongsun 145 0 yongsun /** 146 0 yongsun * BackTraceWordId record the best word's id that make thisState 147 0 yongsun * best score 148 0 yongsun */ 149 0 yongsun CWordKey m_BackTraceWordId; 150 0 yongsun 151 0 yongsun /** for debug printing... */ 152 0 yongsun void 153 0 yongsun print(std::string& prefix); 154 0 yongsun 155 0 yongsun public: 156 0 yongsun /** 157 0 yongsun * Default constructor 158 0 yongsun */ 159 0 yongsun #ifdef _USE_RAW_PROBABILITY 160 0 yongsun TLatticeState(double score = -1.0, 161 0 yongsun #else 162 0 yongsun TLatticeState(double score = 0.0, 163 0 yongsun #endif 164 0 yongsun CSkeletonIter bone=CSkeletonIter(), 165 0 yongsun TLatticeState* btNodePtr = NULL, 166 0 yongsun CStateKey sk= CStateKey(), 167 0 yongsun CWordKey wk = CWordKey()) 168 0 yongsun : m_Score(score), m_BoneAfter(bone), m_State(sk), 169 0 yongsun m_pBackTraceNode(btNodePtr), m_BackTraceWordId(wk) { } 170 0 yongsun 171 0 yongsun }; 172 0 yongsun 173 0 yongsun typedef std::vector<TLatticeState> CLatticeStateVec; 174 0 yongsun 175 0 yongsun 176 0 yongsun /** 177 0 yongsun * All lattice node before a single bone. This class provide beam pruning 178 0 yongsun * while push_back, which means at most the best MAX states are reserved, 179 0 yongsun * ie, weak state will may be discard while new better state are inserted, 180 0 yongsun * and the number MAX is arrived. 181 0 yongsun */ 182 0 yongsun class CLatticeStates { 183 0 yongsun private: 184 186 tchaikov static const unsigned beam_width = 32; 185 0 yongsun 186 0 yongsun public: 187 0 yongsun /** just use the CLatticeStateVec's iterator */ 188 0 yongsun typedef CLatticeStateVec::iterator iterator; 189 0 yongsun 190 0 yongsun /** just use the CLatticeStateVec's iterator */ 191 0 yongsun typedef CLatticeStateVec::const_iterator const_iterator; 192 0 yongsun 193 0 yongsun public: 194 0 yongsun void 195 0 yongsun clear(); 196 0 yongsun 197 0 yongsun void 198 0 yongsun push_back(const TLatticeState& node); 199 0 yongsun 200 0 yongsun //@{ 201 0 yongsun /** return the first iterator of m_Vec. */ 202 0 yongsun size_t 203 0 yongsun size() 204 0 yongsun { return m_Vec.size(); } 205 0 yongsun 206 0 yongsun iterator 207 0 yongsun begin() 208 0 yongsun { return m_Vec.begin(); } 209 0 yongsun 210 0 yongsun /** return the first iterator of m_Vec. */ 211 0 yongsun const_iterator 212 0 yongsun begin() const 213 0 yongsun { return m_Vec.begin(); } 214 0 yongsun //@} 215 0 yongsun 216 0 yongsun 217 0 yongsun //@{ 218 0 yongsun /** return the last iterator of m_Vec. */ 219 0 yongsun iterator 220 0 yongsun end() 221 0 yongsun { return m_Vec.end(); } 222 0 yongsun 223 0 yongsun /** return the last iterator of m_Vec. */ 224 0 yongsun const_iterator 225 0 yongsun end() const 226 0 yongsun { return m_Vec.end(); } 227 0 yongsun //@} 228 0 yongsun 229 0 yongsun 230 0 yongsun protected: 231 0 yongsun void 232 0 yongsun bubbleUp(int idxInHeap); 233 0 yongsun 234 0 yongsun void 235 0 yongsun ironDown(int idxInHeap); 236 0 yongsun 237 0 yongsun protected: 238 0 yongsun std::vector<TLatticeState> m_Vec; 239 0 yongsun std::vector<int> m_VecIdxInHeap; 240 0 yongsun std::map<CStateKey, int> m_Map; 241 0 yongsun std::vector<int> m_Heap; 242 0 yongsun }; 243 0 yongsun 244 0 yongsun 245 0 yongsun /** 246 0 yongsun * Bone Inner Data is used to construct the search lattice. Logically, 247 0 yongsun * each bone just like the time fram as in the speech recoginition. 248 0 yongsun * Many information when searching arrived right before the bone are needed. 249 0 yongsun * - The Pinyin Trie States start from previous bones, and not terminated 250 0 yongsun * yet. From these states, we could look further to find longer words. 251 0 yongsun * - All Lattice State (which is already after beam-pruning). They are 252 0 yongsun * basic units of the Lattice. 253 0 yongsun * Any other information also needed to help the interaction with user: 254 0 yongsun * - Best word start righ from the bone. A best word in the best 255 0 yongsun * sentence is an edge of the best path. 256 0 yongsun * - Best word type. 257 0 yongsun * -# Whether there is a best word start right before the bone? Best 258 0 yongsun * words are not overlaped and many bones may contains no best word. 259 0 yongsun * -# Whether the best word is user-selected. Some best word is user 260 0 yongsun * selected, and may be really diffirent with what we found according 261 0 yongsun * to Language Model. 262 0 yongsun */ 263 0 yongsun class CBoneInnerData { 264 0 yongsun public: 265 0 yongsun /** values used by m_BWType to reflect the m_BestWord status */ 266 0 yongsun enum { 267 0 yongsun NoBestWordStartHere, /**< best word not start here. m_bestWod invalid.*/ 268 0 yongsun BestWordStartHere, /**< m_BestWord is valid, but not user selected.*/ 269 0 yongsun UserSelectedBestWord /**< m_BestWord is the user selected best word.*/ 270 0 yongsun }; 271 0 yongsun 272 0 yongsun /** 273 0 yongsun * The best word start from here. A best word in the best sentence is 274 0 yongsun * an edge of the best path. 275 0 yongsun */ 276 0 yongsun CCandidate m_BestWord; 277 0 yongsun 278 0 yongsun /** 279 0 yongsun * m_BWType reflects the m_BestWord status: 280 0 yongsun * -# Is there a best word start here? 281 0 yongsun * -# Whether or not it is user selected? 282 0 yongsun */ 283 0 yongsun int m_BWType; 284 0 yongsun 285 0 yongsun /** 286 0 yongsun * Lexicon status list just before this bone. 287 0 yongsun */ 288 0 yongsun CLexiconStates m_LexiconStates; 289 0 yongsun 290 0 yongsun /** 291 0 yongsun * Lattice status list just before this bone. 292 0 yongsun */ 293 0 yongsun CLatticeStates m_LatticeNodes; 294 0 yongsun 295 0 yongsun public: 296 0 yongsun CBoneInnerData() 297 0 yongsun : m_BestWord(), m_BWType(NoBestWordStartHere), 298 0 yongsun m_LexiconStates(), m_LatticeNodes() { } 299 0 yongsun /** 300 0 yongsun * clear LatticeNodes and LexiconStates, 301 0 yongsun * set the BWType to NoBestWordStartHere. 302 0 yongsun */ 303 0 yongsun void 304 0 yongsun clear() 305 0 yongsun { 306 0 yongsun m_BWType = NoBestWordStartHere; 307 0 yongsun m_LatticeNodes.clear(); 308 0 yongsun m_LexiconStates.clear(); 309 0 yongsun } 310 0 yongsun 311 0 yongsun void 312 0 yongsun print(std::string& prefix); 313 0 yongsun }; 314 0 yongsun 315 0 yongsun 316 0 yongsun #endif 317