Home | History | Annotate | Download | only in src
      1    0   yongsun /*
      2   82   yongsun  * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS HEADER.
      3   82   yongsun  *
      4   82   yongsun  * Copyright (c) 2007 Sun Microsystems, Inc. All Rights Reserved.
      5   82   yongsun  *
      6   82   yongsun  * The contents of this file are subject to the terms of either the GNU Lesser
      7   82   yongsun  * General Public License Version 2.1 only ("LGPL") or the Common Development and
      8   82   yongsun  * Distribution License ("CDDL")(collectively, the "License"). You may not use this
      9   82   yongsun  * file except in compliance with the License. You can obtain a copy of the CDDL at
     10   82   yongsun  * http://www.opensource.org/licenses/cddl1.php and a copy of the LGPLv2.1 at
     11   82   yongsun  * http://www.opensource.org/licenses/lgpl-license.php. See the License for the
     12   82   yongsun  * specific language governing permissions and limitations under the License. When
     13   82   yongsun  * distributing the software, include this License Header Notice in each file and
     14   82   yongsun  * include the full text of the License in the License file as well as the
     15   82   yongsun  * following notice:
     16   82   yongsun  *
     17   82   yongsun  * NOTICE PURSUANT TO SECTION 9 OF THE COMMON DEVELOPMENT AND DISTRIBUTION LICENSE
     18   82   yongsun  * (CDDL)
     19   82   yongsun  * For Covered Software in this distribution, this License shall be governed by the
     20   82   yongsun  * laws of the State of California (excluding conflict-of-law provisions).
     21   82   yongsun  * Any litigation relating to this License shall be subject to the jurisdiction of
     22   82   yongsun  * the Federal Courts of the Northern District of California and the state courts
     23   82   yongsun  * of the State of California, with venue lying in Santa Clara County, California.
     24   82   yongsun  *
     25   82   yongsun  * Contributor(s):
     26   82   yongsun  *
     27   82   yongsun  * If you wish your version of this file to be governed by only the CDDL or only
     28   82   yongsun  * the LGPL Version 2.1, indicate your decision by adding "[Contributor]" elects to
     29   82   yongsun  * include this software in this distribution under the [CDDL or LGPL Version 2.1]
     30   82   yongsun  * license." If you don't indicate a single choice of license, a recipient has the
     31   82   yongsun  * option to distribute your version of this file under either the CDDL or the LGPL
     32   82   yongsun  * Version 2.1, or to extend the choice of license to its licensees as provided
     33   82   yongsun  * above. However, if you add LGPL Version 2.1 code and therefore, elected the LGPL
     34   82   yongsun  * Version 2 license, then the option applies only if the new code is made subject
     35   82   yongsun  * to such option by the copyright holder.
     36    0   yongsun  */
     37   82   yongsun 
     38    0   yongsun #ifndef SUNPY_LATTICE_STATES_H
     39    0   yongsun #define SUNPY_LATTICE_STATES_H
     40    0   yongsun 
     41    0   yongsun #include "portability.h"
     42    0   yongsun 
     43    0   yongsun #include "imi_context.h"
     44    0   yongsun 
     45    0   yongsun /**
     46    0   yongsun  * CStateKey represent the history. In real implementation, it's a
     47    0   yongsun  * node pointer to a state in the language model. But to save the
     48    0   yongsun  * language model size, the state node in language model do not
     49    0   yongsun  * thread the back-off pointer. Now, we just use the Word Id for
     50    0   yongsun  * the node in the language model. Later we should abstract the
     51    0   yongsun  * StateNode from language model implemetation to replace this
     52    0   yongsun  * definition.
     53    0   yongsun  */
     54    0   yongsun typedef CThreadSlm::TState          CStateKey;
     55    0   yongsun 
     56    0   yongsun /**
     57    0   yongsun  * A WordKey could represent a word. Define this use the unsigned int
     58    0   yongsun  * directly. Because in the future, we may adopt word class, such as
     59    0   yongsun  * Digital Word Class.
     60    0   yongsun  */
     61    0   yongsun typedef unsigned                    CWordKey;
     62    0   yongsun 
     63    0   yongsun /**
     64    0   yongsun  * This class is used to record lexicon state (pinyin trie nodes)
     65    0   yongsun  * just before a bone. From the bone, it could see when arriving
     66    0   yongsun  * it, how many valid Pinyin Trie Node still could be used to search
     67    0   yongsun  * more words further, and what bone is its starting bone.
     68    0   yongsun  */
     69    0   yongsun struct CLexiconState {
     70    0   yongsun public:
     71    0   yongsun     /**
     72    0   yongsun      * The bone where m_pPYNode starts
     73    0   yongsun      */
     74    0   yongsun     CSkeletonIter                   m_BoneStart;
     75    0   yongsun 
     76    0   yongsun     /**
     77    0   yongsun      * PinYin trie node, from which we could get its corresponding
     78    0   yongsun      * words (if it contains some), and search further for longer
     79    0   yongsun      * words.
     80    0   yongsun      */
     81    0   yongsun     const CPinyinTrie::TNode       *m_pPYNode;
     82    0   yongsun 
     83    0   yongsun     /**
     84    0   yongsun      * If is not a PINYIN state, we should use the m_WordId directly.
     85    0   yongsun      */
     86    0   yongsun     bool                            m_bPinyin;
     87    0   yongsun 
     88    0   yongsun     /**
     89    0   yongsun      * the word id for normal string, like english string, punc, etc.
     90    0   yongsun      */
     91    0   yongsun     unsigned int                    m_WordId;
     92    0   yongsun 
     93    0   yongsun public:
     94    0   yongsun     CLexiconState(const CLexiconState& b)
     95    0   yongsun         : m_BoneStart(b.m_BoneStart), m_pPYNode(b.m_pPYNode),
     96    0   yongsun           m_bPinyin(b.m_bPinyin), m_WordId(b.m_WordId) { }
     97    0   yongsun 
     98    0   yongsun     CLexiconState(CSkeletonIter boneStart = CSkeletonIter(),
     99    0   yongsun                   const CPinyinTrie::TNode *pynptr = NULL)
    100    0   yongsun         : m_BoneStart(boneStart), m_pPYNode(pynptr),
    101    0   yongsun           m_bPinyin(true), m_WordId(0) { }
    102    0   yongsun 
    103    0   yongsun     CLexiconState(CSkeletonIter boneStart, unsigned int wid)
    104    0   yongsun         : m_BoneStart(boneStart), m_pPYNode(NULL),
    105    0   yongsun           m_bPinyin(false), m_WordId(wid) { }
    106    0   yongsun };
    107    0   yongsun 
    108    0   yongsun /**
    109    0   yongsun  * A list of Lexicon State. Every state may from different
    110    0   yongsun  * starting position. Later, when Fuzzy PinYin are added,
    111    0   yongsun  * more than one state may comes from one starting bone.
    112    0   yongsun  */
    113    0   yongsun typedef std::vector<CLexiconState>    CLexiconStates;
    114    0   yongsun 
    115    0   yongsun 
    116    0   yongsun /**
    117    0   yongsun  * The basic static unit used in the lattice searching
    118    0   yongsun  */
    119    0   yongsun struct TLatticeState {
    120    0   yongsun public:
    121    0   yongsun 
    122    0   yongsun     /**
    123    0   yongsun      * The score from the beginning of the sentence to here
    124    0   yongsun      * pr(w1)*pr(w2|w1)*... or its log format
    125    0   yongsun      */
    126    0   yongsun     TSentenceScore                  m_Score;
    127    0   yongsun 
    128    0   yongsun     /**
    129    0   yongsun      * The bone right after this lattice node.
    130    0   yongsun      */
    131    0   yongsun     CSkeletonIter                   m_BoneAfter;
    132    0   yongsun 
    133    0   yongsun     /**
    134    0   yongsun      * The history key
    135    0   yongsun      */
    136    0   yongsun     CStateKey                       m_State;
    137    0   yongsun 
    138    0   yongsun     /**
    139    0   yongsun      * prevState --- word ---> thisState
    140    0   yongsun      * BackTraceNode record the best prevState that make thisState
    141    0   yongsun      * best score.
    142    0   yongsun      */
    143    0   yongsun     TLatticeState                  *m_pBackTraceNode;
    144    0   yongsun 
    145    0   yongsun     /**
    146    0   yongsun      * BackTraceWordId record the best word's id that make thisState
    147    0   yongsun      * best score
    148    0   yongsun      */
    149    0   yongsun     CWordKey                        m_BackTraceWordId;
    150    0   yongsun 
    151    0   yongsun     /** for debug printing... */
    152    0   yongsun     void
    153    0   yongsun     print(std::string& prefix);
    154    0   yongsun 
    155    0   yongsun public:
    156    0   yongsun     /**
    157    0   yongsun      * Default constructor
    158    0   yongsun      */
    159    0   yongsun     #ifdef _USE_RAW_PROBABILITY
    160    0   yongsun     TLatticeState(double score = -1.0,
    161    0   yongsun     #else
    162    0   yongsun     TLatticeState(double score = 0.0,
    163    0   yongsun     #endif
    164    0   yongsun                   CSkeletonIter bone=CSkeletonIter(),
    165    0   yongsun                   TLatticeState* btNodePtr = NULL,
    166    0   yongsun                   CStateKey sk= CStateKey(),
    167    0   yongsun                   CWordKey wk = CWordKey())
    168    0   yongsun         : m_Score(score), m_BoneAfter(bone), m_State(sk),
    169    0   yongsun           m_pBackTraceNode(btNodePtr), m_BackTraceWordId(wk) { }
    170    0   yongsun 
    171    0   yongsun };
    172    0   yongsun 
    173    0   yongsun typedef std::vector<TLatticeState>  CLatticeStateVec;
    174    0   yongsun 
    175    0   yongsun 
    176    0   yongsun /**
    177    0   yongsun  * All lattice node before a single bone. This class provide beam pruning
    178    0   yongsun  * while push_back, which means at most the best MAX states are reserved,
    179    0   yongsun  * ie, weak state will may be discard while new better state are inserted,
    180    0   yongsun  * and the number MAX is arrived.
    181    0   yongsun  */
    182    0   yongsun class CLatticeStates {
    183    0   yongsun private:
    184  186  tchaikov     static const unsigned beam_width = 32;
    185    0   yongsun 
    186    0   yongsun public:
    187    0   yongsun     /** just use the CLatticeStateVec's iterator */
    188    0   yongsun     typedef CLatticeStateVec::iterator        iterator;
    189    0   yongsun 
    190    0   yongsun     /** just use the CLatticeStateVec's iterator */
    191    0   yongsun     typedef CLatticeStateVec::const_iterator  const_iterator;
    192    0   yongsun 
    193    0   yongsun public:
    194    0   yongsun     void
    195    0   yongsun     clear();
    196    0   yongsun 
    197    0   yongsun     void
    198    0   yongsun     push_back(const TLatticeState& node);
    199    0   yongsun 
    200    0   yongsun     //@{
    201    0   yongsun     /** return the first iterator of m_Vec. */
    202    0   yongsun     size_t
    203    0   yongsun     size()
    204    0   yongsun         { return m_Vec.size(); }
    205    0   yongsun 
    206    0   yongsun     iterator
    207    0   yongsun     begin()
    208    0   yongsun         { return m_Vec.begin(); }
    209    0   yongsun 
    210    0   yongsun     /** return the first iterator of m_Vec. */
    211    0   yongsun     const_iterator
    212    0   yongsun     begin() const
    213    0   yongsun         { return m_Vec.begin(); }
    214    0   yongsun     //@}
    215    0   yongsun 
    216    0   yongsun 
    217    0   yongsun     //@{
    218    0   yongsun     /** return the last iterator of m_Vec. */
    219    0   yongsun     iterator
    220    0   yongsun     end()
    221    0   yongsun         { return m_Vec.end(); }
    222    0   yongsun 
    223    0   yongsun     /** return the last iterator of m_Vec. */
    224    0   yongsun     const_iterator
    225    0   yongsun     end() const
    226    0   yongsun         { return m_Vec.end(); }
    227    0   yongsun     //@}
    228    0   yongsun 
    229    0   yongsun 
    230    0   yongsun protected:
    231    0   yongsun     void
    232    0   yongsun     bubbleUp(int idxInHeap);
    233    0   yongsun 
    234    0   yongsun     void
    235    0   yongsun     ironDown(int idxInHeap);
    236    0   yongsun 
    237    0   yongsun protected:
    238    0   yongsun     std::vector<TLatticeState>      m_Vec;
    239    0   yongsun     std::vector<int>                m_VecIdxInHeap;
    240    0   yongsun     std::map<CStateKey, int>        m_Map;
    241    0   yongsun     std::vector<int>                m_Heap;
    242    0   yongsun };
    243    0   yongsun 
    244    0   yongsun 
    245    0   yongsun /**
    246    0   yongsun  * Bone Inner Data is used to construct the search lattice. Logically,
    247    0   yongsun  * each bone just like the time fram as in the speech recoginition.
    248    0   yongsun  * Many information when searching arrived right before the bone are needed.
    249    0   yongsun  *    - The Pinyin Trie States start from previous bones, and not terminated
    250    0   yongsun  *      yet. From these states, we could look further to find longer words.
    251    0   yongsun  *    - All Lattice State (which is already after beam-pruning). They are
    252    0   yongsun  *      basic units of the Lattice.
    253    0   yongsun  * Any other information also needed to help the interaction with user:
    254    0   yongsun  *    - Best word start righ from the bone. A best word in the best
    255    0   yongsun  *      sentence is an edge of the best path.
    256    0   yongsun  *    - Best word type.
    257    0   yongsun  *       -# Whether there is a best word start right before the bone? Best
    258    0   yongsun  *          words are not overlaped and many bones may contains no best word.
    259    0   yongsun  *       -# Whether the best word is user-selected. Some best word is user
    260    0   yongsun  *          selected, and may be really diffirent with what we found according
    261    0   yongsun  *          to Language Model.
    262    0   yongsun  */
    263    0   yongsun class CBoneInnerData {
    264    0   yongsun public:
    265    0   yongsun     /** values used by m_BWType to reflect the m_BestWord status */
    266    0   yongsun     enum {
    267    0   yongsun         NoBestWordStartHere, /**< best word not start here. m_bestWod invalid.*/
    268    0   yongsun         BestWordStartHere, /**< m_BestWord is valid, but not user selected.*/
    269    0   yongsun         UserSelectedBestWord /**< m_BestWord is the user selected best word.*/
    270    0   yongsun     };
    271    0   yongsun 
    272    0   yongsun     /**
    273    0   yongsun      * The best word start from here. A best word in the best sentence is
    274    0   yongsun      * an edge of the best path.
    275    0   yongsun      */
    276    0   yongsun     CCandidate                      m_BestWord;
    277    0   yongsun 
    278    0   yongsun     /**
    279    0   yongsun      * m_BWType reflects the m_BestWord status:
    280    0   yongsun      *    -# Is there a best word start here?
    281    0   yongsun      *    -# Whether or not it is user selected?
    282    0   yongsun      */
    283    0   yongsun     int                             m_BWType;
    284    0   yongsun 
    285    0   yongsun     /**
    286    0   yongsun      * Lexicon status list just before this bone.
    287    0   yongsun      */
    288    0   yongsun     CLexiconStates                  m_LexiconStates;
    289    0   yongsun 
    290    0   yongsun     /**
    291    0   yongsun      * Lattice status list just before this bone.
    292    0   yongsun      */
    293    0   yongsun     CLatticeStates                  m_LatticeNodes;
    294    0   yongsun 
    295    0   yongsun public:
    296    0   yongsun     CBoneInnerData()
    297    0   yongsun         : m_BestWord(), m_BWType(NoBestWordStartHere),
    298    0   yongsun           m_LexiconStates(), m_LatticeNodes() { }
    299    0   yongsun     /**
    300    0   yongsun      * clear LatticeNodes and LexiconStates,
    301    0   yongsun      * set the BWType to NoBestWordStartHere.
    302    0   yongsun      */
    303    0   yongsun     void
    304    0   yongsun     clear()
    305    0   yongsun         {
    306    0   yongsun             m_BWType = NoBestWordStartHere;
    307    0   yongsun             m_LatticeNodes.clear();
    308    0   yongsun             m_LexiconStates.clear();
    309    0   yongsun         }
    310    0   yongsun 
    311    0   yongsun     void
    312    0   yongsun     print(std::string& prefix);
    313    0   yongsun };
    314    0   yongsun 
    315    0   yongsun 
    316    0   yongsun #endif
    317