PocketSphinx 5prealpha
ngram_search.h
Go to the documentation of this file.
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2008 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37
42#ifndef __NGRAM_SEARCH_H__
43#define __NGRAM_SEARCH_H__
44
45/* SphinxBase headers. */
46#include <sphinxbase/cmd_ln.h>
47#include <sphinxbase/logmath.h>
48#include <sphinxbase/ngram_model.h>
49#include <sphinxbase/listelem_alloc.h>
50#include <sphinxbase/err.h>
51
52/* Local headers. */
54#include "hmm.h"
55
64typedef struct chan_s {
68 struct chan_s *next;
71 struct chan_s *alt;
73 int32 ciphone;
74 union {
79 int32 rc_id;
80 } info;
82
90typedef struct root_chan_s {
96 int32 penult_phn_wid;
100 int16 ciphone;
102 int16 ci2phone;
105
109typedef struct bptbl_s {
111 uint8 valid;
112 uint8 refcnt;
113 int32 wid;
114 int32 bp;
115 int32 score;
116 int32 s_idx;
117 int32 real_wid;
122
126typedef struct bptbl_seg_s {
128 int32 *bpidx;
129 int16 n_bpidx;
130 int16 cur;
132
133/*
134 * Candidates words for entering their last phones. Cleared and rebuilt in each
135 * frame.
136 * NOTE: candidates can only be multi-phone, real dictionary words.
137 */
138typedef struct lastphn_cand_s {
139 int32 wid;
140 int32 score;
141 int32 bp;
142 int32 next; /* next candidate starting at the same frame */
144
145/*
146 * Since the same instance of a word (i.e., <word,start-frame>) reaches its last
147 * phone several times, we can compute its best BP and LM transition score info
148 * just the first time and cache it for future occurrences. Structure for such
149 * a cache.
150 */
151typedef struct {
152 int32 sf; /* Start frame */
153 int32 dscr; /* Delta-score upon entering last phone */
154 int32 bp; /* Best BP */
156
157#define CAND_SF_ALLOCSIZE 32
158typedef struct {
159 int32 bp_ef;
160 int32 cand;
161} cand_sf_t;
162
163/*
164 * Structure for reorganizing the BP table entries in the current frame according
165 * to distinct right context ci-phones. Each entry contains the best BP entry for
166 * a given right context. Each successor word will pick up the correct entry based
167 * on its first ci-phone.
168 */
169typedef struct bestbp_rc_s {
170 int32 score;
171 int32 path; /* BP table index corresponding to this entry */
172 int32 lc; /* right most ci-phone of above BP entry word */
174
175#define NO_BP -1
176
180typedef struct ngram_search_stats_s {
181 int32 n_phone_eval;
182 int32 n_root_chan_eval;
183 int32 n_nonroot_chan_eval;
184 int32 n_last_chan_eval;
185 int32 n_word_lastchan_eval;
186 int32 n_lastphn_cand_utt;
187 int32 n_fwdflat_chan;
188 int32 n_fwdflat_words;
189 int32 n_fwdflat_word_transition;
190 int32 n_senone_active_utt;
192
193
198 ps_search_t base;
199 ngram_model_t *lmset;
202 /* Flags to quickly indicate which passes are enabled. */
203 uint8 fwdtree;
204 uint8 fwdflat;
205 uint8 bestpath;
206
207 /* State of procesing. */
208 uint8 done;
209
210 /* Allocators */
211 listelem_alloc_t *chan_alloc;
212 listelem_alloc_t *root_chan_alloc;
213 listelem_alloc_t *latnode_alloc;
247 bitvec_t *word_active;
276 int32 n_active_chan[2];
288 int32 n_active_word[2];
290 /*
291 * FIXME: Document all of these bits.
292 */
293 lastphn_cand_t *lastphn_cand;
294 int32 n_lastphn_cand;
295 last_ltrans_t *last_ltrans; /* one per word */
296 int32 cand_sf_alloc;
297 cand_sf_t *cand_sf;
298 bestbp_rc_t *bestbp_rc;
299
300 bptbl_t *bp_table; /* Forward pass lattice */
301 int32 bpidx; /* First free BPTable entry */
302 int32 bp_table_size;
303 int32 *bscore_stack; /* Score stack for all possible right contexts */
304 int32 bss_head; /* First free BScoreStack entry */
305 int32 bscore_stack_size;
306
308 int32 n_frame;
309 int32 *bp_table_idx; /* First BPTable entry for each frame */
310 int32 *word_lat_idx; /* BPTable index for any word in current frame;
311 cleared before each frame */
312
313 /*
314 * Flat lexicon (2nd pass) search stuff.
315 */
318 bitvec_t *expand_word_flag;
319 int32 *expand_word_list;
320 int32 n_expand_words;
321 int32 min_ef_width;
322 int32 max_sf_win;
323 float32 fwdflat_fwdtree_lw_ratio;
324
327 int32 renormalized;
328
329 /*
330 * DAG (3rd pass) search stuff.
331 */
332 float32 bestpath_fwdtree_lw_ratio;
333 float32 ascale;
336 ptmr_t fwdtree_perf;
337 ptmr_t fwdflat_perf;
338 ptmr_t bestpath_perf;
339 int32 n_tot_frame;
340
341 /* A collection of beam widths. */
342 int32 beam;
343 int32 dynamic_beam;
344 int32 pbeam;
345 int32 wbeam;
346 int32 lpbeam;
347 int32 lponlybeam;
348 int32 fwdflatbeam;
349 int32 fwdflatwbeam;
350 int32 fillpen;
351 int32 silpen;
352 int32 wip;
353 int32 nwpen;
354 int32 pip;
355 int32 maxwpf;
356 int32 maxhmmpf;
357};
358typedef struct ngram_search_s ngram_search_t;
359
363ps_search_t *ngram_search_init(const char *name,
364 ngram_model_t *lm,
365 cmd_ln_t *config,
366 acmod_t *acmod,
367 dict_t *dict,
368 dict2pid_t *d2p);
369
374
380int ngram_search_mark_bptable(ngram_search_t *ngs, int frame_idx);
381
385void ngram_search_save_bp(ngram_search_t *ngs, int frame_idx, int32 w,
386 int32 score, int32 path, int32 rc);
387
391void ngram_search_alloc_all_rc(ngram_search_t *ngs, int32 w);
392
396void ngram_search_free_all_rc(ngram_search_t *ngs, int32 w);
397
403int ngram_search_find_exit(ngram_search_t *ngs, int frame_idx, int32 *out_best_score);
404
410char const *ngram_search_bp_hyp(ngram_search_t *ngs, int bpidx);
411
416
421
425int32 ngram_search_exit_score(ngram_search_t *ngs, bptbl_t *pbe, int rcphone);
426
432void ngram_search_set_lm(ngram_model_t *lm);
433
434#endif /* __NGRAM_SEARCH_H__ */
Implementation of HMM base structure.
int32 frame_idx_t
Type for frame index values.
Definition hmm.h:64
struct ngram_search_stats_s ngram_search_stats_t
Various statistics for profiling.
void ngram_search_set_lm(ngram_model_t *lm)
Sets the global language model.
struct bptbl_seg_s bptbl_seg_t
Segmentation "iterator" for backpointer table results.
void ngram_search_free_all_rc(ngram_search_t *ngs, int32 w)
Allocate last phone channels for all possible right contexts for word w.
void ngram_search_alloc_all_rc(ngram_search_t *ngs, int32 w)
Allocate last phone channels for all possible right contexts for word w.
int32 ngram_search_exit_score(ngram_search_t *ngs, bptbl_t *pbe, int rcphone)
Get the exit score for a backpointer entry with a given right context.
void ngram_compute_seg_scores(ngram_search_t *ngs, float32 lwf)
Compute language and acoustic scores for backpointer table entries.
ps_search_t * ngram_search_init(const char *name, ngram_model_t *lm, cmd_ln_t *config, acmod_t *acmod, dict_t *dict, dict2pid_t *d2p)
Initialize the N-Gram search module.
int ngram_search_mark_bptable(ngram_search_t *ngs, int frame_idx)
Record the current frame's index in the backpointer table.
ps_lattice_t * ngram_search_lattice(ps_search_t *search)
Construct a word lattice from the current hypothesis.
struct root_chan_s root_chan_t
Lexical tree node data type for the first phone (root) of each dynamic HMM tree structure.
void ngram_search_free(ps_search_t *ngs)
Finalize the N-Gram search module.
int ngram_search_find_exit(ngram_search_t *ngs, int frame_idx, int32 *out_best_score)
Find the best word exit for the current frame in the backpointer table.
struct bptbl_s bptbl_t
Back pointer table (forward pass lattice; actually a tree)
char const * ngram_search_bp_hyp(ngram_search_t *ngs, int bpidx)
Backtrace from a given backpointer index to obtain a word hypothesis.
void ngram_search_save_bp(ngram_search_t *ngs, int frame_idx, int32 w, int32 score, int32 path, int32 rc)
Enter a word in the backpointer table.
struct chan_s chan_t
Lexical tree node data type.
Internal implementation of PocketSphinx decoder.
Acoustic model structure.
Definition acmod.h:148
Back pointer table (forward pass lattice; actually a tree)
int32 wid
Word index.
int16 last2_phone
next-to-last phone of this word
uint8 valid
For absolute pruning.
int32 bp
Back Pointer.
int32 prev_real_wid
wid of second-last real word
int32 real_wid
wid of this or latest predecessor real word
int32 score
Score (best among all right contexts)
int16 last_phone
last phone of this word
int32 s_idx
Start of BScoreStack for various right contexts.
uint8 refcnt
Reference count (number of successors)
frame_idx_t frame
start or end frame
Segmentation "iterator" for backpointer table results.
int16 cur
Current position in bpidx.
int32 * bpidx
Sequence of backpointer IDs.
int16 n_bpidx
Number of backpointer IDs.
ps_seg_t base
Base structure.
Lexical tree node data type.
int32 penult_phn_wid
list of words whose last phone follows this one; this field indicates the first of the list; the rest...
struct chan_s * next
first descendant of this channel; or, in the case of the last phone of a word, the next alternative r...
int32 ciphone
ciphone for this node
struct chan_s * alt
sibling; i.e., next descendant of parent HMM
hmm_t hmm
Basic HMM structure.
int32 rc_id
right-context id for last phone of words
Building composite triphone (as well as word internal triphones) with the dictionary.
Definition dict2pid.h:84
a structure for a dictionary.
Definition dict.h:76
Shared information between a set of HMMs.
An individual HMM among the HMM search space.
N-Gram search module structure.
int32 n_nonroot_chan
Number of valid non-root channels.
int32 * single_phone_wid
list of single-phone word ids
int32 best_score
Best Viterbi path score.
float32 ascale
Acoustic score scale for posterior probabilities.
root_chan_t * rhmm_1ph
Root HMMs for single-phone words.
listelem_alloc_t * latnode_alloc
For latnode_t.
int32 n_root_chan
Number of valid root_chan.
int32 n_frame_alloc
Number of frames allocated in bp_table_idx and friends.
int32 max_nonroot_chan
Maximum possible number of non-root channels.
int32 ** active_word_list
Array of active multi-phone words for current and next frame.
int32 n_frame
Number of frames actually present.
ngram_search_stats_t st
Various statistics for profiling.
listelem_alloc_t * root_chan_alloc
For root_chan_t.
int32 n_active_word[2]
Number entries in active_word_list.
ngram_model_t * lmset
Set of language models.
int32 * fwdflat_wordlist
List of active word IDs for utterance.
chan_t ** word_chan
Channels associated with a given word (only used for right contexts, single-phone words in fwdtree se...
int32 last_phone_best_score
Best Viterbi path score for last phone.
chan_t *** active_chan_list
Array of active channels for current and next frame.
int32 n_1ph_words
Number single phone words in dict (total)
int32 n_1ph_LMwords
Number single phone dict words also in LM; these come first in single_phone_wid.
ps_latnode_t ** frm_wordlist
List of active words in each frame.
int32 * homophone_set
Each node in the HMM tree structure may point to a set of words whose last phone would follow that no...
int32 n_root_chan_alloc
Number of root_chan allocated.
listelem_alloc_t * chan_alloc
For chan_t.
int32 n_active_chan[2]
Number entries in active_chan_list.
hmm_context_t * hmmctx
HMM context.
root_chan_t * root_chan
Search structure of HMM instances.
bitvec_t * word_active
array of active flags for all words.
Various statistics for profiling.
Word graph structure used in bestpath/nbest search.
Base structure for search module.
Base structure for hypothesis segmentation iterator.
Lexical tree node data type for the first phone (root) of each dynamic HMM tree structure.
int16 ci2phone
second ciphone of this node; one root HMM for each unique right context
hmm_t hmm
Basic HMM structure.
int16 ciphone
first ciphone of this node; all words rooted at this node begin with this ciphone
chan_t * next
first descendant of this channel
int32 this_phn_wid
list of words consisting of this single phone; actually the first of the list, like penult_phn_wid; -...