PocketSphinx 5prealpha
ngram_search_fwdflat.c
Go to the documentation of this file.
1/* -*- c-basic-offset: 4; indent-tabs-mode: nil -*- */
2/* ====================================================================
3 * Copyright (c) 2008 Carnegie Mellon University. All rights
4 * reserved.
5 *
6 * Redistribution and use in source and binary forms, with or without
7 * modification, are permitted provided that the following conditions
8 * are met:
9 *
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 *
13 * 2. Redistributions in binary form must reproduce the above copyright
14 * notice, this list of conditions and the following disclaimer in
15 * the documentation and/or other materials provided with the
16 * distribution.
17 *
18 * This work was supported in part by funding from the Defense Advanced
19 * Research Projects Agency and the National Science Foundation of the
20 * United States of America, and the CMU Sphinx Speech Consortium.
21 *
22 * THIS SOFTWARE IS PROVIDED BY CARNEGIE MELLON UNIVERSITY ``AS IS'' AND
23 * ANY EXPRESSED OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
24 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
25 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL CARNEGIE MELLON UNIVERSITY
26 * NOR ITS EMPLOYEES BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
27 * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
28 * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
29 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
30 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
31 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
32 * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 *
34 * ====================================================================
35 *
36 */
37
42/* System headers. */
43#include <string.h>
44#include <assert.h>
45
46/* SphinxBase headers. */
47#include <sphinxbase/ckd_alloc.h>
48#include <sphinxbase/listelem_alloc.h>
49#include <sphinxbase/err.h>
50
51/* Local headers. */
52#include "ngram_search.h"
53#include "ps_lattice_internal.h"
54
55/* Turn this on to dump channels for debugging */
56#define __CHAN_DUMP__ 0
57#if __CHAN_DUMP__
58#define chan_v_eval(chan) hmm_dump_vit_eval(&(chan)->hmm, stderr)
59#else
60#define chan_v_eval(chan) hmm_vit_eval(&(chan)->hmm)
61#endif
62
63static void
64ngram_fwdflat_expand_all(ngram_search_t *ngs)
65{
66 int n_words, i;
67
68 /* For all "real words" (not fillers or <s>/</s>) in the dictionary,
69 *
70 * 1) Add the ones which are in the LM to the fwdflat wordlist
71 * 2) And to the expansion list (since we are expanding all)
72 */
73 ngs->n_expand_words = 0;
74 n_words = ps_search_n_words(ngs);
75 bitvec_clear_all(ngs->expand_word_flag, ps_search_n_words(ngs));
76 for (i = 0; i < n_words; ++i) {
77 if (!ngram_model_set_known_wid(ngs->lmset,
78 dict_basewid(ps_search_dict(ngs),i)))
79 continue;
80 ngs->fwdflat_wordlist[ngs->n_expand_words] = i;
81 ngs->expand_word_list[ngs->n_expand_words] = i;
82 bitvec_set(ngs->expand_word_flag, i);
83 ngs->n_expand_words++;
84 }
85 E_INFO("Utterance vocabulary contains %d words\n", ngs->n_expand_words);
86 ngs->expand_word_list[ngs->n_expand_words] = -1;
87 ngs->fwdflat_wordlist[ngs->n_expand_words] = -1;
88}
89
90static void
91ngram_fwdflat_allocate_1ph(ngram_search_t *ngs)
92{
93 dict_t *dict = ps_search_dict(ngs);
94 int n_words = ps_search_n_words(ngs);
95 int i, w;
96
97 /* Allocate single-phone words, since they won't have
98 * been allocated for us by fwdtree initialization. */
99 ngs->n_1ph_words = 0;
100 for (w = 0; w < n_words; w++) {
101 if (dict_is_single_phone(dict, w))
102 ++ngs->n_1ph_words;
103 }
104 ngs->single_phone_wid = ckd_calloc(ngs->n_1ph_words,
105 sizeof(*ngs->single_phone_wid));
106 ngs->rhmm_1ph = ckd_calloc(ngs->n_1ph_words, sizeof(*ngs->rhmm_1ph));
107 i = 0;
108 for (w = 0; w < n_words; w++) {
109 if (!dict_is_single_phone(dict, w))
110 continue;
111
112 /* DICT2PID location */
113 ngs->rhmm_1ph[i].ciphone = dict_first_phone(dict, w);
114 ngs->rhmm_1ph[i].ci2phone = bin_mdef_silphone(ps_search_acmod(ngs)->mdef);
115 hmm_init(ngs->hmmctx, &ngs->rhmm_1ph[i].hmm, TRUE,
116 /* ssid */ bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef,
117 ngs->rhmm_1ph[i].ciphone),
118 /* tmatid */ bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef,
119 ngs->rhmm_1ph[i].ciphone));
120 ngs->rhmm_1ph[i].next = NULL;
121 ngs->word_chan[w] = (chan_t *) &(ngs->rhmm_1ph[i]);
122 ngs->single_phone_wid[i] = w;
123 i++;
124 }
125}
126
127static void
128ngram_fwdflat_free_1ph(ngram_search_t *ngs)
129{
130 int i, w;
131 int n_words = ps_search_n_words(ngs);
132
133 for (i = w = 0; w < n_words; ++w) {
134 if (!dict_is_single_phone(ps_search_dict(ngs), w))
135 continue;
136 hmm_deinit(&ngs->rhmm_1ph[i].hmm);
137 ++i;
138 }
139 ckd_free(ngs->rhmm_1ph);
140 ngs->rhmm_1ph = NULL;
141 ckd_free(ngs->single_phone_wid);
142}
143
144void
146{
147 int n_words;
148
149 n_words = ps_search_n_words(ngs);
150 ngs->fwdflat_wordlist = ckd_calloc(n_words + 1, sizeof(*ngs->fwdflat_wordlist));
151 ngs->expand_word_flag = bitvec_alloc(n_words);
152 ngs->expand_word_list = ckd_calloc(n_words + 1, sizeof(*ngs->expand_word_list));
153 ngs->frm_wordlist = ckd_calloc(ngs->n_frame_alloc, sizeof(*ngs->frm_wordlist));
154 ngs->min_ef_width = cmd_ln_int32_r(ps_search_config(ngs), "-fwdflatefwid");
155 ngs->max_sf_win = cmd_ln_int32_r(ps_search_config(ngs), "-fwdflatsfwin");
156 E_INFO("fwdflat: min_ef_width = %d, max_sf_win = %d\n",
157 ngs->min_ef_width, ngs->max_sf_win);
158
159 /* No tree-search; pre-build the expansion list, including all LM words. */
160 if (!ngs->fwdtree) {
161 /* Build full expansion list from LM words. */
162 ngram_fwdflat_expand_all(ngs);
163 /* Allocate single phone words. */
164 ngram_fwdflat_allocate_1ph(ngs);
165 }
166}
167
168void
170{
171 double n_speech = (double)ngs->n_tot_frame
172 / cmd_ln_int32_r(ps_search_config(ngs), "-frate");
173
174 E_INFO("TOTAL fwdflat %.2f CPU %.3f xRT\n",
175 ngs->fwdflat_perf.t_tot_cpu,
176 ngs->fwdflat_perf.t_tot_cpu / n_speech);
177 E_INFO("TOTAL fwdflat %.2f wall %.3f xRT\n",
178 ngs->fwdflat_perf.t_tot_elapsed,
179 ngs->fwdflat_perf.t_tot_elapsed / n_speech);
180
181 /* Free single-phone words if we allocated them. */
182 if (!ngs->fwdtree) {
183 ngram_fwdflat_free_1ph(ngs);
184 }
185 ckd_free(ngs->fwdflat_wordlist);
186 bitvec_free(ngs->expand_word_flag);
187 ckd_free(ngs->expand_word_list);
188 ckd_free(ngs->frm_wordlist);
189}
190
191int
193{
194 /* Reallocate things that depend on the number of words. */
195 int n_words;
196
197 ckd_free(ngs->fwdflat_wordlist);
198 ckd_free(ngs->expand_word_list);
199 bitvec_free(ngs->expand_word_flag);
200 n_words = ps_search_n_words(ngs);
201 ngs->fwdflat_wordlist = ckd_calloc(n_words + 1, sizeof(*ngs->fwdflat_wordlist));
202 ngs->expand_word_flag = bitvec_alloc(n_words);
203 ngs->expand_word_list = ckd_calloc(n_words + 1, sizeof(*ngs->expand_word_list));
204
205 /* No tree-search; take care of the expansion list and single phone words. */
206 if (!ngs->fwdtree) {
207 /* Free single-phone words. */
208 ngram_fwdflat_free_1ph(ngs);
209 /* Reallocate word_chan. */
210 ckd_free(ngs->word_chan);
211 ngs->word_chan = ckd_calloc(dict_size(ps_search_dict(ngs)),
212 sizeof(*ngs->word_chan));
213 /* Rebuild full expansion list from LM words. */
214 ngram_fwdflat_expand_all(ngs);
215 /* Allocate single phone words. */
216 ngram_fwdflat_allocate_1ph(ngs);
217 }
218 /* Otherwise there is nothing to do since the wordlist is
219 * generated anew every utterance. */
220 return 0;
221}
222
226static void
227build_fwdflat_wordlist(ngram_search_t *ngs)
228{
229 int32 i, f, sf, ef, wid, nwd;
230 bptbl_t *bp;
231 ps_latnode_t *node, *prevnode, *nextnode;
232
233 /* No tree-search, use statically allocated wordlist. */
234 if (!ngs->fwdtree)
235 return;
236
237 memset(ngs->frm_wordlist, 0, ngs->n_frame_alloc * sizeof(*ngs->frm_wordlist));
238
239 /* Scan the backpointer table for all active words and record
240 * their exit frames. */
241 for (i = 0, bp = ngs->bp_table; i < ngs->bpidx; i++, bp++) {
242 sf = (bp->bp < 0) ? 0 : ngs->bp_table[bp->bp].frame + 1;
243 ef = bp->frame;
244 wid = bp->wid;
245
246 /* Anything that can be transitioned to in the LM can go in
247 * the word list. */
248 if (!ngram_model_set_known_wid(ngs->lmset,
249 dict_basewid(ps_search_dict(ngs), wid)))
250 continue;
251
252 /* Look for it in the wordlist. */
253 for (node = ngs->frm_wordlist[sf]; node && (node->wid != wid);
254 node = node->next);
255
256 /* Update last end frame. */
257 if (node)
258 node->lef = ef;
259 else {
260 /* New node; link to head of list */
261 node = listelem_malloc(ngs->latnode_alloc);
262 node->wid = wid;
263 node->fef = node->lef = ef;
264
265 node->next = ngs->frm_wordlist[sf];
266 ngs->frm_wordlist[sf] = node;
267 }
268 }
269
270 /* Eliminate "unlikely" words, for which there are too few end points */
271 for (f = 0; f < ngs->n_frame; f++) {
272 prevnode = NULL;
273 for (node = ngs->frm_wordlist[f]; node; node = nextnode) {
274 nextnode = node->next;
275 /* Word has too few endpoints */
276 if ((node->lef - node->fef < ngs->min_ef_width) ||
277 /* Word is </s> and doesn't actually end in last frame */
278 ((node->wid == ps_search_finish_wid(ngs)) && (node->lef < ngs->n_frame - 1))) {
279 if (!prevnode)
280 ngs->frm_wordlist[f] = nextnode;
281 else
282 prevnode->next = nextnode;
283 listelem_free(ngs->latnode_alloc, node);
284 }
285 else
286 prevnode = node;
287 }
288 }
289
290 /* Form overall wordlist for 2nd pass */
291 nwd = 0;
292 bitvec_clear_all(ngs->word_active, ps_search_n_words(ngs));
293 for (f = 0; f < ngs->n_frame; f++) {
294 for (node = ngs->frm_wordlist[f]; node; node = node->next) {
295 if (!bitvec_is_set(ngs->word_active, node->wid)) {
296 bitvec_set(ngs->word_active, node->wid);
297 ngs->fwdflat_wordlist[nwd++] = node->wid;
298 }
299 }
300 }
301 ngs->fwdflat_wordlist[nwd] = -1;
302 E_INFO("Utterance vocabulary contains %d words\n", nwd);
303}
304
308static void
309build_fwdflat_chan(ngram_search_t *ngs)
310{
311 int32 i, wid, p;
312 root_chan_t *rhmm;
313 chan_t *hmm, *prevhmm;
314 dict_t *dict;
315 dict2pid_t *d2p;
316
317 dict = ps_search_dict(ngs);
318 d2p = ps_search_dict2pid(ngs);
319
320 /* Build word HMMs for each word in the lattice. */
321 for (i = 0; ngs->fwdflat_wordlist[i] >= 0; i++) {
322 wid = ngs->fwdflat_wordlist[i];
323
324 /* Single-phone words are permanently allocated */
325 if (dict_is_single_phone(dict, wid))
326 continue;
327
328 assert(ngs->word_chan[wid] == NULL);
329
330 /* Multiplex root HMM for first phone (one root per word, flat
331 * lexicon). diphone is irrelevant here, for the time being,
332 * at least. */
333 rhmm = listelem_malloc(ngs->root_chan_alloc);
334 rhmm->ci2phone = dict_second_phone(dict, wid);
335 rhmm->ciphone = dict_first_phone(dict, wid);
336 rhmm->next = NULL;
337 hmm_init(ngs->hmmctx, &rhmm->hmm, TRUE,
338 bin_mdef_pid2ssid(ps_search_acmod(ngs)->mdef, rhmm->ciphone),
339 bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, rhmm->ciphone));
340
341 /* HMMs for word-internal phones */
342 prevhmm = NULL;
343 for (p = 1; p < dict_pronlen(dict, wid) - 1; p++) {
344 hmm = listelem_malloc(ngs->chan_alloc);
345 hmm->ciphone = dict_pron(dict, wid, p);
346 hmm->info.rc_id = (p == dict_pronlen(dict, wid) - 1) ? 0 : -1;
347 hmm->next = NULL;
348 hmm_init(ngs->hmmctx, &hmm->hmm, FALSE,
349 dict2pid_internal(d2p,wid,p),
350 bin_mdef_pid2tmatid(ps_search_acmod(ngs)->mdef, hmm->ciphone));
351
352 if (prevhmm)
353 prevhmm->next = hmm;
354 else
355 rhmm->next = hmm;
356
357 prevhmm = hmm;
358 }
359
360 /* Right-context phones */
362
363 /* Link in just allocated right-context phones */
364 if (prevhmm)
365 prevhmm->next = ngs->word_chan[wid];
366 else
367 rhmm->next = ngs->word_chan[wid];
368 ngs->word_chan[wid] = (chan_t *) rhmm;
369 }
370
371}
372
373void
375{
376 root_chan_t *rhmm;
377 int i;
378
379 ptmr_reset(&ngs->fwdflat_perf);
380 ptmr_start(&ngs->fwdflat_perf);
381 build_fwdflat_wordlist(ngs);
382 build_fwdflat_chan(ngs);
383
384 ngs->bpidx = 0;
385 ngs->bss_head = 0;
386
387 for (i = 0; i < ps_search_n_words(ngs); i++)
388 ngs->word_lat_idx[i] = NO_BP;
389
390 /* Reset the permanently allocated single-phone words, since they
391 * may have junk left over in them from previous searches. */
392 for (i = 0; i < ngs->n_1ph_words; i++) {
393 int32 w = ngs->single_phone_wid[i];
394 rhmm = (root_chan_t *) ngs->word_chan[w];
395 hmm_clear(&rhmm->hmm);
396 }
397
398 /* Start search with <s>; word_chan[<s>] is permanently allocated */
399 rhmm = (root_chan_t *) ngs->word_chan[ps_search_start_wid(ngs)];
400 hmm_enter(&rhmm->hmm, 0, NO_BP, 0);
401 ngs->active_word_list[0][0] = ps_search_start_wid(ngs);
402 ngs->n_active_word[0] = 1;
403
404 ngs->best_score = 0;
405 ngs->renormalized = FALSE;
406
407 for (i = 0; i < ps_search_n_words(ngs); i++)
408 ngs->last_ltrans[i].sf = -1;
409
410 if (!ngs->fwdtree)
411 ngs->n_frame = 0;
412
413 ngs->st.n_fwdflat_chan = 0;
414 ngs->st.n_fwdflat_words = 0;
415 ngs->st.n_fwdflat_word_transition = 0;
416 ngs->st.n_senone_active_utt = 0;
417}
418
419static void
420compute_fwdflat_sen_active(ngram_search_t *ngs, int frame_idx)
421{
422 int32 i, nw, w;
423 int32 *awl;
424 root_chan_t *rhmm;
425 chan_t *hmm;
426
427 acmod_clear_active(ps_search_acmod(ngs));
428
429 nw = ngs->n_active_word[frame_idx & 0x1];
430 awl = ngs->active_word_list[frame_idx & 0x1];
431
432 for (i = 0; i < nw; i++) {
433 w = *(awl++);
434 rhmm = (root_chan_t *)ngs->word_chan[w];
435 if (hmm_frame(&rhmm->hmm) == frame_idx) {
436 acmod_activate_hmm(ps_search_acmod(ngs), &rhmm->hmm);
437 }
438
439 for (hmm = rhmm->next; hmm; hmm = hmm->next) {
440 if (hmm_frame(&hmm->hmm) == frame_idx) {
441 acmod_activate_hmm(ps_search_acmod(ngs), &hmm->hmm);
442 }
443 }
444 }
445}
446
447static void
448fwdflat_eval_chan(ngram_search_t *ngs, int frame_idx)
449{
450 int32 i, w, nw, bestscore;
451 int32 *awl;
452 root_chan_t *rhmm;
453 chan_t *hmm;
454
455 nw = ngs->n_active_word[frame_idx & 0x1];
456 awl = ngs->active_word_list[frame_idx & 0x1];
457 bestscore = WORST_SCORE;
458
459 ngs->st.n_fwdflat_words += nw;
460
461 /* Scan all active words. */
462 for (i = 0; i < nw; i++) {
463 w = *(awl++);
464 rhmm = (root_chan_t *) ngs->word_chan[w];
465 if (hmm_frame(&rhmm->hmm) == frame_idx) {
466 int32 score = chan_v_eval(rhmm);
467 if ((score BETTER_THAN bestscore) && (w != ps_search_finish_wid(ngs)))
468 bestscore = score;
469 ngs->st.n_fwdflat_chan++;
470 }
471
472 for (hmm = rhmm->next; hmm; hmm = hmm->next) {
473 if (hmm_frame(&hmm->hmm) == frame_idx) {
474 int32 score = chan_v_eval(hmm);
475 if (score BETTER_THAN bestscore)
476 bestscore = score;
477 ngs->st.n_fwdflat_chan++;
478 }
479 }
480 }
481
482 ngs->best_score = bestscore;
483}
484
485static void
486fwdflat_prune_chan(ngram_search_t *ngs, int frame_idx)
487{
488 int32 i, nw, cf, nf, w, pip, newscore, thresh, wordthresh;
489 int32 *awl;
490 root_chan_t *rhmm;
491 chan_t *hmm, *nexthmm;
492
493 cf = frame_idx;
494 nf = cf + 1;
495 nw = ngs->n_active_word[cf & 0x1];
496 awl = ngs->active_word_list[cf & 0x1];
497 bitvec_clear_all(ngs->word_active, ps_search_n_words(ngs));
498
499 thresh = ngs->best_score + ngs->fwdflatbeam;
500 wordthresh = ngs->best_score + ngs->fwdflatwbeam;
501 pip = ngs->pip;
502 E_DEBUG(3,("frame %d thresh %d wordthresh %d\n", frame_idx, thresh, wordthresh));
503
504 /* Scan all active words. */
505 for (i = 0; i < nw; i++) {
506 w = *(awl++);
507 rhmm = (root_chan_t *) ngs->word_chan[w];
508 /* Propagate active root channels */
509 if (hmm_frame(&rhmm->hmm) == cf
510 && hmm_bestscore(&rhmm->hmm) BETTER_THAN thresh) {
511 hmm_frame(&rhmm->hmm) = nf;
512 bitvec_set(ngs->word_active, w);
513
514 /* Transitions out of root channel */
515 newscore = hmm_out_score(&rhmm->hmm);
516 if (rhmm->next) {
517 assert(!dict_is_single_phone(ps_search_dict(ngs), w));
518
519 newscore += pip;
520 if (newscore BETTER_THAN thresh) {
521 hmm = rhmm->next;
522 /* Enter all right context phones */
523 if (hmm->info.rc_id >= 0) {
524 for (; hmm; hmm = hmm->next) {
525 if ((hmm_frame(&hmm->hmm) < cf)
526 || (newscore BETTER_THAN hmm_in_score(&hmm->hmm))) {
527 hmm_enter(&hmm->hmm, newscore,
528 hmm_out_history(&rhmm->hmm), nf);
529 }
530 }
531 }
532 /* Just a normal word internal phone */
533 else {
534 if ((hmm_frame(&hmm->hmm) < cf)
535 || (newscore BETTER_THAN hmm_in_score(&hmm->hmm))) {
536 hmm_enter(&hmm->hmm, newscore,
537 hmm_out_history(&rhmm->hmm), nf);
538 }
539 }
540 }
541 }
542 else {
543 assert(dict_is_single_phone(ps_search_dict(ngs), w));
544
545 /* Word exit for single-phone words (where did their
546 * whmms come from?) (either from
547 * ngram_search_fwdtree, or from
548 * ngram_fwdflat_allocate_1ph(), that's where) */
549 if (newscore BETTER_THAN wordthresh) {
550 ngram_search_save_bp(ngs, cf, w, newscore,
551 hmm_out_history(&rhmm->hmm), 0);
552 }
553 }
554 }
555
556 /* Transitions out of non-root channels. */
557 for (hmm = rhmm->next; hmm; hmm = hmm->next) {
558 if (hmm_frame(&hmm->hmm) >= cf) {
559 /* Propagate forward HMMs inside the beam. */
560 if (hmm_bestscore(&hmm->hmm) BETTER_THAN thresh) {
561 hmm_frame(&hmm->hmm) = nf;
562 bitvec_set(ngs->word_active, w);
563
564 newscore = hmm_out_score(&hmm->hmm);
565 /* Word-internal phones */
566 if (hmm->info.rc_id < 0) {
567 newscore += pip;
568 if (newscore BETTER_THAN thresh) {
569 nexthmm = hmm->next;
570 /* Enter all right-context phones. */
571 if (nexthmm->info.rc_id >= 0) {
572 for (; nexthmm; nexthmm = nexthmm->next) {
573 if ((hmm_frame(&nexthmm->hmm) < cf)
574 || (newscore BETTER_THAN
575 hmm_in_score(&nexthmm->hmm))) {
576 hmm_enter(&nexthmm->hmm,
577 newscore,
578 hmm_out_history(&hmm->hmm),
579 nf);
580 }
581 }
582 }
583 /* Enter single word-internal phone. */
584 else {
585 if ((hmm_frame(&nexthmm->hmm) < cf)
586 || (newscore BETTER_THAN
587 hmm_in_score(&nexthmm->hmm))) {
588 hmm_enter(&nexthmm->hmm, newscore,
589 hmm_out_history(&hmm->hmm), nf);
590 }
591 }
592 }
593 }
594 /* Right-context phones - apply word beam and exit. */
595 else {
596 if (newscore BETTER_THAN wordthresh) {
597 ngram_search_save_bp(ngs, cf, w, newscore,
598 hmm_out_history(&hmm->hmm),
599 hmm->info.rc_id);
600 }
601 }
602 }
603 /* Zero out inactive HMMs. */
604 else if (hmm_frame(&hmm->hmm) != nf) {
605 hmm_clear_scores(&hmm->hmm);
606 }
607 }
608 }
609 }
610}
611
612static void
613get_expand_wordlist(ngram_search_t *ngs, int32 frm, int32 win)
614{
615 int32 f, sf, ef;
616 ps_latnode_t *node;
617
618 if (!ngs->fwdtree) {
619 ngs->st.n_fwdflat_word_transition += ngs->n_expand_words;
620 return;
621 }
622
623 sf = frm - win;
624 if (sf < 0)
625 sf = 0;
626 ef = frm + win;
627 if (ef > ngs->n_frame)
628 ef = ngs->n_frame;
629
630 bitvec_clear_all(ngs->expand_word_flag, ps_search_n_words(ngs));
631 ngs->n_expand_words = 0;
632
633 for (f = sf; f < ef; f++) {
634 for (node = ngs->frm_wordlist[f]; node; node = node->next) {
635 if (!bitvec_is_set(ngs->expand_word_flag, node->wid)) {
636 ngs->expand_word_list[ngs->n_expand_words++] = node->wid;
637 bitvec_set(ngs->expand_word_flag, node->wid);
638 }
639 }
640 }
641 ngs->expand_word_list[ngs->n_expand_words] = -1;
642 ngs->st.n_fwdflat_word_transition += ngs->n_expand_words;
643}
644
645static void
646fwdflat_word_transition(ngram_search_t *ngs, int frame_idx)
647{
648 int32 cf, nf, b, thresh, pip, i, nw, w, newscore;
649 int32 best_silrc_score = 0, best_silrc_bp = 0; /* FIXME: good defaults? */
650 bptbl_t *bp;
651 int32 *rcss;
652 root_chan_t *rhmm;
653 int32 *awl;
654 float32 lwf;
655 dict_t *dict = ps_search_dict(ngs);
656 dict2pid_t *d2p = ps_search_dict2pid(ngs);
657
658 cf = frame_idx;
659 nf = cf + 1;
660 thresh = ngs->best_score + ngs->fwdflatbeam;
661 pip = ngs->pip;
662 best_silrc_score = WORST_SCORE;
663 lwf = ngs->fwdflat_fwdtree_lw_ratio;
664
665 /* Search for all words starting within a window of this frame.
666 * These are the successors for words exiting now. */
667 get_expand_wordlist(ngs, cf, ngs->max_sf_win);
668
669 /* Scan words exited in current frame */
670 for (b = ngs->bp_table_idx[cf]; b < ngs->bpidx; b++) {
671 xwdssid_t *rssid;
672 int32 silscore;
673
674 bp = ngs->bp_table + b;
675 ngs->word_lat_idx[bp->wid] = NO_BP;
676
677 if (bp->wid == ps_search_finish_wid(ngs))
678 continue;
679
680 /* DICT2PID location */
681 /* Get the mapping from right context phone ID to index in the
682 * right context table and the bscore_stack. */
683 rcss = ngs->bscore_stack + bp->s_idx;
684 if (bp->last2_phone == -1)
685 rssid = NULL;
686 else
687 rssid = dict2pid_rssid(d2p, bp->last_phone, bp->last2_phone);
688
689 /* Transition to all successor words. */
690 for (i = 0; ngs->expand_word_list[i] >= 0; i++) {
691 int32 n_used;
692
693 w = ngs->expand_word_list[i];
694
695 /* Get the exit score we recorded in save_bwd_ptr(), or
696 * something approximating it. */
697 if (rssid)
698 newscore = rcss[rssid->cimap[dict_first_phone(dict, w)]];
699 else
700 newscore = bp->score;
701 if (newscore == WORST_SCORE)
702 continue;
703 /* FIXME: Floating point... */
704 newscore += lwf
705 * (ngram_tg_score(ngs->lmset,
706 dict_basewid(dict, w),
707 bp->real_wid,
708 bp->prev_real_wid,
709 &n_used) >> SENSCR_SHIFT);
710 newscore += pip;
711
712 /* Enter the next word */
713 if (newscore BETTER_THAN thresh) {
714 rhmm = (root_chan_t *) ngs->word_chan[w];
715 if ((hmm_frame(&rhmm->hmm) < cf)
716 || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) {
717 hmm_enter(&rhmm->hmm, newscore, b, nf);
718 /* DICT2PID: This is where mpx ssids get introduced. */
719 /* Look up the ssid to use when entering this mpx triphone. */
720 hmm_mpx_ssid(&rhmm->hmm, 0) =
721 dict2pid_ldiph_lc(d2p, rhmm->ciphone, rhmm->ci2phone,
722 dict_last_phone(dict, bp->wid));
723 assert(IS_S3SSID(hmm_mpx_ssid(&rhmm->hmm, 0)));
724 E_DEBUG(6,("ssid %d(%d,%d) = %d\n",
725 rhmm->ciphone, dict_last_phone(dict, bp->wid), rhmm->ci2phone,
726 hmm_mpx_ssid(&rhmm->hmm, 0)));
727 bitvec_set(ngs->word_active, w);
728 }
729 }
730 }
731
732 /* Get the best exit into silence. */
733 if (rssid)
734 silscore = rcss[rssid->cimap[ps_search_acmod(ngs)->mdef->sil]];
735 else
736 silscore = bp->score;
737 if (silscore BETTER_THAN best_silrc_score) {
738 best_silrc_score = silscore;
739 best_silrc_bp = b;
740 }
741 }
742
743 /* Transition to <sil> */
744 newscore = best_silrc_score + ngs->silpen + pip;
745 if ((newscore BETTER_THAN thresh) && (newscore BETTER_THAN WORST_SCORE)) {
746 w = ps_search_silence_wid(ngs);
747 rhmm = (root_chan_t *) ngs->word_chan[w];
748 if ((hmm_frame(&rhmm->hmm) < cf)
749 || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) {
750 hmm_enter(&rhmm->hmm, newscore,
751 best_silrc_bp, nf);
752 bitvec_set(ngs->word_active, w);
753 }
754 }
755 /* Transition to noise words */
756 newscore = best_silrc_score + ngs->fillpen + pip;
757 if ((newscore BETTER_THAN thresh) && (newscore BETTER_THAN WORST_SCORE)) {
758 for (w = dict_filler_start(dict); w <= dict_filler_end(dict); w++) {
759 if (w == ps_search_silence_wid(ngs))
760 continue;
761
762 rhmm = (root_chan_t *) ngs->word_chan[w];
763 /* Noise words that aren't a single phone will have NULL here. */
764 if (rhmm == NULL)
765 continue;
766 if ((hmm_frame(&rhmm->hmm) < cf)
767 || (newscore BETTER_THAN hmm_in_score(&rhmm->hmm))) {
768 hmm_enter(&rhmm->hmm, newscore,
769 best_silrc_bp, nf);
770 bitvec_set(ngs->word_active, w);
771 }
772 }
773 }
774
775 /* Reset initial channels of words that have become inactive even after word trans. */
776 nw = ngs->n_active_word[cf & 0x1];
777 awl = ngs->active_word_list[cf & 0x1];
778 for (i = 0; i < nw; i++) {
779 w = *(awl++);
780 rhmm = (root_chan_t *) ngs->word_chan[w];
781 if (hmm_frame(&rhmm->hmm) == cf) {
782 hmm_clear_scores(&rhmm->hmm);
783 }
784 }
785}
786
787static void
788fwdflat_renormalize_scores(ngram_search_t *ngs, int frame_idx, int32 norm)
789{
790 root_chan_t *rhmm;
791 chan_t *hmm;
792 int32 i, nw, cf, w, *awl;
793
794 cf = frame_idx;
795
796 /* Renormalize individual word channels */
797 nw = ngs->n_active_word[cf & 0x1];
798 awl = ngs->active_word_list[cf & 0x1];
799 for (i = 0; i < nw; i++) {
800 w = *(awl++);
801 rhmm = (root_chan_t *) ngs->word_chan[w];
802 if (hmm_frame(&rhmm->hmm) == cf) {
803 hmm_normalize(&rhmm->hmm, norm);
804 }
805 for (hmm = rhmm->next; hmm; hmm = hmm->next) {
806 if (hmm_frame(&hmm->hmm) == cf) {
807 hmm_normalize(&hmm->hmm, norm);
808 }
809 }
810 }
811
812 ngs->renormalized = TRUE;
813}
814
815int
817{
818 int16 const *senscr;
819 int32 nf, i, j;
820 int32 *nawl;
821
822 /* Activate our HMMs for the current frame if need be. */
823 if (!ps_search_acmod(ngs)->compallsen)
824 compute_fwdflat_sen_active(ngs, frame_idx);
825
826 /* Compute GMM scores for the current frame. */
827 senscr = acmod_score(ps_search_acmod(ngs), &frame_idx);
828 ngs->st.n_senone_active_utt += ps_search_acmod(ngs)->n_senone_active;
829
830 /* Mark backpointer table for current frame. */
831 ngram_search_mark_bptable(ngs, frame_idx);
832
833 /* If the best score is equal to or worse than WORST_SCORE,
834 * recognition has failed, don't bother to keep trying. */
836 return 0;
837 /* Renormalize if necessary */
838 if (ngs->best_score + (2 * ngs->beam) WORSE_THAN WORST_SCORE) {
839 E_INFO("Renormalizing Scores at frame %d, best score %d\n",
840 frame_idx, ngs->best_score);
841 fwdflat_renormalize_scores(ngs, frame_idx, ngs->best_score);
842 }
843
844 ngs->best_score = WORST_SCORE;
845 hmm_context_set_senscore(ngs->hmmctx, senscr);
846
847 /* Evaluate HMMs */
848 fwdflat_eval_chan(ngs, frame_idx);
849 /* Prune HMMs and do phone transitions. */
850 fwdflat_prune_chan(ngs, frame_idx);
851 /* Do word transitions. */
852 fwdflat_word_transition(ngs, frame_idx);
853
854 /* Create next active word list, skip fillers */
855 nf = frame_idx + 1;
856 nawl = ngs->active_word_list[nf & 0x1];
857 for (i = 0, j = 0; ngs->fwdflat_wordlist[i] >= 0; i++) {
858 int32 wid = ngs->fwdflat_wordlist[i];
859 if (bitvec_is_set(ngs->word_active, wid) && wid < ps_search_start_wid(ngs)) {
860 *(nawl++) = wid;
861 j++;
862 }
863 }
864 /* Add fillers */
865 for (i = ps_search_start_wid(ngs); i < ps_search_n_words(ngs); i++) {
866 if (bitvec_is_set(ngs->word_active, i)) {
867 *(nawl++) = i;
868 j++;
869 }
870 }
871 if (!ngs->fwdtree)
872 ++ngs->n_frame;
873 ngs->n_active_word[nf & 0x1] = j;
874
875 /* Return the number of frames processed. */
876 return 1;
877}
878
882static void
883destroy_fwdflat_wordlist(ngram_search_t *ngs)
884{
885 ps_latnode_t *node, *tnode;
886 int32 f;
887
888 if (!ngs->fwdtree)
889 return;
890
891 for (f = 0; f < ngs->n_frame; f++) {
892 for (node = ngs->frm_wordlist[f]; node; node = tnode) {
893 tnode = node->next;
894 listelem_free(ngs->latnode_alloc, node);
895 }
896 }
897}
898
902static void
903destroy_fwdflat_chan(ngram_search_t *ngs)
904{
905 int32 i, wid;
906
907 for (i = 0; ngs->fwdflat_wordlist[i] >= 0; i++) {
908 root_chan_t *rhmm;
909 chan_t *thmm;
910 wid = ngs->fwdflat_wordlist[i];
911 if (dict_is_single_phone(ps_search_dict(ngs),wid))
912 continue;
913 assert(ngs->word_chan[wid] != NULL);
914
915 /* The first HMM in ngs->word_chan[wid] was allocated with
916 * ngs->root_chan_alloc, but this will attempt to free it
917 * using ngs->chan_alloc, which will not work. Therefore we
918 * free it manually and move the list forward before handing
919 * it off. */
920 rhmm = (root_chan_t *)ngs->word_chan[wid];
921 thmm = rhmm->next;
922 listelem_free(ngs->root_chan_alloc, rhmm);
923 ngs->word_chan[wid] = thmm;
924 ngram_search_free_all_rc(ngs, wid);
925 }
926}
927
928void
930{
931 int32 cf;
932
933 destroy_fwdflat_chan(ngs);
934 destroy_fwdflat_wordlist(ngs);
935 bitvec_clear_all(ngs->word_active, ps_search_n_words(ngs));
936
937 /* This is the number of frames processed. */
938 cf = ps_search_acmod(ngs)->output_frame;
939 /* Add a mark in the backpointer table for one past the final frame. */
941
942 ptmr_stop(&ngs->fwdflat_perf);
943 /* Print out some statistics. */
944 if (cf > 0) {
945 double n_speech = (double)(cf + 1)
946 / cmd_ln_int32_r(ps_search_config(ngs), "-frate");
947 E_INFO("%8d words recognized (%d/fr)\n",
948 ngs->bpidx, (ngs->bpidx + (cf >> 1)) / (cf + 1));
949 E_INFO("%8d senones evaluated (%d/fr)\n", ngs->st.n_senone_active_utt,
950 (ngs->st.n_senone_active_utt + (cf >> 1)) / (cf + 1));
951 E_INFO("%8d channels searched (%d/fr)\n",
952 ngs->st.n_fwdflat_chan, ngs->st.n_fwdflat_chan / (cf + 1));
953 E_INFO("%8d words searched (%d/fr)\n",
954 ngs->st.n_fwdflat_words, ngs->st.n_fwdflat_words / (cf + 1));
955 E_INFO("%8d word transitions (%d/fr)\n",
956 ngs->st.n_fwdflat_word_transition,
957 ngs->st.n_fwdflat_word_transition / (cf + 1));
958 E_INFO("fwdflat %.2f CPU %.3f xRT\n",
959 ngs->fwdflat_perf.t_cpu,
960 ngs->fwdflat_perf.t_cpu / n_speech);
961 E_INFO("fwdflat %.2f wall %.3f xRT\n",
962 ngs->fwdflat_perf.t_elapsed,
963 ngs->fwdflat_perf.t_elapsed / n_speech);
964 }
965}
void acmod_activate_hmm(acmod_t *acmod, hmm_t *hmm)
Activate senones associated with an HMM.
Definition acmod.c:1213
int16 const * acmod_score(acmod_t *acmod, int *inout_frame_idx)
Score one frame of data.
Definition acmod.c:1106
void acmod_clear_active(acmod_t *acmod)
Clear set of active senones.
Definition acmod.c:1197
s3ssid_t dict2pid_internal(dict2pid_t *d2p, int32 wid, int pos)
Return the senone sequence ID for the given word position.
Definition dict2pid.c:367
#define dict2pid_rssid(d, ci, lc)
Access macros; not designed for arbitrary use.
Definition dict2pid.h:115
#define dict_size(d)
Packaged macro access to dictionary members.
Definition dict.h:151
#define dict_pron(d, w, p)
The CI phones of the word w at position p.
Definition dict.h:165
#define BETTER_THAN
Is one score better than another?
Definition hmm.h:95
#define hmm_context_set_senscore(ctx, senscr)
Change the senone score array for a context.
Definition hmm.h:227
#define WORST_SCORE
Large "bad" score.
Definition hmm.h:84
#define WORSE_THAN
Is one score worse than another?
Definition hmm.h:100
#define SENSCR_SHIFT
Shift count for senone scores.
Definition hmm.h:73
void ngram_search_free_all_rc(ngram_search_t *ngs, int32 w)
Allocate last phone channels for all possible right contexts for word w.
void ngram_search_alloc_all_rc(ngram_search_t *ngs, int32 w)
Allocate last phone channels for all possible right contexts for word w.
int ngram_search_mark_bptable(ngram_search_t *ngs, int frame_idx)
Record the current frame's index in the backpointer table.
void ngram_search_save_bp(ngram_search_t *ngs, int frame_idx, int32 w, int32 score, int32 path, int32 rc)
Enter a word in the backpointer table.
N-Gram based multi-pass search ("FBS")
void ngram_fwdflat_start(ngram_search_t *ngs)
Start fwdflat decoding for an utterance.
void ngram_fwdflat_deinit(ngram_search_t *ngs)
Release memory associated with fwdflat decoding.
int ngram_fwdflat_reinit(ngram_search_t *ngs)
Rebuild search structures for updated language models.
void ngram_fwdflat_finish(ngram_search_t *ngs)
Finish fwdflat decoding for an utterance.
void ngram_fwdflat_init(ngram_search_t *ngs)
Initialize N-Gram search for fwdflat decoding.
int ngram_fwdflat_search(ngram_search_t *ngs, int frame_idx)
Search one frame forward in an utterance.
Word graph search implementation.
Back pointer table (forward pass lattice; actually a tree)
int32 wid
Word index.
int16 last2_phone
next-to-last phone of this word
int32 bp
Back Pointer.
int32 prev_real_wid
wid of second-last real word
int32 real_wid
wid of this or latest predecessor real word
int32 score
Score (best among all right contexts)
int16 last_phone
last phone of this word
int32 s_idx
Start of BScoreStack for various right contexts.
frame_idx_t frame
start or end frame
Lexical tree node data type.
struct chan_s * next
first descendant of this channel; or, in the case of the last phone of a word, the next alternative r...
int32 ciphone
ciphone for this node
hmm_t hmm
Basic HMM structure.
int32 rc_id
right-context id for last phone of words
Building composite triphone (as well as word internal triphones) with the dictionary.
Definition dict2pid.h:84
a structure for a dictionary.
Definition dict.h:76
N-Gram search module structure.
int32 * single_phone_wid
list of single-phone word ids
int32 best_score
Best Viterbi path score.
root_chan_t * rhmm_1ph
Root HMMs for single-phone words.
listelem_alloc_t * latnode_alloc
For latnode_t.
int32 n_frame_alloc
Number of frames allocated in bp_table_idx and friends.
int32 ** active_word_list
Array of active multi-phone words for current and next frame.
int32 n_frame
Number of frames actually present.
ngram_search_stats_t st
Various statistics for profiling.
listelem_alloc_t * root_chan_alloc
For root_chan_t.
int32 n_active_word[2]
Number entries in active_word_list.
ngram_model_t * lmset
Set of language models.
int32 * fwdflat_wordlist
List of active word IDs for utterance.
chan_t ** word_chan
Channels associated with a given word (only used for right contexts, single-phone words in fwdtree se...
int32 n_1ph_words
Number single phone words in dict (total)
ps_latnode_t ** frm_wordlist
List of active words in each frame.
listelem_alloc_t * chan_alloc
For chan_t.
hmm_context_t * hmmctx
HMM context.
bitvec_t * word_active
array of active flags for all words.
struct ps_latnode_s * next
Next node in DAG (no ordering implied)
int32 wid
Dictionary word id.
Lexical tree node data type for the first phone (root) of each dynamic HMM tree structure.
int16 ci2phone
second ciphone of this node; one root HMM for each unique right context
hmm_t hmm
Basic HMM structure.
int16 ciphone
first ciphone of this node; all words rooted at this node begin with this ciphone
chan_t * next
first descendant of this channel
cross word triphone model structure
Definition dict2pid.h:73
s3cipid_t * cimap
Index into ssid[] above for each ci phone.
Definition dict2pid.h:75