diff --git a/.editorconfig b/.editorconfig index 0aea39b4..1164c0a5 100644 --- a/.editorconfig +++ b/.editorconfig @@ -4,5 +4,5 @@ root = true indent_style = tab indent_size = 2 tab_width = 8 -trim_trailing_whitespace = true +trim_trailing_whitespace = false insert_final_newline = true diff --git a/Sample.jconf b/Sample.jconf index b419da1c..e7a6f523 100644 --- a/Sample.jconf +++ b/Sample.jconf @@ -96,14 +96,16 @@ #-rejectlong -1 # reject longer input (msec) -1 to disable #### -#### Speech detection by libfvad +#### Speech detection by WebRTC VAD (libfvad) #### -#-fvad -1 # disable libfvad -#-fvad 0 # enable on mode 0 (least aggressive to filtering out non-speech) -#-fvad 1 # enable on mode 1 (moderately aggressive to filtering out non-speech) -#-fvad 2 # enable on mode 2 (aggressive to filtering out non-speech) -#-fvad 3 # enable on mode 3 (very aggressive to filtering out non-speech) -#-fvad_param 5 0.5 # optinal parameter: smoothing frames, trigger threshold +#-fvad -1 # disable WebRTC VAD +#-fvad 0 # enable WebRTC VAD on mode 0 (least aggressive to filtering out non-speech) +#-fvad 1 # enable WebRTC VAD on mode 1 (moderately aggressive to filtering out non-speech) +#-fvad 2 # enable WebRTC VAD on mode 2 (aggressive to filtering out non-speech) +#-fvad 3 # enable WebRTC VAD on mode 3 (very aggressive to filtering out non-speech) +#-fvad_param 5 0.5 # optional parameter: smoothing frames, trigger threshold +#-agc # enable auto gain control. Should be specified with -fvad. +#-noagc # disable auto gain control. #### #### Input rejection by average power (EXPERIMENTAL) @@ -117,7 +119,7 @@ #### #### Gaussian Mixture Model #### -#### GMM will be used for input rejection by accumurated score, or +#### GMM will be used for input rejection by accumulated score, or #### for GMM-based frontend VAD when "--enable-gmm-vad" specified. #### #### NOTE: If you use MFCC for the GMM which is different from AM, you @@ -188,13 +190,13 @@ ## Create a new AM configuration set, and switch current to it. ## You should give a unique name. -#-AM name +#-AM name ## Create a new LM configuration set, and switch current to it. ## You should give a unique name. -#-LM name +#-LM name -## Create a new Search configuration set with AM and LM, and switch +## Create a new Search configuration set with AM and LM, and switch ## current to it. AM and LM name can be either name or ID number. #-SR name am_name_or_id lm_name_or_id @@ -208,7 +210,7 @@ ## This option is only a switcher and can be used anywhere anytime. # -GLOBAL -## This option disables the strict section checkings and back to 4.0 +## This option disables the strict section checks and back to 4.0 # -nosectioncheck ###################################################################### @@ -231,7 +233,7 @@ #-mapunk "" # word to which unknown words should be mapped #-iwspword # add a pause word to the dictionary #-iwspentry " [sp] sp sp" # word that will be added by "-iwspword" -#-sepnum 150 # num of high freq words to linearize +#-sepnum 150 # num of high freq words to linearize #-adddict dictfile # append additional word dictionary #-addword entry # append additional word entry @@ -271,7 +273,7 @@ #### the AM defines the required parameter. You can use different MFCC #### type for each AM. #### For GMM, the same parameter should be specified after "-AM_GMM" -#### +#### #### When using multiple AM, the values of "-smpPeriod", "-smpFreq", #### "-fsize" and "-fshift" should be the same among all AM. #### @@ -332,7 +334,7 @@ #-dnnconf file # DNN configuration file ## Others -#-htkconf configfile # load analysis settings from HTK Config file +#-htkconf configfile # load analysis settings from HTK Config file ###################################################################### #### RECOGNIZER (-SR) @@ -341,7 +343,7 @@ #### Default values for beam width and LM weights will change #### according to compile-time setup of JuliusLib and model specification. #### Please see the startup log for the actual values. -#### +#### #### #### parameter (common) @@ -387,34 +389,34 @@ #-spdur 10 # # of frames to detect a short pause #-pausemodels string # comma-separated pause model names #### for decoder-VAD -#-spmargin 40 # backstep margin at trigger up (frame) +#-spmargin 40 # back-step margin at trigger up (frame) #-spdelay 4 # decision delay at trigger up (frame) -#### +#### #### lattice output -#### +#### #-lattice # output result in word graph (aka -graphout) #-graphrange 0 # merge same words nearby, -1 to disable merge #-graphcut 80 # graph depth cut threshold (in depth) -#-graphboundloop 20 # max itertations for boundary adjustment loop -#-graphsearchdelay # activate an alternate generation algorithm +#-graphboundloop 20 # max iterations for boundary adjustment loop +#-graphsearchdelay # activate an alternate generation algorithm #-nographsearchdelay # disable "-graphsearchdelay" -#### +#### #### confusion network output -#### +#### #-confnet # enable confusion network output #-noconfnet # disable confusion network output -#### +#### #### multi-grammar output (for grammar and isolated word) -#### +#### #-multigramout # output max hypo for each grammar #-nomultigramout # disable "-multigramout" -#### +#### #### forced alignment -#### +#### #-walign # enable alignment for result at word level #-palign # enable alignment for result at phoneme level #-salign # enable alignment for result at state level diff --git a/adinrec/adinrec.c b/adinrec/adinrec.c index 3a291743..f4f452ca 100644 --- a/adinrec/adinrec.c +++ b/adinrec/adinrec.c @@ -1,19 +1,19 @@ /** * @file adinrec.c - * + * * * @brief マイクから一発話をファイルへ記録する * - * + * * * @brief Record a speech segment from microphone to a file * - * + * * @author Akinobu LEE * @date Wed Mar 23 20:33:01 2005 * * $Revision: 1.13 $ - * + * */ /* * Copyright (c) 1991-2013 Kawahara Lab., Kyoto University @@ -35,7 +35,7 @@ static char *filename = NULL; ///< Output file name static boolean stout = FALSE; ///< True if output to stdout static boolean use_raw = FALSE; ///< Output in RAW format if TRUE -/** +/** * ヘルプを表示して終了する * Print help and exit */ @@ -53,8 +53,10 @@ opt_help(Jconf *jconf, char *arg[], int argnum) fprintf(stderr, " [-tailmargin msec] tail margin length (%d)\n", jconf->detect.tail_margin_msec); fprintf(stderr, " [-chunksize sample] chunk size for processing (%d)\n", jconf->detect.chunk_size); #ifdef HAVE_LIBFVAD - fprintf(stderr, " [-fvad] FVAD sw (-1=off, 0 - 3) (%d)\n", jconf->detect.fvad_mode); - fprintf(stderr, " [-fvad_param i f] FVAD parameter (dur/thres) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); + fprintf(stderr, " [-fvad mode] enable WebRTC VAD (0-3, larger value rejects noises aggressively) (%d)\n", jconf->detect.fvad_mode); + fprintf(stderr, " [-fvad_param i f] WebRTC VAD parameters (smoothing duration (frames), thres([0-1])) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); + fprintf(stderr, " [-agc][-noagc] enable/disable additional AGC on WebRTC VAD\n"); + fprintf(stderr, " [-agc_param p1 ... p7] AGC parameters (%d %.2f %.2f %.2f %.2f %.2f %.2f)\n", jconf->detect.agc.overflow_thres , jconf->detect.agc.scale_max, jconf->detect.agc.scale_max_relative_first, jconf->detect.agc.level_factor_first, jconf->detect.agc.scale_up_rate, jconf->detect.agc.scale_down_rate, jconf->detect.agc.scale_down_overflow_rate); #endif /* HAVE_LIBFVAD */ fprintf(stderr, " [-nostrip] not strip off zero samples\n"); fprintf(stderr, " [-zmean] remove DC by zero mean\n"); @@ -83,21 +85,21 @@ opt_freq(Jconf *jconf, char *arg[], int argnum) return TRUE; } -/** +/** * * 録音されたサンプル列を処理するコールバック関数 - * + * * @param now [in] 録音されたサンプル列 * @param len [in] 長さ(サンプル数) - * + * * @return エラー時 -1,処理成功時 0,処理成功+区間終端検出時 1 を返す. * * * Callback handler of recorded sample fragments - * + * * @param now [in] recorded fragments of speech sample * @param len [in] length of above in samples - * + * * @return -1 on device error (require caller to exit and terminate input), * 0 on success (allow caller to continue), * 1 on succeeded but segmentation detected (require caller to exit but @@ -155,11 +157,11 @@ adin_callback_file(SP16 *now, int len, Recog *recog) return -1; } } - + speechlen += len; - + /* progress bar in dots */ - fprintf(stderr, "."); + fprintf(stderr, "."); return(0); } @@ -182,7 +184,7 @@ close_file() } } fprintf(stderr, "\n%d samples (%d bytes, %.2f sec.) recorded\n", speechlen, size, (float)speechlen / (float)sfreq); -} +} /* Interrupt signal handling */ static void @@ -196,21 +198,21 @@ interrupt_record(int signum) } -/** +/** * * メイン関数 - * + * * @param argc [in] 引数列の長さ * @param argv [in] 引数列 - * - * @return + * + * @return * エラー時 1,通常終了時 0 を返す. * * Main function. - * + * * @param argc [in] number of argument. * @param argv [in] array of arguments. - * + * * @return 1 on error, 0 on success. * */ @@ -266,7 +268,7 @@ main(int argc, char *argv[]) /* set Julius default parameters for unspecified acoustic parameters */ apply_para(&(jconf->am_root->analysis.para), &(jconf->am_root->analysis.para_default)); - + /* set some values */ jconf->input.sfreq = jconf->am_root->analysis.para.smp_freq; jconf->input.period = jconf->am_root->analysis.para.smp_period; diff --git a/adintool/README.md b/adintool/README.md index 94778509..544df242 100644 --- a/adintool/README.md +++ b/adintool/README.md @@ -22,7 +22,7 @@ GUI version: ## Description `adintool` analyzes speech input, detects speech segments skipping silence, and -records the detected segments in various ways. +records the detected segments in various ways. It accepts all Julius options. Input waveform: @@ -47,7 +47,7 @@ Output waveform / feature vector: - none This tool uses Julius's internal VAD module for speech detection. The detection -algorithm and parameters are the same as Julius. +algorithm and parameters are the same as Julius. It also accepts all Julius options. The default audio format is 16 bit, 1 channel in Microsoft WAV format. @@ -80,6 +80,12 @@ Record utterances one by one, into file "test0001.wav", "test0002.wav", ... % adintool -in mic -out file -filename test ``` +Use WebRTC-based VAD and experimental AGC. + +```shell +% adintool -in mic -out file -filename test -fvad 3 -agc +``` + Record only one utterance into "test.wav" ```shell diff --git a/adintool/adintool.h b/adintool/adintool.h index 2ce31d73..8cf67d6c 100644 --- a/adintool/adintool.h +++ b/adintool/adintool.h @@ -51,6 +51,11 @@ enum{SPOUT_NONE, SPOUT_FILE, SPOUT_STDOUT, SPOUT_ADINNET, SPOUT_VECTORNET}; #define WAVE_TICK_FLAG_PROCESSED 0x01 // audio tick flag: set to indicate that an input segment was triggered down #define WAVE_TICK_FLAG_TRIGGER 0x02 +#ifdef HAVE_LIBFVAD +// audio tick flag: set to indicate that an input segment was detemined as voice by fvad +#define WAVE_TICK_FLAG_FVAD_VOICED 0x04 +#endif /* HAVE_LIBFVAD */ + #ifdef AUTO_ADJUST_THRESHOLD // mean / var computing window length in seconds diff --git a/adintool/mainloop.c b/adintool/mainloop.c index 34ec6941..ba029d5f 100644 --- a/adintool/mainloop.c +++ b/adintool/mainloop.c @@ -954,6 +954,11 @@ static void draw_wave(Recog *recog, SP16 *now, int len, void *data) if (recog->adin->is_valid_data == TRUE) { s->flag[s->bp] |= WAVE_TICK_FLAG_PROCESSED; } +#ifdef HAVE_LIBFVAD + if (recog->adin->fvad_last_voice == TRUE) { + s->flag[s->bp] |= WAVE_TICK_FLAG_FVAD_VOICED; + } +#endif /* HAVE_LIBFVAD */ s->is_valid_flag = (recog->adin->is_valid_data == TRUE) ? 1 : 0; #ifdef AUTO_ADJUST_THRESHOLD @@ -1098,7 +1103,7 @@ static void draw_wave(Recog *recog, SP16 *now, int len, void *data) #endif /* AUTO_ADJUST_THRESHOLD */ // clear screen - if (recog->jconf->preprocess.level_coef == 1.0f) { + if (recog->jconf->preprocess.level_coef != 0.0f) { // fill black SDL_SetRenderDrawColor(s->renderer, 0, 0, 0, 0xFF); } else { @@ -1163,19 +1168,32 @@ static void draw_wave(Recog *recog, SP16 *now, int len, void *data) j++; if (j >= s->items) j -= s->items; } - m = s->rectflags[0] & WAVE_TICK_FLAG_PROCESSED; + + short process_flag = WAVE_TICK_FLAG_PROCESSED; +#ifdef HAVE_LIBFVAD + process_flag |= WAVE_TICK_FLAG_FVAD_VOICED; +#endif + m = s->rectflags[0] & process_flag; k = 0; miny = viewport.h; startx = 0; for(i = 0; i < s->items; i++) { - if ((s->rectflags[i] & WAVE_TICK_FLAG_PROCESSED) != m) { - SDL_SetRenderDrawColor(s->renderer, 255 * m, 128, 255 - 128 * m, 255); + if ((s->rectflags[i] & process_flag) != m) { + if (m & WAVE_TICK_FLAG_PROCESSED) { + SDL_SetRenderDrawColor(s->renderer, 255, 128, 128, 255); +#ifdef HAVE_LIBFVAD + } else if (m & WAVE_TICK_FLAG_FVAD_VOICED) { + SDL_SetRenderDrawColor(s->renderer, 128, 255, 128, 255); +#endif + } else { + SDL_SetRenderDrawColor(s->renderer, 0, 128, 255, 255); + } SDL_RenderFillRects(s->renderer, &(s->rects[k]), i - k); - m = s->rectflags[i]; - if ((s->rectflags[i] & WAVE_TICK_FLAG_PROCESSED) != 0) { + if ((m & WAVE_TICK_FLAG_PROCESSED) == 0 && (s->rectflags[i] & WAVE_TICK_FLAG_PROCESSED) != 0) { startx = i; miny = viewport.h; } + m = s->rectflags[i] & process_flag; k = i; } if ((s->rectflags[i] & WAVE_TICK_FLAG_TRIGGER) != 0) { @@ -1193,9 +1211,25 @@ static void draw_wave(Recog *recog, SP16 *now, int len, void *data) if (miny > viewport.h - (s->rects[i].y + s->rects[i].h)) miny = viewport.h - (s->rects[i].y + s->rects[i].h); } - SDL_SetRenderDrawColor(s->renderer, 255 * m, 128, 255 - 128 * m, 255); + if (m & WAVE_TICK_FLAG_PROCESSED) { + SDL_SetRenderDrawColor(s->renderer, 255, 128, 128, 255); + } else if (m & WAVE_TICK_FLAG_FVAD_VOICED) { + SDL_SetRenderDrawColor(s->renderer, 128, 255, 128, 255); + } else { + SDL_SetRenderDrawColor(s->renderer, 0, 128, 255, 255); + } SDL_RenderFillRects(s->renderer, &(s->rects[k]), s->items - k); +#ifdef HAVE_LIBFVAD + /* draw current scale at last */ + r.w = WAVE_TICK_WIDTH; + r.h = recog->adin->level_coef * viewport.h * 0.025; + r.x = viewport.w - r.w; + r.y = viewport.h * 0.5 - r.h; + SDL_SetRenderDrawColor(s->renderer, 255, 0, 0, 255); + SDL_RenderDrawRect(s->renderer, &r); +#endif /* HAVE_LIBFVAD */ + #ifdef AUTO_ADJUST_THRESHOLD /* draw last mean/var box */ @@ -1224,6 +1258,10 @@ static void draw_wave(Recog *recog, SP16 *now, int len, void *data) } + +// keep audio scale +static float stored_scale; + // check events on SDL static int sdl_check_command() @@ -1267,10 +1305,11 @@ sdl_check_command() case SDLK_m: // 'm' -> input mute if (event.key.state != SDL_PRESSED || event.key.repeat != 0) break; - if (recog->jconf->preprocess.level_coef == 1.0f) { - recog->jconf->preprocess.level_coef = recog->adin->level_coef = 0.00f; + if (recog->jconf->preprocess.level_coef != 0.0f) { + stored_scale = recog->jconf->preprocess.level_coef; + recog->jconf->preprocess.level_coef = recog->adin->level_coef = 0.0f; } else { - recog->jconf->preprocess.level_coef = recog->adin->level_coef = 1.0f; + recog->jconf->preprocess.level_coef = recog->adin->level_coef = stored_scale; } break; case SDLK_c: diff --git a/adintool/options.c b/adintool/options.c index 22c17058..3d96dbe5 100644 --- a/adintool/options.c +++ b/adintool/options.c @@ -33,7 +33,7 @@ show_help_and_exit(Jconf *jconf, char *arg[], int argnum) fprintf(stderr, " vecnet to vecnet server as feature vector (I'm client)\n"); fprintf(stderr, " stdout standard tty output\n"); fprintf(stderr, " none output nothing\n"); - + fprintf(stderr, "I/O options:\n"); #ifdef USE_NETAUDIO fprintf(stderr, " -NA (netaudio) NetAudio server host:unit\n"); @@ -47,7 +47,7 @@ show_help_and_exit(Jconf *jconf, char *arg[], int argnum) fprintf(stderr, "Feature extraction options (other than in jconf):\n"); fprintf(stderr, " -paramtype desc parameter type in HTK format\n"); fprintf(stderr, " -veclen num total vector length\n"); - + fprintf(stderr, "Recording and Pause segmentation options:\n"); fprintf(stderr, " (input segmentation: on for file/mic/stdin, off for adinnet)\n"); @@ -63,10 +63,12 @@ show_help_and_exit(Jconf *jconf, char *arg[], int argnum) fprintf(stderr, " [-tailmargin msec] tail margin length (%d)\n", jconf->detect.tail_margin_msec); fprintf(stderr, " [-chunksize sample] chunk size for processing (%d)\n", jconf->detect.chunk_size); #ifdef HAVE_LIBFVAD - fprintf(stderr, " [-fvad] FVAD sw (-1=off, 0 - 3) (%d)\n", jconf->detect.fvad_mode); - fprintf(stderr, " [-fvad_param i f] FVAD parameter (dur/thres) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); + fprintf(stderr, " [-fvad mode] enable WebRTC VAD (0-3, larger value rejects noises aggressively, -1 to disable) (%d)\n", jconf->detect.fvad_mode); + fprintf(stderr, " [-fvad_param i f] WebRTC VAD parameters (smoothing duration (frames), thres([0-1])) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); + fprintf(stderr, " [-agc][-noagc] enable/disable additional AGC on WebRTC VAD\n"); + fprintf(stderr, " [-agc_param p1 ... p7] AGC parameters (%d %.2f %.2f %.2f %.2f %.2f %.2f)\n", jconf->detect.agc.overflow_thres , jconf->detect.agc.scale_max, jconf->detect.agc.scale_max_relative_first, jconf->detect.agc.level_factor_first, jconf->detect.agc.scale_up_rate, jconf->detect.agc.scale_down_rate, jconf->detect.agc.scale_down_overflow_rate); #endif /* HAVE_LIBFVAD */ - + fprintf(stderr, " [-nostrip] do not strip zero samples\n"); fprintf(stderr, " [-zmean] remove DC by zero mean\n"); fprintf(stderr, " [-raw] output in RAW format\n"); @@ -74,7 +76,7 @@ show_help_and_exit(Jconf *jconf, char *arg[], int argnum) fprintf(stderr, " [-loosesync] loose sync of resume among servers\n"); fprintf(stderr, " [-rewind msec] rewind input if spoken while pause at resume\n"); fprintf(stderr, " [-C jconffile] load jconf to set parameters (ignore other options\n"); - + fprintf(stderr, "\nLibrary configuration: "); confout_version(stderr); confout_audio(stderr); @@ -346,7 +348,7 @@ void register_options_to_julius() j_add_option("-h", 0, 0, "display this help", show_help_and_exit); j_add_option("-help", 0, 0, "display this help", show_help_and_exit); j_add_option("--help", 0, 0, "display this help", show_help_and_exit); - + } /* end of options.c */ diff --git a/doc/Options.md b/doc/Options.md index ff051695..160726a0 100644 --- a/doc/Options.md +++ b/doc/Options.md @@ -329,11 +329,19 @@ small. ### -fvad mode -Set libfvad-based VAD mode. `mode` is an integer value from -1 to 3, specify -1 to disable, 0 for moderate detection, 3 for most aggressive detection (more likely to drop speech-like noises). Default value is -1 (disabled) +Enable WebRTC VAD (libfvad-based VAD) mode. Setting `mode` to 0, 1, 2 or 3 enables WebRTC based VAD. `mode` is an integer value from -1 to 3, specify -1 to disable, 0 for weakest noise rejection (accepts all speech, but often wrongly accept noises), 3 for most aggressive noise rejection. Default value is -1 (disabled) ### -fvad_param nFrame threshold -Set libfvad detailed parameter. `nFrame` is the number of smoothing frame. `threshold` is the threshold to detect speech trigger [0.0-1.0]. Default values are 5 and 0.5 respectively. +Set WebRTC VAD's detailed parameters. `nFrame` is the length of smoothing frame. `threshold` is the threshold to detect speech trigger [0.0-1.0]. Default values are 5 and 0.5, respectively. + +### -agc, -noagc + +Enable / disable supplemental auto gain control (AGC). This feature scales up captured audio automatically by looking at the input level and results of WebRTC VAD. This is soft AGC, applying no change to the hardware volume of the capture device. Requires WebRTC VAD to be enabled together, so use with `-fvad 2` or `-fvad 3`. This feature is highly experimental and not tested well. Default is disabled. (Added 2019/4/20) + +### -agc_param i1 p1 p2 p3 p4 p5 p6 + +Set AGC parameters. `i1` is a level threshold value to detect signal overflow, `p1` is the maximum allowed scale factor, `p2` is the maximum allowed scale, relative to first-utterance scale. `p3` is target level magnifying factor for first-utterance scaling, `p4` is the magnifying rate when moving the scale up, `p5` is the magnifying rate when moving scale down, `p6` is the special scaling down factor applied when detecting signal overflow. ## Input rejection options (category `GLOBAL`) diff --git a/doc/VAD.md b/doc/VAD.md index 05575f14..d1abd467 100644 --- a/doc/VAD.md +++ b/doc/VAD.md @@ -23,9 +23,10 @@ The block diagram of detection modules are as follows: ![Block diagram of VAD modules](image/vad-module.png) -All VAD detector is disabled by default for buffered processing. For stream -processing, the level and zero cross threshold detector is enabled by default. -Other detectors should be set up and enabled by options. +For stream processing, the level and zero cross threshold detector is enabled by default. +Other detectors like WebRTC detecter can be enabled by options. + +For buffered processing, All VAD detectors are disabled by default. You can enable the detectors for buffered d input by specifying option [-cutsilence](https://github.com/julius-speech/julius/blob/master/doc/Options.md#-cutsilence--nocutsilence). ## Level and zero cross threshold detector @@ -80,6 +81,8 @@ and zero cross threshold detector and run WebRTC detector only, leave it enabled i.e. "`-lv 1`" to enforce the threshold detector to always pass through the input. +After 2019/4/20, you can also test an experimental AGC (auto gain control) feature. When `-agc` is specified together with the WebRTC VAD detector, an additional auto gain control will be activated together with the WebRTC VAD. This is soft AGC that scales audio after capturing, so no change will be made to the hardware volume. This feature is highly experimental and not tested well. You can tune the parameters by `-agc_param` options. + ## Static GMM based detector Gaussian mixture model (GMM) based speech detector. Requires the voice / noise diff --git a/libjulius/include/julius/jconf.h b/libjulius/include/julius/jconf.h index 5fbeefe4..d7af0d92 100644 --- a/libjulius/include/julius/jconf.h +++ b/libjulius/include/julius/jconf.h @@ -995,22 +995,38 @@ typedef struct __Jconf__ { * aggressive, put focus on picking up all possible speech, likely * to accepting speech-like noise part. Larger value indicates * very aggressive detection, putting focus on accepting truly - * speech only part, aggressively dropping amiguous part. + * speech only part, aggressively dropping ambiguous part. */ int fvad_mode; /** * (LIBFVAD) number of frames for smoothing. Last N frames (where 1 - * frame is fixed to 10ms) value are averaged to get stational VAD. + * frame is fixed to 10ms) value are averaged to get stable VAD. */ int fvad_smoothnum; /** - * (LIBFVAD) speech likelihood threshold value to finaly detect + * (LIBFVAD) speech likelihood threshold value to finally detect * speech trigger. Value should be between 0.0 and 1.0. Typical * value is 0.5. */ float fvad_thres; + + /** + * (LIBFVAD) switch AGC + */ + boolean auto_gain_control_flag; + + struct { + int overflow_thres; ///< signal overflow threshold + float scale_max; ///< maximum allowed scale (absolute) + float scale_max_relative_first; ///< maximum allowed scale (relative to first-utterance scale) + float level_factor_first; ///< target level threshold factor for first-utterance scaling + float scale_up_rate; ///< rate when moving scale up + float scale_down_rate; ///< rate when moving scale down + float scale_down_overflow_rate; ///< rate when moving scale down by overflow + } agc; + #endif /* HAVE_LIBFVAD */ } detect; diff --git a/libjulius/include/julius/recog.h b/libjulius/include/julius/recog.h index c1c54546..7e0603c2 100644 --- a/libjulius/include/julius/recog.h +++ b/libjulius/include/julius/recog.h @@ -454,6 +454,7 @@ typedef struct __adin__ { int *fvad_lastresult; ///< working buffer to hold last N results int fvad_lastp; ///< current pointer fot lastresult buffer float fvad_thres; ///< threshold to detect speech + boolean fvad_last_voice; ///< TRUE if last result was voice #endif /* HAVE_LIBFVAD */ } ADIn; diff --git a/libjulius/src/adin-cut.c b/libjulius/src/adin-cut.c index ae7f6c3e..1ff7b7d3 100644 --- a/libjulius/src/adin-cut.c +++ b/libjulius/src/adin-cut.c @@ -5,42 +5,42 @@ * @brief 音声キャプチャおよび有音区間検出 * * 音声入力デバイスからの音声データの取り込み,および - * 音の存在する区間の検出を行ないます. + * 音の存在する区間の検出を行ないます. * - * 有音区間の検出は,振幅レベルと零交差数を用いて行ないます. + * 有音区間の検出は,振幅レベルと零交差数を用いて行ないます. * 入力断片ごとに,レベルしきい値を越える振幅について零交差数をカウントし, * それが指定した数以上になれば,音の区間開始検出として * 取り込みを開始します. 取り込み中に零交差数が指定数以下になれば, * 取り込みを停止します. 実際には頑健に切り出しを行なうため,開始部と - * 停止部の前後にマージンを持たせて切り出します. - * - * また,オプション指定 (-zmean)により DC offset の除去をここで行ないます. - * offset は最初の @a ZMEANSAMPLES 個のサンプルの平均から計算されます. + * 停止部の前後にマージンを持たせて切り出します. + * + * また,オプション指定 (-zmean)により DC offset の除去をここで行ないます. + * offset は最初の @a ZMEANSAMPLES 個のサンプルの平均から計算されます. * * 音声データの取り込みと並行して入力音声の処理を行ないます. このため, * 取り込んだ音声データはその取り込み単位(live入力では一定時間,音声ファイル - * ではバッファサイズ)ごとに,それらを引数としてコールバック関数が呼ばれます. + * ではバッファサイズ)ごとに,それらを引数としてコールバック関数が呼ばれます. * このコールバック関数としてデータの保存や特徴量抽出, - * (フレーム同期の)認識処理を進める関数を指定します. + * (フレーム同期の)認識処理を進める関数を指定します. * * マイク入力や NetAudio 入力などの Live 入力では, * コールバック内の処理が重く処理が入力の速度に追い付かないと, - * デバイスのバッファが溢れ,入力断片がロストする場合があります. + * デバイスのバッファが溢れ,入力断片がロストする場合があります. * このエラーを防ぐため,実行環境で pthread が使用可能である場合, - * 音声取り込み・区間検出部は本体と独立したスレッドで動作します. + * 音声取り込み・区間検出部は本体と独立したスレッドで動作します. * この場合,このスレッドは本スレッドとバッファ @a speech を介して - * 以下のように協調動作します. - * + * 以下のように協調動作します. + * * - Thread 1: 音声取り込み・音区間検出スレッド - * - デバイスから音声データを読み込みながら音区間検出を行なう. + * - デバイスから音声データを読み込みながら音区間検出を行なう. * 検出した音区間のサンプルはバッファ @a speech の末尾に逐次 - * 追加される. + * 追加される. * - このスレッドは起動時から本スレッドから独立して動作し, - * 上記の動作を行ない続ける. + * 上記の動作を行ない続ける. * - Thread 2: 音声処理・認識処理を行なう本スレッド * - バッファ @a speech を一定時間ごとに監視し,新たなサンプルが * Thread 1 によって追加されたらそれらを処理し,処理が終了した - * 分バッファを詰める. + * 分バッファを詰める. * * * @@ -96,7 +96,7 @@ * @date Sat Feb 12 13:20:53 2005 * * $Revision: 1.22 $ - * + * */ /* * Copyright (c) 1991-2013 Kawahara Lab., Kyoto University @@ -117,20 +117,20 @@ /// Define this if you want to output a debug message for threading #undef THREAD_DEBUG /// Enable some fixes relating adinnet+module -#define TMP_FIX_200602 +#define TMP_FIX_200602 -/** +/** * * @brief Set up parameters for A/D-in and input detection. * * Set variables in work area according to the configuration values. - * + * * * * @brief 音声切り出し用各種パラメータをセット * - * 設定を元に切り出し用のパラメータを計算し,ワークエリアにセットします. - * + * 設定を元に切り出し用のパラメータを計算し,ワークエリアにセットします. + * * * @param adin [in] AD-in work area * @param jconf [in] configuration data @@ -157,6 +157,7 @@ adin_setup_param(ADIn *adin, Jconf *jconf) adin->adin_cut_on = adin->silence_cut_default; } adin->strip_flag = jconf->preprocess.strip_zero_sample; + if (verbose_flag == FALSE) set_strip_zero_warning(FALSE); adin->thres = jconf->detect.level_thres; #ifdef HAVE_PTHREAD if (adin->enable_thread && jconf->decodeopt.segment) { @@ -207,7 +208,7 @@ adin_setup_param(ADIn *adin, Jconf *jconf) if (adin->adin_cut_on) { init_count_zc_e(&(adin->zc), adin->c_length); } - + adin->need_init = TRUE; adin->rehash = FALSE; @@ -230,6 +231,7 @@ adin_setup_param(ADIn *adin, Jconf *jconf) adin->fvad_lastresult = (int *)mymalloc(sizeof(int) * adin->fvad_lastresultnum); for (i = 0; i < adin->fvad_lastresultnum; i++) adin->fvad_lastresult[i] = 0; adin->fvad_lastp = 0; + adin->fvad_last_voice = FALSE; } #endif /* HAVE_LIBFVAD */ @@ -237,17 +239,17 @@ adin_setup_param(ADIn *adin, Jconf *jconf) } -/** +/** * * Purge samples already processed in the temporary buffer. * * * テンポラリバッファにある処理されたサンプルをパージする. * - * + * * @param a [in] AD-in work area * @param from [in] Purge samples in range [0..from-1]. - * + * */ static void adin_purge(ADIn *a, int from) @@ -260,14 +262,14 @@ adin_purge(ADIn *a, int from) #ifdef HAVE_LIBFVAD /* proceed libfvad detection: return 1 for speech part, 0 for non-speech part */ -static int +static boolean fvad_proceed(ADIn *a, SP16 *speech, int samplenum) { int i, j, k; int ret, result; float sum; - if (a->fvad == NULL) return 1; + if (a->fvad == NULL) return TRUE; if (a->fvad_speechlen + samplenum > MAXSPEECHLEN) { /* buffer overflow */ @@ -283,7 +285,7 @@ fvad_proceed(ADIn *a, SP16 *speech, int samplenum) ret = fvad_process(a->fvad, &(a->fvad_speech[i]), a->fvad_framesize); if (ret < 0) { /* error */ - jlog("ERROR: fvad_proceed: internal error occured at fvad_process()\n"); + jlog("ERROR: fvad_proceed: internal error occurred at fvad_process()\n"); break; } a->fvad_lastresult[a->fvad_lastp] = ret; @@ -295,10 +297,10 @@ fvad_proceed(ADIn *a, SP16 *speech, int samplenum) sum /= (float)a->fvad_lastresultnum; /* judge */ if (sum >= a->fvad_thres) - result = 1; + a->fvad_last_voice = TRUE; else - result = 0; - + a->fvad_last_voice = FALSE; + /* flush processed samples */ k = 0; for (j = i; j < a->fvad_speechlen; j++) { @@ -306,12 +308,48 @@ fvad_proceed(ADIn *a, SP16 *speech, int samplenum) k++; } a->fvad_speechlen = k; - - return result; + + return a->fvad_last_voice; } #endif /* HAVE_LIBFVAD */ -/** +#ifdef HAVE_LIBFVAD +/* work area for auto gain control */ +static int fvad_cont_count = 0; /* continuous count of status keep */ +static boolean fvad_last_result = FALSE; /* keeps last fvad result */ +static int fvad_level_max = 0; /* maximum input level in cycle buffer */ +static int fvad_first_time = 0; /* flag to detect the first speech */ +static float fvad_first_rate; + +/* change scale and update cycle buffer */ +static int +update_audio_scale(Recog *recog, float scale, int totallen) { + ADIn *a = recog->adin; + int i, len; + int zc; + float totalsec; + int hour, minutes; + float second; + + totalsec = (float)totallen / (float)recog->jconf->input.sfreq; + hour = (int)totalsec / 3600; + minutes = (int)((totalsec - hour * 3600) / 60); + second = totalsec - hour * 3600 - minutes * 60; + + zc_copy_buffer(&(a->zc), a->cbuf, &len); + for(i = 0; i < len; i++) a->cbuf[i] = a->cbuf[i] * scale / a->level_coef; + reset_count_zc_e(&(a->zc), a->thres, a->c_length, a->c_offset); + zc = count_zc_e(&(a->zc), a->cbuf, len); + if (verbose_flag) jlog("STAT: AGC: %.2f to %.2f at %02d:%02d:%02.2f\n", recog->adin->level_coef, scale, hour, minutes, second); + recog->adin->level_coef = scale; + recog->jconf->preprocess.level_coef = scale; + + return zc; +} + +#endif /* HAVE_LIBFVAD */ + +/** * * @brief Main A/D-in and sound detection function * @@ -340,40 +378,40 @@ fvad_proceed(ADIn *a, SP16 *speech, int samplenum) * * When the argument "ad_check()" specified, it will be called periodically. * When it returns less than 0, this function will be terminated. - * + * * * * @brief 音声入力と音検出を行うメイン関数 * - * ここでは音声入力の取り込み,音区間の開始・終了の検出を行います. + * ここでは音声入力の取り込み,音区間の開始・終了の検出を行います. * - * スレッドモード時,この関数は独立したAD-inスレッドとしてデタッチされます. + * スレッドモード時,この関数は独立したAD-inスレッドとしてデタッチされます. * (adin_thread_create()), 音入力を検知するとこの関数はワークエリア内の * speech[] にトリガしたサンプルを記録し,かつ transfer_online を TRUE に * セットします. Julius のメイン処理スレッド (adin_go()) は * adin_thread_process() に移行し,そこで transfer_online 時に speech[] を - * 参照しながら認識処理を行います. + * 参照しながら認識処理を行います. * * 非スレッドモード時は,メイン処理関数 adin_go() は直接この関数を呼び, - * 認識処理はこの内部で直接行われます. + * 認識処理はこの内部で直接行われます. * * スレッドモードはマイク入力など,入力が無限で処理の遅延がデータの * 取りこぼしを招くような live input で用いられます. 一方,ファイル入力 - * やadinnet 入力のような buffered input では非スレッドモードが用いられます. + * やadinnet 入力のような buffered input では非スレッドモードが用いられます. * * 引数の ad_process は,取り込んだサンプルに対して処理を行う関数を * 指定します. リアルタイム認識を行う場合は,ここに第1パスの認識処理を - * 行う関数が指定されます. 返り値が 1 であれば,入力をここで区切ります. - * -1 であればエラー終了します. - * + * 行う関数が指定されます. 返り値が 1 であれば,入力をここで区切ります. + * -1 であればエラー終了します. + * * 引数の ad_check は一定処理ごとに繰り返し呼ばれる関数を指定します. この - * 関数の返り値が 0 以下だった場合,入力を即時中断して関数を終了します. + * 関数の返り値が 0 以下だった場合,入力を即時中断して関数を終了します. * * * @param ad_process [in] function to process triggerted input. * @param ad_check [in] function to be called periodically. * @param recog [in] engine instance - * + * * @return 2 when input termination requested by ad_process(), 1 when * if detect end of an input segment (down trigger detected after up * trigger), 0 when reached end of input device, -1 on error, -2 when @@ -394,7 +432,9 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco int end_status = 0; /* return value */ boolean transfer_online_local; /* local repository of transfer_online */ int zc; /* count of zero cross */ - +#ifdef HAVE_LIBFVAD + boolean fv; +#endif /* HAVE_LIBFVAD */ a = recog->adin; /* @@ -404,7 +444,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco * swap buffer for re-starting after short tail silence * * Each samples are first read to buffer[], then passed to count_zc_e() - * to find trigger. Samples between trigger and end of speech are + * to find trigger. Samples between trigger and end of speech are * passed to (*ad_process) with pointer to the first sample and its length. * */ @@ -460,7 +500,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco mic input - samples exist in a device buffer tcpip input - samples exist in a socket file input - samples in a file - + Return value is the number of read samples. If no data exists in the device (in case of mic input), ad_read() will return 0. If reached end of stream (in case end of file or @@ -488,11 +528,11 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco a->input_side_segment = TRUE; end_status = 0; } - /* now the input has been ended, - we should not get further speech input in the next loop, + /* now the input has been ended, + we should not get further speech input in the next loop, instead just process the samples in the temporary buffer until the entire data is processed. */ - a->end_of_stream = TRUE; + a->end_of_stream = TRUE; cnt = 0; /* no new input */ /* in case the first trial of ad_read() fails, exit this loop */ if (a->bp == 0) break; @@ -543,7 +583,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco sub_zmean(&(a->buffer[a->bp]), cnt); } } - + /* current len = current samples in buffer */ a->current_len = a->bp + cnt; } @@ -591,7 +631,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco /* When not adin_cut mode, all incoming data is valid. So is_valid_data should be set to TRUE when some input first comes till this input ends. So, if some data comes, set is_valid_data to - TRUE here. */ + TRUE here. */ if (!a->adin_cut_on && a->is_valid_data == FALSE && a->current_len > 0) { a->is_valid_data = TRUE; callback_exec(CALLBACK_EVENT_SPEECH_START, recog); @@ -600,18 +640,18 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco /******************************************************/ /* prepare for processing samples in temporary buffer */ /******************************************************/ - + wstep = a->chunk_size; /* process unit (should be smaller than cycle buffer) */ /* imax: total length that should be processed at one ad_read() call */ - /* if in real-time mode and not threaded, recognition process + /* if in real-time mode and not threaded, recognition process will be called and executed as the ad_process() callback within this function. If the recognition speed is over the real time, processing all the input samples at the loop below may result in the significant delay of getting next input, that may result in the buffer overflow of the device (namely a microphone device will suffer from this). So, in non-threaded mode, in order to avoid buffer overflow and - input frame dropping, we will leave here by processing + input frame dropping, we will leave here by processing only one segment [0..wstep], and leave the rest in the temporary buffer. */ #ifdef HAVE_PTHREAD @@ -620,7 +660,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco #else imax = (a->current_len < wstep) ? a->current_len : wstep; /* one step */ #endif - + /* wstep: unit length for the loop below */ if (wstep > a->current_len) wstep = a->current_len; @@ -651,19 +691,99 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco /* the cycle buffer in count_zc_e() holds the last samples of (head_margin) miliseconds, and the zerocross over the threshold level are counted within the cycle buffer */ - + /* store the new data to cycle buffer and update the count */ /* return zero-cross num in the cycle buffer */ zc = count_zc_e(&(a->zc), &(a->buffer[i]), wstep); - + +#ifdef HAVE_LIBFVAD + /*********************/ + /* auto gain control */ + /*********************/ + + /* get voice/noise status from fvad */ + fv = fvad_proceed(a, &(a->buffer[i]), wstep); + if (a->fvad && recog->jconf->detect.auto_gain_control_flag) { + float scale; + int total_processed_len = a->total_captured_len - a->current_len + i + wstep - a->zc.valid_len; + /* check if voice/noise status has been kept for the entire cycle buffer */ + if (fvad_last_result == fv) { + fvad_cont_count += wstep; + /* also keep maximum level for the entire cycle buffer */ + if (fvad_level_max < a->zc.level) fvad_level_max = a->zc.level; + } else { + fvad_cont_count = wstep; + fvad_level_max = a->zc.level; + } + fvad_last_result = fv; + + if (a->zc.level > recog->jconf->detect.agc.overflow_thres && fvad_cont_count > a->c_length) { + /* detect input overflow at last chunk, immediately reduce the scale under the cap */ + if (verbose_flag) jlog("STAT: AGC: too loud (>%d)\n", recog->jconf->detect.agc.overflow_thres); + zc = update_audio_scale(recog, (float)recog->adin->level_coef * recog->jconf->detect.agc.scale_down_overflow_rate, total_processed_len); + /* update max after scaling */ + fvad_level_max *= recog->jconf->detect.agc.scale_down_overflow_rate; + /* does not reset detection, continues */ + } + if (fv == TRUE && fvad_cont_count > a->c_length) { + /* voice segment of a certain length found */ + if (fvad_first_time == 0) { + fvad_first_time = 1; + /* this is first time: if amplitude is below level threshold, immediately raise the scale to go over the threshold */ + scale = recog->jconf->detect.agc.level_factor_first * a->thres / fvad_level_max; + if (scale > 1.0f) { + /* set new scale */ + if (verbose_flag) jlog("STAT: AGC: first speech segment, force adjustment\n"); + if (scale > recog->jconf->detect.agc.scale_max) scale = recog->jconf->detect.agc.scale_max; + fvad_first_rate = scale; + zc = update_audio_scale(recog, scale, total_processed_len); + /* update max after scaling */ + if (fvad_level_max < a->zc.level) fvad_level_max = a->zc.level; + } + } else if (fvad_level_max < a->thres) { + /* too low amplitude of the voice part, increase scale gradually */ + if (fvad_first_time == 1 && recog->adin->level_coef >= fvad_first_rate * recog->jconf->detect.agc.scale_max_relative_first) { + fvad_cont_count = 0; + } else { + scale = recog->adin->level_coef * recog->jconf->detect.agc.scale_up_rate; + if (scale > recog->jconf->detect.agc.scale_max) scale = recog->jconf->detect.agc.scale_max; + if (fvad_first_time == 1 && scale > fvad_first_rate * recog->jconf->detect.agc.scale_max_relative_first) { + scale = fvad_first_rate * recog->jconf->detect.agc.scale_max_relative_first; + } + zc = update_audio_scale(recog, scale, total_processed_len); + /* update max after scaling */ + if (fvad_level_max < a->zc.level) fvad_level_max = a->zc.level; + } + } + /* reset detection */ + fvad_cont_count = 0; + } + if (fv == FALSE && fvad_cont_count > a->c_length) { + /* noise segment of a certain length found */ + if (fvad_level_max > a->thres) { + /* mis-detecting long noise as speech, decrease scale gradually */ + scale = recog->adin->level_coef * recog->jconf->detect.agc.scale_down_rate; + if (scale <= 0.0) { + if (verbose_flag) jlog("STAT: AGC: too small scale %f, ignored\n", scale); + } else { + zc = update_audio_scale(recog, scale, total_processed_len); + /* update max after scaling */ + fvad_level_max *= recog->jconf->detect.agc.scale_down_rate; + } + } + /* reset detection */ + fvad_cont_count = 0; + } + } +#endif /* HAVE_LIBFVAD */ if ( #ifdef HAVE_LIBFVAD /* trigger when both libfvad and julius VAD are triggered */ /* process input in libfvad and get VAD result */ - fvad_proceed(a, &(a->buffer[i]), wstep) == 1 && + fv == TRUE && #endif /* HAVE_LIBFVAD */ zc > a->noise_zerocross) { /* now triggering */ - + if (a->is_valid_data == FALSE) { /*****************************************************/ /* process off, trigger on: detect speech triggering */ @@ -738,18 +858,18 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco } } } - + } else { /* is_valid_data == TRUE */ /******************************************************/ /* process on, trigger on: we are in a speech segment */ /******************************************************/ - + if (a->nc > 0) { - + /*************************************/ /* re-triggering in trailing silence */ /*************************************/ - + #ifdef THREAD_DEBUG jlog("DEBUG: re-triggered\n"); #endif @@ -767,7 +887,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco #endif ) { #endif - + /*************************************************/ /* process swap buffer stored while tail silence */ /*************************************************/ @@ -819,13 +939,13 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco } #endif } - } + } } else if (a->is_valid_data == TRUE) { - + /*******************************************************/ /* process on, trigger off: processing tailing silence */ /*******************************************************/ - + #ifdef THREAD_DEBUG jlog("DEBUG: TRAILING SILENCE\n"); #endif @@ -842,21 +962,21 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco a->nc++; } } /* end of triggering handlers */ - - + + /********************************************************************/ /* process the current segment buffer[i...i+wstep] if process == on */ /********************************************************************/ - + if (a->adin_cut_on && a->is_valid_data && a->nc > 0 && a->rest_tail == 0) { - + /* The current trailing silence is now longer than the user- specified tail margin length, so the current samples should not be processed now. But if 're-triggering' occurs in the trailing silence later, they should be processed then. So we just store the overed samples in swapbuf[] and not process them now */ - + #ifdef THREAD_DEBUG jlog("DEBUG: tail silence over, store to swap buffer (nc=%d, rest_tail=%d, sblen=%d-%d)\n", a->nc, a->rest_tail, a->sblen, a->sblen+wstep); #endif @@ -865,7 +985,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco } memcpy(&(a->swapbuf[a->sblen]), &(a->buffer[i]), wstep * sizeof(SP16)); a->sblen += wstep; - + } else { /* we are in a normal speech segment (nc == 0), or @@ -873,7 +993,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco The current trailing silence is shorter than the user- specified tail margin length, so the current samples should be processed now as same as the normal speech segment */ - + #ifdef TMP_FIX_200602 if (!a->adin_cut_on || a->is_valid_data == TRUE) { #else @@ -944,7 +1064,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco } } /* end of current segment processing */ - + if (a->adin_cut_on && a->is_valid_data && a->nc >= a->nc_max) { /*************************************/ /* process on, trailing silence over */ @@ -979,7 +1099,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco /*********************************************************/ i += wstep; /* increment to next wstep samples */ } - + /* purge processed samples and update queue */ adin_purge(a, i); @@ -1001,7 +1121,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco end_status = (a->bp) ? 1 : 0; } } - + return(end_status); } @@ -1021,11 +1141,11 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco * * A/D-in スレッドにてトリガした入力サンプルを保存するコールバック. * - * + * * @param now [in] triggered fragment * @param len [in] length of above * @param recog [in] engine instance - * + * * @return always 0, to tell caller to just continue the input */ static int @@ -1059,7 +1179,7 @@ adin_store_buffer(SP16 *now, int len, Recog *recog) * * A/D-in スレッドのメイン関数. * - * + * * @param dummy [in] a dummy data, not used. */ static void @@ -1089,7 +1209,7 @@ adin_thread_input_main(void *dummy) * Start new A/D-in thread, and initialize buffer. * * - * バッファを初期化して A/D-in スレッドを開始する. + * バッファを初期化して A/D-in スレッドを開始する. * * @param recog [in] engine instance * @@ -1200,13 +1320,13 @@ adin_thread_cancel(Recog *recog) * * この関数は A/D-in スレッドによってサンプルが保存されるのを待ち, * 保存されたサンプルを順次処理していきます. 引数や返り値は adin_cut() と - * 同一です. + * 同一です. * - * + * * @param ad_process [in] function to process triggerted input. * @param ad_check [in] function to be called periodically. * @param recog [in] engine instance - * + * * @return 2 when input termination requested by ad_process(), 1 when * if detect end of an input segment (down trigger detected after up * trigger), 0 when reached end of input device, -1 on error, -2 when @@ -1330,7 +1450,7 @@ adin_thread_process(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Rec pthread_mutex_unlock(&(a->mutex)); break; } - usleep(50000); /* wait = 0.05sec*/ + usleep(50000); /* wait = 0.05sec*/ } } @@ -1360,13 +1480,13 @@ adin_thread_process(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Rec * * スレッドモードでは,この関数は adin_thead_process() を呼び出し, * 非スレッドモードでは adin_cut() を直接呼び出す. 引数や返り値は - * adin_cut() と同一である. + * adin_cut() と同一である. * - * + * * @param ad_process [in] function to process triggerted input. * @param ad_check [in] function to be called periodically. * @param recog [in] engine instance - * + * * @return 2 when input termination requested by ad_process(), 1 when * if detect end of an input segment (down trigger detected after up * trigger), 0 when reached end of input device, -1 on error, -2 when @@ -1374,7 +1494,7 @@ adin_thread_process(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Rec * * @callergraph * @callgraph - * + * */ int adin_go(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Recog *recog) @@ -1389,23 +1509,23 @@ adin_go(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Recog return(adin_cut(ad_process, ad_check, recog)); } -/** +/** * * Call device-specific initialization. * * - * デバイス依存の初期化関数を呼び出す. + * デバイス依存の初期化関数を呼び出す. * - * + * * @param a [in] A/D-in work area * @param freq [in] sampling frequency * @param arg [in] device-dependent argument - * + * * @return TRUE if succeeded, FALSE if failed. - * + * * @callergraph * @callgraph - * + * */ boolean adin_standby(ADIn *a, int freq, void *arg) @@ -1414,22 +1534,22 @@ adin_standby(ADIn *a, int freq, void *arg) if (a->ad_standby != NULL) return(a->ad_standby(freq, arg)); return TRUE; } -/** +/** * * Call device-specific function to begin capturing of the audio stream. * * - * 音の取り込みを開始するデバイス依存の関数を呼び出す. + * 音の取り込みを開始するデバイス依存の関数を呼び出す. * - * + * * @param a [in] A/D-in work area * @param file_or_dev_name [in] device / file path to open or NULL for default - * + * * @return TRUE on success, FALSE on failure. - * + * * @callergraph * @callgraph - * + * */ boolean adin_begin(ADIn *a, char *file_or_dev_name) @@ -1443,16 +1563,16 @@ adin_begin(ADIn *a, char *file_or_dev_name) } return TRUE; } -/** +/** * * Call device-specific function to end capturing of the audio stream. * * - * 音の取り込みを終了するデバイス依存の関数を呼び出す. + * 音の取り込みを終了するデバイス依存の関数を呼び出す. * - * + * * @param a [in] A/D-in work area - * + * * @return TRUE on success, FALSE on failure. * * @callergraph @@ -1468,19 +1588,19 @@ adin_end(ADIn *a) return TRUE; } -/** +/** * * Free memories of A/D-in work area. * * - * 音取り込み用ワークエリアのメモリを開放する. + * 音取り込み用ワークエリアのメモリを開放する. * - * + * * @param recog [in] engine instance * * @callergraph * @callgraph - * + * */ void adin_free_param(Recog *recog) diff --git a/libjulius/src/default.c b/libjulius/src/default.c index a4949f1b..46810f1b 100644 --- a/libjulius/src/default.c +++ b/libjulius/src/default.c @@ -87,6 +87,14 @@ jconf_set_default_values(Jconf *j) j->detect.fvad_mode = -1; j->detect.fvad_smoothnum = 5; j->detect.fvad_thres = 0.5; + j->detect.auto_gain_control_flag = FALSE; + j->detect.agc.overflow_thres = 30000; + j->detect.agc.scale_max = 25.0; + j->detect.agc.scale_max_relative_first = 3.0; + j->detect.agc.level_factor_first = 3.0; + j->detect.agc.scale_up_rate = 1.3; + j->detect.agc.scale_down_rate = 0.8; + j->detect.agc.scale_down_overflow_rate = 0.7; #endif /* HAVE_LIBFVAD */ j->preprocess.strip_zero_sample = TRUE; diff --git a/libjulius/src/m_chkparam.c b/libjulius/src/m_chkparam.c index e63c1ed5..85d587a8 100644 --- a/libjulius/src/m_chkparam.c +++ b/libjulius/src/m_chkparam.c @@ -350,22 +350,22 @@ j_jconf_finalize(Jconf *jconf) /* check parameter to be passed to libfvad */ if (jconf->detect.fvad_mode < -1) jconf->detect.fvad_mode = -1; if (jconf->detect.fvad_mode > 3) { - jlog("WARNING: m_chkparam: invalud value for \"-fvad\": %d\n", jconf->detect.fvad_mode); + jlog("WARNING: m_chkparam: invalid value for \"-fvad\": %d\n", jconf->detect.fvad_mode); jlog("WARNING: m_chkparam: maximum aggressiveness is 3, use 3 now\n"); jconf->detect.fvad_mode = 3; } if (jconf->detect.fvad_smoothnum < 1) { - jlog("WARNING: m_chkparam: invalud value for 1st arg of \"-fvad_param\": %d\n", jconf->detect.fvad_smoothnum); + jlog("WARNING: m_chkparam: invalid value for 1st arg of \"-fvad_param\": %d\n", jconf->detect.fvad_smoothnum); jlog("WARNING: m_chkparam: num should be >= 1, use 1 now\n"); jconf->detect.fvad_smoothnum = 1; } if (jconf->detect.fvad_thres < 0.0) { - jlog("WARNING: m_chkparam: invalud value for 2nd arg of \"-fvad_param\": %f\n", jconf->detect.fvad_thres); + jlog("WARNING: m_chkparam: invalid value for 2nd arg of \"-fvad_param\": %f\n", jconf->detect.fvad_thres); jlog("WARNING: m_chkparam: thres should be >= 0.0, use 0.0 now\n"); jconf->detect.fvad_thres = 0.0; } if (jconf->detect.fvad_thres > 1.0) { - jlog("WARNING: m_chkparam: invalud value for 2nd arg of \"-fvad_param\": %f\n", jconf->detect.fvad_thres); + jlog("WARNING: m_chkparam: invalid value for 2nd arg of \"-fvad_param\": %f\n", jconf->detect.fvad_thres); jlog("WARNING: m_chkparam: thres should be <= 1.0, use 1.0 now\n"); jconf->detect.fvad_thres = 1.0; } diff --git a/libjulius/src/m_info.c b/libjulius/src/m_info.c index aa5d4e51..ac262992 100644 --- a/libjulius/src/m_info.c +++ b/libjulius/src/m_info.c @@ -1,19 +1,19 @@ /** * @file m_info.c - * + * * * @brief システム情報の出力 * - * + * * * @brief Output system informations. * - * + * * @author Akinobu Lee * @date Thu May 12 14:14:01 2005 * * $Revision: 1.23 $ - * + * */ /* * Copyright (c) 1991-2013 Kawahara Lab., Kyoto University @@ -24,19 +24,19 @@ #include -/** +/** * * Output module overview in a global configuration variables to log. * * - * 全体設定パラメータ内のモジュール構成の概要をログに出力する. + * 全体設定パラメータ内のモジュール構成の概要をログに出力する. * - * + * * @param jconf [in] global configuration variables * * @callgraph * @callergraph - * + * */ void print_jconf_overview(Jconf *jconf) @@ -56,9 +56,9 @@ print_jconf_overview(Jconf *jconf) jlog(" LM=%d,", i); i = 0; for(sconf=jconf->search_root;sconf;sconf=sconf->next) i++; jlog(" SR=%d\n", i); - + jlog("\n"); - + jlog(" Acoustic Model (with input parameter spec.):\n"); for(amconf=jconf->am_root;amconf;amconf=amconf->next) { if (amconf->name[0] != '\0') { @@ -75,7 +75,7 @@ print_jconf_overview(Jconf *jconf) } } jlog("\n"); - + jlog(" Language Model:\n"); for(lmconf=jconf->lm_root;lmconf;lmconf=lmconf->next) { if (lmconf->name[0] != '\0') { @@ -218,7 +218,7 @@ print_mfcc_info(FILE *fp, MFCCCalc *mfcc, Jconf *jconf) jlog(" save cep. data to = \"%s\", update at the end of each input\n", mfcc->cmn.save_filename); } jlog("\n"); - + jlog("\t base setup from ="); if (mfcc->htk_loaded == 1 || mfcc->hmm_loaded == 1) { if (mfcc->hmm_loaded == 1) { @@ -243,16 +243,16 @@ print_mfcc_info(FILE *fp, MFCCCalc *mfcc, Jconf *jconf) } -/** +/** * - * エンジンインスタンスの全情報をログに出力する. + * エンジンインスタンスの全情報をログに出力する. * * * Output all informations of an engine instance to log. * * * @param recog [in] engine instance - * + * * @callgraph * @callergraph */ @@ -267,7 +267,7 @@ print_engine_info(Recog *recog) RecogProcess *r; jconf = recog->jconf; - + /* set output file pointer to fp */ fp = jlog_get_fp(); if (fp == NULL) return; @@ -277,7 +277,7 @@ print_engine_info(Recog *recog) j_put_compile_defs(fp); j_put_library_defs(fp); jlog("\n"); - + /* print current argument setting to log */ print_jconf_overview(jconf); @@ -286,7 +286,7 @@ print_engine_info(Recog *recog) /* acoustic parameter conditions for this model */ jlog("------------------------------------------------------------\n"); jlog("Speech Analysis Module(s)\n\n"); - + for(mfcc=recog->mfcclist;mfcc;mfcc=mfcc->next) { jlog("[MFCC%02d] for", mfcc->id); @@ -395,7 +395,7 @@ print_engine_info(Recog *recog) #else jlog("disabled\n"); #endif - + if (am->hmminfo->multipath) { jlog(" sp transition penalty = %+2.1f\n", am->config->iwsp_penalty); } @@ -475,7 +475,7 @@ print_engine_info(Recog *recog) jlog("\n"); } } - + if (lm->lmtype == LM_PROB) { if (lm->config->enable_iwspword) { jlog("\tIW-sp word added to dict= \"%s\"\n", lm->config->iwspentry); @@ -500,7 +500,7 @@ print_engine_info(Recog *recog) } } - if (lm->lmtype == LM_PROB) { + if (lm->lmtype == LM_PROB) { jlog("\t(-silhead)head sil word = "); put_voca(fp, lm->winfo, lm->winfo->head_silwid); jlog("\t(-siltail)tail sil word = "); @@ -512,7 +512,7 @@ print_engine_info(Recog *recog) jlog("\tword head = \"%s\"\n", lm->config->wordrecog_head_silence_model_name); jlog("\tword tail = \"%s\"\n", lm->config->wordrecog_tail_silence_model_name); jlog("\ttheir context name = \"%s\"\n", (lm->config->wordrecog_silence_context_name[0] == '\0') ? "NULL (blank)" : lm->config->wordrecog_silence_context_name); - + } } @@ -630,7 +630,7 @@ print_engine_info(Recog *recog) #endif jlog("\t(-n)search candidate num= %d\n", r->config->pass2.nbest); jlog("\t(-s) search stack size = %d\n", r->config->pass2.stack_size); - jlog("\t(-m) search overflow = after %d hypothesis poped\n", r->config->pass2.hypo_overflow); + jlog("\t(-m) search overflow = after %d hypothesis popped\n", r->config->pass2.hypo_overflow); jlog("\t 2nd pass method = "); if (r->config->graph.enabled) { #ifdef GRAPHOUT_DYNAMIC @@ -726,7 +726,7 @@ print_engine_info(Recog *recog) if (r->config->compute_only_1pass) { jlog("\tCompute only 1-pass\n"); } - + if (r->config->graph.enabled) { jlog("\n"); jlog("Graph-based output with graph-oriented search:\n"); @@ -763,7 +763,7 @@ print_engine_info(Recog *recog) #endif } - + if (r->config->successive.enabled) { jlog("\tshort pause segmentation = on\n"); jlog("\t sp duration length = %d frames\n", r->config->successive.sp_frame_duration); @@ -831,7 +831,7 @@ print_engine_info(Recog *recog) jlog("based on search-time scores\n"); #endif #endif /* CONFIDENCE_MEASURE */ - + jlog("\n"); jlog("------------------------------------------------------------\n"); @@ -949,11 +949,23 @@ print_engine_info(Recog *recog) jlog("\t chunk size = %d samples\n", jconf->detect.chunk_size); #ifdef HAVE_LIBFVAD if (jconf->detect.fvad_mode < 0) { - jlog("\t FVAD switch value = %d (disabled)\n", jconf->detect.fvad_mode); + jlog("\tWebRTC VAD operating mode = %d (disabled)\n", jconf->detect.fvad_mode); } else { - jlog("\t FVAD switch value = %d (0: moderate - 3: very aggressive to regist to noise\n", jconf->detect.fvad_mode); - jlog("\t FVAD param smoothlen = %d (%dms)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_smoothnum * 10); - jlog("\t FVAD param threshold = %.2f\n", jconf->detect.fvad_thres); + jlog("\tWebRTC VAD operating mode = %d (0-3, larger value rejects noises aggressively)\n", jconf->detect.fvad_mode); + jlog("\tWebRTC VAD smoothing len = %d (%dms)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_smoothnum * 10); + jlog("\tWebRTC VAD active thres = %.2f\n", jconf->detect.fvad_thres); + if (jconf->detect.auto_gain_control_flag) { + jlog("\t Auto Gain Control = enabled\n"); + jlog("\t AGC signal overflow threshold = %d\n", jconf->detect.agc.overflow_thres); + jlog("\t AGC maximum allowed scale = %.2f\n", jconf->detect.agc.scale_max); + jlog("\t AGC maximum allowed scale, relative to first-utterance scale) = %.2f\n", jconf->detect.agc.scale_max_relative_first); + jlog("\t AGC target level threshold factor for first-utterance scaling = %.2f\n", jconf->detect.agc.level_factor_first); + jlog("\t AGC rate when moving scale up = %.2f\n", jconf->detect.agc.scale_up_rate); + jlog("\t AGC rate when moving scale down = %.2f\n", jconf->detect.agc.scale_down_rate); + jlog("\t AGC rate when moving scale down at overflow = %.2f\n", jconf->detect.agc.scale_down_overflow_rate); + } else { + jlog("\t Auto Gain Control = disabled\n"); + } } #endif /* HAVE_LIBFVAD */ } else { diff --git a/libjulius/src/m_options.c b/libjulius/src/m_options.c index 8a4a45e2..fe6a3150 100644 --- a/libjulius/src/m_options.c +++ b/libjulius/src/m_options.c @@ -1363,6 +1363,31 @@ opt_parse(int argc, char *argv[], char *cwd, Jconf *jconf) GET_TMPARG; jconf->detect.fvad_thres = (float)atof(tmparg); continue; + } else if (strmatch(argv[i],"-agc")) { /* enable agc */ + if (!check_section(jconf, argv[i], JCONF_OPT_GLOBAL)) return FALSE; + jconf->detect.auto_gain_control_flag = TRUE; + continue; + } else if (strmatch(argv[i],"-noagc")) { /* disable agc */ + if (!check_section(jconf, argv[i], JCONF_OPT_GLOBAL)) return FALSE; + jconf->detect.auto_gain_control_flag = FALSE; + continue; + } else if (strmatch(argv[i],"-agc_param")) { /* disable agc */ + if (!check_section(jconf, argv[i], JCONF_OPT_GLOBAL)) return FALSE; + GET_TMPARG; + jconf->detect.agc.overflow_thres = atoi(tmparg); + GET_TMPARG; + jconf->detect.agc.scale_max = (float)atof(tmparg); + GET_TMPARG; + jconf->detect.agc.scale_max_relative_first = (float)atof(tmparg); + GET_TMPARG; + jconf->detect.agc.level_factor_first = (float)atof(tmparg); + GET_TMPARG; + jconf->detect.agc.scale_up_rate = (float)atof(tmparg); + GET_TMPARG; + jconf->detect.agc.scale_down_rate = (float)atof(tmparg); + GET_TMPARG; + jconf->detect.agc.scale_down_overflow_rate = (float)atof(tmparg); + continue; #endif /* HAVE_LIBFVAD */ } if (argv[i][0] == '-' && strlen(argv[i]) == 2) { diff --git a/libjulius/src/m_usage.c b/libjulius/src/m_usage.c index 6fd3eb5b..59b2dd6c 100644 --- a/libjulius/src/m_usage.c +++ b/libjulius/src/m_usage.c @@ -1,19 +1,19 @@ /** * @file m_usage.c - * + * * * @brief ヘルプを表示する * - * + * * * @brief Print help. * - * + * * @author Akinobu Lee * @date Fri May 13 15:04:34 2005 * * $Revision: 1.25 $ - * + * */ /* * Copyright (c) 1991-2013 Kawahara Lab., Kyoto University @@ -24,14 +24,14 @@ #include -/** +/** * - * ヘルプを表示する. - * + * ヘルプを表示する. + * * * * Output help document. - * + * * * * @param fp [in] file pointer to output help @@ -39,7 +39,7 @@ * @callgraph * @callergraph * @ingroup engine - * + * */ void j_output_argument_help(FILE *fp) @@ -51,7 +51,7 @@ j_output_argument_help(FILE *fp) PLUGIN_ENTRY *p; FUNC_VOID func; #endif - + /* load default values */ jconf = j_jconf_new(); @@ -129,7 +129,7 @@ j_output_argument_help(FILE *fp) #ifdef POWER_REJECT fprintf(fp, " [-powerthres value] rejection threshold of average power (%.1f)\n", jconf->reject.powerthres); #endif - + fprintf(fp, "\n Speech Detection: (default: on=mic/net off=files)\n"); /*fprintf(fp, " [-pausesegment] turn on (force) pause detection\n");*/ /*fprintf(fp, " [-nopausesegment] turn off (force) pause detection\n");*/ @@ -141,8 +141,11 @@ j_output_argument_help(FILE *fp) fprintf(fp, " [-tailmargin msec] tail margin length in msec. (%d)\n", jconf->detect.tail_margin_msec); fprintf(fp, " [-chunksize sample] unit length for processing (%d)\n", jconf->detect.chunk_size); #ifdef HAVE_LIBFVAD - fprintf(fp, " [-fvad] FVAD sw (-1=off, 0-3=on / degree (%d)\n", jconf->detect.fvad_mode); - fprintf(fp, " [-fvad_param i f] FVAD parameter (dur/thres) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); + fprintf(fp, " [-fvad mode] enable WebRTC VAD (0-3, larger value rejects noises aggressively) (%d)\n", jconf->detect.fvad_mode); + fprintf(fp, " [-fvad_param i f] WebRTC VAD parameters (smoothing duration (frames), thres([0-1])) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); + fprintf(fp, " [-agc][-noagc] enable/disable additional AGC on WebRTC VAD\n"); + fprintf(fp, " [-agc_param p1 ... p7] AGC parameters (%d %.2f %.2f %.2f %.2f %.2f %.2f)\n", jconf->detect.agc.overflow_thres , jconf->detect.agc.scale_max, jconf->detect.agc.scale_max_relative_first, jconf->detect.agc.level_factor_first, jconf->detect.agc.scale_up_rate, jconf->detect.agc.scale_down_rate, jconf->detect.agc.scale_down_overflow_rate); + #endif /* HAVE_LIBFVAD */ fprintf(fp, "\n GMM utterance verification:\n"); @@ -289,7 +292,7 @@ j_output_argument_help(FILE *fp) fprintf(fp, " [-iwspentry entry] (n-gram) word entry for \"-iwspword\" (%s)\n", IWSPENTRY_DEFAULT); fprintf(fp, " [-adddict dictfile] (n-gram) load extra dictionary\n"); fprintf(fp, " [-addentry entry] (n-gram) load extra word entry\n"); - + fprintf(fp, "\n Isolated Word Recognition:\n"); fprintf(fp, " -w file[,file2...] (list of) wordlist file name(s)\n"); fprintf(fp, " -wlist filename file that contains list of wordlists\n"); diff --git a/libsent/include/sent/speech.h b/libsent/include/sent/speech.h index c2a618e2..ff53a1c5 100644 --- a/libsent/include/sent/speech.h +++ b/libsent/include/sent/speech.h @@ -115,7 +115,8 @@ FILE *wrwav_open(char *filename, int sfreq); boolean wrwav_data(FILE *fp, SP16 *buf, int len); boolean wrwav_close(FILE *fp); -/* for an;z/strip.c */ +/* for anlz/strip.c */ +void set_strip_zero_warning(boolean flag); int strip_zero(SP16 a[], int len); #ifdef __cplusplus diff --git a/libsent/src/anlz/strip.c b/libsent/src/anlz/strip.c index 9ed25c4b..405fb246 100644 --- a/libsent/src/anlz/strip.c +++ b/libsent/src/anlz/strip.c @@ -29,6 +29,21 @@ /// Length of zero sample to detect as invalid sequence. #define WINDOWLEN 16 +/// log switch +static boolean strip_zero_warning = TRUE; + +/** + * Switch strip zero warning message + * + * @param flag [in] flag + * + */ +void +set_strip_zero_warning(boolean flag) +{ + strip_zero_warning = flag; +} + /** * Strip zero samples from speech data. * @@ -63,7 +78,7 @@ strip_zero(SP16 a[], int len) } } else { /* deleted (leave uncopied) */ - jlog("Warning: strip: sample %d-%d has zero value, stripped\n", bgn, src-1); + if (strip_zero_warning) jlog("Warning: strip: sample %d-%d has zero value, stripped\n", bgn, src-1); } } a[dst++] = a[src]; @@ -79,7 +94,7 @@ strip_zero(SP16 a[], int len) } } else { /* deleted (leave uncopied) */ - jlog("Warning: strip: sample %d-%d is invalid, stripped\n", bgn, src-1); + if (strip_zero_warning) jlog("Warning: strip: sample %d-%d is invalid, stripped\n", bgn, src-1); } }