From e50d2bcde2cdabd6474d93fdc84e2df1e686fe06 Mon Sep 17 00:00:00 2001 From: Akinobu Lee Date: Sat, 20 Apr 2019 03:38:48 +0900 Subject: [PATCH 01/12] Added experimental agc --- libjulius/src/adin-cut.c | 310 ++++++++++++++++++++++++++------------- 1 file changed, 204 insertions(+), 106 deletions(-) diff --git a/libjulius/src/adin-cut.c b/libjulius/src/adin-cut.c index ae7f6c3e..52b411f0 100644 --- a/libjulius/src/adin-cut.c +++ b/libjulius/src/adin-cut.c @@ -5,42 +5,42 @@ * @brief 音声キャプチャおよび有音区間検出 * * 音声入力デバイスからの音声データの取り込み,および - * 音の存在する区間の検出を行ないます. + * 音の存在する区間の検出を行ないます. * - * 有音区間の検出は,振幅レベルと零交差数を用いて行ないます. + * 有音区間の検出は,振幅レベルと零交差数を用いて行ないます. * 入力断片ごとに,レベルしきい値を越える振幅について零交差数をカウントし, * それが指定した数以上になれば,音の区間開始検出として * 取り込みを開始します. 取り込み中に零交差数が指定数以下になれば, * 取り込みを停止します. 実際には頑健に切り出しを行なうため,開始部と - * 停止部の前後にマージンを持たせて切り出します. - * - * また,オプション指定 (-zmean)により DC offset の除去をここで行ないます. - * offset は最初の @a ZMEANSAMPLES 個のサンプルの平均から計算されます. + * 停止部の前後にマージンを持たせて切り出します. + * + * また,オプション指定 (-zmean)により DC offset の除去をここで行ないます. + * offset は最初の @a ZMEANSAMPLES 個のサンプルの平均から計算されます. * * 音声データの取り込みと並行して入力音声の処理を行ないます. このため, * 取り込んだ音声データはその取り込み単位(live入力では一定時間,音声ファイル - * ではバッファサイズ)ごとに,それらを引数としてコールバック関数が呼ばれます. + * ではバッファサイズ)ごとに,それらを引数としてコールバック関数が呼ばれます. * このコールバック関数としてデータの保存や特徴量抽出, - * (フレーム同期の)認識処理を進める関数を指定します. + * (フレーム同期の)認識処理を進める関数を指定します. * * マイク入力や NetAudio 入力などの Live 入力では, * コールバック内の処理が重く処理が入力の速度に追い付かないと, - * デバイスのバッファが溢れ,入力断片がロストする場合があります. + * デバイスのバッファが溢れ,入力断片がロストする場合があります. * このエラーを防ぐため,実行環境で pthread が使用可能である場合, - * 音声取り込み・区間検出部は本体と独立したスレッドで動作します. + * 音声取り込み・区間検出部は本体と独立したスレッドで動作します. * この場合,このスレッドは本スレッドとバッファ @a speech を介して - * 以下のように協調動作します. - * + * 以下のように協調動作します. + * * - Thread 1: 音声取り込み・音区間検出スレッド - * - デバイスから音声データを読み込みながら音区間検出を行なう. + * - デバイスから音声データを読み込みながら音区間検出を行なう. * 検出した音区間のサンプルはバッファ @a speech の末尾に逐次 - * 追加される. + * 追加される. * - このスレッドは起動時から本スレッドから独立して動作し, - * 上記の動作を行ない続ける. + * 上記の動作を行ない続ける. * - Thread 2: 音声処理・認識処理を行なう本スレッド * - バッファ @a speech を一定時間ごとに監視し,新たなサンプルが * Thread 1 によって追加されたらそれらを処理し,処理が終了した - * 分バッファを詰める. + * 分バッファを詰める. * * * @@ -96,7 +96,7 @@ * @date Sat Feb 12 13:20:53 2005 * * $Revision: 1.22 $ - * + * */ /* * Copyright (c) 1991-2013 Kawahara Lab., Kyoto University @@ -117,20 +117,20 @@ /// Define this if you want to output a debug message for threading #undef THREAD_DEBUG /// Enable some fixes relating adinnet+module -#define TMP_FIX_200602 +#define TMP_FIX_200602 -/** +/** * * @brief Set up parameters for A/D-in and input detection. * * Set variables in work area according to the configuration values. - * + * * * * @brief 音声切り出し用各種パラメータをセット * - * 設定を元に切り出し用のパラメータを計算し,ワークエリアにセットします. - * + * 設定を元に切り出し用のパラメータを計算し,ワークエリアにセットします. + * * * @param adin [in] AD-in work area * @param jconf [in] configuration data @@ -207,7 +207,7 @@ adin_setup_param(ADIn *adin, Jconf *jconf) if (adin->adin_cut_on) { init_count_zc_e(&(adin->zc), adin->c_length); } - + adin->need_init = TRUE; adin->rehash = FALSE; @@ -237,17 +237,17 @@ adin_setup_param(ADIn *adin, Jconf *jconf) } -/** +/** * * Purge samples already processed in the temporary buffer. * * * テンポラリバッファにある処理されたサンプルをパージする. * - * + * * @param a [in] AD-in work area * @param from [in] Purge samples in range [0..from-1]. - * + * */ static void adin_purge(ADIn *a, int from) @@ -298,7 +298,7 @@ fvad_proceed(ADIn *a, SP16 *speech, int samplenum) result = 1; else result = 0; - + /* flush processed samples */ k = 0; for (j = i; j < a->fvad_speechlen; j++) { @@ -306,12 +306,43 @@ fvad_proceed(ADIn *a, SP16 *speech, int samplenum) k++; } a->fvad_speechlen = k; - + return result; } #endif /* HAVE_LIBFVAD */ -/** +#ifdef HAVE_LIBFVAD +/* work area for auto gain control */ +static int fvad_cont_count = 0; /* continuous count of status keep */ +static int fvad_last_result = 0; /* keeps last fvad result */ +static int fvad_level_max = 0; /* maximum input level in cycle buffer */ +static int fvad_first_time = 0; /* flag to detect the first speech */ + +/* defines for auto gain control */ +#define FVAD_AGC_CAP 30000 /* upper cap */ +#define FVAD_AGC_INC_COEF 1.2 /* scale increase coef */ +#define FVAD_AGC_DEC_COEF 0.8 /* scale decrease coef */ + +/* change scale and update cycle buffer */ +static int +update_audio_scale(Recog *recog, float scale) { + ADIn *a = recog->adin; + int i, len; + int zc; + + zc_copy_buffer(&(a->zc), a->cbuf, &len); + for(i = 0; i < len; i++) a->cbuf[i] = a->cbuf[i] * scale / a->level_coef; + reset_count_zc_e(&(a->zc), a->thres, a->c_length, a->c_offset); + zc = count_zc_e(&(a->zc), a->cbuf, len); + jlog("INFO: audio scale adjusted from %f to %f toward threshold %d\n", recog->adin->level_coef, scale, a->thres); + recog->adin->level_coef = scale; + recog->jconf->preprocess.level_coef = scale; + + return zc; +} +#endif /* HAVE_LIBFVAD */ + +/** * * @brief Main A/D-in and sound detection function * @@ -340,40 +371,40 @@ fvad_proceed(ADIn *a, SP16 *speech, int samplenum) * * When the argument "ad_check()" specified, it will be called periodically. * When it returns less than 0, this function will be terminated. - * + * * * * @brief 音声入力と音検出を行うメイン関数 * - * ここでは音声入力の取り込み,音区間の開始・終了の検出を行います. + * ここでは音声入力の取り込み,音区間の開始・終了の検出を行います. * - * スレッドモード時,この関数は独立したAD-inスレッドとしてデタッチされます. + * スレッドモード時,この関数は独立したAD-inスレッドとしてデタッチされます. * (adin_thread_create()), 音入力を検知するとこの関数はワークエリア内の * speech[] にトリガしたサンプルを記録し,かつ transfer_online を TRUE に * セットします. Julius のメイン処理スレッド (adin_go()) は * adin_thread_process() に移行し,そこで transfer_online 時に speech[] を - * 参照しながら認識処理を行います. + * 参照しながら認識処理を行います. * * 非スレッドモード時は,メイン処理関数 adin_go() は直接この関数を呼び, - * 認識処理はこの内部で直接行われます. + * 認識処理はこの内部で直接行われます. * * スレッドモードはマイク入力など,入力が無限で処理の遅延がデータの * 取りこぼしを招くような live input で用いられます. 一方,ファイル入力 - * やadinnet 入力のような buffered input では非スレッドモードが用いられます. + * やadinnet 入力のような buffered input では非スレッドモードが用いられます. * * 引数の ad_process は,取り込んだサンプルに対して処理を行う関数を * 指定します. リアルタイム認識を行う場合は,ここに第1パスの認識処理を - * 行う関数が指定されます. 返り値が 1 であれば,入力をここで区切ります. - * -1 であればエラー終了します. - * + * 行う関数が指定されます. 返り値が 1 であれば,入力をここで区切ります. + * -1 であればエラー終了します. + * * 引数の ad_check は一定処理ごとに繰り返し呼ばれる関数を指定します. この - * 関数の返り値が 0 以下だった場合,入力を即時中断して関数を終了します. + * 関数の返り値が 0 以下だった場合,入力を即時中断して関数を終了します. * * * @param ad_process [in] function to process triggerted input. * @param ad_check [in] function to be called periodically. * @param recog [in] engine instance - * + * * @return 2 when input termination requested by ad_process(), 1 when * if detect end of an input segment (down trigger detected after up * trigger), 0 when reached end of input device, -1 on error, -2 when @@ -394,7 +425,9 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco int end_status = 0; /* return value */ boolean transfer_online_local; /* local repository of transfer_online */ int zc; /* count of zero cross */ - +#ifdef HAVE_LIBFVAD + int fv; +#endif /* HAVE_LIBFVAD */ a = recog->adin; /* @@ -404,7 +437,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco * swap buffer for re-starting after short tail silence * * Each samples are first read to buffer[], then passed to count_zc_e() - * to find trigger. Samples between trigger and end of speech are + * to find trigger. Samples between trigger and end of speech are * passed to (*ad_process) with pointer to the first sample and its length. * */ @@ -460,7 +493,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco mic input - samples exist in a device buffer tcpip input - samples exist in a socket file input - samples in a file - + Return value is the number of read samples. If no data exists in the device (in case of mic input), ad_read() will return 0. If reached end of stream (in case end of file or @@ -488,11 +521,11 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco a->input_side_segment = TRUE; end_status = 0; } - /* now the input has been ended, - we should not get further speech input in the next loop, + /* now the input has been ended, + we should not get further speech input in the next loop, instead just process the samples in the temporary buffer until the entire data is processed. */ - a->end_of_stream = TRUE; + a->end_of_stream = TRUE; cnt = 0; /* no new input */ /* in case the first trial of ad_read() fails, exit this loop */ if (a->bp == 0) break; @@ -543,7 +576,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco sub_zmean(&(a->buffer[a->bp]), cnt); } } - + /* current len = current samples in buffer */ a->current_len = a->bp + cnt; } @@ -591,7 +624,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco /* When not adin_cut mode, all incoming data is valid. So is_valid_data should be set to TRUE when some input first comes till this input ends. So, if some data comes, set is_valid_data to - TRUE here. */ + TRUE here. */ if (!a->adin_cut_on && a->is_valid_data == FALSE && a->current_len > 0) { a->is_valid_data = TRUE; callback_exec(CALLBACK_EVENT_SPEECH_START, recog); @@ -600,18 +633,18 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco /******************************************************/ /* prepare for processing samples in temporary buffer */ /******************************************************/ - + wstep = a->chunk_size; /* process unit (should be smaller than cycle buffer) */ /* imax: total length that should be processed at one ad_read() call */ - /* if in real-time mode and not threaded, recognition process + /* if in real-time mode and not threaded, recognition process will be called and executed as the ad_process() callback within this function. If the recognition speed is over the real time, processing all the input samples at the loop below may result in the significant delay of getting next input, that may result in the buffer overflow of the device (namely a microphone device will suffer from this). So, in non-threaded mode, in order to avoid buffer overflow and - input frame dropping, we will leave here by processing + input frame dropping, we will leave here by processing only one segment [0..wstep], and leave the rest in the temporary buffer. */ #ifdef HAVE_PTHREAD @@ -620,7 +653,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco #else imax = (a->current_len < wstep) ? a->current_len : wstep; /* one step */ #endif - + /* wstep: unit length for the loop below */ if (wstep > a->current_len) wstep = a->current_len; @@ -651,19 +684,84 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco /* the cycle buffer in count_zc_e() holds the last samples of (head_margin) miliseconds, and the zerocross over the threshold level are counted within the cycle buffer */ - + /* store the new data to cycle buffer and update the count */ /* return zero-cross num in the cycle buffer */ zc = count_zc_e(&(a->zc), &(a->buffer[i]), wstep); - + +#ifdef HAVE_LIBFVAD + /*********************/ + /* auto gain control */ + /*********************/ + + /* get voice/noise status from fvad */ + fv = fvad_proceed(a, &(a->buffer[i]), wstep); + if (a->fvad) { + float scale; + + /* check if voice/noise status has been kept for the entire cycle buffer */ + if (fvad_last_result == fv) { + fvad_cont_count += wstep; + /* also keep maximum level for the entire cycle buffer */ + if (fvad_level_max < a->zc.level) fvad_level_max = a->zc.level; + } else { + fvad_cont_count = wstep; + fvad_level_max = a->zc.level; + } + fvad_last_result = fv; + + if (a->zc.level > FVAD_AGC_CAP && fvad_cont_count > a->c_length) { + /* detect input overflow at last chunk, immediately reduce the scale under the cap */ + zc = update_audio_scale(recog, (float)FVAD_AGC_CAP / a->zc.level); + /* reset detection */ + fvad_cont_count = 0; + } + if (fv == 1 && fvad_cont_count > a->c_length) { + /* voice segment of a certain length found */ + if (fvad_first_time == 0) { + fvad_first_time = 1; + /* this is first time: if amplitude is below level threshold, immediately raise the scale to go over the threshold */ + scale = 2.0 * a->thres / fvad_level_max; + if (scale > 1.0f) { + /* set new scale */ + zc = update_audio_scale(recog, scale); + printf("first scale=%f to %f, level=%d to %d, thres=%d\n", recog->adin->level_coef, scale, fvad_level_max, a->zc.level, a->thres); + /* update max after scaling */ + if (fvad_level_max < a->zc.level) fvad_level_max = a->zc.level; + } + /* reset detection */ + fvad_cont_count = 0; + } else if (fvad_level_max < a->thres * 2.0) { + /* too low amplitude of the voice part, increase scale gradually */ + scale = recog->adin->level_coef * FVAD_AGC_INC_COEF; + zc = update_audio_scale(recog, scale); + printf("up scale=%f to %f, level=%d to %d, thres=%d\n", recog->adin->level_coef, scale, fvad_level_max, a->zc.level, a->thres); + /* update max after scaling */ + if (fvad_level_max < a->zc.level) fvad_level_max = a->zc.level; + /* does not reset detection, continues */ + } + } + if (fv == 0 && fvad_cont_count > a->c_length) { + /* noise segment of a certain length found */ + if (fvad_level_max > a->thres * 2.0) { + /* mis-detecting long noise as speech, decrease scale gradually */ + scale = recog->adin->level_coef * FVAD_AGC_DEC_COEF; + zc = update_audio_scale(recog, scale); + /* update max after scaling */ + fvad_level_max *= FVAD_AGC_DEC_COEF; + printf("down scale=%f\n", recog->adin->level_coef); + } + } + } +#endif /* HAVE_LIBFVAD */ if ( #ifdef HAVE_LIBFVAD /* trigger when both libfvad and julius VAD are triggered */ /* process input in libfvad and get VAD result */ - fvad_proceed(a, &(a->buffer[i]), wstep) == 1 && + fv == 1 && #endif /* HAVE_LIBFVAD */ zc > a->noise_zerocross) { /* now triggering */ - + if (a->is_valid_data == FALSE) { /*****************************************************/ /* process off, trigger on: detect speech triggering */ @@ -738,18 +836,18 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco } } } - + } else { /* is_valid_data == TRUE */ /******************************************************/ /* process on, trigger on: we are in a speech segment */ /******************************************************/ - + if (a->nc > 0) { - + /*************************************/ /* re-triggering in trailing silence */ /*************************************/ - + #ifdef THREAD_DEBUG jlog("DEBUG: re-triggered\n"); #endif @@ -767,7 +865,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco #endif ) { #endif - + /*************************************************/ /* process swap buffer stored while tail silence */ /*************************************************/ @@ -819,13 +917,13 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco } #endif } - } + } } else if (a->is_valid_data == TRUE) { - + /*******************************************************/ /* process on, trigger off: processing tailing silence */ /*******************************************************/ - + #ifdef THREAD_DEBUG jlog("DEBUG: TRAILING SILENCE\n"); #endif @@ -842,21 +940,21 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco a->nc++; } } /* end of triggering handlers */ - - + + /********************************************************************/ /* process the current segment buffer[i...i+wstep] if process == on */ /********************************************************************/ - + if (a->adin_cut_on && a->is_valid_data && a->nc > 0 && a->rest_tail == 0) { - + /* The current trailing silence is now longer than the user- specified tail margin length, so the current samples should not be processed now. But if 're-triggering' occurs in the trailing silence later, they should be processed then. So we just store the overed samples in swapbuf[] and not process them now */ - + #ifdef THREAD_DEBUG jlog("DEBUG: tail silence over, store to swap buffer (nc=%d, rest_tail=%d, sblen=%d-%d)\n", a->nc, a->rest_tail, a->sblen, a->sblen+wstep); #endif @@ -865,7 +963,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco } memcpy(&(a->swapbuf[a->sblen]), &(a->buffer[i]), wstep * sizeof(SP16)); a->sblen += wstep; - + } else { /* we are in a normal speech segment (nc == 0), or @@ -873,7 +971,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco The current trailing silence is shorter than the user- specified tail margin length, so the current samples should be processed now as same as the normal speech segment */ - + #ifdef TMP_FIX_200602 if (!a->adin_cut_on || a->is_valid_data == TRUE) { #else @@ -944,7 +1042,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco } } /* end of current segment processing */ - + if (a->adin_cut_on && a->is_valid_data && a->nc >= a->nc_max) { /*************************************/ /* process on, trailing silence over */ @@ -979,7 +1077,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco /*********************************************************/ i += wstep; /* increment to next wstep samples */ } - + /* purge processed samples and update queue */ adin_purge(a, i); @@ -1001,7 +1099,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco end_status = (a->bp) ? 1 : 0; } } - + return(end_status); } @@ -1021,11 +1119,11 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco * * A/D-in スレッドにてトリガした入力サンプルを保存するコールバック. * - * + * * @param now [in] triggered fragment * @param len [in] length of above * @param recog [in] engine instance - * + * * @return always 0, to tell caller to just continue the input */ static int @@ -1059,7 +1157,7 @@ adin_store_buffer(SP16 *now, int len, Recog *recog) * * A/D-in スレッドのメイン関数. * - * + * * @param dummy [in] a dummy data, not used. */ static void @@ -1089,7 +1187,7 @@ adin_thread_input_main(void *dummy) * Start new A/D-in thread, and initialize buffer. * * - * バッファを初期化して A/D-in スレッドを開始する. + * バッファを初期化して A/D-in スレッドを開始する. * * @param recog [in] engine instance * @@ -1200,13 +1298,13 @@ adin_thread_cancel(Recog *recog) * * この関数は A/D-in スレッドによってサンプルが保存されるのを待ち, * 保存されたサンプルを順次処理していきます. 引数や返り値は adin_cut() と - * 同一です. + * 同一です. * - * + * * @param ad_process [in] function to process triggerted input. * @param ad_check [in] function to be called periodically. * @param recog [in] engine instance - * + * * @return 2 when input termination requested by ad_process(), 1 when * if detect end of an input segment (down trigger detected after up * trigger), 0 when reached end of input device, -1 on error, -2 when @@ -1330,7 +1428,7 @@ adin_thread_process(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Rec pthread_mutex_unlock(&(a->mutex)); break; } - usleep(50000); /* wait = 0.05sec*/ + usleep(50000); /* wait = 0.05sec*/ } } @@ -1360,13 +1458,13 @@ adin_thread_process(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Rec * * スレッドモードでは,この関数は adin_thead_process() を呼び出し, * 非スレッドモードでは adin_cut() を直接呼び出す. 引数や返り値は - * adin_cut() と同一である. + * adin_cut() と同一である. * - * + * * @param ad_process [in] function to process triggerted input. * @param ad_check [in] function to be called periodically. * @param recog [in] engine instance - * + * * @return 2 when input termination requested by ad_process(), 1 when * if detect end of an input segment (down trigger detected after up * trigger), 0 when reached end of input device, -1 on error, -2 when @@ -1374,7 +1472,7 @@ adin_thread_process(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Rec * * @callergraph * @callgraph - * + * */ int adin_go(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Recog *recog) @@ -1389,23 +1487,23 @@ adin_go(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Recog return(adin_cut(ad_process, ad_check, recog)); } -/** +/** * * Call device-specific initialization. * * - * デバイス依存の初期化関数を呼び出す. + * デバイス依存の初期化関数を呼び出す. * - * + * * @param a [in] A/D-in work area * @param freq [in] sampling frequency * @param arg [in] device-dependent argument - * + * * @return TRUE if succeeded, FALSE if failed. - * + * * @callergraph * @callgraph - * + * */ boolean adin_standby(ADIn *a, int freq, void *arg) @@ -1414,22 +1512,22 @@ adin_standby(ADIn *a, int freq, void *arg) if (a->ad_standby != NULL) return(a->ad_standby(freq, arg)); return TRUE; } -/** +/** * * Call device-specific function to begin capturing of the audio stream. * * - * 音の取り込みを開始するデバイス依存の関数を呼び出す. + * 音の取り込みを開始するデバイス依存の関数を呼び出す. * - * + * * @param a [in] A/D-in work area * @param file_or_dev_name [in] device / file path to open or NULL for default - * + * * @return TRUE on success, FALSE on failure. - * + * * @callergraph * @callgraph - * + * */ boolean adin_begin(ADIn *a, char *file_or_dev_name) @@ -1443,16 +1541,16 @@ adin_begin(ADIn *a, char *file_or_dev_name) } return TRUE; } -/** +/** * * Call device-specific function to end capturing of the audio stream. * * - * 音の取り込みを終了するデバイス依存の関数を呼び出す. + * 音の取り込みを終了するデバイス依存の関数を呼び出す. * - * + * * @param a [in] A/D-in work area - * + * * @return TRUE on success, FALSE on failure. * * @callergraph @@ -1468,19 +1566,19 @@ adin_end(ADIn *a) return TRUE; } -/** +/** * * Free memories of A/D-in work area. * * - * 音取り込み用ワークエリアのメモリを開放する. + * 音取り込み用ワークエリアのメモリを開放する. * - * + * * @param recog [in] engine instance * * @callergraph * @callgraph - * + * */ void adin_free_param(Recog *recog) From fdf460293b109d054a127850d0d4fa1f06a0f241 Mon Sep 17 00:00:00 2001 From: Akinobu Lee Date: Sat, 20 Apr 2019 10:41:47 +0900 Subject: [PATCH 02/12] Cosmetic text fix --- Sample.jconf | 54 +++++++++++++++++++------------------- adinrec/adinrec.c | 46 ++++++++++++++++---------------- adintool/options.c | 14 +++++----- doc/Options.md | 4 +-- libjulius/src/m_info.c | 58 ++++++++++++++++++++--------------------- libjulius/src/m_usage.c | 28 ++++++++++---------- 6 files changed, 102 insertions(+), 102 deletions(-) diff --git a/Sample.jconf b/Sample.jconf index b419da1c..09eb270c 100644 --- a/Sample.jconf +++ b/Sample.jconf @@ -96,14 +96,14 @@ #-rejectlong -1 # reject longer input (msec) -1 to disable #### -#### Speech detection by libfvad +#### Speech detection by WebRTC VAD (libfvad) #### -#-fvad -1 # disable libfvad -#-fvad 0 # enable on mode 0 (least aggressive to filtering out non-speech) -#-fvad 1 # enable on mode 1 (moderately aggressive to filtering out non-speech) -#-fvad 2 # enable on mode 2 (aggressive to filtering out non-speech) -#-fvad 3 # enable on mode 3 (very aggressive to filtering out non-speech) -#-fvad_param 5 0.5 # optinal parameter: smoothing frames, trigger threshold +#-fvad -1 # disable WebRTC VAD +#-fvad 0 # enable WebRTC VAD on mode 0 (least aggressive to filtering out non-speech) +#-fvad 1 # enable WebRTC VAD on mode 1 (moderately aggressive to filtering out non-speech) +#-fvad 2 # enable WebRTC VAD on mode 2 (aggressive to filtering out non-speech) +#-fvad 3 # enable WebRTC VAD on mode 3 (very aggressive to filtering out non-speech) +#-fvad_param 5 0.5 # optional parameter: smoothing frames, trigger threshold #### #### Input rejection by average power (EXPERIMENTAL) @@ -117,7 +117,7 @@ #### #### Gaussian Mixture Model #### -#### GMM will be used for input rejection by accumurated score, or +#### GMM will be used for input rejection by accumulated score, or #### for GMM-based frontend VAD when "--enable-gmm-vad" specified. #### #### NOTE: If you use MFCC for the GMM which is different from AM, you @@ -188,13 +188,13 @@ ## Create a new AM configuration set, and switch current to it. ## You should give a unique name. -#-AM name +#-AM name ## Create a new LM configuration set, and switch current to it. ## You should give a unique name. -#-LM name +#-LM name -## Create a new Search configuration set with AM and LM, and switch +## Create a new Search configuration set with AM and LM, and switch ## current to it. AM and LM name can be either name or ID number. #-SR name am_name_or_id lm_name_or_id @@ -208,7 +208,7 @@ ## This option is only a switcher and can be used anywhere anytime. # -GLOBAL -## This option disables the strict section checkings and back to 4.0 +## This option disables the strict section checks and back to 4.0 # -nosectioncheck ###################################################################### @@ -231,7 +231,7 @@ #-mapunk "" # word to which unknown words should be mapped #-iwspword # add a pause word to the dictionary #-iwspentry " [sp] sp sp" # word that will be added by "-iwspword" -#-sepnum 150 # num of high freq words to linearize +#-sepnum 150 # num of high freq words to linearize #-adddict dictfile # append additional word dictionary #-addword entry # append additional word entry @@ -271,7 +271,7 @@ #### the AM defines the required parameter. You can use different MFCC #### type for each AM. #### For GMM, the same parameter should be specified after "-AM_GMM" -#### +#### #### When using multiple AM, the values of "-smpPeriod", "-smpFreq", #### "-fsize" and "-fshift" should be the same among all AM. #### @@ -332,7 +332,7 @@ #-dnnconf file # DNN configuration file ## Others -#-htkconf configfile # load analysis settings from HTK Config file +#-htkconf configfile # load analysis settings from HTK Config file ###################################################################### #### RECOGNIZER (-SR) @@ -341,7 +341,7 @@ #### Default values for beam width and LM weights will change #### according to compile-time setup of JuliusLib and model specification. #### Please see the startup log for the actual values. -#### +#### #### #### parameter (common) @@ -387,34 +387,34 @@ #-spdur 10 # # of frames to detect a short pause #-pausemodels string # comma-separated pause model names #### for decoder-VAD -#-spmargin 40 # backstep margin at trigger up (frame) +#-spmargin 40 # back-step margin at trigger up (frame) #-spdelay 4 # decision delay at trigger up (frame) -#### +#### #### lattice output -#### +#### #-lattice # output result in word graph (aka -graphout) #-graphrange 0 # merge same words nearby, -1 to disable merge #-graphcut 80 # graph depth cut threshold (in depth) -#-graphboundloop 20 # max itertations for boundary adjustment loop -#-graphsearchdelay # activate an alternate generation algorithm +#-graphboundloop 20 # max iterations for boundary adjustment loop +#-graphsearchdelay # activate an alternate generation algorithm #-nographsearchdelay # disable "-graphsearchdelay" -#### +#### #### confusion network output -#### +#### #-confnet # enable confusion network output #-noconfnet # disable confusion network output -#### +#### #### multi-grammar output (for grammar and isolated word) -#### +#### #-multigramout # output max hypo for each grammar #-nomultigramout # disable "-multigramout" -#### +#### #### forced alignment -#### +#### #-walign # enable alignment for result at word level #-palign # enable alignment for result at phoneme level #-salign # enable alignment for result at state level diff --git a/adinrec/adinrec.c b/adinrec/adinrec.c index 3a291743..f478a0aa 100644 --- a/adinrec/adinrec.c +++ b/adinrec/adinrec.c @@ -1,19 +1,19 @@ /** * @file adinrec.c - * + * * * @brief マイクから一発話をファイルへ記録する * - * + * * * @brief Record a speech segment from microphone to a file * - * + * * @author Akinobu LEE * @date Wed Mar 23 20:33:01 2005 * * $Revision: 1.13 $ - * + * */ /* * Copyright (c) 1991-2013 Kawahara Lab., Kyoto University @@ -35,7 +35,7 @@ static char *filename = NULL; ///< Output file name static boolean stout = FALSE; ///< True if output to stdout static boolean use_raw = FALSE; ///< Output in RAW format if TRUE -/** +/** * ヘルプを表示して終了する * Print help and exit */ @@ -53,8 +53,8 @@ opt_help(Jconf *jconf, char *arg[], int argnum) fprintf(stderr, " [-tailmargin msec] tail margin length (%d)\n", jconf->detect.tail_margin_msec); fprintf(stderr, " [-chunksize sample] chunk size for processing (%d)\n", jconf->detect.chunk_size); #ifdef HAVE_LIBFVAD - fprintf(stderr, " [-fvad] FVAD sw (-1=off, 0 - 3) (%d)\n", jconf->detect.fvad_mode); - fprintf(stderr, " [-fvad_param i f] FVAD parameter (dur/thres) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); + fprintf(stderr, " [-fvad mode] enable WebRTC VAD (0-3, larger value rejects noises aggressively) (%d)\n", jconf->detect.fvad_mode); + fprintf(stderr, " [-fvad_param i f] WebRTC VAD parameters (smoothing duration (frames), thres([0-1])) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); #endif /* HAVE_LIBFVAD */ fprintf(stderr, " [-nostrip] not strip off zero samples\n"); fprintf(stderr, " [-zmean] remove DC by zero mean\n"); @@ -83,21 +83,21 @@ opt_freq(Jconf *jconf, char *arg[], int argnum) return TRUE; } -/** +/** * * 録音されたサンプル列を処理するコールバック関数 - * + * * @param now [in] 録音されたサンプル列 * @param len [in] 長さ(サンプル数) - * + * * @return エラー時 -1,処理成功時 0,処理成功+区間終端検出時 1 を返す. * * * Callback handler of recorded sample fragments - * + * * @param now [in] recorded fragments of speech sample * @param len [in] length of above in samples - * + * * @return -1 on device error (require caller to exit and terminate input), * 0 on success (allow caller to continue), * 1 on succeeded but segmentation detected (require caller to exit but @@ -155,11 +155,11 @@ adin_callback_file(SP16 *now, int len, Recog *recog) return -1; } } - + speechlen += len; - + /* progress bar in dots */ - fprintf(stderr, "."); + fprintf(stderr, "."); return(0); } @@ -182,7 +182,7 @@ close_file() } } fprintf(stderr, "\n%d samples (%d bytes, %.2f sec.) recorded\n", speechlen, size, (float)speechlen / (float)sfreq); -} +} /* Interrupt signal handling */ static void @@ -196,21 +196,21 @@ interrupt_record(int signum) } -/** +/** * * メイン関数 - * + * * @param argc [in] 引数列の長さ * @param argv [in] 引数列 - * - * @return + * + * @return * エラー時 1,通常終了時 0 を返す. * * Main function. - * + * * @param argc [in] number of argument. * @param argv [in] array of arguments. - * + * * @return 1 on error, 0 on success. * */ @@ -266,7 +266,7 @@ main(int argc, char *argv[]) /* set Julius default parameters for unspecified acoustic parameters */ apply_para(&(jconf->am_root->analysis.para), &(jconf->am_root->analysis.para_default)); - + /* set some values */ jconf->input.sfreq = jconf->am_root->analysis.para.smp_freq; jconf->input.period = jconf->am_root->analysis.para.smp_period; diff --git a/adintool/options.c b/adintool/options.c index 22c17058..5f5a60c4 100644 --- a/adintool/options.c +++ b/adintool/options.c @@ -33,7 +33,7 @@ show_help_and_exit(Jconf *jconf, char *arg[], int argnum) fprintf(stderr, " vecnet to vecnet server as feature vector (I'm client)\n"); fprintf(stderr, " stdout standard tty output\n"); fprintf(stderr, " none output nothing\n"); - + fprintf(stderr, "I/O options:\n"); #ifdef USE_NETAUDIO fprintf(stderr, " -NA (netaudio) NetAudio server host:unit\n"); @@ -47,7 +47,7 @@ show_help_and_exit(Jconf *jconf, char *arg[], int argnum) fprintf(stderr, "Feature extraction options (other than in jconf):\n"); fprintf(stderr, " -paramtype desc parameter type in HTK format\n"); fprintf(stderr, " -veclen num total vector length\n"); - + fprintf(stderr, "Recording and Pause segmentation options:\n"); fprintf(stderr, " (input segmentation: on for file/mic/stdin, off for adinnet)\n"); @@ -63,10 +63,10 @@ show_help_and_exit(Jconf *jconf, char *arg[], int argnum) fprintf(stderr, " [-tailmargin msec] tail margin length (%d)\n", jconf->detect.tail_margin_msec); fprintf(stderr, " [-chunksize sample] chunk size for processing (%d)\n", jconf->detect.chunk_size); #ifdef HAVE_LIBFVAD - fprintf(stderr, " [-fvad] FVAD sw (-1=off, 0 - 3) (%d)\n", jconf->detect.fvad_mode); - fprintf(stderr, " [-fvad_param i f] FVAD parameter (dur/thres) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); + fprintf(stderr, " [-fvad mode] enable WebRTC VAD (0-3, larger value rejects noises aggressively, -1 to disable) (%d)\n", jconf->detect.fvad_mode); + fprintf(stderr, " [-fvad_param i f] WebRTC VAD parameters (smoothing duration (frames), thres([0-1])) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); #endif /* HAVE_LIBFVAD */ - + fprintf(stderr, " [-nostrip] do not strip zero samples\n"); fprintf(stderr, " [-zmean] remove DC by zero mean\n"); fprintf(stderr, " [-raw] output in RAW format\n"); @@ -74,7 +74,7 @@ show_help_and_exit(Jconf *jconf, char *arg[], int argnum) fprintf(stderr, " [-loosesync] loose sync of resume among servers\n"); fprintf(stderr, " [-rewind msec] rewind input if spoken while pause at resume\n"); fprintf(stderr, " [-C jconffile] load jconf to set parameters (ignore other options\n"); - + fprintf(stderr, "\nLibrary configuration: "); confout_version(stderr); confout_audio(stderr); @@ -346,7 +346,7 @@ void register_options_to_julius() j_add_option("-h", 0, 0, "display this help", show_help_and_exit); j_add_option("-help", 0, 0, "display this help", show_help_and_exit); j_add_option("--help", 0, 0, "display this help", show_help_and_exit); - + } /* end of options.c */ diff --git a/doc/Options.md b/doc/Options.md index ff051695..81c172bc 100644 --- a/doc/Options.md +++ b/doc/Options.md @@ -329,11 +329,11 @@ small. ### -fvad mode -Set libfvad-based VAD mode. `mode` is an integer value from -1 to 3, specify -1 to disable, 0 for moderate detection, 3 for most aggressive detection (more likely to drop speech-like noises). Default value is -1 (disabled) +Enable WebRTC VAD (libfvad-based VAD) mode. Setting `mode` to 0, 1, 2 or 3 enables WebRTC based VAD. `mode` is an integer value from -1 to 3, specify -1 to disable, 0 for weakest noise rejection (accepts all speech, but often wrongly accept noises), 3 for most aggressive noise rejection. Default value is -1 (disabled) ### -fvad_param nFrame threshold -Set libfvad detailed parameter. `nFrame` is the number of smoothing frame. `threshold` is the threshold to detect speech trigger [0.0-1.0]. Default values are 5 and 0.5 respectively. +Set WebRTC VAD's detailed parameters. `nFrame` is the length of smoothing frame. `threshold` is the threshold to detect speech trigger [0.0-1.0]. Default values are 5 and 0.5, respectively. ## Input rejection options (category `GLOBAL`) diff --git a/libjulius/src/m_info.c b/libjulius/src/m_info.c index aa5d4e51..5ecaa2a6 100644 --- a/libjulius/src/m_info.c +++ b/libjulius/src/m_info.c @@ -1,19 +1,19 @@ /** * @file m_info.c - * + * * * @brief システム情報の出力 * - * + * * * @brief Output system informations. * - * + * * @author Akinobu Lee * @date Thu May 12 14:14:01 2005 * * $Revision: 1.23 $ - * + * */ /* * Copyright (c) 1991-2013 Kawahara Lab., Kyoto University @@ -24,19 +24,19 @@ #include -/** +/** * * Output module overview in a global configuration variables to log. * * - * 全体設定パラメータ内のモジュール構成の概要をログに出力する. + * 全体設定パラメータ内のモジュール構成の概要をログに出力する. * - * + * * @param jconf [in] global configuration variables * * @callgraph * @callergraph - * + * */ void print_jconf_overview(Jconf *jconf) @@ -56,9 +56,9 @@ print_jconf_overview(Jconf *jconf) jlog(" LM=%d,", i); i = 0; for(sconf=jconf->search_root;sconf;sconf=sconf->next) i++; jlog(" SR=%d\n", i); - + jlog("\n"); - + jlog(" Acoustic Model (with input parameter spec.):\n"); for(amconf=jconf->am_root;amconf;amconf=amconf->next) { if (amconf->name[0] != '\0') { @@ -75,7 +75,7 @@ print_jconf_overview(Jconf *jconf) } } jlog("\n"); - + jlog(" Language Model:\n"); for(lmconf=jconf->lm_root;lmconf;lmconf=lmconf->next) { if (lmconf->name[0] != '\0') { @@ -218,7 +218,7 @@ print_mfcc_info(FILE *fp, MFCCCalc *mfcc, Jconf *jconf) jlog(" save cep. data to = \"%s\", update at the end of each input\n", mfcc->cmn.save_filename); } jlog("\n"); - + jlog("\t base setup from ="); if (mfcc->htk_loaded == 1 || mfcc->hmm_loaded == 1) { if (mfcc->hmm_loaded == 1) { @@ -243,16 +243,16 @@ print_mfcc_info(FILE *fp, MFCCCalc *mfcc, Jconf *jconf) } -/** +/** * - * エンジンインスタンスの全情報をログに出力する. + * エンジンインスタンスの全情報をログに出力する. * * * Output all informations of an engine instance to log. * * * @param recog [in] engine instance - * + * * @callgraph * @callergraph */ @@ -267,7 +267,7 @@ print_engine_info(Recog *recog) RecogProcess *r; jconf = recog->jconf; - + /* set output file pointer to fp */ fp = jlog_get_fp(); if (fp == NULL) return; @@ -277,7 +277,7 @@ print_engine_info(Recog *recog) j_put_compile_defs(fp); j_put_library_defs(fp); jlog("\n"); - + /* print current argument setting to log */ print_jconf_overview(jconf); @@ -286,7 +286,7 @@ print_engine_info(Recog *recog) /* acoustic parameter conditions for this model */ jlog("------------------------------------------------------------\n"); jlog("Speech Analysis Module(s)\n\n"); - + for(mfcc=recog->mfcclist;mfcc;mfcc=mfcc->next) { jlog("[MFCC%02d] for", mfcc->id); @@ -395,7 +395,7 @@ print_engine_info(Recog *recog) #else jlog("disabled\n"); #endif - + if (am->hmminfo->multipath) { jlog(" sp transition penalty = %+2.1f\n", am->config->iwsp_penalty); } @@ -475,7 +475,7 @@ print_engine_info(Recog *recog) jlog("\n"); } } - + if (lm->lmtype == LM_PROB) { if (lm->config->enable_iwspword) { jlog("\tIW-sp word added to dict= \"%s\"\n", lm->config->iwspentry); @@ -500,7 +500,7 @@ print_engine_info(Recog *recog) } } - if (lm->lmtype == LM_PROB) { + if (lm->lmtype == LM_PROB) { jlog("\t(-silhead)head sil word = "); put_voca(fp, lm->winfo, lm->winfo->head_silwid); jlog("\t(-siltail)tail sil word = "); @@ -512,7 +512,7 @@ print_engine_info(Recog *recog) jlog("\tword head = \"%s\"\n", lm->config->wordrecog_head_silence_model_name); jlog("\tword tail = \"%s\"\n", lm->config->wordrecog_tail_silence_model_name); jlog("\ttheir context name = \"%s\"\n", (lm->config->wordrecog_silence_context_name[0] == '\0') ? "NULL (blank)" : lm->config->wordrecog_silence_context_name); - + } } @@ -726,7 +726,7 @@ print_engine_info(Recog *recog) if (r->config->compute_only_1pass) { jlog("\tCompute only 1-pass\n"); } - + if (r->config->graph.enabled) { jlog("\n"); jlog("Graph-based output with graph-oriented search:\n"); @@ -763,7 +763,7 @@ print_engine_info(Recog *recog) #endif } - + if (r->config->successive.enabled) { jlog("\tshort pause segmentation = on\n"); jlog("\t sp duration length = %d frames\n", r->config->successive.sp_frame_duration); @@ -831,7 +831,7 @@ print_engine_info(Recog *recog) jlog("based on search-time scores\n"); #endif #endif /* CONFIDENCE_MEASURE */ - + jlog("\n"); jlog("------------------------------------------------------------\n"); @@ -949,11 +949,11 @@ print_engine_info(Recog *recog) jlog("\t chunk size = %d samples\n", jconf->detect.chunk_size); #ifdef HAVE_LIBFVAD if (jconf->detect.fvad_mode < 0) { - jlog("\t FVAD switch value = %d (disabled)\n", jconf->detect.fvad_mode); + jlog("\tWebRTC VAD operating mode = %d (disabled)\n", jconf->detect.fvad_mode); } else { - jlog("\t FVAD switch value = %d (0: moderate - 3: very aggressive to regist to noise\n", jconf->detect.fvad_mode); - jlog("\t FVAD param smoothlen = %d (%dms)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_smoothnum * 10); - jlog("\t FVAD param threshold = %.2f\n", jconf->detect.fvad_thres); + jlog("\tWebRTC VAD operating mode = %d (0-3, larger value rejects noises aggressively)\n", jconf->detect.fvad_mode); + jlog("\tWebRTC VAD smoothing len = %d (%dms)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_smoothnum * 10); + jlog("\tWebRTC VAD active thres = %.2f\n", jconf->detect.fvad_thres); } #endif /* HAVE_LIBFVAD */ } else { diff --git a/libjulius/src/m_usage.c b/libjulius/src/m_usage.c index 6fd3eb5b..798ab29d 100644 --- a/libjulius/src/m_usage.c +++ b/libjulius/src/m_usage.c @@ -1,19 +1,19 @@ /** * @file m_usage.c - * + * * * @brief ヘルプを表示する * - * + * * * @brief Print help. * - * + * * @author Akinobu Lee * @date Fri May 13 15:04:34 2005 * * $Revision: 1.25 $ - * + * */ /* * Copyright (c) 1991-2013 Kawahara Lab., Kyoto University @@ -24,14 +24,14 @@ #include -/** +/** * - * ヘルプを表示する. - * + * ヘルプを表示する. + * * * * Output help document. - * + * * * * @param fp [in] file pointer to output help @@ -39,7 +39,7 @@ * @callgraph * @callergraph * @ingroup engine - * + * */ void j_output_argument_help(FILE *fp) @@ -51,7 +51,7 @@ j_output_argument_help(FILE *fp) PLUGIN_ENTRY *p; FUNC_VOID func; #endif - + /* load default values */ jconf = j_jconf_new(); @@ -129,7 +129,7 @@ j_output_argument_help(FILE *fp) #ifdef POWER_REJECT fprintf(fp, " [-powerthres value] rejection threshold of average power (%.1f)\n", jconf->reject.powerthres); #endif - + fprintf(fp, "\n Speech Detection: (default: on=mic/net off=files)\n"); /*fprintf(fp, " [-pausesegment] turn on (force) pause detection\n");*/ /*fprintf(fp, " [-nopausesegment] turn off (force) pause detection\n");*/ @@ -141,8 +141,8 @@ j_output_argument_help(FILE *fp) fprintf(fp, " [-tailmargin msec] tail margin length in msec. (%d)\n", jconf->detect.tail_margin_msec); fprintf(fp, " [-chunksize sample] unit length for processing (%d)\n", jconf->detect.chunk_size); #ifdef HAVE_LIBFVAD - fprintf(fp, " [-fvad] FVAD sw (-1=off, 0-3=on / degree (%d)\n", jconf->detect.fvad_mode); - fprintf(fp, " [-fvad_param i f] FVAD parameter (dur/thres) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); + fprintf(fp, " [-fvad mode] enable WebRTC VAD (0-3, larger value rejects noises aggressively) (%d)\n", jconf->detect.fvad_mode); + fprintf(fp, " [-fvad_param i f] WebRTC VAD parameters (smoothing duration (frames), thres([0-1])) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); #endif /* HAVE_LIBFVAD */ fprintf(fp, "\n GMM utterance verification:\n"); @@ -289,7 +289,7 @@ j_output_argument_help(FILE *fp) fprintf(fp, " [-iwspentry entry] (n-gram) word entry for \"-iwspword\" (%s)\n", IWSPENTRY_DEFAULT); fprintf(fp, " [-adddict dictfile] (n-gram) load extra dictionary\n"); fprintf(fp, " [-addentry entry] (n-gram) load extra word entry\n"); - + fprintf(fp, "\n Isolated Word Recognition:\n"); fprintf(fp, " -w file[,file2...] (list of) wordlist file name(s)\n"); fprintf(fp, " -wlist filename file that contains list of wordlists\n"); From f248fdb770a3247de072b0925335624c6b757fae Mon Sep 17 00:00:00 2001 From: Akinobu Lee Date: Sat, 20 Apr 2019 17:41:57 +0900 Subject: [PATCH 03/12] Fix red screen on audio scale changes --- adintool/mainloop.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/adintool/mainloop.c b/adintool/mainloop.c index 34ec6941..4ed367fb 100644 --- a/adintool/mainloop.c +++ b/adintool/mainloop.c @@ -1098,7 +1098,7 @@ static void draw_wave(Recog *recog, SP16 *now, int len, void *data) #endif /* AUTO_ADJUST_THRESHOLD */ // clear screen - if (recog->jconf->preprocess.level_coef == 1.0f) { + if (recog->jconf->preprocess.level_coef != 0.0f) { // fill black SDL_SetRenderDrawColor(s->renderer, 0, 0, 0, 0xFF); } else { @@ -1224,6 +1224,10 @@ static void draw_wave(Recog *recog, SP16 *now, int len, void *data) } + +// keep audio scale +static float stored_scale; + // check events on SDL static int sdl_check_command() @@ -1267,10 +1271,11 @@ sdl_check_command() case SDLK_m: // 'm' -> input mute if (event.key.state != SDL_PRESSED || event.key.repeat != 0) break; - if (recog->jconf->preprocess.level_coef == 1.0f) { - recog->jconf->preprocess.level_coef = recog->adin->level_coef = 0.00f; + if (recog->jconf->preprocess.level_coef != 0.0f) { + stored_scale = recog->jconf->preprocess.level_coef; + recog->jconf->preprocess.level_coef = recog->adin->level_coef = 0.0f; } else { - recog->jconf->preprocess.level_coef = recog->adin->level_coef = 1.0f; + recog->jconf->preprocess.level_coef = recog->adin->level_coef = stored_scale; } break; case SDLK_c: From 77a344d84d14e749e14857cd21cb162db954cf38 Mon Sep 17 00:00:00 2001 From: Akinobu Lee Date: Sat, 20 Apr 2019 17:43:37 +0900 Subject: [PATCH 04/12] supress audio strip-zero warnings on quiet (demo) mode. --- libsent/include/sent/speech.h | 3 ++- libsent/src/anlz/strip.c | 19 +++++++++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/libsent/include/sent/speech.h b/libsent/include/sent/speech.h index c2a618e2..ff53a1c5 100644 --- a/libsent/include/sent/speech.h +++ b/libsent/include/sent/speech.h @@ -115,7 +115,8 @@ FILE *wrwav_open(char *filename, int sfreq); boolean wrwav_data(FILE *fp, SP16 *buf, int len); boolean wrwav_close(FILE *fp); -/* for an;z/strip.c */ +/* for anlz/strip.c */ +void set_strip_zero_warning(boolean flag); int strip_zero(SP16 a[], int len); #ifdef __cplusplus diff --git a/libsent/src/anlz/strip.c b/libsent/src/anlz/strip.c index 9ed25c4b..405fb246 100644 --- a/libsent/src/anlz/strip.c +++ b/libsent/src/anlz/strip.c @@ -29,6 +29,21 @@ /// Length of zero sample to detect as invalid sequence. #define WINDOWLEN 16 +/// log switch +static boolean strip_zero_warning = TRUE; + +/** + * Switch strip zero warning message + * + * @param flag [in] flag + * + */ +void +set_strip_zero_warning(boolean flag) +{ + strip_zero_warning = flag; +} + /** * Strip zero samples from speech data. * @@ -63,7 +78,7 @@ strip_zero(SP16 a[], int len) } } else { /* deleted (leave uncopied) */ - jlog("Warning: strip: sample %d-%d has zero value, stripped\n", bgn, src-1); + if (strip_zero_warning) jlog("Warning: strip: sample %d-%d has zero value, stripped\n", bgn, src-1); } } a[dst++] = a[src]; @@ -79,7 +94,7 @@ strip_zero(SP16 a[], int len) } } else { /* deleted (leave uncopied) */ - jlog("Warning: strip: sample %d-%d is invalid, stripped\n", bgn, src-1); + if (strip_zero_warning) jlog("Warning: strip: sample %d-%d is invalid, stripped\n", bgn, src-1); } } From eddb2b3b0c374c67c16b768ad17ebd357fc2593c Mon Sep 17 00:00:00 2001 From: Akinobu Lee Date: Sat, 20 Apr 2019 17:45:02 +0900 Subject: [PATCH 05/12] Refine AGC parameters, some typo fixes --- libjulius/src/adin-cut.c | 79 +++++++++++++++++++++++++++------------- 1 file changed, 54 insertions(+), 25 deletions(-) diff --git a/libjulius/src/adin-cut.c b/libjulius/src/adin-cut.c index 52b411f0..955281af 100644 --- a/libjulius/src/adin-cut.c +++ b/libjulius/src/adin-cut.c @@ -157,6 +157,7 @@ adin_setup_param(ADIn *adin, Jconf *jconf) adin->adin_cut_on = adin->silence_cut_default; } adin->strip_flag = jconf->preprocess.strip_zero_sample; + if (verbose_flag == FALSE) set_strip_zero_warning(FALSE); adin->thres = jconf->detect.level_thres; #ifdef HAVE_PTHREAD if (adin->enable_thread && jconf->decodeopt.segment) { @@ -283,7 +284,7 @@ fvad_proceed(ADIn *a, SP16 *speech, int samplenum) ret = fvad_process(a->fvad, &(a->fvad_speech[i]), a->fvad_framesize); if (ret < 0) { /* error */ - jlog("ERROR: fvad_proceed: internal error occured at fvad_process()\n"); + jlog("ERROR: fvad_proceed: internal error occurred at fvad_process()\n"); break; } a->fvad_lastresult[a->fvad_lastp] = ret; @@ -317,29 +318,43 @@ static int fvad_cont_count = 0; /* continuous count of status keep */ static int fvad_last_result = 0; /* keeps last fvad result */ static int fvad_level_max = 0; /* maximum input level in cycle buffer */ static int fvad_first_time = 0; /* flag to detect the first speech */ +static float fvad_first_rate; /* defines for auto gain control */ #define FVAD_AGC_CAP 30000 /* upper cap */ -#define FVAD_AGC_INC_COEF 1.2 /* scale increase coef */ -#define FVAD_AGC_DEC_COEF 0.8 /* scale decrease coef */ +#define FVAD_AGC_INC_COEF 1.5 /* scale increase coef */ +#define FVAD_AGC_DEC_COEF 0.6 /* scale decrease coef */ +#define FVAD_AGC_MAX_COEF 32.0 /* maximum scale */ +#define FVAD_AGC_DEC_OVERFLOW 0.6 /* scale decrease coef at overflow */ +#define FVAD_AGC_FIRST_BOOST_SCALE 3.0 /* additional scale for first adjustment */ +#define FVAD_AGC_UPDATE_MAX_RATE 3.0 /* maximum rate of current/first */ /* change scale and update cycle buffer */ static int -update_audio_scale(Recog *recog, float scale) { +update_audio_scale(Recog *recog, float scale, int totallen) { ADIn *a = recog->adin; int i, len; int zc; + float totalsec; + int hour, minutes; + float second; + + totalsec = (float)totallen / (float)recog->jconf->input.sfreq; + hour = (int)totalsec / 3600; + minutes = (int)((totalsec - hour * 3600) / 60); + second = totalsec - hour * 3600 - minutes * 60; zc_copy_buffer(&(a->zc), a->cbuf, &len); for(i = 0; i < len; i++) a->cbuf[i] = a->cbuf[i] * scale / a->level_coef; reset_count_zc_e(&(a->zc), a->thres, a->c_length, a->c_offset); zc = count_zc_e(&(a->zc), a->cbuf, len); - jlog("INFO: audio scale adjusted from %f to %f toward threshold %d\n", recog->adin->level_coef, scale, a->thres); + if (verbose_flag) jlog("STAT: AGC: %.2f to %.2f at %02d:%02d:%02.2f\n", recog->adin->level_coef, scale, hour, minutes, second); recog->adin->level_coef = scale; recog->jconf->preprocess.level_coef = scale; return zc; } + #endif /* HAVE_LIBFVAD */ /** @@ -696,9 +711,9 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco /* get voice/noise status from fvad */ fv = fvad_proceed(a, &(a->buffer[i]), wstep); - if (a->fvad) { + if (a->fvad && recog->jconf->detect.auto_gain_control_flag) { float scale; - + int total_processed_len = a->total_captured_len - a->current_len + i + wstep - a->zc.valid_len; /* check if voice/noise status has been kept for the entire cycle buffer */ if (fvad_last_result == fv) { fvad_cont_count += wstep; @@ -712,44 +727,58 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco if (a->zc.level > FVAD_AGC_CAP && fvad_cont_count > a->c_length) { /* detect input overflow at last chunk, immediately reduce the scale under the cap */ - zc = update_audio_scale(recog, (float)FVAD_AGC_CAP / a->zc.level); - /* reset detection */ - fvad_cont_count = 0; + if (verbose_flag) jlog("STAT: AGC: too loud (>%d)\n", FVAD_AGC_CAP); + zc = update_audio_scale(recog, (float)recog->adin->level_coef * FVAD_AGC_DEC_OVERFLOW, total_processed_len); + /* update max after scaling */ + fvad_level_max *= FVAD_AGC_DEC_OVERFLOW; + /* does not reset detection, continues */ } if (fv == 1 && fvad_cont_count > a->c_length) { /* voice segment of a certain length found */ if (fvad_first_time == 0) { fvad_first_time = 1; /* this is first time: if amplitude is below level threshold, immediately raise the scale to go over the threshold */ - scale = 2.0 * a->thres / fvad_level_max; + scale = FVAD_AGC_FIRST_BOOST_SCALE * a->thres / fvad_level_max; if (scale > 1.0f) { /* set new scale */ - zc = update_audio_scale(recog, scale); - printf("first scale=%f to %f, level=%d to %d, thres=%d\n", recog->adin->level_coef, scale, fvad_level_max, a->zc.level, a->thres); + if (verbose_flag) jlog("STAT: AGC: first speech segment, force adjustment\n"); + if (scale > FVAD_AGC_MAX_COEF) scale = FVAD_AGC_MAX_COEF; + fvad_first_rate = scale; + zc = update_audio_scale(recog, scale, total_processed_len); /* update max after scaling */ if (fvad_level_max < a->zc.level) fvad_level_max = a->zc.level; } /* reset detection */ fvad_cont_count = 0; - } else if (fvad_level_max < a->thres * 2.0) { + } else if (fvad_level_max < a->thres) { /* too low amplitude of the voice part, increase scale gradually */ - scale = recog->adin->level_coef * FVAD_AGC_INC_COEF; - zc = update_audio_scale(recog, scale); - printf("up scale=%f to %f, level=%d to %d, thres=%d\n", recog->adin->level_coef, scale, fvad_level_max, a->zc.level, a->thres); - /* update max after scaling */ - if (fvad_level_max < a->zc.level) fvad_level_max = a->zc.level; - /* does not reset detection, continues */ + if (fvad_first_time == 1 && recog->adin->level_coef >= fvad_first_rate * FVAD_AGC_UPDATE_MAX_RATE) { + fvad_cont_count = 0; + } else { + scale = recog->adin->level_coef * FVAD_AGC_INC_COEF; + if (scale > FVAD_AGC_MAX_COEF) scale = FVAD_AGC_MAX_COEF; + if (fvad_first_time == 1 && scale > fvad_first_rate * FVAD_AGC_UPDATE_MAX_RATE) { + scale = fvad_first_rate * FVAD_AGC_UPDATE_MAX_RATE; + } + zc = update_audio_scale(recog, scale, total_processed_len); + /* update max after scaling */ + if (fvad_level_max < a->zc.level) fvad_level_max = a->zc.level; + /* does not reset detection, continues */ + } } } if (fv == 0 && fvad_cont_count > a->c_length) { /* noise segment of a certain length found */ - if (fvad_level_max > a->thres * 2.0) { + if (fvad_level_max > a->thres) { /* mis-detecting long noise as speech, decrease scale gradually */ scale = recog->adin->level_coef * FVAD_AGC_DEC_COEF; - zc = update_audio_scale(recog, scale); - /* update max after scaling */ - fvad_level_max *= FVAD_AGC_DEC_COEF; - printf("down scale=%f\n", recog->adin->level_coef); + if (scale <= 0.0) { + if (verbose_flag) jlog("STAT: AGC: too small scale %f, ignored\n", scale); + } else { + zc = update_audio_scale(recog, scale, total_processed_len); + /* update max after scaling */ + fvad_level_max *= FVAD_AGC_DEC_COEF; + } } } } From 5715474719b14dc613b79618fde41190b3db2d5e Mon Sep 17 00:00:00 2001 From: Akinobu Lee Date: Sat, 20 Apr 2019 17:45:37 +0900 Subject: [PATCH 06/12] Added new option "-agc", "-noagc" for experimental AGC --- Sample.jconf | 2 ++ adinrec/adinrec.c | 1 + adintool/options.c | 1 + doc/Options.md | 4 ++++ doc/VAD.md | 9 ++++++--- libjulius/include/julius/jconf.h | 5 +++++ libjulius/src/default.c | 1 + libjulius/src/m_info.c | 7 ++++++- libjulius/src/m_options.c | 8 ++++++++ libjulius/src/m_usage.c | 1 + 10 files changed, 35 insertions(+), 4 deletions(-) diff --git a/Sample.jconf b/Sample.jconf index 09eb270c..e7a6f523 100644 --- a/Sample.jconf +++ b/Sample.jconf @@ -104,6 +104,8 @@ #-fvad 2 # enable WebRTC VAD on mode 2 (aggressive to filtering out non-speech) #-fvad 3 # enable WebRTC VAD on mode 3 (very aggressive to filtering out non-speech) #-fvad_param 5 0.5 # optional parameter: smoothing frames, trigger threshold +#-agc # enable auto gain control. Should be specified with -fvad. +#-noagc # disable auto gain control. #### #### Input rejection by average power (EXPERIMENTAL) diff --git a/adinrec/adinrec.c b/adinrec/adinrec.c index f478a0aa..cc7e885c 100644 --- a/adinrec/adinrec.c +++ b/adinrec/adinrec.c @@ -55,6 +55,7 @@ opt_help(Jconf *jconf, char *arg[], int argnum) #ifdef HAVE_LIBFVAD fprintf(stderr, " [-fvad mode] enable WebRTC VAD (0-3, larger value rejects noises aggressively) (%d)\n", jconf->detect.fvad_mode); fprintf(stderr, " [-fvad_param i f] WebRTC VAD parameters (smoothing duration (frames), thres([0-1])) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); + fprintf(stderr, " [-agc][-noagc] enable/disable additional AGC on WebRTC VAD\n"); #endif /* HAVE_LIBFVAD */ fprintf(stderr, " [-nostrip] not strip off zero samples\n"); fprintf(stderr, " [-zmean] remove DC by zero mean\n"); diff --git a/adintool/options.c b/adintool/options.c index 5f5a60c4..162ec15b 100644 --- a/adintool/options.c +++ b/adintool/options.c @@ -65,6 +65,7 @@ show_help_and_exit(Jconf *jconf, char *arg[], int argnum) #ifdef HAVE_LIBFVAD fprintf(stderr, " [-fvad mode] enable WebRTC VAD (0-3, larger value rejects noises aggressively, -1 to disable) (%d)\n", jconf->detect.fvad_mode); fprintf(stderr, " [-fvad_param i f] WebRTC VAD parameters (smoothing duration (frames), thres([0-1])) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); + fprintf(stderr, " [-agc][-noagc] enable/disable additional AGC on WebRTC VAD\n"); #endif /* HAVE_LIBFVAD */ fprintf(stderr, " [-nostrip] do not strip zero samples\n"); diff --git a/doc/Options.md b/doc/Options.md index 81c172bc..016b8f3f 100644 --- a/doc/Options.md +++ b/doc/Options.md @@ -335,6 +335,10 @@ Enable WebRTC VAD (libfvad-based VAD) mode. Setting `mode` to 0, 1, 2 or 3 enab Set WebRTC VAD's detailed parameters. `nFrame` is the length of smoothing frame. `threshold` is the threshold to detect speech trigger [0.0-1.0]. Default values are 5 and 0.5, respectively. +### -agc, -noagc + +Enable / disable auto gain control (AGC). This function depends on WebRTC VAD module, so WebRTC VAD should always enabled when using AGC, for example, `-fvad 2 -agc`. This feature is highly experimental, not tested well so use with care. Default is disabled. (From 2019/4/20) + ## Input rejection options (category `GLOBAL`) Two simple front-end input rejection methods are implemented, based diff --git a/doc/VAD.md b/doc/VAD.md index 05575f14..03c04fbf 100644 --- a/doc/VAD.md +++ b/doc/VAD.md @@ -23,9 +23,10 @@ The block diagram of detection modules are as follows: ![Block diagram of VAD modules](image/vad-module.png) -All VAD detector is disabled by default for buffered processing. For stream -processing, the level and zero cross threshold detector is enabled by default. -Other detectors should be set up and enabled by options. +For stream processing, the level and zero cross threshold detector is enabled by default. +Other detectors like WebRTC detecter can be enabled by options. + +For buffered processing, All VAD detectors are disabled by default. You can enable the detectors for buffered d input by specifying option [-cutsilence](https://github.com/julius-speech/julius/blob/master/doc/Options.md#-cutsilence--nocutsilence). ## Level and zero cross threshold detector @@ -80,6 +81,8 @@ and zero cross threshold detector and run WebRTC detector only, leave it enabled i.e. "`-lv 1`" to enforce the threshold detector to always pass through the input. +After 2019/4/20, you can also test an experimental AGC (auto gain control) feature. When `-agc` is specified together with the WebRTC VAD detector, an additional auto gain control will be activated together with the WebRTC VAD. This feature is highly experimental, not tested well, so use with care. + ## Static GMM based detector Gaussian mixture model (GMM) based speech detector. Requires the voice / noise diff --git a/libjulius/include/julius/jconf.h b/libjulius/include/julius/jconf.h index 5fbeefe4..4b853aa0 100644 --- a/libjulius/include/julius/jconf.h +++ b/libjulius/include/julius/jconf.h @@ -1011,6 +1011,11 @@ typedef struct __Jconf__ { * value is 0.5. */ float fvad_thres; + + /** + * (LIBFVAD) switch AGC + */ + boolean auto_gain_control_flag; #endif /* HAVE_LIBFVAD */ } detect; diff --git a/libjulius/src/default.c b/libjulius/src/default.c index a4949f1b..e8b54c26 100644 --- a/libjulius/src/default.c +++ b/libjulius/src/default.c @@ -87,6 +87,7 @@ jconf_set_default_values(Jconf *j) j->detect.fvad_mode = -1; j->detect.fvad_smoothnum = 5; j->detect.fvad_thres = 0.5; + j->detect.auto_gain_control_flag = FALSE; #endif /* HAVE_LIBFVAD */ j->preprocess.strip_zero_sample = TRUE; diff --git a/libjulius/src/m_info.c b/libjulius/src/m_info.c index 5ecaa2a6..f707b15b 100644 --- a/libjulius/src/m_info.c +++ b/libjulius/src/m_info.c @@ -630,7 +630,7 @@ print_engine_info(Recog *recog) #endif jlog("\t(-n)search candidate num= %d\n", r->config->pass2.nbest); jlog("\t(-s) search stack size = %d\n", r->config->pass2.stack_size); - jlog("\t(-m) search overflow = after %d hypothesis poped\n", r->config->pass2.hypo_overflow); + jlog("\t(-m) search overflow = after %d hypothesis popped\n", r->config->pass2.hypo_overflow); jlog("\t 2nd pass method = "); if (r->config->graph.enabled) { #ifdef GRAPHOUT_DYNAMIC @@ -954,6 +954,11 @@ print_engine_info(Recog *recog) jlog("\tWebRTC VAD operating mode = %d (0-3, larger value rejects noises aggressively)\n", jconf->detect.fvad_mode); jlog("\tWebRTC VAD smoothing len = %d (%dms)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_smoothnum * 10); jlog("\tWebRTC VAD active thres = %.2f\n", jconf->detect.fvad_thres); + if (jconf->detect.auto_gain_control_flag) { + jlog("\t Auto Gain Control = enabled\n"); + } else { + jlog("\t Auto Gain Control = disabled\n"); + } } #endif /* HAVE_LIBFVAD */ } else { diff --git a/libjulius/src/m_options.c b/libjulius/src/m_options.c index 8a4a45e2..12e0c3bd 100644 --- a/libjulius/src/m_options.c +++ b/libjulius/src/m_options.c @@ -1363,6 +1363,14 @@ opt_parse(int argc, char *argv[], char *cwd, Jconf *jconf) GET_TMPARG; jconf->detect.fvad_thres = (float)atof(tmparg); continue; + } else if (strmatch(argv[i],"-agc")) { /* enable agc */ + if (!check_section(jconf, argv[i], JCONF_OPT_GLOBAL)) return FALSE; + jconf->detect.auto_gain_control_flag = TRUE; + continue; + } else if (strmatch(argv[i],"-noagc")) { /* disable agc */ + if (!check_section(jconf, argv[i], JCONF_OPT_GLOBAL)) return FALSE; + jconf->detect.auto_gain_control_flag = FALSE; + continue; #endif /* HAVE_LIBFVAD */ } if (argv[i][0] == '-' && strlen(argv[i]) == 2) { diff --git a/libjulius/src/m_usage.c b/libjulius/src/m_usage.c index 798ab29d..be167eb1 100644 --- a/libjulius/src/m_usage.c +++ b/libjulius/src/m_usage.c @@ -143,6 +143,7 @@ j_output_argument_help(FILE *fp) #ifdef HAVE_LIBFVAD fprintf(fp, " [-fvad mode] enable WebRTC VAD (0-3, larger value rejects noises aggressively) (%d)\n", jconf->detect.fvad_mode); fprintf(fp, " [-fvad_param i f] WebRTC VAD parameters (smoothing duration (frames), thres([0-1])) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); + fprintf(fp, " [-agc][-noagc] enable/disable additional AGC on WebRTC VAD\n"); #endif /* HAVE_LIBFVAD */ fprintf(fp, "\n GMM utterance verification:\n"); From bb1c350b24168b9e22e03d258b0ee96dd9f5d3d9 Mon Sep 17 00:00:00 2001 From: Akinobu Lee Date: Sat, 20 Apr 2019 17:46:08 +0900 Subject: [PATCH 07/12] not change trailing spaces in sources --- .editorconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.editorconfig b/.editorconfig index 0aea39b4..1164c0a5 100644 --- a/.editorconfig +++ b/.editorconfig @@ -4,5 +4,5 @@ root = true indent_style = tab indent_size = 2 tab_width = 8 -trim_trailing_whitespace = true +trim_trailing_whitespace = false insert_final_newline = true From a1d47652f7721927a0ab572f364c8c97b611cacc Mon Sep 17 00:00:00 2001 From: Akinobu Lee Date: Sun, 21 Apr 2019 16:08:29 +0900 Subject: [PATCH 08/12] update documents for AGC. --- adintool/README.md | 10 ++++++++-- doc/Options.md | 6 +++++- doc/VAD.md | 2 +- 3 files changed, 14 insertions(+), 4 deletions(-) diff --git a/adintool/README.md b/adintool/README.md index 94778509..544df242 100644 --- a/adintool/README.md +++ b/adintool/README.md @@ -22,7 +22,7 @@ GUI version: ## Description `adintool` analyzes speech input, detects speech segments skipping silence, and -records the detected segments in various ways. +records the detected segments in various ways. It accepts all Julius options. Input waveform: @@ -47,7 +47,7 @@ Output waveform / feature vector: - none This tool uses Julius's internal VAD module for speech detection. The detection -algorithm and parameters are the same as Julius. +algorithm and parameters are the same as Julius. It also accepts all Julius options. The default audio format is 16 bit, 1 channel in Microsoft WAV format. @@ -80,6 +80,12 @@ Record utterances one by one, into file "test0001.wav", "test0002.wav", ... % adintool -in mic -out file -filename test ``` +Use WebRTC-based VAD and experimental AGC. + +```shell +% adintool -in mic -out file -filename test -fvad 3 -agc +``` + Record only one utterance into "test.wav" ```shell diff --git a/doc/Options.md b/doc/Options.md index 016b8f3f..160726a0 100644 --- a/doc/Options.md +++ b/doc/Options.md @@ -337,7 +337,11 @@ Set WebRTC VAD's detailed parameters. `nFrame` is the length of smoothing frame ### -agc, -noagc -Enable / disable auto gain control (AGC). This function depends on WebRTC VAD module, so WebRTC VAD should always enabled when using AGC, for example, `-fvad 2 -agc`. This feature is highly experimental, not tested well so use with care. Default is disabled. (From 2019/4/20) +Enable / disable supplemental auto gain control (AGC). This feature scales up captured audio automatically by looking at the input level and results of WebRTC VAD. This is soft AGC, applying no change to the hardware volume of the capture device. Requires WebRTC VAD to be enabled together, so use with `-fvad 2` or `-fvad 3`. This feature is highly experimental and not tested well. Default is disabled. (Added 2019/4/20) + +### -agc_param i1 p1 p2 p3 p4 p5 p6 + +Set AGC parameters. `i1` is a level threshold value to detect signal overflow, `p1` is the maximum allowed scale factor, `p2` is the maximum allowed scale, relative to first-utterance scale. `p3` is target level magnifying factor for first-utterance scaling, `p4` is the magnifying rate when moving the scale up, `p5` is the magnifying rate when moving scale down, `p6` is the special scaling down factor applied when detecting signal overflow. ## Input rejection options (category `GLOBAL`) diff --git a/doc/VAD.md b/doc/VAD.md index 03c04fbf..d1abd467 100644 --- a/doc/VAD.md +++ b/doc/VAD.md @@ -81,7 +81,7 @@ and zero cross threshold detector and run WebRTC detector only, leave it enabled i.e. "`-lv 1`" to enforce the threshold detector to always pass through the input. -After 2019/4/20, you can also test an experimental AGC (auto gain control) feature. When `-agc` is specified together with the WebRTC VAD detector, an additional auto gain control will be activated together with the WebRTC VAD. This feature is highly experimental, not tested well, so use with care. +After 2019/4/20, you can also test an experimental AGC (auto gain control) feature. When `-agc` is specified together with the WebRTC VAD detector, an additional auto gain control will be activated together with the WebRTC VAD. This is soft AGC that scales audio after capturing, so no change will be made to the hardware volume. This feature is highly experimental and not tested well. You can tune the parameters by `-agc_param` options. ## Static GMM based detector From c73055ccc233f440522b58423aa397acc7a6f9b0 Mon Sep 17 00:00:00 2001 From: Akinobu Lee Date: Sun, 21 Apr 2019 16:09:56 +0900 Subject: [PATCH 09/12] AGC parameters are jconf-nized. --- libjulius/include/julius/jconf.h | 17 +++++++-- libjulius/include/julius/recog.h | 1 + libjulius/src/adin-cut.c | 63 ++++++++++++++------------------ libjulius/src/default.c | 7 ++++ libjulius/src/m_info.c | 7 ++++ libjulius/src/m_options.c | 17 +++++++++ libjulius/src/m_usage.c | 2 + 7 files changed, 76 insertions(+), 38 deletions(-) diff --git a/libjulius/include/julius/jconf.h b/libjulius/include/julius/jconf.h index 4b853aa0..d7af0d92 100644 --- a/libjulius/include/julius/jconf.h +++ b/libjulius/include/julius/jconf.h @@ -995,18 +995,18 @@ typedef struct __Jconf__ { * aggressive, put focus on picking up all possible speech, likely * to accepting speech-like noise part. Larger value indicates * very aggressive detection, putting focus on accepting truly - * speech only part, aggressively dropping amiguous part. + * speech only part, aggressively dropping ambiguous part. */ int fvad_mode; /** * (LIBFVAD) number of frames for smoothing. Last N frames (where 1 - * frame is fixed to 10ms) value are averaged to get stational VAD. + * frame is fixed to 10ms) value are averaged to get stable VAD. */ int fvad_smoothnum; /** - * (LIBFVAD) speech likelihood threshold value to finaly detect + * (LIBFVAD) speech likelihood threshold value to finally detect * speech trigger. Value should be between 0.0 and 1.0. Typical * value is 0.5. */ @@ -1016,6 +1016,17 @@ typedef struct __Jconf__ { * (LIBFVAD) switch AGC */ boolean auto_gain_control_flag; + + struct { + int overflow_thres; ///< signal overflow threshold + float scale_max; ///< maximum allowed scale (absolute) + float scale_max_relative_first; ///< maximum allowed scale (relative to first-utterance scale) + float level_factor_first; ///< target level threshold factor for first-utterance scaling + float scale_up_rate; ///< rate when moving scale up + float scale_down_rate; ///< rate when moving scale down + float scale_down_overflow_rate; ///< rate when moving scale down by overflow + } agc; + #endif /* HAVE_LIBFVAD */ } detect; diff --git a/libjulius/include/julius/recog.h b/libjulius/include/julius/recog.h index c1c54546..7e0603c2 100644 --- a/libjulius/include/julius/recog.h +++ b/libjulius/include/julius/recog.h @@ -454,6 +454,7 @@ typedef struct __adin__ { int *fvad_lastresult; ///< working buffer to hold last N results int fvad_lastp; ///< current pointer fot lastresult buffer float fvad_thres; ///< threshold to detect speech + boolean fvad_last_voice; ///< TRUE if last result was voice #endif /* HAVE_LIBFVAD */ } ADIn; diff --git a/libjulius/src/adin-cut.c b/libjulius/src/adin-cut.c index 955281af..1ff7b7d3 100644 --- a/libjulius/src/adin-cut.c +++ b/libjulius/src/adin-cut.c @@ -231,6 +231,7 @@ adin_setup_param(ADIn *adin, Jconf *jconf) adin->fvad_lastresult = (int *)mymalloc(sizeof(int) * adin->fvad_lastresultnum); for (i = 0; i < adin->fvad_lastresultnum; i++) adin->fvad_lastresult[i] = 0; adin->fvad_lastp = 0; + adin->fvad_last_voice = FALSE; } #endif /* HAVE_LIBFVAD */ @@ -261,14 +262,14 @@ adin_purge(ADIn *a, int from) #ifdef HAVE_LIBFVAD /* proceed libfvad detection: return 1 for speech part, 0 for non-speech part */ -static int +static boolean fvad_proceed(ADIn *a, SP16 *speech, int samplenum) { int i, j, k; int ret, result; float sum; - if (a->fvad == NULL) return 1; + if (a->fvad == NULL) return TRUE; if (a->fvad_speechlen + samplenum > MAXSPEECHLEN) { /* buffer overflow */ @@ -296,9 +297,9 @@ fvad_proceed(ADIn *a, SP16 *speech, int samplenum) sum /= (float)a->fvad_lastresultnum; /* judge */ if (sum >= a->fvad_thres) - result = 1; + a->fvad_last_voice = TRUE; else - result = 0; + a->fvad_last_voice = FALSE; /* flush processed samples */ k = 0; @@ -308,27 +309,18 @@ fvad_proceed(ADIn *a, SP16 *speech, int samplenum) } a->fvad_speechlen = k; - return result; + return a->fvad_last_voice; } #endif /* HAVE_LIBFVAD */ #ifdef HAVE_LIBFVAD /* work area for auto gain control */ static int fvad_cont_count = 0; /* continuous count of status keep */ -static int fvad_last_result = 0; /* keeps last fvad result */ +static boolean fvad_last_result = FALSE; /* keeps last fvad result */ static int fvad_level_max = 0; /* maximum input level in cycle buffer */ static int fvad_first_time = 0; /* flag to detect the first speech */ static float fvad_first_rate; -/* defines for auto gain control */ -#define FVAD_AGC_CAP 30000 /* upper cap */ -#define FVAD_AGC_INC_COEF 1.5 /* scale increase coef */ -#define FVAD_AGC_DEC_COEF 0.6 /* scale decrease coef */ -#define FVAD_AGC_MAX_COEF 32.0 /* maximum scale */ -#define FVAD_AGC_DEC_OVERFLOW 0.6 /* scale decrease coef at overflow */ -#define FVAD_AGC_FIRST_BOOST_SCALE 3.0 /* additional scale for first adjustment */ -#define FVAD_AGC_UPDATE_MAX_RATE 3.0 /* maximum rate of current/first */ - /* change scale and update cycle buffer */ static int update_audio_scale(Recog *recog, float scale, int totallen) { @@ -441,7 +433,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco boolean transfer_online_local; /* local repository of transfer_online */ int zc; /* count of zero cross */ #ifdef HAVE_LIBFVAD - int fv; + boolean fv; #endif /* HAVE_LIBFVAD */ a = recog->adin; @@ -725,61 +717,62 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco } fvad_last_result = fv; - if (a->zc.level > FVAD_AGC_CAP && fvad_cont_count > a->c_length) { + if (a->zc.level > recog->jconf->detect.agc.overflow_thres && fvad_cont_count > a->c_length) { /* detect input overflow at last chunk, immediately reduce the scale under the cap */ - if (verbose_flag) jlog("STAT: AGC: too loud (>%d)\n", FVAD_AGC_CAP); - zc = update_audio_scale(recog, (float)recog->adin->level_coef * FVAD_AGC_DEC_OVERFLOW, total_processed_len); + if (verbose_flag) jlog("STAT: AGC: too loud (>%d)\n", recog->jconf->detect.agc.overflow_thres); + zc = update_audio_scale(recog, (float)recog->adin->level_coef * recog->jconf->detect.agc.scale_down_overflow_rate, total_processed_len); /* update max after scaling */ - fvad_level_max *= FVAD_AGC_DEC_OVERFLOW; + fvad_level_max *= recog->jconf->detect.agc.scale_down_overflow_rate; /* does not reset detection, continues */ } - if (fv == 1 && fvad_cont_count > a->c_length) { + if (fv == TRUE && fvad_cont_count > a->c_length) { /* voice segment of a certain length found */ if (fvad_first_time == 0) { fvad_first_time = 1; /* this is first time: if amplitude is below level threshold, immediately raise the scale to go over the threshold */ - scale = FVAD_AGC_FIRST_BOOST_SCALE * a->thres / fvad_level_max; + scale = recog->jconf->detect.agc.level_factor_first * a->thres / fvad_level_max; if (scale > 1.0f) { /* set new scale */ if (verbose_flag) jlog("STAT: AGC: first speech segment, force adjustment\n"); - if (scale > FVAD_AGC_MAX_COEF) scale = FVAD_AGC_MAX_COEF; + if (scale > recog->jconf->detect.agc.scale_max) scale = recog->jconf->detect.agc.scale_max; fvad_first_rate = scale; zc = update_audio_scale(recog, scale, total_processed_len); /* update max after scaling */ if (fvad_level_max < a->zc.level) fvad_level_max = a->zc.level; } - /* reset detection */ - fvad_cont_count = 0; } else if (fvad_level_max < a->thres) { /* too low amplitude of the voice part, increase scale gradually */ - if (fvad_first_time == 1 && recog->adin->level_coef >= fvad_first_rate * FVAD_AGC_UPDATE_MAX_RATE) { + if (fvad_first_time == 1 && recog->adin->level_coef >= fvad_first_rate * recog->jconf->detect.agc.scale_max_relative_first) { fvad_cont_count = 0; } else { - scale = recog->adin->level_coef * FVAD_AGC_INC_COEF; - if (scale > FVAD_AGC_MAX_COEF) scale = FVAD_AGC_MAX_COEF; - if (fvad_first_time == 1 && scale > fvad_first_rate * FVAD_AGC_UPDATE_MAX_RATE) { - scale = fvad_first_rate * FVAD_AGC_UPDATE_MAX_RATE; + scale = recog->adin->level_coef * recog->jconf->detect.agc.scale_up_rate; + if (scale > recog->jconf->detect.agc.scale_max) scale = recog->jconf->detect.agc.scale_max; + if (fvad_first_time == 1 && scale > fvad_first_rate * recog->jconf->detect.agc.scale_max_relative_first) { + scale = fvad_first_rate * recog->jconf->detect.agc.scale_max_relative_first; } zc = update_audio_scale(recog, scale, total_processed_len); /* update max after scaling */ if (fvad_level_max < a->zc.level) fvad_level_max = a->zc.level; - /* does not reset detection, continues */ } } + /* reset detection */ + fvad_cont_count = 0; } - if (fv == 0 && fvad_cont_count > a->c_length) { + if (fv == FALSE && fvad_cont_count > a->c_length) { /* noise segment of a certain length found */ if (fvad_level_max > a->thres) { /* mis-detecting long noise as speech, decrease scale gradually */ - scale = recog->adin->level_coef * FVAD_AGC_DEC_COEF; + scale = recog->adin->level_coef * recog->jconf->detect.agc.scale_down_rate; if (scale <= 0.0) { if (verbose_flag) jlog("STAT: AGC: too small scale %f, ignored\n", scale); } else { zc = update_audio_scale(recog, scale, total_processed_len); /* update max after scaling */ - fvad_level_max *= FVAD_AGC_DEC_COEF; + fvad_level_max *= recog->jconf->detect.agc.scale_down_rate; } } + /* reset detection */ + fvad_cont_count = 0; } } #endif /* HAVE_LIBFVAD */ @@ -787,7 +780,7 @@ adin_cut(int (*ad_process)(SP16 *, int, Recog *), int (*ad_check)(Recog *), Reco #ifdef HAVE_LIBFVAD /* trigger when both libfvad and julius VAD are triggered */ /* process input in libfvad and get VAD result */ - fv == 1 && + fv == TRUE && #endif /* HAVE_LIBFVAD */ zc > a->noise_zerocross) { /* now triggering */ diff --git a/libjulius/src/default.c b/libjulius/src/default.c index e8b54c26..46810f1b 100644 --- a/libjulius/src/default.c +++ b/libjulius/src/default.c @@ -88,6 +88,13 @@ jconf_set_default_values(Jconf *j) j->detect.fvad_smoothnum = 5; j->detect.fvad_thres = 0.5; j->detect.auto_gain_control_flag = FALSE; + j->detect.agc.overflow_thres = 30000; + j->detect.agc.scale_max = 25.0; + j->detect.agc.scale_max_relative_first = 3.0; + j->detect.agc.level_factor_first = 3.0; + j->detect.agc.scale_up_rate = 1.3; + j->detect.agc.scale_down_rate = 0.8; + j->detect.agc.scale_down_overflow_rate = 0.7; #endif /* HAVE_LIBFVAD */ j->preprocess.strip_zero_sample = TRUE; diff --git a/libjulius/src/m_info.c b/libjulius/src/m_info.c index f707b15b..ac262992 100644 --- a/libjulius/src/m_info.c +++ b/libjulius/src/m_info.c @@ -956,6 +956,13 @@ print_engine_info(Recog *recog) jlog("\tWebRTC VAD active thres = %.2f\n", jconf->detect.fvad_thres); if (jconf->detect.auto_gain_control_flag) { jlog("\t Auto Gain Control = enabled\n"); + jlog("\t AGC signal overflow threshold = %d\n", jconf->detect.agc.overflow_thres); + jlog("\t AGC maximum allowed scale = %.2f\n", jconf->detect.agc.scale_max); + jlog("\t AGC maximum allowed scale, relative to first-utterance scale) = %.2f\n", jconf->detect.agc.scale_max_relative_first); + jlog("\t AGC target level threshold factor for first-utterance scaling = %.2f\n", jconf->detect.agc.level_factor_first); + jlog("\t AGC rate when moving scale up = %.2f\n", jconf->detect.agc.scale_up_rate); + jlog("\t AGC rate when moving scale down = %.2f\n", jconf->detect.agc.scale_down_rate); + jlog("\t AGC rate when moving scale down at overflow = %.2f\n", jconf->detect.agc.scale_down_overflow_rate); } else { jlog("\t Auto Gain Control = disabled\n"); } diff --git a/libjulius/src/m_options.c b/libjulius/src/m_options.c index 12e0c3bd..fe6a3150 100644 --- a/libjulius/src/m_options.c +++ b/libjulius/src/m_options.c @@ -1371,6 +1371,23 @@ opt_parse(int argc, char *argv[], char *cwd, Jconf *jconf) if (!check_section(jconf, argv[i], JCONF_OPT_GLOBAL)) return FALSE; jconf->detect.auto_gain_control_flag = FALSE; continue; + } else if (strmatch(argv[i],"-agc_param")) { /* disable agc */ + if (!check_section(jconf, argv[i], JCONF_OPT_GLOBAL)) return FALSE; + GET_TMPARG; + jconf->detect.agc.overflow_thres = atoi(tmparg); + GET_TMPARG; + jconf->detect.agc.scale_max = (float)atof(tmparg); + GET_TMPARG; + jconf->detect.agc.scale_max_relative_first = (float)atof(tmparg); + GET_TMPARG; + jconf->detect.agc.level_factor_first = (float)atof(tmparg); + GET_TMPARG; + jconf->detect.agc.scale_up_rate = (float)atof(tmparg); + GET_TMPARG; + jconf->detect.agc.scale_down_rate = (float)atof(tmparg); + GET_TMPARG; + jconf->detect.agc.scale_down_overflow_rate = (float)atof(tmparg); + continue; #endif /* HAVE_LIBFVAD */ } if (argv[i][0] == '-' && strlen(argv[i]) == 2) { diff --git a/libjulius/src/m_usage.c b/libjulius/src/m_usage.c index be167eb1..59b2dd6c 100644 --- a/libjulius/src/m_usage.c +++ b/libjulius/src/m_usage.c @@ -144,6 +144,8 @@ j_output_argument_help(FILE *fp) fprintf(fp, " [-fvad mode] enable WebRTC VAD (0-3, larger value rejects noises aggressively) (%d)\n", jconf->detect.fvad_mode); fprintf(fp, " [-fvad_param i f] WebRTC VAD parameters (smoothing duration (frames), thres([0-1])) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); fprintf(fp, " [-agc][-noagc] enable/disable additional AGC on WebRTC VAD\n"); + fprintf(fp, " [-agc_param p1 ... p7] AGC parameters (%d %.2f %.2f %.2f %.2f %.2f %.2f)\n", jconf->detect.agc.overflow_thres , jconf->detect.agc.scale_max, jconf->detect.agc.scale_max_relative_first, jconf->detect.agc.level_factor_first, jconf->detect.agc.scale_up_rate, jconf->detect.agc.scale_down_rate, jconf->detect.agc.scale_down_overflow_rate); + #endif /* HAVE_LIBFVAD */ fprintf(fp, "\n GMM utterance verification:\n"); From d1bc3bc6f58624e63a0d2a0ac3f92346f8c9a57d Mon Sep 17 00:00:00 2001 From: Akinobu Lee Date: Sun, 21 Apr 2019 16:10:56 +0900 Subject: [PATCH 10/12] Added options for agc parameters --- adinrec/adinrec.c | 1 + adintool/options.c | 1 + 2 files changed, 2 insertions(+) diff --git a/adinrec/adinrec.c b/adinrec/adinrec.c index cc7e885c..f4f452ca 100644 --- a/adinrec/adinrec.c +++ b/adinrec/adinrec.c @@ -56,6 +56,7 @@ opt_help(Jconf *jconf, char *arg[], int argnum) fprintf(stderr, " [-fvad mode] enable WebRTC VAD (0-3, larger value rejects noises aggressively) (%d)\n", jconf->detect.fvad_mode); fprintf(stderr, " [-fvad_param i f] WebRTC VAD parameters (smoothing duration (frames), thres([0-1])) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); fprintf(stderr, " [-agc][-noagc] enable/disable additional AGC on WebRTC VAD\n"); + fprintf(stderr, " [-agc_param p1 ... p7] AGC parameters (%d %.2f %.2f %.2f %.2f %.2f %.2f)\n", jconf->detect.agc.overflow_thres , jconf->detect.agc.scale_max, jconf->detect.agc.scale_max_relative_first, jconf->detect.agc.level_factor_first, jconf->detect.agc.scale_up_rate, jconf->detect.agc.scale_down_rate, jconf->detect.agc.scale_down_overflow_rate); #endif /* HAVE_LIBFVAD */ fprintf(stderr, " [-nostrip] not strip off zero samples\n"); fprintf(stderr, " [-zmean] remove DC by zero mean\n"); diff --git a/adintool/options.c b/adintool/options.c index 162ec15b..3d96dbe5 100644 --- a/adintool/options.c +++ b/adintool/options.c @@ -66,6 +66,7 @@ show_help_and_exit(Jconf *jconf, char *arg[], int argnum) fprintf(stderr, " [-fvad mode] enable WebRTC VAD (0-3, larger value rejects noises aggressively, -1 to disable) (%d)\n", jconf->detect.fvad_mode); fprintf(stderr, " [-fvad_param i f] WebRTC VAD parameters (smoothing duration (frames), thres([0-1])) (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres); fprintf(stderr, " [-agc][-noagc] enable/disable additional AGC on WebRTC VAD\n"); + fprintf(stderr, " [-agc_param p1 ... p7] AGC parameters (%d %.2f %.2f %.2f %.2f %.2f %.2f)\n", jconf->detect.agc.overflow_thres , jconf->detect.agc.scale_max, jconf->detect.agc.scale_max_relative_first, jconf->detect.agc.level_factor_first, jconf->detect.agc.scale_up_rate, jconf->detect.agc.scale_down_rate, jconf->detect.agc.scale_down_overflow_rate); #endif /* HAVE_LIBFVAD */ fprintf(stderr, " [-nostrip] do not strip zero samples\n"); From 481a3bd778d4e6acabe30d8c859d01919a0ac538 Mon Sep 17 00:00:00 2001 From: Akinobu Lee Date: Sun, 21 Apr 2019 16:12:34 +0900 Subject: [PATCH 11/12] Support drawing part of "non-detected but WebRTC-detected voice" by green at adintool-gui. --- adintool/adintool.h | 5 +++++ adintool/mainloop.c | 46 +++++++++++++++++++++++++++++++++++++++------ 2 files changed, 45 insertions(+), 6 deletions(-) diff --git a/adintool/adintool.h b/adintool/adintool.h index 2ce31d73..8cf67d6c 100644 --- a/adintool/adintool.h +++ b/adintool/adintool.h @@ -51,6 +51,11 @@ enum{SPOUT_NONE, SPOUT_FILE, SPOUT_STDOUT, SPOUT_ADINNET, SPOUT_VECTORNET}; #define WAVE_TICK_FLAG_PROCESSED 0x01 // audio tick flag: set to indicate that an input segment was triggered down #define WAVE_TICK_FLAG_TRIGGER 0x02 +#ifdef HAVE_LIBFVAD +// audio tick flag: set to indicate that an input segment was detemined as voice by fvad +#define WAVE_TICK_FLAG_FVAD_VOICED 0x04 +#endif /* HAVE_LIBFVAD */ + #ifdef AUTO_ADJUST_THRESHOLD // mean / var computing window length in seconds diff --git a/adintool/mainloop.c b/adintool/mainloop.c index 4ed367fb..ba029d5f 100644 --- a/adintool/mainloop.c +++ b/adintool/mainloop.c @@ -954,6 +954,11 @@ static void draw_wave(Recog *recog, SP16 *now, int len, void *data) if (recog->adin->is_valid_data == TRUE) { s->flag[s->bp] |= WAVE_TICK_FLAG_PROCESSED; } +#ifdef HAVE_LIBFVAD + if (recog->adin->fvad_last_voice == TRUE) { + s->flag[s->bp] |= WAVE_TICK_FLAG_FVAD_VOICED; + } +#endif /* HAVE_LIBFVAD */ s->is_valid_flag = (recog->adin->is_valid_data == TRUE) ? 1 : 0; #ifdef AUTO_ADJUST_THRESHOLD @@ -1163,19 +1168,32 @@ static void draw_wave(Recog *recog, SP16 *now, int len, void *data) j++; if (j >= s->items) j -= s->items; } - m = s->rectflags[0] & WAVE_TICK_FLAG_PROCESSED; + + short process_flag = WAVE_TICK_FLAG_PROCESSED; +#ifdef HAVE_LIBFVAD + process_flag |= WAVE_TICK_FLAG_FVAD_VOICED; +#endif + m = s->rectflags[0] & process_flag; k = 0; miny = viewport.h; startx = 0; for(i = 0; i < s->items; i++) { - if ((s->rectflags[i] & WAVE_TICK_FLAG_PROCESSED) != m) { - SDL_SetRenderDrawColor(s->renderer, 255 * m, 128, 255 - 128 * m, 255); + if ((s->rectflags[i] & process_flag) != m) { + if (m & WAVE_TICK_FLAG_PROCESSED) { + SDL_SetRenderDrawColor(s->renderer, 255, 128, 128, 255); +#ifdef HAVE_LIBFVAD + } else if (m & WAVE_TICK_FLAG_FVAD_VOICED) { + SDL_SetRenderDrawColor(s->renderer, 128, 255, 128, 255); +#endif + } else { + SDL_SetRenderDrawColor(s->renderer, 0, 128, 255, 255); + } SDL_RenderFillRects(s->renderer, &(s->rects[k]), i - k); - m = s->rectflags[i]; - if ((s->rectflags[i] & WAVE_TICK_FLAG_PROCESSED) != 0) { + if ((m & WAVE_TICK_FLAG_PROCESSED) == 0 && (s->rectflags[i] & WAVE_TICK_FLAG_PROCESSED) != 0) { startx = i; miny = viewport.h; } + m = s->rectflags[i] & process_flag; k = i; } if ((s->rectflags[i] & WAVE_TICK_FLAG_TRIGGER) != 0) { @@ -1193,9 +1211,25 @@ static void draw_wave(Recog *recog, SP16 *now, int len, void *data) if (miny > viewport.h - (s->rects[i].y + s->rects[i].h)) miny = viewport.h - (s->rects[i].y + s->rects[i].h); } - SDL_SetRenderDrawColor(s->renderer, 255 * m, 128, 255 - 128 * m, 255); + if (m & WAVE_TICK_FLAG_PROCESSED) { + SDL_SetRenderDrawColor(s->renderer, 255, 128, 128, 255); + } else if (m & WAVE_TICK_FLAG_FVAD_VOICED) { + SDL_SetRenderDrawColor(s->renderer, 128, 255, 128, 255); + } else { + SDL_SetRenderDrawColor(s->renderer, 0, 128, 255, 255); + } SDL_RenderFillRects(s->renderer, &(s->rects[k]), s->items - k); +#ifdef HAVE_LIBFVAD + /* draw current scale at last */ + r.w = WAVE_TICK_WIDTH; + r.h = recog->adin->level_coef * viewport.h * 0.025; + r.x = viewport.w - r.w; + r.y = viewport.h * 0.5 - r.h; + SDL_SetRenderDrawColor(s->renderer, 255, 0, 0, 255); + SDL_RenderDrawRect(s->renderer, &r); +#endif /* HAVE_LIBFVAD */ + #ifdef AUTO_ADJUST_THRESHOLD /* draw last mean/var box */ From 7685726e4ddabc6aa7c58f951114100aae08dcfe Mon Sep 17 00:00:00 2001 From: Akinobu Lee Date: Sun, 21 Apr 2019 16:13:27 +0900 Subject: [PATCH 12/12] Typo fixes --- libjulius/src/m_chkparam.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/libjulius/src/m_chkparam.c b/libjulius/src/m_chkparam.c index e63c1ed5..85d587a8 100644 --- a/libjulius/src/m_chkparam.c +++ b/libjulius/src/m_chkparam.c @@ -350,22 +350,22 @@ j_jconf_finalize(Jconf *jconf) /* check parameter to be passed to libfvad */ if (jconf->detect.fvad_mode < -1) jconf->detect.fvad_mode = -1; if (jconf->detect.fvad_mode > 3) { - jlog("WARNING: m_chkparam: invalud value for \"-fvad\": %d\n", jconf->detect.fvad_mode); + jlog("WARNING: m_chkparam: invalid value for \"-fvad\": %d\n", jconf->detect.fvad_mode); jlog("WARNING: m_chkparam: maximum aggressiveness is 3, use 3 now\n"); jconf->detect.fvad_mode = 3; } if (jconf->detect.fvad_smoothnum < 1) { - jlog("WARNING: m_chkparam: invalud value for 1st arg of \"-fvad_param\": %d\n", jconf->detect.fvad_smoothnum); + jlog("WARNING: m_chkparam: invalid value for 1st arg of \"-fvad_param\": %d\n", jconf->detect.fvad_smoothnum); jlog("WARNING: m_chkparam: num should be >= 1, use 1 now\n"); jconf->detect.fvad_smoothnum = 1; } if (jconf->detect.fvad_thres < 0.0) { - jlog("WARNING: m_chkparam: invalud value for 2nd arg of \"-fvad_param\": %f\n", jconf->detect.fvad_thres); + jlog("WARNING: m_chkparam: invalid value for 2nd arg of \"-fvad_param\": %f\n", jconf->detect.fvad_thres); jlog("WARNING: m_chkparam: thres should be >= 0.0, use 0.0 now\n"); jconf->detect.fvad_thres = 0.0; } if (jconf->detect.fvad_thres > 1.0) { - jlog("WARNING: m_chkparam: invalud value for 2nd arg of \"-fvad_param\": %f\n", jconf->detect.fvad_thres); + jlog("WARNING: m_chkparam: invalid value for 2nd arg of \"-fvad_param\": %f\n", jconf->detect.fvad_thres); jlog("WARNING: m_chkparam: thres should be <= 1.0, use 1.0 now\n"); jconf->detect.fvad_thres = 1.0; }