julius-speech · LeeAkinobu · Apr 19, 2019 · Apr 20, 2019 · Apr 20, 2019 · Apr 20, 2019
diff --git a/.editorconfig b/.editorconfig
@@ -4,5 +4,5 @@ root = true
 indent_style = tab
 indent_size = 2
 tab_width = 8
-trim_trailing_whitespace = true
+trim_trailing_whitespace = false
 insert_final_newline = true
diff --git a/Sample.jconf b/Sample.jconf
@@ -96,14 +96,16 @@
 #-rejectlong -1			# reject longer input (msec) -1 to disable
 
 ####
-#### Speech detection by libfvad
+#### Speech detection by WebRTC VAD (libfvad)
 ####
-#-fvad -1           # disable libfvad
-#-fvad 0            # enable on mode 0 (least aggressive to filtering out non-speech)
-#-fvad 1            # enable on mode 1 (moderately aggressive to filtering out non-speech)
-#-fvad 2            # enable on mode 2 (aggressive to filtering out non-speech)
-#-fvad 3            # enable on mode 3 (very aggressive to filtering out non-speech)
-#-fvad_param 5 0.5  # optinal parameter: smoothing frames, trigger threshold
+#-fvad -1           # disable WebRTC VAD
+#-fvad 0            # enable WebRTC VAD on mode 0 (least aggressive to filtering out non-speech)
+#-fvad 1            # enable WebRTC VAD on mode 1 (moderately aggressive to filtering out non-speech)
+#-fvad 2            # enable WebRTC VAD on mode 2 (aggressive to filtering out non-speech)
+#-fvad 3            # enable WebRTC VAD on mode 3 (very aggressive to filtering out non-speech)
+#-fvad_param 5 0.5  # optional parameter: smoothing frames, trigger threshold
+#-agc               # enable auto gain control.  Should be specified with -fvad. 
+#-noagc             # disable auto gain control.
 
 ####
 #### Input rejection by average power (EXPERIMENTAL)
@@ -117,7 +119,7 @@
 ####
 #### Gaussian Mixture Model
 ####
-####  GMM will be used for input rejection by accumurated score, or
+####  GMM will be used for input rejection by accumulated score, or
 ####  for GMM-based frontend VAD when "--enable-gmm-vad" specified.
 ####
 ####  NOTE: If you use MFCC for the GMM which is different from AM, you
@@ -188,13 +190,13 @@
 
 ## Create a new AM configuration set, and switch current to it.
 ## You should give a unique name.
-#-AM name			
+#-AM name
 
 ## Create a new LM configuration set, and switch current to it.
 ## You should give a unique name.
-#-LM name			
+#-LM name
 
-## Create a new Search configuration set with AM and LM, and switch 
+## Create a new Search configuration set with AM and LM, and switch
 ## current to it.  AM and LM name can be either name or ID number.
 #-SR name am_name_or_id lm_name_or_id
 
@@ -208,7 +210,7 @@
 ## This option is only a switcher and can be used anywhere anytime.
 # -GLOBAL
 
-## This option disables the strict section checkings and back to 4.0
+## This option disables the strict section checks and back to 4.0
 # -nosectioncheck
 
 ######################################################################
@@ -231,7 +233,7 @@
 #-mapunk "<unk>"		# word to which unknown words should be mapped
 #-iwspword			# add a pause word to the dictionary
 #-iwspentry "<UNK> [sp] sp sp"	# word that will be added by "-iwspword"
-#-sepnum 150			# num of high freq words to linearize 
+#-sepnum 150			# num of high freq words to linearize
 #-adddict dictfile              # append additional word dictionary
 #-addword entry                 # append additional word entry
 
@@ -271,7 +273,7 @@
 #### the AM defines the required parameter.  You can use different MFCC
 #### type for each AM.
 #### For GMM, the same parameter should be specified after "-AM_GMM"
-#### 
+####
 #### When using multiple AM,  the values of "-smpPeriod", "-smpFreq",
 #### "-fsize" and "-fshift" should be the same among all AM.
 ####
@@ -332,7 +334,7 @@
 #-dnnconf file			# DNN configuration file
 
 ## Others
-#-htkconf configfile		# load analysis settings from HTK Config file 
+#-htkconf configfile		# load analysis settings from HTK Config file
 
 ######################################################################
 #### RECOGNIZER (-SR)
@@ -341,7 +343,7 @@
 #### Default values for beam width and LM weights will change
 #### according to compile-time setup of JuliusLib and model specification.
 #### Please see the startup log for the actual values.
-#### 
+####
 
 ####
 #### parameter (common)
@@ -387,34 +389,34 @@
 #-spdur 10			# # of frames to detect a short pause
 #-pausemodels string		# comma-separated pause model names
 #### for decoder-VAD
-#-spmargin 40			# backstep margin at trigger up (frame)
+#-spmargin 40			# back-step margin at trigger up (frame)
 #-spdelay 4			# decision delay at trigger up (frame)
 
-#### 
+####
 #### lattice output
-#### 
+####
 #-lattice			# output result in word graph (aka -graphout)
 #-graphrange 0			# merge same words nearby, -1 to disable merge
 #-graphcut 80			# graph depth cut threshold (in depth)
-#-graphboundloop 20		# max itertations for boundary adjustment loop
-#-graphsearchdelay		# activate an alternate generation algorithm 
+#-graphboundloop 20		# max iterations for boundary adjustment loop
+#-graphsearchdelay		# activate an alternate generation algorithm
 #-nographsearchdelay		# disable "-graphsearchdelay"
 
-#### 
+####
 #### confusion network output
-#### 
+####
 #-confnet			# enable confusion network output
 #-noconfnet			# disable confusion network output
 
-#### 
+####
 #### multi-grammar output (for grammar and isolated word)
-#### 
+####
 #-multigramout			# output max hypo for each grammar
 #-nomultigramout		# disable "-multigramout"
 
-#### 
+####
 #### forced alignment
-#### 
+####
 #-walign			# enable alignment for result at word level
 #-palign			# enable alignment for result at phoneme level
 #-salign			# enable alignment for result at state level

diff --git a/adinrec/adinrec.c b/adinrec/adinrec.c
@@ -1,19 +1,19 @@
 /**
  * @file   adinrec.c
- * 
+ *
  * <JA>
  * @brief  マイクから一発話をファイルへ記録する
  * </JA>
- * 
+ *
  * <EN>
  * @brief  Record a speech segment from microphone to a file
  * </EN>
- * 
+ *
  * @author Akinobu LEE
  * @date   Wed Mar 23 20:33:01 2005
  *
  * $Revision: 1.13 $
- * 
+ *
  */
 /*
  * Copyright (c) 1991-2013 Kawahara Lab., Kyoto University
@@ -35,7 +35,7 @@ static char *filename = NULL;	///< Output file name
 static boolean stout = FALSE;	///< True if output to stdout
 static boolean use_raw = FALSE;	///< Output in RAW format if TRUE
 
-/** 
+/**
  * <JA>ヘルプを表示して終了する</JA>
  * <EN>Print help and exit</EN>
  */
@@ -53,8 +53,10 @@ opt_help(Jconf *jconf, char *arg[], int argnum)
   fprintf(stderr, "    [-tailmargin msec]    tail margin length          (%d)\n", jconf->detect.tail_margin_msec);
   fprintf(stderr, "    [-chunksize sample]   chunk size for processing   (%d)\n", jconf->detect.chunk_size);
 #ifdef HAVE_LIBFVAD
-  fprintf(stderr, "    [-fvad]               FVAD sw (-1=off, 0 - 3)     (%d)\n", jconf->detect.fvad_mode);
-  fprintf(stderr, "    [-fvad_param i f]     FVAD parameter (dur/thres)  (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres);
+  fprintf(stderr, "    [-fvad mode]          enable WebRTC VAD (0-3, larger value rejects noises aggressively) (%d)\n", jconf->detect.fvad_mode);
+  fprintf(stderr, "    [-fvad_param i f]     WebRTC VAD parameters (smoothing duration (frames), thres([0-1]))  (%d %.2f)\n", jconf->detect.fvad_smoothnum, jconf->detect.fvad_thres);
+  fprintf(stderr, "    [-agc][-noagc]        enable/disable additional AGC on WebRTC VAD\n");
+  fprintf(stderr, "    [-agc_param p1 ... p7]  AGC parameters   (%d %.2f %.2f %.2f %.2f %.2f %.2f)\n", jconf->detect.agc.overflow_thres , jconf->detect.agc.scale_max, jconf->detect.agc.scale_max_relative_first, jconf->detect.agc.level_factor_first, jconf->detect.agc.scale_up_rate, jconf->detect.agc.scale_down_rate, jconf->detect.agc.scale_down_overflow_rate);
 #endif /* HAVE_LIBFVAD */
   fprintf(stderr, "    [-nostrip]            not strip off zero samples\n");
   fprintf(stderr, "    [-zmean]              remove DC by zero mean\n");
@@ -83,21 +85,21 @@ opt_freq(Jconf *jconf, char *arg[], int argnum)
   return TRUE;
 }
 
-/** 
+/**
  * <JA>
  * 録音されたサンプル列を処理するコールバック関数
- * 
+ *
  * @param now [in] 録音されたサンプル列
  * @param len [in] 長さ（サンプル数）
- * 
+ *
  * @return エラー時 -1，処理成功時 0，処理成功＋区間終端検出時 1 を返す．
  * </JA>
  * <EN>
  * Callback handler of recorded sample fragments
- * 
+ *
  * @param now [in] recorded fragments of speech sample
  * @param len [in] length of above in samples
- * 
+ *
  * @return -1 on device error (require caller to exit and terminate input),
  * 0 on success (allow caller to continue),
  * 1 on succeeded but segmentation detected (require caller to exit but
@@ -155,11 +157,11 @@ adin_callback_file(SP16 *now, int len, Recog *recog)
       return -1;
     }
   }
-  
+
   speechlen += len;
-  
+
   /* progress bar in dots */
-  fprintf(stderr, ".");		
+  fprintf(stderr, ".");
   return(0);
 }
 
@@ -182,7 +184,7 @@ close_file()
     }
   }
   fprintf(stderr, "\n%d samples (%d bytes, %.2f sec.) recorded\n", speechlen, size, (float)speechlen / (float)sfreq);
-}  
+}
 
 /* Interrupt signal handling */
 static void
@@ -196,21 +198,21 @@ interrupt_record(int signum)
 }
 
 
-/** 
+/**
  * <JA>
  * メイン関数
- * 
+ *
  * @param argc [in] 引数列の長さ
  * @param argv [in] 引数列
- * 
- * @return 
+ *
+ * @return
  * </JA>エラー時 1，通常終了時 0 を返す．
  * <EN>
  * Main function.
- * 
+ *
  * @param argc [in] number of argument.
  * @param argv [in] array of arguments.
- * 
+ *
  * @return 1 on error, 0 on success.
  * </EN>
  */
@@ -266,7 +268,7 @@ main(int argc, char *argv[])
 
   /* set Julius default parameters for unspecified acoustic parameters */
   apply_para(&(jconf->am_root->analysis.para), &(jconf->am_root->analysis.para_default));
-  
+
   /* set some values */
   jconf->input.sfreq = jconf->am_root->analysis.para.smp_freq;
   jconf->input.period = jconf->am_root->analysis.para.smp_period;

diff --git a/adintool/README.md b/adintool/README.md
@@ -22,7 +22,7 @@ GUI version:
 ## Description
 
 `adintool` analyzes speech input, detects speech segments skipping silence, and
-records the detected segments in various ways.
+records the detected segments in various ways.  It accepts all Julius options.
 
 Input waveform:
 
@@ -47,7 +47,7 @@ Output waveform / feature vector:
 - none
 
 This tool uses Julius's internal VAD module for speech detection. The detection
-algorithm and parameters are the same as Julius.
+algorithm and parameters are the same as Julius.  It also accepts all Julius options.
 
 The default audio format is 16 bit, 1 channel in Microsoft WAV format.
 
@@ -80,6 +80,12 @@ Record utterances one by one, into file "test0001.wav", "test0002.wav", ...
 % adintool -in mic -out file -filename test
 ```
 
+Use WebRTC-based VAD and experimental AGC.
+
+```shell
+% adintool -in mic -out file -filename test -fvad 3 -agc
+```
+
 Record only one utterance into "test.wav"
 
 ```shell

diff --git a/adintool/adintool.h b/adintool/adintool.h
@@ -51,6 +51,11 @@ enum{SPOUT_NONE, SPOUT_FILE, SPOUT_STDOUT, SPOUT_ADINNET, SPOUT_VECTORNET};
 #define WAVE_TICK_FLAG_PROCESSED 0x01
 // audio tick flag: set to indicate that an input segment was triggered down
 #define WAVE_TICK_FLAG_TRIGGER 0x02
+#ifdef HAVE_LIBFVAD
+// audio tick flag: set to indicate that an input segment was detemined as voice by fvad
+#define WAVE_TICK_FLAG_FVAD_VOICED 0x04
+#endif /* HAVE_LIBFVAD */
+
 
 #ifdef AUTO_ADJUST_THRESHOLD
 // mean / var computing window length in seconds