From 19c7937c54ba074f0e6915d926ccdb60c046a37e Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Sat, 29 May 2021 10:41:51 +0800 Subject: [PATCH 1/2] failed nbest 4-gram --- .../simple_v1/mmi_att_transformer_decode.py | 16 +++++++--- egs/librispeech/asr/simple_v1/run.sh | 31 +++++++++++++++++-- snowfall/models/conformer.py | 28 ++++++++++++----- 3 files changed, 62 insertions(+), 13 deletions(-) diff --git a/egs/librispeech/asr/simple_v1/mmi_att_transformer_decode.py b/egs/librispeech/asr/simple_v1/mmi_att_transformer_decode.py index 89d5a506..39f1aba3 100755 --- a/egs/librispeech/asr/simple_v1/mmi_att_transformer_decode.py +++ b/egs/librispeech/asr/simple_v1/mmi_att_transformer_decode.py @@ -220,6 +220,11 @@ def get_parser(): 'If it is negative, then rescore with the whole lattice.'\ 'CAUTION: You have to reduce max_duration in case of CUDA OOM' ) + parser.add_argument( + '--output-dir', + type=str, + default='exp/', + help='output dir for err and recog text') return parser @@ -285,7 +290,8 @@ def main(): num_classes=len(phone_ids) + 1, # +1 for the blank symbol subsampling_factor=4, num_decoder_layers=num_decoder_layers, - vgg_frontend=True) + vgg_frontend=True, + is_espnet_structure=True) elif model_type == "contextnet": model = ContextNet( num_features=80, @@ -378,7 +384,8 @@ def main(): # load dataset librispeech = LibriSpeechAsrDataModule(args) - test_sets = ['test-clean', 'test-other'] + # test_sets = ['test-clean', 'test-other'] + test_sets = ['test-clean'] # test_sets = ['test-other'] for test_set, test_dl in zip(test_sets, librispeech.test_dataloaders()): logging.info(f'* DECODING: {test_set}') @@ -393,13 +400,14 @@ def main(): use_whole_lattice=use_whole_lattice, output_beam_size=output_beam_size) - recog_path = exp_dir / f'recogs-{test_set}.txt' + output_dir = Path(args.output_dir) + recog_path = output_dir / f'recogs-{test_set}.txt' store_transcripts(path=recog_path, texts=results) logging.info(f'The transcripts are stored in {recog_path}') # The following prints out WERs, per-word error statistics and aligned # ref/hyp pairs. - errs_filename = exp_dir / f'errs-{test_set}.txt' + errs_filename = output_dir / f'errs-{test_set}.txt' with open(errs_filename, 'w') as f: write_error_stats(f, test_set, results) logging.info('Wrote detailed error stats to {}'.format(errs_filename)) diff --git a/egs/librispeech/asr/simple_v1/run.sh b/egs/librispeech/asr/simple_v1/run.sh index 17127046..23b2c114 100755 --- a/egs/librispeech/asr/simple_v1/run.sh +++ b/egs/librispeech/asr/simple_v1/run.sh @@ -7,7 +7,7 @@ set -eou pipefail -stage=0 +stage=7 if [ $stage -le 1 ]; then local/download_lm.sh "openslr.org/resources/11" data/local/lm @@ -74,6 +74,33 @@ fi if [ $stage -le 7 ]; then # python3 ./decode.py # ctc decoding - python3 ./mmi_bigram_decode.py --epoch 9 + # python3 ./mmi_bigram_decode.py --epoch 9 # python3 ./mmi_mbr_decode.py + mkdir -p exp/ + export CUDA_VISIBLE_DEVICES=3 + ln -sf /home/storage14/guoliyong/open-source/snowfall/egs/librispeech/asr/simple_v1/exp-conformer-noam-mmi-att-musan-sa-vgg ./ + ln -sf /ceph-ly/open-source/lm_resocre_snowfall/snowfall/egs/librispeech/asr/simple_v1/exp/data ./exp/ + ln -sf /ceph-ly/open-source/to_submit/espnet_snowfall/snowfall/egs/librispeech/asr/simple_v1/data ./ + output_dir=result_dir + mkdir -p $output_dir + python3 ./mmi_att_transformer_decode.py \ + --output-dir $output_dir \ + --num-paths -1 \ + --max-duration 300 \ + --attention-dim 512 \ + --use-lm-rescoring True \ + --avg 15 \ + --epoch 19 +fi + +if [ $stage -le 8 ]; then + export CUDA_VISIBLE_DEVICES=3 + python3 ./mmi_att_transformer_decode.py \ + --output-dir $output_dir \ + --num-paths 1000 \ + --max-duration 300 \ + --attention-dim 512 \ + --use-lm-rescoring True \ + --avg 15 \ + --epoch 19 fi diff --git a/snowfall/models/conformer.py b/snowfall/models/conformer.py index 4bf921b2..23c9653b 100644 --- a/snowfall/models/conformer.py +++ b/snowfall/models/conformer.py @@ -36,7 +36,8 @@ def __init__(self, num_features: int, num_classes: int, subsampling_factor: int d_model: int = 256, nhead: int = 4, dim_feedforward: int = 2048, num_encoder_layers: int = 12, num_decoder_layers: int = 6, dropout: float = 0.1, cnn_module_kernel: int = 31, - normalize_before: bool = True, vgg_frontend: bool = False) -> None: + normalize_before: bool = True, vgg_frontend: bool = False, + is_espnet_structure: bool = False) -> None: super(Conformer, self).__init__(num_features=num_features, num_classes=num_classes, subsampling_factor=subsampling_factor, d_model=d_model, nhead=nhead, dim_feedforward=dim_feedforward, num_encoder_layers=num_encoder_layers, num_decoder_layers=num_decoder_layers, @@ -44,8 +45,12 @@ def __init__(self, num_features: int, num_classes: int, subsampling_factor: int self.encoder_pos = RelPositionalEncoding(d_model, dropout) - encoder_layer = ConformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, cnn_module_kernel, normalize_before) + encoder_layer = ConformerEncoderLayer(d_model, nhead, dim_feedforward, dropout, cnn_module_kernel, normalize_before, is_espnet_structure) self.encoder = ConformerEncoder(encoder_layer, num_encoder_layers) + self.normalize_before = normalize_before + self.is_espnet_structure = is_espnet_structure + if self.normalize_before and self.is_espnet_structure: + self.after_norm = nn.LayerNorm(d_model) def encode(self, x: Tensor, supervisions: Optional[Dict] = None) -> Tuple[Tensor, Optional[Tensor]]: """ @@ -65,6 +70,8 @@ def encode(self, x: Tensor, supervisions: Optional[Dict] = None) -> Tuple[Tensor mask = encoder_padding_mask(x.size(0), supervisions) mask = mask.to(x.device) if mask != None else None x = self.encoder(x, pos_emb, src_key_padding_mask=mask) # (T, B, F) + if self.normalize_before and self.is_espnet_structure: + x = self.after_norm(x) return x, mask @@ -90,9 +97,10 @@ class ConformerEncoderLayer(nn.Module): """ def __init__(self, d_model: int, nhead: int, dim_feedforward: int = 2048, dropout: float = 0.1, - cnn_module_kernel: int = 31, normalize_before: bool = True) -> None: + cnn_module_kernel: int = 31, normalize_before: bool = True, + is_espnet_structure=False) -> None: super(ConformerEncoderLayer, self).__init__() - self.self_attn = RelPositionMultiheadAttention(d_model, nhead, dropout=0.0) + self.self_attn = RelPositionMultiheadAttention(d_model, nhead, dropout=0.0, is_espnet_structure=is_espnet_structure) self.feed_forward = nn.Sequential( nn.Linear(d_model, dim_feedforward), @@ -319,7 +327,8 @@ class RelPositionMultiheadAttention(nn.Module): >>> attn_output, attn_output_weights = multihead_attn(query, key, value, pos_emb) """ - def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.) -> None: + def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0., + is_espnet_structure: bool = False) -> None: super(RelPositionMultiheadAttention, self).__init__() self.embed_dim = embed_dim self.num_heads = num_heads @@ -338,6 +347,7 @@ def __init__(self, embed_dim: int, num_heads: int, dropout: float = 0.) -> None: self.pos_bias_v = nn.Parameter(torch.Tensor(num_heads, self.head_dim)) self._reset_parameters() + self.is_espnet_structure = is_espnet_structure def _reset_parameters(self) -> None: nn.init.xavier_uniform_(self.in_proj.weight) @@ -538,7 +548,8 @@ def multi_head_attention_forward(self, query: Tensor, _b = _b[_start:] v = nn.functional.linear(value, _w, _b) - q = q * scaling + if not self.is_espnet_structure: + q = q * scaling if attn_mask is not None: assert attn_mask.dtype == torch.float32 or attn_mask.dtype == torch.float64 or \ @@ -596,7 +607,10 @@ def multi_head_attention_forward(self, query: Tensor, matrix_bd = torch.matmul(q_with_bias_v, p.transpose(-2, -1)) # (batch, head, time1, 2*time1-1) matrix_bd = self.rel_shift(matrix_bd) - attn_output_weights = (matrix_ac + matrix_bd) # (batch, head, time1, time2) + if not self.is_espnet_structure: + attn_output_weights = (matrix_ac + matrix_bd) # (batch, head, time1, time2) + else: + attn_output_weights = (matrix_ac + matrix_bd) * scaling # (batch, head, time1, time2) attn_output_weights = attn_output_weights.view(bsz * num_heads, tgt_len, -1) From 5d5c1dd970a302b2bd2553da42bb9c2cd60d6b07 Mon Sep 17 00:00:00 2001 From: Guo Liyong Date: Sat, 29 May 2021 11:23:21 +0800 Subject: [PATCH 2/2] change epochs of average --- egs/librispeech/asr/simple_v1/run.sh | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/egs/librispeech/asr/simple_v1/run.sh b/egs/librispeech/asr/simple_v1/run.sh index 23b2c114..61046847 100755 --- a/egs/librispeech/asr/simple_v1/run.sh +++ b/egs/librispeech/asr/simple_v1/run.sh @@ -83,24 +83,36 @@ if [ $stage -le 7 ]; then ln -sf /ceph-ly/open-source/to_submit/espnet_snowfall/snowfall/egs/librispeech/asr/simple_v1/data ./ output_dir=result_dir mkdir -p $output_dir + # no rescore + python3 ./mmi_att_transformer_decode.py \ + --output-dir $output_dir \ + --num-paths -1 \ + --max-duration 300 \ + --attention-dim 512 \ + --use-lm-rescoring False \ + --avg 16 \ + --epoch 19 + + # lattice rescore python3 ./mmi_att_transformer_decode.py \ --output-dir $output_dir \ --num-paths -1 \ --max-duration 300 \ --attention-dim 512 \ --use-lm-rescoring True \ - --avg 15 \ + --avg 16 \ --epoch 19 fi if [ $stage -le 8 ]; then export CUDA_VISIBLE_DEVICES=3 + # nbest rescore python3 ./mmi_att_transformer_decode.py \ --output-dir $output_dir \ --num-paths 1000 \ --max-duration 300 \ --attention-dim 512 \ --use-lm-rescoring True \ - --avg 15 \ + --avg 16 \ --epoch 19 fi