MyrtleSoftware · giuseppeCoccia · Nov 22, 2019 · Nov 22, 2019 · Nov 27, 2019 · Nov 27, 2019
diff --git a/docs/source/myrtlespeech/data/index.rst b/docs/source/myrtlespeech/data/index.rst
@@ -9,3 +9,4 @@
     alphabet
     dataset/index
     preprocess
+    sampler
diff --git a/docs/source/myrtlespeech/data/sampler.rst b/docs/source/myrtlespeech/data/sampler.rst
@@ -0,0 +1,16 @@
+=========
+ sampler
+=========
+
+.. automodule:: myrtlespeech.data.sampler
+     :members:
+     :show-inheritance:
+
+.. autoclass:: myrtlespeech.data.sampler.SequentialRandomSampler
+    :members:
+    :show-inheritance:
+
+
+.. autoclass:: myrtlespeech.data.sampler.SortaGrad
+    :members:
+    :show-inheritance:
diff --git a/src/myrtlespeech/builders/task_config.py b/src/myrtlespeech/builders/task_config.py
@@ -1,11 +1,13 @@
 import multiprocessing
 from typing import Tuple
+from typing import Union
 
 import torch
 from myrtlespeech.builders.dataset import build as build_dataset
 from myrtlespeech.builders.speech_to_text import build as build_stt
 from myrtlespeech.data.batch import seq_to_seq_collate_fn
-from myrtlespeech.data.sampler import RandomBatchSampler
+from myrtlespeech.data.sampler import SequentialRandomSampler
+from myrtlespeech.data.sampler import SortaGrad
 from myrtlespeech.model.seq_to_seq import SeqToSeq
 from myrtlespeech.protos import task_config_pb2
 
@@ -114,15 +116,35 @@ def target_transform(target):
         add_seq_len_to_transforms=True,
     )
 
-    shuffle = task_config.train_config.shuffle_batches_before_every_epoch
-    train_loader = torch.utils.data.DataLoader(
-        dataset=train_dataset,
-        batch_sampler=RandomBatchSampler(
+    shuffle_str = task_config.train_config.WhichOneof("shuffle_strategy")
+    batch_sampler: Union[SortaGrad, SequentialRandomSampler]
+    if shuffle_str == "sorta_grad":
+        batch_sampler = SortaGrad(
+            indices=range(len(train_dataset)),
+            batch_size=task_config.train_config.batch_size,
+            shuffle=True,
+            drop_last=False,
+        )
+    elif shuffle_str == "random_batches":
+        batch_sampler = SequentialRandomSampler(
             indices=range(len(train_dataset)),
             batch_size=task_config.train_config.batch_size,
-            shuffle=shuffle,
+            shuffle=True,
             drop_last=False,
-        ),
+        )
+    elif shuffle_str == "sequential_batches":
+        batch_sampler = SequentialRandomSampler(
+            indices=range(len(train_dataset)),
+            batch_size=task_config.train_config.batch_size,
+            shuffle=False,
+            drop_last=False,
+        )
+    else:
+        raise ValueError(f"unsupported shuffle strategy {shuffle_str}")
+
+    train_loader = torch.utils.data.DataLoader(
+        dataset=train_dataset,
+        batch_sampler=batch_sampler,
         num_workers=num_workers,
         collate_fn=seq_to_seq_collate_fn,
         pin_memory=torch.cuda.is_available(),

diff --git a/src/myrtlespeech/configs/deep_speech_1_en.config b/src/myrtlespeech/configs/deep_speech_1_en.config
@@ -60,7 +60,8 @@ train_config {
       }
     }
   }
-  shuffle_batches_before_every_epoch: true;
+  random_batches {
+  }
 }
 
 eval_config {

diff --git a/src/myrtlespeech/configs/deep_speech_2_en.config b/src/myrtlespeech/configs/deep_speech_2_en.config
@@ -111,7 +111,8 @@ train_config {
       }
     }
   }
-  shuffle_batches_before_every_epoch: true;
+  sorta_grad {
+  }
 }
 
 eval_config {

diff --git a/src/myrtlespeech/data/sampler.py b/src/myrtlespeech/data/sampler.py
@@ -1,14 +1,48 @@
 import random
+from typing import Iterable
+from typing import Optional
 
 
-class RandomBatchSampler:
-    """TODO"""
+class SequentialRandomSampler:
+    """A sequential or random iterable over batches of indices.
 
-    def __init__(self, indices, batch_size, shuffle, drop_last=False):
+    The iterator used each time this iterable is iterated over will yield
+    batches of indices either sequentially (i.e. in-order) or randomly (uniform
+    without replacement).
+    This iterable records the number of times it has returned an iterator. A
+    sequential iterator is returned if the current count is in `sequential`.
+
+    Args:
+        indices: Data with which batches are created.
+        batch_size: Batch dimension.
+        shuffle: Set to True to have the data reshuffled at every epoch if a
+            random iterator is used.
+        drop_last: Set to True to drop the last incomplete batch, if the
+            dataset size is not divisible by the batch size. If False and the
+            size of dataset is not divisible by the batch size, then the last
+            batch will be smaller.
+        n_iterators: Number of iterators returned so far.
+        sequential: Counts at which to return a sequential iterator.
+
+    Yields:
+        Batches from `indices`.
+    """
+
+    def __init__(
+        self,
+        indices: Iterable[int],
+        batch_size: int,
+        shuffle: bool,
+        drop_last: bool = False,
+        n_iterators: int = 0,
+        sequential: Optional[set] = None,
+    ):
         self.shuffle = shuffle
         self.batch_indices = self._batch_indices(
             indices, batch_size, drop_last
         )
+        self._n_iterators = n_iterators
+        self._sequential = sequential or set()
 
     def _batch_indices(self, indices, batch_size, drop_last):
         batches = []
@@ -23,10 +57,53 @@ def _batch_indices(self, indices, batch_size, drop_last):
         return batches
 
     def __iter__(self):
-        if self.shuffle:
-            random.shuffle(self.batch_indices)
-        for b in self.batch_indices:
-            yield b
+        indices = list(range(len(self.batch_indices)))
+        if self.shuffle and self._n_iterators not in self._sequential:
+            random.shuffle(indices)
+        self._n_iterators += 1
+        for index in indices:
+            yield self.batch_indices[index]
 
     def __len__(self):
         return len(self.batch_indices)
+
+
+class SortaGrad(SequentialRandomSampler):
+    """An iterable over batch indices according to the SortaGrad strategy.
+
+    The SortaGrad curriculum learning strategy iterates over batches from the
+    batched dataset sequentially for the first pass and then randomly for all
+    other passes. See `Deep Speech 2 <https://arxiv.org/abs/1512.02595>`_ paper
+    for more information.
+
+    Args:
+        indices: Data with which batches are created.
+        batch_size: Batch dimension.
+        shuffle: Set to True to have the data reshuffled at every epoch if a
+            random iterator is used.
+        drop_last: Set to True to drop the last incomplete batch, if the
+            dataset size is not divisible by the batch size. If False and the
+            size of dataset is not divisible by the batch size, then the last
+            batch will be smaller.
+        start_epoch: Number of iterators returned so far by the sampler.
+
+    Yields:
+        Batches from `indices`.
+    """
+
+    def __init__(
+        self,
+        indices: Iterable[int],
+        batch_size: int,
+        shuffle: bool,
+        drop_last: bool = False,
+        start_epoch: int = 0,
+    ):
+        super().__init__(
+            indices,
+            batch_size,
+            shuffle,
+            drop_last,
+            n_iterators=start_epoch,
+            sequential={0},
+        )
diff --git a/src/myrtlespeech/protos/shuffle_strategy.proto b/src/myrtlespeech/protos/shuffle_strategy.proto
@@ -0,0 +1,15 @@
+syntax = "proto3";
+
+package myrtlespeech.protos;
+
+
+message SequentialBatches {
+}
+
+
+message RandomBatches {
+}
+
+
+message SortaGrad {
+}
diff --git a/src/myrtlespeech/protos/train_config.proto b/src/myrtlespeech/protos/train_config.proto
@@ -5,6 +5,7 @@ package myrtlespeech.protos;
 import "myrtlespeech/protos/dataset.proto";
 import "myrtlespeech/protos/lr_scheduler.proto";
 import "myrtlespeech/protos/optimizer.proto";
+import "myrtlespeech/protos/shuffle_strategy.proto";
 
 
 // Configuration for training.
@@ -29,8 +30,12 @@ message TrainConfig {
 
   Dataset dataset = 9;
 
-  oneof supported_shuffles {
+  oneof shuffle_strategy {
+    // Mantain a sequential batch order.
+    SequentialBatches sequential_batches = 10;
     // Shuffle batches before every epoch.
-    bool shuffle_batches_before_every_epoch = 10;
+    RandomBatches random_batches = 11;
+    // Sequential for the first epoch and random for the following ones.
+    SortaGrad sorta_grad = 12;
   }
 }
diff --git a/tests/data/test_batch.py b/tests/data/test_batch.py
@@ -102,7 +102,7 @@ def test_pad_sequences_returns_tensor_with_correct_values(
 
 
 def test_seq_to_seq_collate_fn() -> None:
-    """Unit test to ensure seq_to_seq_collate_fn returns correct values."""
+    """Ensure seq_to_seq_collate_fn returns correct values"""
     inputs = [rand([1, 2, 3]), rand([1, 2, 5])]
     seq_lens = tensor([3, 5])
-Original file line number
+Diff line change
@@ Expand Up / @@ -60,7 +60,8 @@ train_config { @@
           }
         }
       }
-      shuffle_batches_before_every_epoch: true;
+      random_batches {
+      }
     }
     eval_config {
@@ Expand Down @@