diff --git a/README.md b/README.md
index da2fc59..c1603ce 100644
--- a/README.md
+++ b/README.md
@@ -142,7 +142,7 @@ bash tools/dist_train.sh configs/soft_teacher/soft_teacher_faster_rcnn_r50_caffe
 ```
 - To train model on **new dataset**:
 
-The core idea is to convert a new dataset to coco format. Details about it can be found in the [adding new dataset](https://github.com/open-mmlab/mmdetection/blob/master/docs/tutorials/customize_dataset.md).
+The core idea is to convert a new dataset to coco format. Details about it can be found in the [adding new dataset](https://github.com/open-mmlab/mmdetection/blob/master/docs/tutorials/customize_dataset.md).  See also: tools/dataset/unlabeled_json.py
 
 
 
diff --git a/requirements.txt b/requirements.txt
index f3d6626..e388d91 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ torchvision
 mmcv-full
 wandb
 prettytable
+imagesize
diff --git a/ssod/datasets/samplers/semi_sampler.py b/ssod/datasets/samplers/semi_sampler.py
index 89b0b07..2ab2687 100644
--- a/ssod/datasets/samplers/semi_sampler.py
+++ b/ssod/datasets/samplers/semi_sampler.py
@@ -51,6 +51,7 @@ def __init__(
         self.size_of_dataset = []
         cumulative_sizes = [0] + self.cumulative_sizes
 
+        data_names = ['supervised', 'unsupervised']
         for i, _ in enumerate(self.group_sizes):
             size_of_dataset = 0
             cur_group_inds = np.where(self.flag == i)[0]
@@ -62,6 +63,9 @@ def __init__(
                     )
                 )[0]
                 size_per_dataset = len(cur_group_cur_dataset)
+                assert size_per_dataset is not 0, (
+                    f'{data_names[j]} dataset does not contain examples from both'
+                    ' h > w and  w > h aspect ratio groups')
                 size_of_dataset = max(
                     size_of_dataset, np.ceil(size_per_dataset / self.sample_ratio[j])
                 )
diff --git a/tools/dataset/unlabeled_json.py b/tools/dataset/unlabeled_json.py
new file mode 100644
index 0000000..9b3db19
--- /dev/null
+++ b/tools/dataset/unlabeled_json.py
@@ -0,0 +1,47 @@
+"""Generate unlabeled coco dataset json annotations from a folder of images.
+Uses imagesize for significant speedup over reading images into memory.
+
+Example:
+python tools/unlabeled_json.py --img-dir <img/path/> --json-out <json/save/path.json>
+"""
+
+import argparse
+import glob
+import imagesize
+import json
+
+
+def folder_to_json(img_dir, json_out_path):
+
+    ext = ('*.jpg', '*.jpeg', '*.png')
+    paths = [p for paths in [glob.glob(img_dir + e) for e in ext]
+        for p in paths]
+    assert len(paths) > 0
+
+    images = []
+    for i, p in enumerate(paths):
+        w, h = imagesize.get(p)
+        name = p.split('/')[-1]
+
+        per_image_dict = dict(
+            id=i,
+            file_name=name,
+            width=w,
+            height=h
+            )
+
+        images.append(per_image_dict)
+
+    data = dict(categories=[])
+    data['images'] = images
+    with open(json_out_path, 'w') as f:
+        json.dump(data, f)
+
+if __name__ == "__main__":
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--img-dir", type=str)
+    parser.add_argument("--json-out", type=str)
+    args = parser.parse_args()
+
+    folder_to_json(args.img_dir, args.json_out)