diff --git a/README.md b/README.md
index da2fc59..c1603ce 100644
--- a/README.md
+++ b/README.md
@@ -142,7 +142,7 @@ bash tools/dist_train.sh configs/soft_teacher/soft_teacher_faster_rcnn_r50_caffe
```
- To train model on **new dataset**:
-The core idea is to convert a new dataset to coco format. Details about it can be found in the [adding new dataset](https://github.com/open-mmlab/mmdetection/blob/master/docs/tutorials/customize_dataset.md).
+The core idea is to convert a new dataset to coco format. Details about it can be found in the [adding new dataset](https://github.com/open-mmlab/mmdetection/blob/master/docs/tutorials/customize_dataset.md). See also: tools/dataset/unlabeled_json.py
diff --git a/requirements.txt b/requirements.txt
index f3d6626..e388d91 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -3,3 +3,4 @@ torchvision
mmcv-full
wandb
prettytable
+imagesize
diff --git a/ssod/datasets/samplers/semi_sampler.py b/ssod/datasets/samplers/semi_sampler.py
index 89b0b07..2ab2687 100644
--- a/ssod/datasets/samplers/semi_sampler.py
+++ b/ssod/datasets/samplers/semi_sampler.py
@@ -51,6 +51,7 @@ def __init__(
self.size_of_dataset = []
cumulative_sizes = [0] + self.cumulative_sizes
+ data_names = ['supervised', 'unsupervised']
for i, _ in enumerate(self.group_sizes):
size_of_dataset = 0
cur_group_inds = np.where(self.flag == i)[0]
@@ -62,6 +63,9 @@ def __init__(
)
)[0]
size_per_dataset = len(cur_group_cur_dataset)
+ assert size_per_dataset is not 0, (
+ f'{data_names[j]} dataset does not contain examples from both'
+ ' h > w and w > h aspect ratio groups')
size_of_dataset = max(
size_of_dataset, np.ceil(size_per_dataset / self.sample_ratio[j])
)
diff --git a/tools/dataset/unlabeled_json.py b/tools/dataset/unlabeled_json.py
new file mode 100644
index 0000000..9b3db19
--- /dev/null
+++ b/tools/dataset/unlabeled_json.py
@@ -0,0 +1,47 @@
+"""Generate unlabeled coco dataset json annotations from a folder of images.
+Uses imagesize for significant speedup over reading images into memory.
+
+Example:
+python tools/unlabeled_json.py --img-dir
--json-out
+"""
+
+import argparse
+import glob
+import imagesize
+import json
+
+
+def folder_to_json(img_dir, json_out_path):
+
+ ext = ('*.jpg', '*.jpeg', '*.png')
+ paths = [p for paths in [glob.glob(img_dir + e) for e in ext]
+ for p in paths]
+ assert len(paths) > 0
+
+ images = []
+ for i, p in enumerate(paths):
+ w, h = imagesize.get(p)
+ name = p.split('/')[-1]
+
+ per_image_dict = dict(
+ id=i,
+ file_name=name,
+ width=w,
+ height=h
+ )
+
+ images.append(per_image_dict)
+
+ data = dict(categories=[])
+ data['images'] = images
+ with open(json_out_path, 'w') as f:
+ json.dump(data, f)
+
+if __name__ == "__main__":
+
+ parser = argparse.ArgumentParser()
+ parser.add_argument("--img-dir", type=str)
+ parser.add_argument("--json-out", type=str)
+ args = parser.parse_args()
+
+ folder_to_json(args.img_dir, args.json_out)