diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000..7756454
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,5 @@
+.idea/
+build/
+data/
+__pycache__
+output
diff --git a/README.md b/README.md
index 55d7d1f..35e84a4 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 # High-level Semantic Feature Detection: A New Perspective for Pedestrian Detection
 Keras implementation of [CSP] accepted by CVPR 2019.
 ## Introduction
-This paper provides a new perspective for detecting pedestrians where detection is formulated as Center and Scale Prediction (CSP), the pipeline is illustrated in the following. For more details, please refer to our [paper](./docs/2019CVPR-CSP.pdf).
+This paper provides a new perspective for detecting pedestrians where detection is formulated as Center and Scale Prediction (CSP), the pipeline is illustrated in the following. For more details, please refer to our [paper](http://openaccess.thecvf.com/content_CVPR_2019/papers/Liu_High-Level_Semantic_Feature_Detection_A_New_Perspective_for_Pedestrian_Detection_CVPR_2019_paper.pdf).
 ![img01](./docs/pipeline.png)
 
 Besides the superority on pedestrian detection demonstrated in the paper, we take a step further towards the generablity of CSP and validate it on face detection. Experimental reults on WiderFace benchmark also show the competitiveness of CSP.
@@ -10,10 +10,10 @@ Besides the superority on pedestrian detection demonstrated in the paper, we tak
 
 ### Dependencies
 
-* Python 2.7
-* Tensorflow 1.4.1
-* Keras 2.0.6
-* OpenCV 3.4.1.15
+* Python >= 3.6
+* Tensorflow >= 1.1.3
+* Keras >= 2.0.6
+* OpenCV >= 3.4.1.15  (note that other versions than 3.4.1.15 will result in different performance on Caltech)
 
 ## Contents
 1. [Installation](#installation)
@@ -33,6 +33,17 @@ Besides the superority on pedestrian detection demonstrated in the paper, we tak
   pip install -r requirements.txt
 ```
 
+3. Build dependencies
+```
+  python setup.py build_ext --inplace
+```
+
+4. Download pretrained resnet50 weights (basenet only):
+```
+./download_weights.sh
+```
+
+
 ### Preparation
 1. Download the dataset.
 
diff --git a/download_weights.sh b/download_weights.sh
new file mode 100755
index 0000000..8f263fa
--- /dev/null
+++ b/download_weights.sh
@@ -0,0 +1,2 @@
+mkdir -p data/models
+wget -O ./data/models/resnet50_weights_tf_dim_ordering_tf_kernels.h5 https://github.com/fchollet/deep-learning-models/releases/download/v0.2/resnet50_weights_tf_dim_ordering_tf_kernels.h5 --no-check-certificate
diff --git a/eval_city/cocoapi/PythonAPI/pycocotools/coco.py b/eval_city/cocoapi/PythonAPI/pycocotools/coco.py
index dc9972b..7b8b37e 100755
--- a/eval_city/cocoapi/PythonAPI/pycocotools/coco.py
+++ b/eval_city/cocoapi/PythonAPI/pycocotools/coco.py
@@ -58,7 +58,7 @@
 import sys
 PYTHON_VERSION = sys.version_info[0]
 if PYTHON_VERSION == 2:
-    from urllib import urlretrieve
+    from urllib.request import urlretrieve
 elif PYTHON_VERSION == 3:
     from urllib.request import urlretrieve
 
@@ -83,7 +83,7 @@ def __init__(self, annotation_file=None):
             tic = time.time()
             dataset = json.load(open(annotation_file, 'r'))
             assert type(dataset)==dict, 'annotation file format {} not supported'.format(type(dataset))
-            print('Done (t={:0.2f}s)'.format(time.time()- tic))
+            print(('Done (t={:0.2f}s)'.format(time.time()- tic)))
             self.dataset = dataset
             self.createIndex()
 
@@ -123,8 +123,8 @@ def info(self):
         Print information about the annotation file.
         :return:
         """
-        for key, value in self.dataset['info'].items():
-            print('{}: {}'.format(key, value))
+        for key, value in list(self.dataset['info'].items()):
+            print(('{}: {}'.format(key, value)))
 
     def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
         """
@@ -187,7 +187,7 @@ def getImgIds(self, imgIds=[], catIds=[]):
         catIds = catIds if _isArrayLike(catIds) else [catIds]
 
         if len(imgIds) == len(catIds) == 0:
-            ids = self.imgs.keys()
+            ids = list(self.imgs.keys())
         else:
             ids = set(imgIds)
             for i, catId in enumerate(catIds):
@@ -292,7 +292,7 @@ def showAnns(self, anns):
             ax.add_collection(p)
         elif datasetType == 'captions':
             for ann in anns:
-                print(ann['caption'])
+                print((ann['caption']))
 
     def loadRes(self, resFile):
         """
@@ -305,7 +305,7 @@ def loadRes(self, resFile):
 
         print('Loading and preparing results...')
         tic = time.time()
-        if type(resFile) == str or type(resFile) == unicode:
+        if type(resFile) == str or type(resFile) == str:
             anns = json.load(open(resFile))
         elif type(resFile) == np.ndarray:
             anns = self.loadNumpyAnnotations(resFile)
@@ -349,7 +349,7 @@ def loadRes(self, resFile):
                 ann['area'] = (x1-x0)*(y1-y0)
                 ann['id'] = id + 1
                 ann['bbox'] = [x0,y0,x1-x0,y1-y0]
-        print('DONE (t={:0.2f}s)'.format(time.time()- tic))
+        print(('DONE (t={:0.2f}s)'.format(time.time()- tic)))
 
         res.dataset['annotations'] = anns
         res.createIndex()
@@ -366,7 +366,7 @@ def download(self, tarDir = None, imgIds = [] ):
             print('Please specify target directory')
             return -1
         if len(imgIds) == 0:
-            imgs = self.imgs.values()
+            imgs = list(self.imgs.values())
         else:
             imgs = self.loadImgs(imgIds)
         N = len(imgs)
@@ -377,7 +377,7 @@ def download(self, tarDir = None, imgIds = [] ):
             fname = os.path.join(tarDir, img['file_name'])
             if not os.path.exists(fname):
                 urlretrieve(img['coco_url'], fname)
-            print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
+            print(('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic)))
 
     def loadNumpyAnnotations(self, data):
         """
@@ -387,13 +387,13 @@ def loadNumpyAnnotations(self, data):
         """
         print('Converting ndarray to lists...')
         assert(type(data) == np.ndarray)
-        print(data.shape)
+        print((data.shape))
         assert(data.shape[1] == 7)
         N = data.shape[0]
         ann = []
         for i in range(N):
             if i % 1000000 == 0:
-                print('{}/{}'.format(i,N))
+                print(('{}/{}'.format(i,N)))
             ann += [{
                 'image_id'  : int(data[i, 0]),
                 'bbox'  : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
diff --git a/eval_city/cocoapi/PythonAPI/pycocotools/cocoeval.py b/eval_city/cocoapi/PythonAPI/pycocotools/cocoeval.py
index 7a4b4ad..84ab931 100755
--- a/eval_city/cocoapi/PythonAPI/pycocotools/cocoeval.py
+++ b/eval_city/cocoapi/PythonAPI/pycocotools/cocoeval.py
@@ -130,8 +130,8 @@ def evaluate(self):
         # add backward compatibility if useSegm is specified in params
         if not p.useSegm is None:
             p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
-            print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
-        print('Evaluate annotation type *{}*'.format(p.iouType))
+            print(('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)))
+        print(('Evaluate annotation type *{}*'.format(p.iouType)))
         p.imgIds = list(np.unique(p.imgIds))
         if p.useCats:
             p.catIds = list(np.unique(p.catIds))
@@ -159,7 +159,7 @@ def evaluate(self):
              ]
         self._paramsEval = copy.deepcopy(self.params)
         toc = time.time()
-        print('DONE (t={:0.2f}s).'.format(toc-tic))
+        print(('DONE (t={:0.2f}s).'.format(toc-tic)))
 
     def computeIoU(self, imgId, catId):
         p = self.params
@@ -346,7 +346,7 @@ def accumulate(self, p = None):
         # get inds to evaluate
         k_list = [n for n, k in enumerate(p.catIds)  if k in setK]
         m_list = [m for n, m in enumerate(p.maxDets) if m in setM]
-        a_list = [n for n, a in enumerate(map(lambda x: tuple(x), p.areaRng)) if a in setA]
+        a_list = [n for n, a in enumerate([tuple(x) for x in p.areaRng]) if a in setA]
         i_list = [n for n, i in enumerate(p.imgIds)  if i in setI]
         I0 = len(_pe.imgIds)
         A0 = len(_pe.areaRng)
@@ -418,7 +418,7 @@ def accumulate(self, p = None):
             'scores': scores,
         }
         toc = time.time()
-        print('DONE (t={:0.2f}s).'.format( toc-tic))
+        print(('DONE (t={:0.2f}s).'.format( toc-tic)))
 
     def summarize(self):
         '''
@@ -454,7 +454,7 @@ def _summarize( ap=1, iouThr=None, areaRng='all', maxDets=100 ):
                 mean_s = -1
             else:
                 mean_s = np.mean(s[s>-1])
-            print(iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s))
+            print((iStr.format(titleStr, typeStr, iouStr, areaRng, maxDets, mean_s)))
             return mean_s
         def _summarizeDets():
             stats = np.zeros((12,))
diff --git a/eval_city/eval_script/coco.py b/eval_city/eval_script/coco.py
index f870138..576371a 100755
--- a/eval_city/eval_script/coco.py
+++ b/eval_city/eval_script/coco.py
@@ -58,7 +58,7 @@
 import sys
 PYTHON_VERSION = sys.version_info[0]
 if PYTHON_VERSION == 2:
-    from urllib import urlretrieve
+    from urllib.request import urlretrieve
 elif PYTHON_VERSION == 3:
     from urllib.request import urlretrieve
 
@@ -118,8 +118,8 @@ def info(self):
         Print information about the annotation file.
         :return:
         """
-        for key, value in self.dataset['info'].items():
-            print('{}: {}'.format(key, value))
+        for key, value in list(self.dataset['info'].items()):
+            print(('{}: {}'.format(key, value)))
 
     def getAnnIds(self, imgIds=[], catIds=[], areaRng=[], iscrowd=None):
         """
@@ -182,7 +182,7 @@ def getImgIds(self, imgIds=[], catIds=[]):
         catIds = catIds if type(catIds) == list else [catIds]
 
         if len(imgIds) == len(catIds) == 0:
-            ids = self.imgs.keys()
+            ids = list(self.imgs.keys())
         else:
             ids = set(imgIds)
             for i, catId in enumerate(catIds):
@@ -287,7 +287,7 @@ def showAnns(self, anns):
             ax.add_collection(p)
         elif datasetType == 'captions':
             for ann in anns:
-                print(ann['caption'])
+                print((ann['caption']))
 
     def loadRes(self, resFile):
         """
@@ -300,7 +300,7 @@ def loadRes(self, resFile):
 
         # print('Loading and preparing results...')
         tic = time.time()
-        if type(resFile) == str or type(resFile) == unicode:
+        if type(resFile) == str or type(resFile) == str:
             anns = json.load(open(resFile))
         elif type(resFile) == np.ndarray:
             anns = self.loadNumpyAnnotations(resFile)
@@ -364,7 +364,7 @@ def download(self, tarDir = None, imgIds = [] ):
             print('Please specify target directory')
             return -1
         if len(imgIds) == 0:
-            imgs = self.imgs.values()
+            imgs = list(self.imgs.values())
         else:
             imgs = self.loadImgs(imgIds)
         N = len(imgs)
@@ -375,7 +375,7 @@ def download(self, tarDir = None, imgIds = [] ):
             fname = os.path.join(tarDir, img['file_name'])
             if not os.path.exists(fname):
                 urlretrieve(img['coco_url'], fname)
-            print('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic))
+            print(('downloaded {}/{} images (t={:0.1f}s)'.format(i, N, time.time()- tic)))
 
     def loadNumpyAnnotations(self, data):
         """
@@ -385,13 +385,13 @@ def loadNumpyAnnotations(self, data):
         """
         print('Converting ndarray to lists...')
         assert(type(data) == np.ndarray)
-        print(data.shape)
+        print((data.shape))
         assert(data.shape[1] == 7)
         N = data.shape[0]
         ann = []
         for i in range(N):
             if i % 1000000 == 0:
-                print('{}/{}'.format(i,N))
+                print(('{}/{}'.format(i,N)))
             ann += [{
                 'image_id'  : int(data[i, 0]),
                 'bbox'  : [ data[i, 1], data[i, 2], data[i, 3], data[i, 4] ],
diff --git a/eval_city/eval_script/coco.pyc b/eval_city/eval_script/coco.pyc
deleted file mode 100755
index 1f3d2b2..0000000
Binary files a/eval_city/eval_script/coco.pyc and /dev/null differ
diff --git a/eval_city/eval_script/eval_MR_multisetup.py b/eval_city/eval_script/eval_MR_multisetup.py
index 1173b71..e16fc3d 100755
--- a/eval_city/eval_script/eval_MR_multisetup.py
+++ b/eval_city/eval_script/eval_MR_multisetup.py
@@ -122,7 +122,7 @@ def evaluate(self, id_setup):
         # add backward compatibility if useSegm is specified in params
         if not p.useSegm is None:
             p.iouType = 'segm' if p.useSegm == 1 else 'bbox'
-            print('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType))
+            print(('useSegm (deprecated) is not None. Running {} evaluation'.format(p.iouType)))
         # print('Evaluate annotation type *{}*'.format(p.iouType))
         p.imgIds = list(np.unique(p.imgIds))
         if p.useCats:
@@ -446,7 +446,7 @@ def _summarize(iouThr=None, maxDets=100 ):
                 mean_s = np.log(mrs[mrs<2])
                 mean_s = np.mean(mean_s)
                 mean_s = np.exp(mean_s)
-            print(iStr.format(titleStr, typeStr,setupStr, iouStr, heightStr, occlStr, mean_s*100))
+            print((iStr.format(titleStr, typeStr,setupStr, iouStr, heightStr, occlStr, mean_s*100)))
             # res_file.write(iStr.format(titleStr, typeStr,setupStr, iouStr, heightStr, occlStr, mean_s*100))
             res_file.write(str(mean_s * 100))
             res_file.write('\n')
diff --git a/eval_city/eval_script/eval_MR_multisetup.pyc b/eval_city/eval_script/eval_MR_multisetup.pyc
deleted file mode 100644
index a0c9e6b..0000000
Binary files a/eval_city/eval_script/eval_MR_multisetup.pyc and /dev/null differ
diff --git a/eval_city/eval_script/eval_demo.py b/eval_city/eval_script/eval_demo.py
index b32ae63..7d7c6d5 100755
--- a/eval_city/eval_script/eval_demo.py
+++ b/eval_city/eval_script/eval_demo.py
@@ -8,7 +8,7 @@
 annFile = '../val_gt.json'
 main_path = '../../output/valresults/city/h/off'
 for f in sorted(os.listdir(main_path)):
-    print f
+    print(f)
     # initialize COCO detections api
     dt_path = os.path.join(main_path, f)
     resFile = os.path.join(dt_path,'val_dt.json')
diff --git a/generate_cache_caltech.py b/generate_cache_caltech.py
index cad6ec8..c1923d2 100755
--- a/generate_cache_caltech.py
+++ b/generate_cache_caltech.py
@@ -1,5 +1,5 @@
 import os
-import cPickle
+import pickle
 import numpy as np
 import matplotlib.pyplot as plt
 
@@ -18,46 +18,47 @@
 box_count = 0
 files = sorted(os.listdir(all_anno_path))
 for l in range(len(files)):
-	gtname = files[l]
-	imgname = files[l].split('.')[0]+'.jpg'
-	img_path = os.path.join(all_img_path, imgname)
-	gt_path = os.path.join(all_anno_path, gtname)
+    gtname = files[l]
+    imgname = files[l].split('.')[0] + '.jpg'
+    img_path = os.path.join(all_img_path, imgname)
+    gt_path = os.path.join(all_anno_path, gtname)
 
-	boxes = []
-	ig_boxes = []
-	with open(gt_path, 'rb') as fid:
-		lines = fid.readlines()
-	if len(lines)>1:
-		for i in range(1, len(lines)):
-			info = lines[i].strip().split(' ')
-			label = info[0]
-			occ, ignore = info[5], info[10]
-			x1, y1 = max(int(float(info[1])), 0), max(int(float(info[2])), 0)
-			w, h = min(int(float(info[3])), cols - x1 - 1), min(int(float(info[4])), rows - y1 - 1)
-			box = np.array([int(x1), int(y1), int(x1) + int(w), int(y1) + int(h)])
-			if int(ignore) == 0:
-				boxes.append(box)
-			else:
-				ig_boxes.append(box)
-	boxes = np.array(boxes)
-	ig_boxes = np.array(ig_boxes)
+    boxes = []
+    ig_boxes = []
+    with open(gt_path, 'rb') as fid:
+        lines = fid.readlines()
+    if len(lines) > 1:
+        for i in range(1, len(lines)):
+            info = lines[i].strip().split(' ')
+            label = info[0]
+            occ, ignore = info[5], info[10]
+            x1, y1 = max(int(float(info[1])), 0), max(int(float(info[2])), 0)
+            w, h = min(int(float(info[3])), cols - x1 - 1), min(int(float(info[4])), rows - y1 - 1)
+            box = np.array([int(x1), int(y1), int(x1) + int(w), int(y1) + int(h)])
+            if int(ignore) == 0:
+                boxes.append(box)
+            else:
+                ig_boxes.append(box)
+    boxes = np.array(boxes)
+    ig_boxes = np.array(ig_boxes)
 
-	annotation = {}
-	annotation['filepath'] = img_path
-	box_count += len(boxes)
-	iggt_count += len(ig_boxes)
-	annotation['bboxes'] = boxes
-	annotation['ignoreareas'] = ig_boxes
-	if len(boxes) == 0:
-		image_data_nogt.append(annotation)
-	else:
-		image_data_gt.append(annotation)
-		valid_count += 1
-print '{} images and {} valid images, {} valid gt and {} ignored gt'.format(len(files), valid_count, box_count, iggt_count)
+    annotation = {}
+    annotation['filepath'] = img_path
+    box_count += len(boxes)
+    iggt_count += len(ig_boxes)
+    annotation['bboxes'] = boxes
+    annotation['ignoreareas'] = ig_boxes
+    if len(boxes) == 0:
+        image_data_nogt.append(annotation)
+    else:
+        image_data_gt.append(annotation)
+        valid_count += 1
+print('{} images and {} valid images, {} valid gt and {} ignored gt'.format(len(files), valid_count, box_count,
+                                                                            iggt_count))
 
 if not os.path.exists(res_path_gt):
-	with open(res_path_gt, 'wb') as fid:
-		cPickle.dump(image_data_gt, fid, cPickle.HIGHEST_PROTOCOL)
+    with open(res_path_gt, 'wb') as fid:
+        pickle.dump(image_data_gt, fid, pickle.HIGHEST_PROTOCOL)
 if not os.path.exists(res_path_nogt):
-	with open(res_path_nogt, 'wb') as fid:
-		cPickle.dump(image_data_nogt, fid, cPickle.HIGHEST_PROTOCOL)
\ No newline at end of file
+    with open(res_path_nogt, 'wb') as fid:
+        pickle.dump(image_data_nogt, fid, pickle.HIGHEST_PROTOCOL)
diff --git a/generate_cache_city.py b/generate_cache_city.py
index 8172b6e..00c230e 100755
--- a/generate_cache_city.py
+++ b/generate_cache_city.py
@@ -1,7 +1,6 @@
-from __future__ import division
 import os
 import cv2
-import cPickle
+import pickle
 import numpy as np
 from scipy import io as scio
 import time
@@ -15,54 +14,56 @@
 rows, cols = 1024, 2048
 
 for type in types:
-	anno_path = os.path.join(all_anno_path, 'anno_'+type+'.mat')
-	res_path = os.path.join('data/cache/cityperson', type)
-	image_data = []
-	annos = scio.loadmat(anno_path)
-	index = 'anno_'+type+'_aligned'
-	valid_count = 0
-	iggt_count = 0
-	box_count = 0
-	for l in range(len(annos[index][0])):
-		anno = annos[index][0][l]
-		cityname = anno[0][0][0][0].encode()
-		imgname = anno[0][0][1][0].encode()
-		gts = anno[0][0][2]
-		img_path = os.path.join(all_img_path, type + '/'+ cityname+'/'+imgname)
-		boxes = []
-		ig_boxes = []
-		vis_boxes = []
-		for i in range(len(gts)):
-			label, x1, y1, w, h = gts[i, :5]
-			x1, y1 = max(int(x1), 0), max(int(y1), 0)
-			w, h = min(int(w), cols - x1 -1), min(int(h), rows - y1 -1)
-			xv1, yv1, wv, hv = gts[i, 6:]
-			xv1, yv1 = max(int(xv1), 0), max(int(yv1), 0)
-			wv, hv = min(int(wv), cols - xv1 - 1), min(int(hv), rows - yv1 - 1)
+    anno_path = os.path.join(all_anno_path, 'anno_' + type + '.mat')
+    res_path = os.path.join('data/cache/cityperson', type)
+    image_data = []
+    annos = scio.loadmat(anno_path)
+    index = 'anno_' + type + '_aligned'
+    valid_count = 0
+    iggt_count = 0
+    box_count = 0
+    for l in range(len(annos[index][0])):
+        anno = annos[index][0][l]
+        cityname = anno[0][0][0][0].encode()
+        imgname = anno[0][0][1][0].encode()
+        gts = anno[0][0][2]
+        img_path = os.path.join(all_img_path, type + '/' + cityname + '/' + imgname)
+        boxes = []
+        ig_boxes = []
+        vis_boxes = []
+        for i in range(len(gts)):
+            label, x1, y1, w, h = gts[i, :5]
+            x1, y1 = max(int(x1), 0), max(int(y1), 0)
+            w, h = min(int(w), cols - x1 - 1), min(int(h), rows - y1 - 1)
+            xv1, yv1, wv, hv = gts[i, 6:]
+            xv1, yv1 = max(int(xv1), 0), max(int(yv1), 0)
+            wv, hv = min(int(wv), cols - xv1 - 1), min(int(hv), rows - yv1 - 1)
 
-			if label == 1 and h>=50:
-				box = np.array([int(x1), int(y1), int(x1)+int(w), int(y1)+int(h)])
-				boxes.append(box)
-				vis_box = np.array([int(xv1), int(yv1), int(xv1)+int(wv), int(yv1)+int(hv)])
-				vis_boxes.append(vis_box)
-			else:
-				ig_box = np.array([int(x1), int(y1), int(x1)+int(w), int(y1)+int(h)])
-				ig_boxes.append(ig_box)
-		boxes = np.array(boxes)
-		vis_boxes = np.array(vis_boxes)
-		ig_boxes = np.array(ig_boxes)
+            if label == 1 and h >= 50:
+                box = np.array([int(x1), int(y1), int(x1) + int(w), int(y1) + int(h)])
+                boxes.append(box)
+                vis_box = np.array([int(xv1), int(yv1), int(xv1) + int(wv), int(yv1) + int(hv)])
+                vis_boxes.append(vis_box)
+            else:
+                ig_box = np.array([int(x1), int(y1), int(x1) + int(w), int(y1) + int(h)])
+                ig_boxes.append(ig_box)
+        boxes = np.array(boxes)
+        vis_boxes = np.array(vis_boxes)
+        ig_boxes = np.array(ig_boxes)
 
-		if len(boxes)>0:
-			valid_count += 1
-		annotation = {}
-		annotation['filepath'] = img_path
-		box_count += len(boxes)
-		iggt_count += len(ig_boxes)
-		annotation['bboxes'] = boxes
-		annotation['vis_bboxes'] = vis_boxes
-		annotation['ignoreareas'] = ig_boxes
-		image_data.append(annotation)
-	if not os.path.exists(res_path):
-		with open(res_path, 'wb') as fid:
-			cPickle.dump(image_data, fid, cPickle.HIGHEST_PROTOCOL)
-	print '{} has {} images and {} valid images, {} valid gt and {} ignored gt'.format(type, len(annos[index][0]), valid_count, box_count, iggt_count)
+        if len(boxes) > 0:
+            valid_count += 1
+        annotation = {}
+        annotation['filepath'] = img_path
+        box_count += len(boxes)
+        iggt_count += len(ig_boxes)
+        annotation['bboxes'] = boxes
+        annotation['vis_bboxes'] = vis_boxes
+        annotation['ignoreareas'] = ig_boxes
+        image_data.append(annotation)
+    if not os.path.exists(res_path):
+        with open(res_path, 'wb') as fid:
+            pickle.dump(image_data, fid, pickle.HIGHEST_PROTOCOL)
+    print('{} has {} images and {} valid images, {} valid gt and {} ignored gt'.format(type, len(annos[index][0]),
+                                                                                       valid_count, box_count,
+                                                                                       iggt_count))
diff --git a/generate_cache_wider.py b/generate_cache_wider.py
index cc37f8b..15145c0 100755
--- a/generate_cache_wider.py
+++ b/generate_cache_wider.py
@@ -1,12 +1,12 @@
 import os
 import cv2
-import cPickle
+import pickle
 import numpy as np
 import matplotlib.pyplot as plt
 
 root_dir = 'data/WiderFace/'
 img_path = os.path.join(root_dir, 'WIDER_train/images')
-anno_path = os.path.join(root_dir, 'wider_face_split','wider_face_train_bbx_gt.txt')
+anno_path = os.path.join(root_dir, 'wider_face_split', 'wider_face_train_bbx_gt.txt')
 # anno_path = os.path.join(root_dir, 'wider_face_split','wider_face_test_filelist.txt')
 
 res_path = 'data/cache/train'
@@ -16,38 +16,38 @@
 img_count = 0
 box_count = 0
 with open(anno_path, 'rb') as fid:
-	lines = fid.readlines()
+    lines = fid.readlines()
 num_lines = len(lines)
 
 index = 0
-while index<num_lines:
-	filename = lines[index].strip()
-	img_count += 1
-	if img_count%1000 == 0:
-		print img_count
-	num_obj = int(lines[index+1])
-	filepath = os.path.join(img_path, filename)
-	img = cv2.imread(filepath)
-	img_height, img_width = img.shape[:2]
-	boxes = []
-	if num_obj>0:
-		for i in range(num_obj):
-			info = lines[index+2+i].strip().split(' ')
-			x1, y1 = max(int(info[0]), 0), max(int(info[1]), 0)
-			w, h = min(int(info[2]), img_width - x1 - 1), min(int(info[3]), img_height - y1 - 1)
-			if w>=5 and h>=5:
-				box = np.array([x1, y1, x1+w, y1+h])
-				boxes.append(box)
-	boxes = np.array(boxes)
-	box_count += len(boxes)
-	if len(boxes)>0:
-		valid_count += 1
-		annotation = {}
-		annotation['filepath'] = filepath
-		annotation['bboxes'] = boxes
-		image_data.append(annotation)
-	index += (2+num_obj)
+while index < num_lines:
+    filename = lines[index].strip()
+    img_count += 1
+    if img_count % 1000 == 0:
+        print(img_count)
+    num_obj = int(lines[index + 1])
+    filepath = os.path.join(img_path, filename)
+    img = cv2.imread(filepath)
+    img_height, img_width = img.shape[:2]
+    boxes = []
+    if num_obj > 0:
+        for i in range(num_obj):
+            info = lines[index + 2 + i].strip().split(' ')
+            x1, y1 = max(int(info[0]), 0), max(int(info[1]), 0)
+            w, h = min(int(info[2]), img_width - x1 - 1), min(int(info[3]), img_height - y1 - 1)
+            if w >= 5 and h >= 5:
+                box = np.array([x1, y1, x1 + w, y1 + h])
+                boxes.append(box)
+    boxes = np.array(boxes)
+    box_count += len(boxes)
+    if len(boxes) > 0:
+        valid_count += 1
+        annotation = {}
+        annotation['filepath'] = filepath
+        annotation['bboxes'] = boxes
+        image_data.append(annotation)
+    index += (2 + num_obj)
 
-print '{} images and {} valid images and {} boxes'.format(img_count, valid_count,box_count)
+print('{} images and {} valid images and {} boxes'.format(img_count, valid_count, box_count))
 with open(res_path, 'wb') as fid:
-	cPickle.dump(image_data, fid, cPickle.HIGHEST_PROTOCOL)
\ No newline at end of file
+    pickle.dump(image_data, fid, pickle.HIGHEST_PROTOCOL)
diff --git a/keras_csp/__init__.pyc b/keras_csp/__init__.pyc
deleted file mode 100644
index 676371a..0000000
Binary files a/keras_csp/__init__.pyc and /dev/null differ
diff --git a/keras_csp/bbox_process.py b/keras_csp/bbox_process.py
index 97a8107..d2f0553 100644
--- a/keras_csp/bbox_process.py
+++ b/keras_csp/bbox_process.py
@@ -1,18 +1,18 @@
-from __future__ import division
 import numpy as np
-from nms_wrapper import nms
+from keras_csp.nms_wrapper import nms
 
-def parse_det(Y, C, score=0.1, down=4,scale='h'):
+
+def parse_det(Y, C, score=0.1, down=4, scale='h'):
     seman = Y[0][0, :, :, 0]
-    if scale=='h':
-        height = np.exp(Y[1][0, :, :, 0])*down
-        width = 0.41*height
-    elif scale=='w':
-        width = np.exp(Y[1][0, :, :, 0])*down
-        height = width/0.41
-    elif scale=='hw':
-        height = np.exp(Y[1][0, :, :, 0])*down
-        width = np.exp(Y[1][0, :, :, 1])*down
+    if scale == 'h':
+        height = np.exp(Y[1][0, :, :, 0]) * down
+        width = 0.41 * height
+    elif scale == 'w':
+        width = np.exp(Y[1][0, :, :, 0]) * down
+        height = width / 0.41
+    elif scale == 'hw':
+        height = np.exp(Y[1][0, :, :, 0]) * down
+        width = np.exp(Y[1][0, :, :, 1]) * down
     y_c, x_c = np.where(seman > score)
     boxs = []
     if len(y_c) > 0:
@@ -20,13 +20,14 @@ def parse_det(Y, C, score=0.1, down=4,scale='h'):
             h = height[y_c[i], x_c[i]]
             w = width[y_c[i], x_c[i]]
             s = seman[y_c[i], x_c[i]]
-            x1, y1 = max(0, (x_c[i]+0.5) * down - w / 2), max(0, (y_c[i]+0.5) * down - h / 2)
+            x1, y1 = max(0, (x_c[i] + 0.5) * down - w / 2), max(0, (y_c[i] + 0.5) * down - h / 2)
             boxs.append([x1, y1, min(x1 + w, C.size_test[1]), min(y1 + h, C.size_test[0]), s])
         boxs = np.asarray(boxs, dtype=np.float32)
         keep = nms(boxs, 0.5, usegpu=False, gpu_id=0)
         boxs = boxs[keep, :]
     return boxs
 
+
 def parse_det_top(Y, C, score=0.1):
     seman = Y[0][0, :, :, 0]
     height = Y[1][0, :, :, 0]
@@ -44,6 +45,7 @@ def parse_det_top(Y, C, score=0.1):
         boxs = boxs[keep, :]
     return boxs
 
+
 def parse_det_bottom(Y, C, score=0.1):
     seman = Y[0][0, :, :, 0]
     height = Y[1][0, :, :, 0]
@@ -54,14 +56,15 @@ def parse_det_bottom(Y, C, score=0.1):
             h = np.exp(height[y_c[i], x_c[i]]) * 4
             w = 0.41 * h
             s = seman[y_c[i], x_c[i]]
-            x1, y1 = max(0, x_c[i] * 4 + 2 - w / 2), max(0, y_c[i] * 4 + 2-h)
+            x1, y1 = max(0, x_c[i] * 4 + 2 - w / 2), max(0, y_c[i] * 4 + 2 - h)
             boxs.append([x1, y1, min(x1 + w, C.size_test[1]), min(y1 + h, C.size_test[0]), s])
         boxs = np.asarray(boxs, dtype=np.float32)
         keep = nms(boxs, 0.5, usegpu=False, gpu_id=0)
         boxs = boxs[keep, :]
     return boxs
 
-def parse_det_offset(Y, C, score=0.1,down=4):
+
+def parse_det_offset(Y, C, score=0.1, down=4):
     seman = Y[0][0, :, :, 0]
     height = Y[1][0, :, :, 0]
     offset_y = Y[2][0, :, :, 0]
@@ -71,7 +74,7 @@ def parse_det_offset(Y, C, score=0.1,down=4):
     if len(y_c) > 0:
         for i in range(len(y_c)):
             h = np.exp(height[y_c[i], x_c[i]]) * down
-            w = 0.41*h
+            w = 0.41 * h
             o_y = offset_y[y_c[i], x_c[i]]
             o_x = offset_x[y_c[i], x_c[i]]
             s = seman[y_c[i], x_c[i]]
@@ -82,7 +85,8 @@ def parse_det_offset(Y, C, score=0.1,down=4):
         boxs = boxs[keep, :]
     return boxs
 
-def parse_wider_offset(Y, C, score=0.1,down=4,nmsthre=0.5):
+
+def parse_wider_offset(Y, C, score=0.1, down=4, nmsthre=0.5):
     seman = Y[0][0, :, :, 0]
     height = Y[1][0, :, :, 0]
     width = Y[1][0, :, :, 1]
@@ -101,12 +105,13 @@ def parse_wider_offset(Y, C, score=0.1,down=4,nmsthre=0.5):
             x1, y1 = min(x1, C.size_test[1]), min(y1, C.size_test[0])
             boxs.append([x1, y1, min(x1 + w, C.size_test[1]), min(y1 + h, C.size_test[0]), s])
         boxs = np.asarray(boxs, dtype=np.float32)
-        #keep = nms(boxs, nmsthre, usegpu=False, gpu_id=0)
-        #boxs = boxs[keep, :]
-	boxs = soft_bbox_vote(boxs,thre=nmsthre)
+        # keep = nms(boxs, nmsthre, usegpu=False, gpu_id=0)
+        # boxs = boxs[keep, :]
+        boxs = soft_bbox_vote(boxs, thre=nmsthre)
     return boxs
 
-def soft_bbox_vote(det,thre=0.35,score=0.05):
+
+def soft_bbox_vote(det, thre=0.35, score=0.05):
     if det.shape[0] <= 1:
         return det
     order = det[:, 4].ravel().argsort()[::-1]
@@ -160,7 +165,8 @@ def soft_bbox_vote(det,thre=0.35,score=0.05):
     dets = dets[order, :]
     return dets
 
-def bbox_vote(det,thre):
+
+def bbox_vote(det, thre):
     if det.shape[0] <= 1:
         return det
     order = det[:, 4].ravel().argsort()[::-1]
diff --git a/keras_csp/bbox_process.pyc b/keras_csp/bbox_process.pyc
deleted file mode 100644
index cf0149b..0000000
Binary files a/keras_csp/bbox_process.pyc and /dev/null differ
diff --git a/keras_csp/bbox_transform.py b/keras_csp/bbox_transform.py
index b7992ea..fbc8796 100644
--- a/keras_csp/bbox_transform.py
+++ b/keras_csp/bbox_transform.py
@@ -7,6 +7,7 @@
 
 import numpy as np
 
+
 def bbox_transform(ex_rois, gt_rois):
     ex_widths = ex_rois[:, 2] - ex_rois[:, 0] + 1.0
     ex_heights = ex_rois[:, 3] - ex_rois[:, 1] + 1.0
@@ -27,6 +28,7 @@ def bbox_transform(ex_rois, gt_rois):
         (targets_dx, targets_dy, targets_dw, targets_dh)).transpose()
     return targets
 
+
 def bbox_transform_inv(boxes, deltas):
     if boxes.shape[0] == 0:
         return np.zeros((0, deltas.shape[1]), dtype=deltas.dtype)
@@ -60,7 +62,8 @@ def bbox_transform_inv(boxes, deltas):
 
     return pred_boxes
 
-def compute_targets(ex_rois, gt_rois, classifier_regr_std,std):
+
+def compute_targets(ex_rois, gt_rois, classifier_regr_std, std):
     """Compute bounding-box regression targets for an image."""
 
     assert ex_rois.shape[0] == gt_rois.shape[0]
@@ -68,11 +71,12 @@ def compute_targets(ex_rois, gt_rois, classifier_regr_std,std):
     assert gt_rois.shape[1] == 4
 
     targets = bbox_transform(ex_rois, gt_rois)
-	# Optionally normalize targets by a precomputed mean and stdev
+    # Optionally normalize targets by a precomputed mean and stdev
     if std:
-		targets = targets/np.array(classifier_regr_std)
+        targets = targets / np.array(classifier_regr_std)
     return targets
 
+
 def clip_boxes(boxes, im_shape):
     # boxes[:, 0::4] = np.maximum(np.minimum(boxes[:, 0::4], im_shape[1] - 1), 0)
     # # y1 >= 0
diff --git a/keras_csp/bbox_transform.pyc b/keras_csp/bbox_transform.pyc
deleted file mode 100644
index bf6f91e..0000000
Binary files a/keras_csp/bbox_transform.pyc and /dev/null differ
diff --git a/keras_csp/config.py b/keras_csp/config.py
index f46b12f..c739c8a 100755
--- a/keras_csp/config.py
+++ b/keras_csp/config.py
@@ -1,27 +1,26 @@
-
 class Config(object):
-	def __init__(self):
-		self.gpu_ids = '0'
-		self.onegpu = 2
-		self.num_epochs = 150
-		self.add_epoch = 0
-		self.iter_per_epoch = 2000
-		self.init_lr = 1e-4
-		self.alpha = 0.999
+    def __init__(self):
+        self.gpu_ids = '0'
+        self.onegpu = 2
+        self.num_epochs = 150
+        self.add_epoch = 0
+        self.iter_per_epoch = 2000
+        self.init_lr = 1e-4
+        self.alpha = 0.999
 
-		# setting for network architechture
-		self.network = 'resnet50' # or 'mobilenet'
-		self.point = 'center' # or 'top', 'bottom
-		self.scale = 'h' # or 'w', 'hw'
-		self.num_scale = 1 # 1 for height (or width) prediction, 2 for height+width prediction
-		self.offset = False # append offset prediction or not
-		self.down = 4 # downsampling rate of the feature map for detection
-		self.radius = 2 # surrounding areas of positives for the scale map
+        # setting for network architechture
+        self.network = 'resnet50'  # or 'mobilenet'
+        self.point = 'center'  # or 'top', 'bottom
+        self.scale = 'h'  # or 'w', 'hw'
+        self.num_scale = 1  # 1 for height (or width) prediction, 2 for height+width prediction
+        self.offset = False  # append offset prediction or not
+        self.down = 4  # downsampling rate of the feature map for detection
+        self.radius = 2  # surrounding areas of positives for the scale map
 
-		# setting for data augmentation
-		self.use_horizontal_flips = True
-		self.brightness = (0.5, 2, 0.5)
-		self.size_train = (336, 448)
+        # setting for data augmentation
+        self.use_horizontal_flips = True
+        self.brightness = (0.5, 2)
+        self.size_train = (336, 448)
 
-		# image channel-wise mean to subtract, the order is BGR
-		self.img_channel_mean = [103.939, 116.779, 123.68]
+        # image channel-wise mean to subtract, the order is BGR
+        self.img_channel_mean = [103.939, 116.779, 123.68]
diff --git a/keras_csp/config.pyc b/keras_csp/config.pyc
deleted file mode 100644
index 4f122d0..0000000
Binary files a/keras_csp/config.pyc and /dev/null differ
diff --git a/keras_csp/data_augment.py b/keras_csp/data_augment.py
index 599d69b..247ab76 100644
--- a/keras_csp/data_augment.py
+++ b/keras_csp/data_augment.py
@@ -1,4 +1,4 @@
-from __future__ import division
+
 import cv2
 import numpy as np
 import copy
@@ -85,20 +85,21 @@ def random_crop(image, gts, igs, crop_size, limit=8):
 
     return cropped_image, gts, igs
 
+
 def random_pave(image, gts, igs, pave_size, limit=8):
     img_height, img_width = image.shape[0:2]
     pave_h, pave_w = pave_size
     # paved_image = np.zeros((pave_h, pave_w, 3), dtype=image.dtype)
-    paved_image = np.ones((pave_h, pave_w, 3), dtype=image.dtype)*np.mean(image,dtype=int)
-    pave_x = int(np.random.randint(0, pave_w-img_width+1))
-    pave_y = int(np.random.randint(0, pave_h-img_height+1))
-    paved_image[pave_y:pave_y+img_height, pave_x:pave_x+img_width] = image
+    paved_image = np.ones((pave_h, pave_w, 3), dtype=image.dtype) * np.mean(image, dtype=int)
+    pave_x = int(np.random.randint(0, pave_w - img_width + 1))
+    pave_y = int(np.random.randint(0, pave_h - img_height + 1))
+    paved_image[pave_y:pave_y + img_height, pave_x:pave_x + img_width] = image
     # pave detections
     if len(igs) > 0:
         igs[:, 0:4:2] += pave_x
         igs[:, 1:4:2] += pave_y
-        keep_inds = ((igs[:, 2] - igs[:, 0]) >=8) & \
-                    ((igs[:, 3] - igs[:, 1]) >=8)
+        keep_inds = ((igs[:, 2] - igs[:, 0]) >= 8) & \
+                    ((igs[:, 3] - igs[:, 1]) >= 8)
         igs = igs[keep_inds]
 
     if len(gts) > 0:
diff --git a/keras_csp/data_augment.pyc b/keras_csp/data_augment.pyc
deleted file mode 100644
index 5029909..0000000
Binary files a/keras_csp/data_augment.pyc and /dev/null differ
diff --git a/keras_csp/data_generators.py b/keras_csp/data_generators.py
index 37018b5..ad51a46 100644
--- a/keras_csp/data_generators.py
+++ b/keras_csp/data_generators.py
@@ -1,292 +1,302 @@
-from __future__ import absolute_import
-from __future__ import division
 # import numpy as np
 # import cv2
 import random
 from . import data_augment
 from .bbox_transform import *
 
-def calc_gt_center(C, img_data,r=2, down=4,scale='h',offset=True):
-	def gaussian(kernel):
-		sigma = ((kernel-1) * 0.5 - 1) * 0.3 + 0.8
-		s = 2*(sigma**2)
-		dx = np.exp(-np.square(np.arange(kernel) - int(kernel / 2)) / s)
-		return np.reshape(dx,(-1,1))
-	gts = np.copy(img_data['bboxes'])
-	igs = np.copy(img_data['ignoreareas'])
-	scale_map = np.zeros((int(C.size_train[0]/down), int(C.size_train[1]/down), 2))
-	if scale=='hw':
-		scale_map = np.zeros((int(C.size_train[0] / down), int(C.size_train[1] / down), 3))
-	if offset:
-		offset_map = np.zeros((int(C.size_train[0] / down), int(C.size_train[1] / down), 3))
-	seman_map = np.zeros((int(C.size_train[0]/down), int(C.size_train[1]/down), 3))
-	seman_map[:,:,1] = 1
-	if len(igs) > 0:
-		igs = igs/down
-		for ind in range(len(igs)):
-			x1,y1,x2,y2 = int(igs[ind,0]), int(igs[ind,1]), int(np.ceil(igs[ind,2])), int(np.ceil(igs[ind,3]))
-			seman_map[y1:y2, x1:x2,1] = 0
-	if len(gts)>0:
-		gts = gts/down
-		for ind in range(len(gts)):
-			# x1, y1, x2, y2 = int(round(gts[ind, 0])), int(round(gts[ind, 1])), int(round(gts[ind, 2])), int(round(gts[ind, 3]))
-			x1, y1, x2, y2 = int(np.ceil(gts[ind, 0])), int(np.ceil(gts[ind, 1])), int(gts[ind, 2]), int(gts[ind, 3])
-			c_x, c_y = int((gts[ind, 0] + gts[ind, 2]) / 2), int((gts[ind, 1] + gts[ind, 3]) / 2)
-			dx = gaussian(x2-x1)
-			dy = gaussian(y2-y1)
-			gau_map = np.multiply(dy, np.transpose(dx))
-			seman_map[y1:y2, x1:x2,0] = np.maximum(seman_map[y1:y2, x1:x2,0], gau_map)
-			seman_map[y1:y2, x1:x2,1] = 1
-			seman_map[c_y, c_x, 2] = 1
-
-			if scale == 'h':
-				scale_map[c_y-r:c_y+r+1, c_x-r:c_x+r+1, 0] = np.log(gts[ind, 3] - gts[ind, 1])
-				scale_map[c_y-r:c_y+r+1, c_x-r:c_x+r+1, 1] = 1
-			elif scale=='w':
-				scale_map[c_y-r:c_y+r+1, c_x-r:c_x+r+1, 0] = np.log(gts[ind, 2] - gts[ind, 0])
-				scale_map[c_y-r:c_y+r+1, c_x-r:c_x+r+1, 1] = 1
-			elif scale=='hw':
-				scale_map[c_y-r:c_y+r+1, c_x-r:c_x+r+1, 0] = np.log(gts[ind, 3] - gts[ind, 1])
-				scale_map[c_y-r:c_y+r+1, c_x-r:c_x+r+1, 1] = np.log(gts[ind, 2] - gts[ind, 0])
-				scale_map[c_y-r:c_y+r+1, c_x-r:c_x+r+1, 2] = 1
-			if offset:
-				offset_map[c_y, c_x, 0] = (gts[ind, 1] + gts[ind, 3]) / 2 - c_y - 0.5
-				offset_map[c_y, c_x, 1] = (gts[ind, 0] + gts[ind, 2]) / 2 - c_x - 0.5
-				offset_map[c_y, c_x, 2] = 1
-
-	if offset:
-		return seman_map,scale_map,offset_map
-	else:
-		return seman_map, scale_map
-
-def calc_gt_top(C, img_data,r=2):
-	def gaussian(kernel):
-		sigma = ((kernel-1) * 0.5 - 1) * 0.3 + 0.8
-		s = 2*(sigma**2)
-		dx = np.exp(-np.square(np.arange(kernel) - int(kernel / 2)) / s)
-		return np.reshape(dx,(-1,1))
-	gts = np.copy(img_data['bboxes'])
-	igs = np.copy(img_data['ignoreareas'])
-	scale_map = np.zeros((int(C.size_train[0]/4), int(C.size_train[1]/4), 2))
-	seman_map = np.zeros((int(C.size_train[0]/4), int(C.size_train[1]/4), 3))
-	seman_map[:,:,1] = 1
-	if len(igs) > 0:
-		igs = igs/4
-		for ind in range(len(igs)):
-			x1,y1,x2,y2 = int(igs[ind,0]), int(igs[ind,1]), int(np.ceil(igs[ind,2])), int(np.ceil(igs[ind,3]))
-			seman_map[y1:y2, x1:x2,1] = 0
-	if len(gts)>0:
-		gts = gts/4
-		for ind in range(len(gts)):
-			x1, y1, x2, y2 = int(round(gts[ind, 0])), int(round(gts[ind, 1])), int(round(gts[ind, 2])), int(round(gts[ind, 3]))
-			w = x2-x1
-			c_x = int((gts[ind, 0] + gts[ind, 2]) / 2)
-
-			dx = gaussian(w)
-			dy = gaussian(w)
-			gau_map = np.multiply(dy, np.transpose(dx))
-
-			ty = np.maximum(0,int(round(y1-w/2)))
-			ot = ty-int(round(y1-w/2))
-			seman_map[ty:ty+w-ot, x1:x2,0] = np.maximum(seman_map[ty:ty+w-ot, x1:x2,0], gau_map[ot:,:])
-			seman_map[ty:ty+w-ot, x1:x2,1] = 1
-			seman_map[y1, c_x, 2] = 1
-
-			scale_map[y1-r:y1+r+1, c_x-r:c_x+r+1, 0] = np.log(gts[ind,3]-gts[ind,1])
-			scale_map[y1-r:y1+r+1, c_x-r:c_x+r+1, 1] = 1
-	return seman_map,scale_map
+
+def calc_gt_center(C, img_data, r=2, down=4, scale='h', offset=True):
+    def gaussian(kernel):
+        sigma = ((kernel - 1) * 0.5 - 1) * 0.3 + 0.8
+        s = 2 * (sigma ** 2)
+        dx = np.exp(-np.square(np.arange(kernel) - int(kernel / 2)) / s)
+        return np.reshape(dx, (-1, 1))
+
+    gts = np.copy(img_data['bboxes'])
+    igs = np.copy(img_data['ignoreareas'])
+    scale_map = np.zeros((int(C.size_train[0] / down), int(C.size_train[1] / down), 2))
+    if scale == 'hw':
+        scale_map = np.zeros((int(C.size_train[0] / down), int(C.size_train[1] / down), 3))
+    if offset:
+        offset_map = np.zeros((int(C.size_train[0] / down), int(C.size_train[1] / down), 3))
+    seman_map = np.zeros((int(C.size_train[0] / down), int(C.size_train[1] / down), 3))
+    seman_map[:, :, 1] = 1
+    if len(igs) > 0:
+        igs = igs / down
+        for ind in range(len(igs)):
+            x1, y1, x2, y2 = int(igs[ind, 0]), int(igs[ind, 1]), int(np.ceil(igs[ind, 2])), int(np.ceil(igs[ind, 3]))
+            seman_map[y1:y2, x1:x2, 1] = 0
+    if len(gts) > 0:
+        gts = gts / down
+        for ind in range(len(gts)):
+            # x1, y1, x2, y2 = int(round(gts[ind, 0])), int(round(gts[ind, 1])), int(round(gts[ind, 2])), int(round(gts[ind, 3]))
+            x1, y1, x2, y2 = int(np.ceil(gts[ind, 0])), int(np.ceil(gts[ind, 1])), int(gts[ind, 2]), int(gts[ind, 3])
+            c_x, c_y = int((gts[ind, 0] + gts[ind, 2]) / 2), int((gts[ind, 1] + gts[ind, 3]) / 2)
+            dx = gaussian(x2 - x1)
+            dy = gaussian(y2 - y1)
+            gau_map = np.multiply(dy, np.transpose(dx))
+            seman_map[y1:y2, x1:x2, 0] = np.maximum(seman_map[y1:y2, x1:x2, 0], gau_map)
+            seman_map[y1:y2, x1:x2, 1] = 1
+            seman_map[c_y, c_x, 2] = 1
+
+            if scale == 'h':
+                scale_map[c_y - r:c_y + r + 1, c_x - r:c_x + r + 1, 0] = np.log(gts[ind, 3] - gts[ind, 1])
+                scale_map[c_y - r:c_y + r + 1, c_x - r:c_x + r + 1, 1] = 1
+            elif scale == 'w':
+                scale_map[c_y - r:c_y + r + 1, c_x - r:c_x + r + 1, 0] = np.log(gts[ind, 2] - gts[ind, 0])
+                scale_map[c_y - r:c_y + r + 1, c_x - r:c_x + r + 1, 1] = 1
+            elif scale == 'hw':
+                scale_map[c_y - r:c_y + r + 1, c_x - r:c_x + r + 1, 0] = np.log(gts[ind, 3] - gts[ind, 1])
+                scale_map[c_y - r:c_y + r + 1, c_x - r:c_x + r + 1, 1] = np.log(gts[ind, 2] - gts[ind, 0])
+                scale_map[c_y - r:c_y + r + 1, c_x - r:c_x + r + 1, 2] = 1
+            if offset:
+                offset_map[c_y, c_x, 0] = (gts[ind, 1] + gts[ind, 3]) / 2 - c_y - 0.5
+                offset_map[c_y, c_x, 1] = (gts[ind, 0] + gts[ind, 2]) / 2 - c_x - 0.5
+                offset_map[c_y, c_x, 2] = 1
+
+    if offset:
+        return seman_map, scale_map, offset_map
+    else:
+        return seman_map, scale_map
+
+
+def calc_gt_top(C, img_data, r=2):
+    def gaussian(kernel):
+        sigma = ((kernel - 1) * 0.5 - 1) * 0.3 + 0.8
+        s = 2 * (sigma ** 2)
+        dx = np.exp(-np.square(np.arange(kernel) - int(kernel / 2)) / s)
+        return np.reshape(dx, (-1, 1))
+
+    gts = np.copy(img_data['bboxes'])
+    igs = np.copy(img_data['ignoreareas'])
+    scale_map = np.zeros((int(C.size_train[0] / 4), int(C.size_train[1] / 4), 2))
+    seman_map = np.zeros((int(C.size_train[0] / 4), int(C.size_train[1] / 4), 3))
+    seman_map[:, :, 1] = 1
+    if len(igs) > 0:
+        igs = igs / 4
+        for ind in range(len(igs)):
+            x1, y1, x2, y2 = int(igs[ind, 0]), int(igs[ind, 1]), int(np.ceil(igs[ind, 2])), int(np.ceil(igs[ind, 3]))
+            seman_map[y1:y2, x1:x2, 1] = 0
+    if len(gts) > 0:
+        gts = gts / 4
+        for ind in range(len(gts)):
+            x1, y1, x2, y2 = int(round(gts[ind, 0])), int(round(gts[ind, 1])), int(round(gts[ind, 2])), int(
+                round(gts[ind, 3]))
+            w = x2 - x1
+            c_x = int((gts[ind, 0] + gts[ind, 2]) / 2)
+
+            dx = gaussian(w)
+            dy = gaussian(w)
+            gau_map = np.multiply(dy, np.transpose(dx))
+
+            ty = np.maximum(0, int(round(y1 - w / 2)))
+            ot = ty - int(round(y1 - w / 2))
+            seman_map[ty:ty + w - ot, x1:x2, 0] = np.maximum(seman_map[ty:ty + w - ot, x1:x2, 0], gau_map[ot:, :])
+            seman_map[ty:ty + w - ot, x1:x2, 1] = 1
+            seman_map[y1, c_x, 2] = 1
+
+            scale_map[y1 - r:y1 + r + 1, c_x - r:c_x + r + 1, 0] = np.log(gts[ind, 3] - gts[ind, 1])
+            scale_map[y1 - r:y1 + r + 1, c_x - r:c_x + r + 1, 1] = 1
+    return seman_map, scale_map
+
 
 def calc_gt_bottom(C, img_data, r=2):
-	def gaussian(kernel):
-		sigma = ((kernel-1) * 0.5 - 1) * 0.3 + 0.8
-		s = 2*(sigma**2)
-		dx = np.exp(-np.square(np.arange(kernel) - int(kernel / 2)) / s)
-		return np.reshape(dx,(-1,1))
-	gts = np.copy(img_data['bboxes'])
-	igs = np.copy(img_data['ignoreareas'])
-	scale_map = np.zeros((int(C.size_train[0]/4), int(C.size_train[1]/4), 2))
-	seman_map = np.zeros((int(C.size_train[0]/4), int(C.size_train[1]/4), 3))
-	seman_map[:,:,1] = 1
-	if len(igs) > 0:
-		igs = igs/4
-		for ind in range(len(igs)):
-			x1,y1,x2,y2 = int(igs[ind,0]), int(igs[ind,1]), int(np.ceil(igs[ind,2])), int(np.ceil(igs[ind,3]))
-			seman_map[y1:y2, x1:x2,1] = 0
-	if len(gts)>0:
-		gts = gts/4
-		for ind in range(len(gts)):
-			x1, y1, x2, y2 = int(np.ceil(gts[ind, 0])), int(np.ceil(gts[ind, 1])), int(gts[ind, 2]), int(gts[ind, 3])
-			y2 = np.minimum(int(C.random_crop[0] / 4) - 1, y2)
-			w = x2 - x1
-			c_x = int((gts[ind, 0] + gts[ind, 2]) / 2)
-			dx = gaussian(w)
-			dy = gaussian(w)
-			gau_map = np.multiply(dy, np.transpose(dx))
-
-			by = np.minimum(int(C.random_crop[0]/4)-1, int(round(y2+w/2)))
-			ob = int(round(y2+w/2))-by
-			seman_map[by-w+ob:by, x1:x2, 0] = np.maximum(seman_map[by-w+ob:by, x1:x2, 0], gau_map[:w-ob, :])
-			seman_map[by-w+ob:by, x1:x2, 1] = 1
-			seman_map[y2, c_x, 2] = 1
-
-			scale_map[y2-r:y2+r+1, c_x-r:c_x+r+1, 0] = np.log(gts[ind,3]-gts[ind,1])
-			scale_map[y2-r:y2+r+1, c_x-r:c_x+r+1, 1] = 1
-
-	return seman_map,scale_map
-
-def get_data(ped_data, C, batchsize = 8):
-	current_ped = 0
-	while True:
-		x_img_batch, y_seman_batch, y_height_batch, y_offset_batch = [], [], [], []
-		if current_ped>len(ped_data)-batchsize:
-			random.shuffle(ped_data)
-			current_ped = 0
-		for img_data in ped_data[current_ped:current_ped + batchsize]:
-			try:
-				img_data, x_img = data_augment.augment(img_data, C)
-				if C.offset:
-					y_seman, y_height, y_offset = calc_gt_center(C, img_data, down=C.down, scale=C.scale, offset=True)
-				else:
-					if C.point == 'top':
-						y_seman, y_height = calc_gt_top(C, img_data)
-					elif C.point == 'bottom':
-						y_seman, y_height = calc_gt_bottom(C, img_data)
-					else:
-						y_seman, y_height = calc_gt_center(C, img_data,down=C.down, scale=C.scale, offset=False)
-
-				x_img = x_img.astype(np.float32)
-				x_img[:, :, 0] -= C.img_channel_mean[0]
-				x_img[:, :, 1] -= C.img_channel_mean[1]
-				x_img[:, :, 2] -= C.img_channel_mean[2]
-
-				x_img_batch.append(np.expand_dims(x_img, axis=0))
-				y_seman_batch.append(np.expand_dims(y_seman, axis=0))
-				y_height_batch.append(np.expand_dims(y_height, axis=0))
-				if C.offset:
-					y_offset_batch.append(np.expand_dims(y_offset, axis=0))
-			except Exception as e:
-				print ('get_batch_gt:',e)
-		x_img_batch = np.concatenate(x_img_batch,axis=0)
-		y_seman_batch = np.concatenate(y_seman_batch, axis=0)
-		y_height_batch = np.concatenate(y_height_batch, axis=0)
-		if C.offset:
-			y_offset_batch = np.concatenate(y_offset_batch, axis=0)
-		current_ped += batchsize
-		if C.offset:
-			yield np.copy(x_img_batch), [np.copy(y_seman_batch), np.copy(y_height_batch), np.copy(y_offset_batch)]
-		else:
-			yield np.copy(x_img_batch), [np.copy(y_seman_batch), np.copy(y_height_batch)]
-
-def get_data_hybrid(ped_data, emp_data, C, batchsize = 8,hyratio=0.5):
-	current_ped = 0
-	current_emp = 0
-	batchsize_ped = int(batchsize * hyratio)
-	batchsize_emp = batchsize - batchsize_ped
-	while True:
-		x_img_batch, y_seman_batch, y_height_batch, y_offset_batch = [], [], [], []
-		if current_ped>len(ped_data)-batchsize_ped:
-			random.shuffle(ped_data)
-			current_ped = 0
-		if current_emp>len(emp_data)-batchsize_emp:
-			random.shuffle(emp_data)
-			current_emp = 0
-		for img_data in ped_data[current_ped:current_ped + batchsize_ped]:
-			try:
-				img_data, x_img = data_augment.augment(img_data, C)
-				if C.offset:
-					y_seman, y_height, y_offset = calc_gt_center(C, img_data, down=C.down, scale=C.scale, offset=C.offset)
-				else:
-					if C.point == 'top':
-						y_seman, y_height = calc_gt_top(C, img_data)
-					elif C.point == 'bottom':
-						y_seman, y_height = calc_gt_bottom(C, img_data)
-					else:
-						y_seman, y_height = calc_gt_center(C, img_data,down=C.down, scale=C.scale, offset=False)
-
-				x_img = x_img.astype(np.float32)
-				x_img[:, :, 0] -= C.img_channel_mean[0]
-				x_img[:, :, 1] -= C.img_channel_mean[1]
-				x_img[:, :, 2] -= C.img_channel_mean[2]
-
-				x_img_batch.append(np.expand_dims(x_img, axis=0))
-				y_seman_batch.append(np.expand_dims(y_seman, axis=0))
-				y_height_batch.append(np.expand_dims(y_height, axis=0))
-				if C.offset:
-					y_offset_batch.append(np.expand_dims(y_offset, axis=0))
-
-			except Exception as e:
-				print ('get_batch_gt:',e)
-		for img_data in emp_data[current_emp:current_emp + batchsize_emp]:
-			try:
-				img_data, x_img = data_augment.augment(img_data, C)
-				if C.offset:
-					y_seman, y_height, y_offset = calc_gt_center(C, img_data, down=C.down, scale=C.scale, offset=C.offset)
-				else:
-					if C.point == 'top':
-						y_seman, y_height = calc_gt_top(C, img_data)
-					elif C.point == 'bottom':
-						y_seman, y_height = calc_gt_bottom(C, img_data)
-					else:
-						y_seman, y_height = calc_gt_center(C, img_data,down=C.down, scale=C.scale, offset=False)
-
-				x_img = x_img.astype(np.float32)
-				x_img[:, :, 0] -= C.img_channel_mean[0]
-				x_img[:, :, 1] -= C.img_channel_mean[1]
-				x_img[:, :, 2] -= C.img_channel_mean[2]
-
-				x_img_batch.append(np.expand_dims(x_img, axis=0))
-				y_seman_batch.append(np.expand_dims(y_seman, axis=0))
-				y_height_batch.append(np.expand_dims(y_height, axis=0))
-				if C.offset:
-					y_offset_batch.append(np.expand_dims(y_offset, axis=0))
-			except Exception as e:
-				print ('get_batch_gt_emp:',e)
-		x_img_batch = np.concatenate(x_img_batch,axis=0)
-		y_seman_batch = np.concatenate(y_seman_batch, axis=0)
-		y_height_batch = np.concatenate(y_height_batch, axis=0)
-		if C.offset:
-			y_offset_batch = np.concatenate(y_offset_batch, axis=0)
-		current_ped += batchsize_ped
-		current_emp += batchsize_emp
-		if C.offset:
-			yield np.copy(x_img_batch), [np.copy(y_seman_batch), np.copy(y_height_batch), np.copy(y_offset_batch)]
-		else:
-			yield np.copy(x_img_batch), [np.copy(y_seman_batch), np.copy(y_height_batch)]
-
-def get_data_wider(ped_data, C, batchsize = 8):
-	current_ped = 0
-	while True:
-		x_img_batch, y_seman_batch, y_height_batch, y_offset_batch = [], [], [], []
-		if current_ped>len(ped_data)-batchsize:
-			random.shuffle(ped_data)
-			current_ped = 0
-		for img_data in ped_data[current_ped:current_ped + batchsize]:
-			try:
-				img_data, x_img = data_augment.augment_wider(img_data, C)
-				if C.offset:
-					y_seman, y_height, y_offset = calc_gt_center(C, img_data, down=C.down, scale=C.scale, offset=True)
-				else:
-					y_seman, y_height = calc_gt_center(C, img_data,down=C.down, scale=C.scale, offset=False)
-
-				x_img = x_img.astype(np.float32)
-				x_img[:, :, 0] -= C.img_channel_mean[0]
-				x_img[:, :, 1] -= C.img_channel_mean[1]
-				x_img[:, :, 2] -= C.img_channel_mean[2]
-
-				x_img_batch.append(np.expand_dims(x_img, axis=0))
-				y_seman_batch.append(np.expand_dims(y_seman, axis=0))
-				y_height_batch.append(np.expand_dims(y_height, axis=0))
-				if C.offset:
-					y_offset_batch.append(np.expand_dims(y_offset, axis=0))
-			except Exception as e:
-				print ('get_batch_gt:',e)
-		x_img_batch = np.concatenate(x_img_batch,axis=0)
-		y_seman_batch = np.concatenate(y_seman_batch, axis=0)
-		y_height_batch = np.concatenate(y_height_batch, axis=0)
-		if C.offset:
-			y_offset_batch = np.concatenate(y_offset_batch, axis=0)
-		current_ped += batchsize
-		if C.offset:
-			yield np.copy(x_img_batch), [np.copy(y_seman_batch), np.copy(y_height_batch), np.copy(y_offset_batch)]
-		else:
-			yield np.copy(x_img_batch), [np.copy(y_seman_batch), np.copy(y_height_batch)]
+    def gaussian(kernel):
+        sigma = ((kernel - 1) * 0.5 - 1) * 0.3 + 0.8
+        s = 2 * (sigma ** 2)
+        dx = np.exp(-np.square(np.arange(kernel) - int(kernel / 2)) / s)
+        return np.reshape(dx, (-1, 1))
+
+    gts = np.copy(img_data['bboxes'])
+    igs = np.copy(img_data['ignoreareas'])
+    scale_map = np.zeros((int(C.size_train[0] / 4), int(C.size_train[1] / 4), 2))
+    seman_map = np.zeros((int(C.size_train[0] / 4), int(C.size_train[1] / 4), 3))
+    seman_map[:, :, 1] = 1
+    if len(igs) > 0:
+        igs = igs / 4
+        for ind in range(len(igs)):
+            x1, y1, x2, y2 = int(igs[ind, 0]), int(igs[ind, 1]), int(np.ceil(igs[ind, 2])), int(np.ceil(igs[ind, 3]))
+            seman_map[y1:y2, x1:x2, 1] = 0
+    if len(gts) > 0:
+        gts = gts / 4
+        for ind in range(len(gts)):
+            x1, y1, x2, y2 = int(np.ceil(gts[ind, 0])), int(np.ceil(gts[ind, 1])), int(gts[ind, 2]), int(gts[ind, 3])
+            y2 = np.minimum(int(C.random_crop[0] / 4) - 1, y2)
+            w = x2 - x1
+            c_x = int((gts[ind, 0] + gts[ind, 2]) / 2)
+            dx = gaussian(w)
+            dy = gaussian(w)
+            gau_map = np.multiply(dy, np.transpose(dx))
+
+            by = np.minimum(int(C.random_crop[0] / 4) - 1, int(round(y2 + w / 2)))
+            ob = int(round(y2 + w / 2)) - by
+            seman_map[by - w + ob:by, x1:x2, 0] = np.maximum(seman_map[by - w + ob:by, x1:x2, 0], gau_map[:w - ob, :])
+            seman_map[by - w + ob:by, x1:x2, 1] = 1
+            seman_map[y2, c_x, 2] = 1
+
+            scale_map[y2 - r:y2 + r + 1, c_x - r:c_x + r + 1, 0] = np.log(gts[ind, 3] - gts[ind, 1])
+            scale_map[y2 - r:y2 + r + 1, c_x - r:c_x + r + 1, 1] = 1
+
+    return seman_map, scale_map
+
+
+def get_data(ped_data, C, batchsize=8):
+    current_ped = 0
+    while True:
+        x_img_batch, y_seman_batch, y_height_batch, y_offset_batch = [], [], [], []
+        if current_ped > len(ped_data) - batchsize:
+            random.shuffle(ped_data)
+            current_ped = 0
+        for img_data in ped_data[current_ped:current_ped + batchsize]:
+            try:
+                img_data, x_img = data_augment.augment(img_data, C)
+                if C.offset:
+                    y_seman, y_height, y_offset = calc_gt_center(C, img_data, down=C.down, scale=C.scale, offset=True)
+                else:
+                    if C.point == 'top':
+                        y_seman, y_height = calc_gt_top(C, img_data)
+                    elif C.point == 'bottom':
+                        y_seman, y_height = calc_gt_bottom(C, img_data)
+                    else:
+                        y_seman, y_height = calc_gt_center(C, img_data, down=C.down, scale=C.scale, offset=False)
+
+                x_img = x_img.astype(np.float32)
+                x_img[:, :, 0] -= C.img_channel_mean[0]
+                x_img[:, :, 1] -= C.img_channel_mean[1]
+                x_img[:, :, 2] -= C.img_channel_mean[2]
+
+                x_img_batch.append(np.expand_dims(x_img, axis=0))
+                y_seman_batch.append(np.expand_dims(y_seman, axis=0))
+                y_height_batch.append(np.expand_dims(y_height, axis=0))
+                if C.offset:
+                    y_offset_batch.append(np.expand_dims(y_offset, axis=0))
+            except Exception as e:
+                print(('get_batch_gt:', e))
+        x_img_batch = np.concatenate(x_img_batch, axis=0)
+        y_seman_batch = np.concatenate(y_seman_batch, axis=0)
+        y_height_batch = np.concatenate(y_height_batch, axis=0)
+        if C.offset:
+            y_offset_batch = np.concatenate(y_offset_batch, axis=0)
+        current_ped += batchsize
+        if C.offset:
+            yield np.copy(x_img_batch), [np.copy(y_seman_batch), np.copy(y_height_batch), np.copy(y_offset_batch)]
+        else:
+            yield np.copy(x_img_batch), [np.copy(y_seman_batch), np.copy(y_height_batch)]
+
+
+def get_data_hybrid(ped_data, emp_data, C, batchsize=8, hyratio=0.5):
+    current_ped = 0
+    current_emp = 0
+    batchsize_ped = int(batchsize * hyratio)
+    batchsize_emp = batchsize - batchsize_ped
+    while True:
+        x_img_batch, y_seman_batch, y_height_batch, y_offset_batch = [], [], [], []
+        if current_ped > len(ped_data) - batchsize_ped:
+            random.shuffle(ped_data)
+            current_ped = 0
+        if current_emp > len(emp_data) - batchsize_emp:
+            random.shuffle(emp_data)
+            current_emp = 0
+        for img_data in ped_data[current_ped:current_ped + batchsize_ped]:
+            try:
+                img_data, x_img = data_augment.augment(img_data, C)
+                if C.offset:
+                    y_seman, y_height, y_offset = calc_gt_center(C, img_data, down=C.down, scale=C.scale,
+                                                                 offset=C.offset)
+                else:
+                    if C.point == 'top':
+                        y_seman, y_height = calc_gt_top(C, img_data)
+                    elif C.point == 'bottom':
+                        y_seman, y_height = calc_gt_bottom(C, img_data)
+                    else:
+                        y_seman, y_height = calc_gt_center(C, img_data, down=C.down, scale=C.scale, offset=False)
+
+                x_img = x_img.astype(np.float32)
+                x_img[:, :, 0] -= C.img_channel_mean[0]
+                x_img[:, :, 1] -= C.img_channel_mean[1]
+                x_img[:, :, 2] -= C.img_channel_mean[2]
+
+                x_img_batch.append(np.expand_dims(x_img, axis=0))
+                y_seman_batch.append(np.expand_dims(y_seman, axis=0))
+                y_height_batch.append(np.expand_dims(y_height, axis=0))
+                if C.offset:
+                    y_offset_batch.append(np.expand_dims(y_offset, axis=0))
+
+            except Exception as e:
+                print(('get_batch_gt:', e))
+        for img_data in emp_data[current_emp:current_emp + batchsize_emp]:
+            try:
+                img_data, x_img = data_augment.augment(img_data, C)
+                if C.offset:
+                    y_seman, y_height, y_offset = calc_gt_center(C, img_data, down=C.down, scale=C.scale,
+                                                                 offset=C.offset)
+                else:
+                    if C.point == 'top':
+                        y_seman, y_height = calc_gt_top(C, img_data)
+                    elif C.point == 'bottom':
+                        y_seman, y_height = calc_gt_bottom(C, img_data)
+                    else:
+                        y_seman, y_height = calc_gt_center(C, img_data, down=C.down, scale=C.scale, offset=False)
+
+                x_img = x_img.astype(np.float32)
+                x_img[:, :, 0] -= C.img_channel_mean[0]
+                x_img[:, :, 1] -= C.img_channel_mean[1]
+                x_img[:, :, 2] -= C.img_channel_mean[2]
+
+                x_img_batch.append(np.expand_dims(x_img, axis=0))
+                y_seman_batch.append(np.expand_dims(y_seman, axis=0))
+                y_height_batch.append(np.expand_dims(y_height, axis=0))
+                if C.offset:
+                    y_offset_batch.append(np.expand_dims(y_offset, axis=0))
+            except Exception as e:
+                print(('get_batch_gt_emp:', e))
+        x_img_batch = np.concatenate(x_img_batch, axis=0)
+        y_seman_batch = np.concatenate(y_seman_batch, axis=0)
+        y_height_batch = np.concatenate(y_height_batch, axis=0)
+        if C.offset:
+            y_offset_batch = np.concatenate(y_offset_batch, axis=0)
+        current_ped += batchsize_ped
+        current_emp += batchsize_emp
+        if C.offset:
+            yield np.copy(x_img_batch), [np.copy(y_seman_batch), np.copy(y_height_batch), np.copy(y_offset_batch)]
+        else:
+            yield np.copy(x_img_batch), [np.copy(y_seman_batch), np.copy(y_height_batch)]
+
+
+def get_data_wider(ped_data, C, batchsize=8):
+    current_ped = 0
+    while True:
+        x_img_batch, y_seman_batch, y_height_batch, y_offset_batch = [], [], [], []
+        if current_ped > len(ped_data) - batchsize:
+            random.shuffle(ped_data)
+            current_ped = 0
+        for img_data in ped_data[current_ped:current_ped + batchsize]:
+            try:
+                img_data, x_img = data_augment.augment_wider(img_data, C)
+                if C.offset:
+                    y_seman, y_height, y_offset = calc_gt_center(C, img_data, down=C.down, scale=C.scale, offset=True)
+                else:
+                    y_seman, y_height = calc_gt_center(C, img_data, down=C.down, scale=C.scale, offset=False)
+
+                x_img = x_img.astype(np.float32)
+                x_img[:, :, 0] -= C.img_channel_mean[0]
+                x_img[:, :, 1] -= C.img_channel_mean[1]
+                x_img[:, :, 2] -= C.img_channel_mean[2]
+
+                x_img_batch.append(np.expand_dims(x_img, axis=0))
+                y_seman_batch.append(np.expand_dims(y_seman, axis=0))
+                y_height_batch.append(np.expand_dims(y_height, axis=0))
+                if C.offset:
+                    y_offset_batch.append(np.expand_dims(y_offset, axis=0))
+            except Exception as e:
+                print(('get_batch_gt:', e))
+        x_img_batch = np.concatenate(x_img_batch, axis=0)
+        y_seman_batch = np.concatenate(y_seman_batch, axis=0)
+        y_height_batch = np.concatenate(y_height_batch, axis=0)
+        if C.offset:
+            y_offset_batch = np.concatenate(y_offset_batch, axis=0)
+        current_ped += batchsize
+        if C.offset:
+            yield np.copy(x_img_batch), [np.copy(y_seman_batch), np.copy(y_height_batch), np.copy(y_offset_batch)]
+        else:
+            yield np.copy(x_img_batch), [np.copy(y_seman_batch), np.copy(y_height_batch)]
diff --git a/keras_csp/data_generators.pyc b/keras_csp/data_generators.pyc
deleted file mode 100644
index dbe5135..0000000
Binary files a/keras_csp/data_generators.pyc and /dev/null differ
diff --git a/keras_csp/keras_layer_L2Normalization.py b/keras_csp/keras_layer_L2Normalization.py
index 5f1b9c2..7b15e33 100644
--- a/keras_csp/keras_layer_L2Normalization.py
+++ b/keras_csp/keras_layer_L2Normalization.py
@@ -22,6 +22,7 @@
 from keras.engine.topology import Layer
 import numpy as np
 
+
 class L2Normalization(Layer):
     '''
     Performs L2 normalization on the input tensor with a learnable scaling parameter
diff --git a/keras_csp/keras_layer_L2Normalization.pyc b/keras_csp/keras_layer_L2Normalization.pyc
deleted file mode 100644
index f1b924b..0000000
Binary files a/keras_csp/keras_layer_L2Normalization.pyc and /dev/null differ
diff --git a/keras_csp/losses.py b/keras_csp/losses.py
index 499994d..1e870dd 100644
--- a/keras_csp/losses.py
+++ b/keras_csp/losses.py
@@ -2,63 +2,65 @@
 from keras.objectives import categorical_crossentropy
 
 if K.image_dim_ordering() == 'tf':
-	import tensorflow as tf
+    import tensorflow as tf
 
 epsilon = 1e-4
 
 
 def cls_center(y_true, y_pred):
+    classification_loss = K.binary_crossentropy(y_pred[:, :, :, 0], y_true[:, :, :, 2])
+    # firstly we compute the focal weight
+    positives = y_true[:, :, :, 2]
+    negatives = y_true[:, :, :, 1] - y_true[:, :, :, 2]
+    foreground_weight = positives * (1.0 - y_pred[:, :, :, 0]) ** 2.0
+    # foreground_weight = positives
+    background_weight = negatives * ((1.0 - y_true[:, :, :, 0]) ** 4.0) * (y_pred[:, :, :, 0] ** 2.0)
+    # background_weight = negatives * ((1.0 - y_true[:, :, :, 0])**4.0)*(0.01 ** 2.0)
 
-	classification_loss = K.binary_crossentropy(y_pred[:, :, :, 0], y_true[:, :, :, 2])
-	# firstly we compute the focal weight
-	positives = y_true[:, :, :, 2]
-	negatives = y_true[:, :, :, 1]-y_true[:, :, :, 2]
-	foreground_weight = positives * (1.0 - y_pred[:, :, :, 0]) ** 2.0
-	# foreground_weight = positives
-	background_weight = negatives * ((1.0 - y_true[:, :, :, 0])**4.0)*(y_pred[:, :, :, 0] ** 2.0)
-	# background_weight = negatives * ((1.0 - y_true[:, :, :, 0])**4.0)*(0.01 ** 2.0)
+    # foreground_weight = y_true[:, :, :, 0] * (1- y_pred[:, :, :, 0]) ** 2.0
+    # background_weight = negatives * y_pred[:, :, :, 0] ** 2.0
 
-	# foreground_weight = y_true[:, :, :, 0] * (1- y_pred[:, :, :, 0]) ** 2.0
-	# background_weight = negatives * y_pred[:, :, :, 0] ** 2.0
+    focal_weight = foreground_weight + background_weight
 
-	focal_weight = foreground_weight + background_weight
+    assigned_boxes = tf.reduce_sum(y_true[:, :, :, 2])
+    class_loss = 0.01 * tf.reduce_sum(focal_weight * classification_loss) / tf.maximum(1.0, assigned_boxes)
 
-	assigned_boxes = tf.reduce_sum(y_true[:, :, :, 2])
-	class_loss = 0.01*tf.reduce_sum(focal_weight*classification_loss) / tf.maximum(1.0, assigned_boxes)
+    # assigned_boxes = tf.reduce_sum(tf.reduce_sum(y_true[:, :, :, 1], axis=-1), axis=-1)
+    # class_loss = tf.reduce_sum(tf.reduce_sum(classification_loss, axis=-1), axis=-1) / tf.maximum(1.0, assigned_boxes)
 
-	# assigned_boxes = tf.reduce_sum(tf.reduce_sum(y_true[:, :, :, 1], axis=-1), axis=-1)
-	# class_loss = tf.reduce_sum(tf.reduce_sum(classification_loss, axis=-1), axis=-1) / tf.maximum(1.0, assigned_boxes)
+    return class_loss
 
-	return class_loss
 
 def regr_h(y_true, y_pred):
+    absolute_loss = tf.abs(y_true[:, :, :, 0] - y_pred[:, :, :, 0]) / (y_true[:, :, :, 0] + 1e-10)
+    square_loss = 0.5 * ((y_true[:, :, :, 0] - y_pred[:, :, :, 0]) / (y_true[:, :, :, 0] + 1e-10)) ** 2
 
-	absolute_loss = tf.abs(y_true[:, :, :, 0] - y_pred[:, :, :, 0])/(y_true[:, :, :, 0]+1e-10)
-	square_loss = 0.5 * ((y_true[:, :, :, 0] - y_pred[:, :, :, 0])/(y_true[:, :, :, 0]+1e-10)) ** 2
+    l1_loss = y_true[:, :, :, 1] * tf.where(tf.less(absolute_loss, 1.0), square_loss, absolute_loss - 0.5)
 
-	l1_loss = y_true[:, :, :, 1]*tf.where(tf.less(absolute_loss, 1.0), square_loss, absolute_loss - 0.5)
+    assigned_boxes = tf.reduce_sum(y_true[:, :, :, 1])
+    class_loss = tf.reduce_sum(l1_loss) / tf.maximum(1.0, assigned_boxes)
 
-	assigned_boxes = tf.reduce_sum(y_true[:, :, :, 1])
-	class_loss = tf.reduce_sum(l1_loss) / tf.maximum(1.0, assigned_boxes)
+    return class_loss
 
-	return class_loss
 
 def regr_hw(y_true, y_pred):
-	absolute_loss = tf.abs(y_true[:, :, :, :2] - y_pred[:, :, :, :]) / (y_true[:, :, :, :2] + 1e-10)
-	square_loss = 0.5 * ((y_true[:, :, :, :2] - y_pred[:, :, :, :]) / (y_true[:, :, :, :2] + 1e-10)) ** 2
-	loss = y_true[:, :, :, 2] * tf.reduce_sum(tf.where(tf.less(absolute_loss, 1.0), square_loss, absolute_loss - 0.5),axis=-1)
-	assigned_boxes = tf.reduce_sum(y_true[:, :, :, 2])
-	class_loss = tf.reduce_sum(loss) / tf.maximum(1.0, assigned_boxes)
+    absolute_loss = tf.abs(y_true[:, :, :, :2] - y_pred[:, :, :, :]) / (y_true[:, :, :, :2] + 1e-10)
+    square_loss = 0.5 * ((y_true[:, :, :, :2] - y_pred[:, :, :, :]) / (y_true[:, :, :, :2] + 1e-10)) ** 2
+    loss = y_true[:, :, :, 2] * tf.reduce_sum(tf.where(tf.less(absolute_loss, 1.0), square_loss, absolute_loss - 0.5),
+                                              axis=-1)
+    assigned_boxes = tf.reduce_sum(y_true[:, :, :, 2])
+    class_loss = tf.reduce_sum(loss) / tf.maximum(1.0, assigned_boxes)
 
-	return class_loss
+    return class_loss
 
-def regr_offset(y_true, y_pred):
 
-	absolute_loss = tf.abs(y_true[:, :, :, :2] - y_pred[:, :, :, :])
-	square_loss = 0.5 * (y_true[:, :, :, :2] - y_pred[:, :, :, :]) ** 2
-	l1_loss = y_true[:, :, :, 2] * tf.reduce_sum(tf.where(tf.less(absolute_loss, 1.0), square_loss, absolute_loss - 0.5), axis=-1)
+def regr_offset(y_true, y_pred):
+    absolute_loss = tf.abs(y_true[:, :, :, :2] - y_pred[:, :, :, :])
+    square_loss = 0.5 * (y_true[:, :, :, :2] - y_pred[:, :, :, :]) ** 2
+    l1_loss = y_true[:, :, :, 2] * tf.reduce_sum(
+        tf.where(tf.less(absolute_loss, 1.0), square_loss, absolute_loss - 0.5), axis=-1)
 
-	assigned_boxes = tf.reduce_sum(y_true[:, :, :, 2])
-	class_loss = 0.1*tf.reduce_sum(l1_loss) / tf.maximum(1.0, assigned_boxes)
+    assigned_boxes = tf.reduce_sum(y_true[:, :, :, 2])
+    class_loss = 0.1 * tf.reduce_sum(l1_loss) / tf.maximum(1.0, assigned_boxes)
 
-	return class_loss
+    return class_loss
diff --git a/keras_csp/losses.pyc b/keras_csp/losses.pyc
deleted file mode 100644
index 47003c5..0000000
Binary files a/keras_csp/losses.pyc and /dev/null differ
diff --git a/keras_csp/mobilenet.py b/keras_csp/mobilenet.py
index d5b936d..e8855f8 100644
--- a/keras_csp/mobilenet.py
+++ b/keras_csp/mobilenet.py
@@ -1,6 +1,6 @@
-from __future__ import print_function
-from __future__ import absolute_import
-from __future__ import division
+
+
+
 from keras.layers import *
 from keras import backend as K
 import numpy as np
@@ -477,12 +477,13 @@ def nn_p2p3p4p5(img_input=None, alpha=1.0, depth_multiplier=1,  trainable=True):
 
     return [x_class, x_regr]
 
+
 # focal loss like
 def prior_probability_onecls(num_class=1, probability=0.01):
-	def f(shape, dtype=keras.backend.floatx()):
-		assert(shape[0] % num_class == 0)
-		# set bias to -log((1 - p)/p) for foregound
-		result = np.ones(shape, dtype=dtype) * -math.log((1 - probability) / probability)
-		# set bias to -log(p/(1 - p)) for background
-		return result
-	return f
+    def f(shape, dtype=keras.backend.floatx()):
+        assert(shape[0] % num_class == 0)
+        # set bias to -log((1 - p)/p) for foregound
+        result = np.ones(shape, dtype=dtype) * -math.log((1 - probability) / probability)
+        # set bias to -log(p/(1 - p)) for background
+        return result
+    return f
diff --git a/keras_csp/mobilenet.pyc b/keras_csp/mobilenet.pyc
deleted file mode 100644
index 71946eb..0000000
Binary files a/keras_csp/mobilenet.pyc and /dev/null differ
diff --git a/keras_csp/nms/__init__.pyc b/keras_csp/nms/__init__.pyc
deleted file mode 100644
index f3d0bba..0000000
Binary files a/keras_csp/nms/__init__.pyc and /dev/null differ
diff --git a/keras_csp/nms_wrapper.py b/keras_csp/nms_wrapper.py
index c34cb15..7bde7d0 100644
--- a/keras_csp/nms_wrapper.py
+++ b/keras_csp/nms_wrapper.py
@@ -4,9 +4,9 @@
 # Licensed under The MIT License [see LICENSE for details]
 # Written by Ross Girshick
 # --------------------------------------------------------
-
-from nms.gpu_nms import gpu_nms
-from nms.cpu_nms import cpu_nms
+import pyximport; pyximport.install()
+from keras_csp.nms.gpu_nms import gpu_nms
+from keras_csp.nms.cpu_nms import cpu_nms
 import numpy as np
 
 def soft_nms(dets, sigma=0.5, Nt=0.3, threshold=0.001, method=1):
diff --git a/keras_csp/nms_wrapper.pyc b/keras_csp/nms_wrapper.pyc
deleted file mode 100644
index 7dd5db0..0000000
Binary files a/keras_csp/nms_wrapper.pyc and /dev/null differ
diff --git a/keras_csp/parallel_model.py b/keras_csp/parallel_model.py
index d5d42b2..01c7af8 100644
--- a/keras_csp/parallel_model.py
+++ b/keras_csp/parallel_model.py
@@ -71,8 +71,8 @@ def make_parallel(self):
             with tf.device('/gpu:%d' % i):
                 with tf.name_scope('tower_%d' % i):
                     # Run a slice of inputs through this replica
-                    zipped_inputs = zip(self.inner_model.input_names,
-                                        self.inner_model.inputs)
+                    zipped_inputs = list(zip(self.inner_model.input_names,
+                                        self.inner_model.inputs))
                     inputs = [
                         KL.Lambda(lambda s: input_slices[name][i],
                                   output_shape=lambda s: (None,)+s[1:])(tensor)
@@ -146,8 +146,8 @@ def build_model(x_train, num_classes):
     x_train = np.expand_dims(x_train, -1).astype('float32') / 255
     x_test = np.expand_dims(x_test, -1).astype('float32') / 255
 
-    print('x_train shape:', x_train.shape)
-    print('x_test shape:', x_test.shape)
+    print(('x_train shape:', x_train.shape))
+    print(('x_test shape:', x_test.shape))
 
     # Build data generator and model
     datagen = ImageDataGenerator()
diff --git a/keras_csp/parallel_model.pyc b/keras_csp/parallel_model.pyc
deleted file mode 100644
index f2ff41b..0000000
Binary files a/keras_csp/parallel_model.pyc and /dev/null differ
diff --git a/keras_csp/resnet50.py b/keras_csp/resnet50.py
index 64591fc..037e60d 100644
--- a/keras_csp/resnet50.py
+++ b/keras_csp/resnet50.py
@@ -1,9 +1,9 @@
 # -*- coding: utf-8 -*-
 
 
-from __future__ import print_function
-from __future__ import absolute_import
-from __future__ import division
+
+
+
 from keras.layers import *
 from keras import backend as K
 from .keras_layer_L2Normalization import L2Normalization
diff --git a/keras_csp/resnet50.pyc b/keras_csp/resnet50.pyc
deleted file mode 100644
index c11bf96..0000000
Binary files a/keras_csp/resnet50.pyc and /dev/null differ
diff --git a/keras_csp/utilsfunc.py b/keras_csp/utilsfunc.py
index ce05628..127595c 100644
--- a/keras_csp/utilsfunc.py
+++ b/keras_csp/utilsfunc.py
@@ -1,71 +1,78 @@
-from __future__ import division
 import cv2
 import numpy as np
 
+
 def format_img_size(img, C):
-	""" formats the image size based on config """
-	img_min_side = float(C.im_size)
-	(height,width,_) = img.shape
-
-	if width <= height:
-		ratio = img_min_side/width
-		new_height = int(ratio * height)
-		new_width = int(img_min_side)
-	else:
-		ratio = img_min_side/height
-		new_width = int(ratio * width)
-		new_height = int(img_min_side)
-	img = cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
-	return img, ratio
+    """ formats the image size based on config """
+    img_min_side = float(C.im_size)
+    (height, width, _) = img.shape
+
+    if width <= height:
+        ratio = img_min_side / width
+        new_height = int(ratio * height)
+        new_width = int(img_min_side)
+    else:
+        ratio = img_min_side / height
+        new_width = int(ratio * width)
+        new_height = int(img_min_side)
+    img = cv2.resize(img, (new_width, new_height), interpolation=cv2.INTER_CUBIC)
+    return img, ratio
+
+
 def format_img_channels(img, C):
-	""" formats the image channels based on config """
-	# img = img[:, :, (2, 1, 0)]
-	img = img.astype(np.float32)
-	img[:, :, 0] -= C.img_channel_mean[0]
-	img[:, :, 1] -= C.img_channel_mean[1]
-	img[:, :, 2] -= C.img_channel_mean[2]
-	# img /= C.img_scaling_factor
-	# img = np.transpose(img, (2, 0, 1))
-	img = np.expand_dims(img, axis=0)
-	return img
+    """ formats the image channels based on config """
+    # img = img[:, :, (2, 1, 0)]
+    img = img.astype(np.float32)
+    img[:, :, 0] -= C.img_channel_mean[0]
+    img[:, :, 1] -= C.img_channel_mean[1]
+    img[:, :, 2] -= C.img_channel_mean[2]
+    # img /= C.img_scaling_factor
+    # img = np.transpose(img, (2, 0, 1))
+    img = np.expand_dims(img, axis=0)
+    return img
+
 
 def format_img_batch(img, C):
-	""" formats the image channels based on config """
-	# img = img[:, :, (2, 1, 0)]
-	img = img.astype(np.float32)
-	img[:, :, :, 0] -= C.img_channel_mean[0]
-	img[:, :, :, 1] -= C.img_channel_mean[1]
-	img[:, :, :, 2] -= C.img_channel_mean[2]
-	# img /= C.img_scaling_factor
-	# img = np.transpose(img, (2, 0, 1))
-	# img = np.expand_dims(img, axis=0)
-	return img
+    """ formats the image channels based on config """
+    # img = img[:, :, (2, 1, 0)]
+    img = img.astype(np.float32)
+    img[:, :, :, 0] -= C.img_channel_mean[0]
+    img[:, :, :, 1] -= C.img_channel_mean[1]
+    img[:, :, :, 2] -= C.img_channel_mean[2]
+    # img /= C.img_scaling_factor
+    # img = np.transpose(img, (2, 0, 1))
+    # img = np.expand_dims(img, axis=0)
+    return img
+
 
 def format_img(img, C):
-	""" formats an image for model prediction based on config """
-	# img, ratio = format_img_size(img, C)
-	img = format_img_channels(img, C)
-	return img #return img, ratio
+    """ formats an image for model prediction based on config """
+    # img, ratio = format_img_size(img, C)
+    img = format_img_channels(img, C)
+    return img  # return img, ratio
+
 
 def format_img_inria(img, C):
-	img_h, img_w = img.shape[:2]
-	# img_h_new, img_w_new = int(round(img_h/16)*16), int(round(img_w/16)*16)
-	# img = cv2.resize(img, (img_w_new, img_h_new))
-	img_h_new, img_w_new = int(np.ceil(img_h/16)*16), int(np.ceil(img_w/16)*16)
-	paved_image = np.zeros((img_h_new, img_w_new, 3), dtype=img.dtype)
-	paved_image[0:img_h,0:img_w] = img
-	img = format_img_channels(paved_image, C)
-	return img
+    img_h, img_w = img.shape[:2]
+    # img_h_new, img_w_new = int(round(img_h/16)*16), int(round(img_w/16)*16)
+    # img = cv2.resize(img, (img_w_new, img_h_new))
+    img_h_new, img_w_new = int(np.ceil(img_h / 16) * 16), int(np.ceil(img_w / 16) * 16)
+    paved_image = np.zeros((img_h_new, img_w_new, 3), dtype=img.dtype)
+    paved_image[0:img_h, 0:img_w] = img
+    img = format_img_channels(paved_image, C)
+    return img
+
 
 def format_img_ratio(img, C, ratio):
-	img = img.astype(np.float32)
-	img[:, :, 0] -= C.img_channel_mean[0]
-	img[:, :, 1] -= C.img_channel_mean[1]
-	img[:, :, 2] -= C.img_channel_mean[2]
-	img = cv2.resize(img, None, None, fx=ratio, fy=ratio)
-	# img = cv2.resize(img, None, None, fx=ratio, fy=ratio, interpolation=cv2.INTER_CUBIC)
-	img = np.expand_dims(img, axis=0)
-	return img #return img, ratio
+    img = img.astype(np.float32)
+    img[:, :, 0] -= C.img_channel_mean[0]
+    img[:, :, 1] -= C.img_channel_mean[1]
+    img[:, :, 2] -= C.img_channel_mean[2]
+    img = cv2.resize(img, None, None, fx=ratio, fy=ratio)
+    # img = cv2.resize(img, None, None, fx=ratio, fy=ratio, interpolation=cv2.INTER_CUBIC)
+    img = np.expand_dims(img, axis=0)
+    return img  # return img, ratio
+
 
 def preprocess_input_test(x):
     x = x.astype(np.float32)
@@ -75,24 +82,26 @@ def preprocess_input_test(x):
     x = np.expand_dims(x, axis=0)
     return x
 
+
 # Method to transform the coordinates of the bounding box to its original size
 def get_real_coordinates(ratio, x1, y1, x2, y2):
+    real_x1 = int(round(x1 // ratio))
+    real_y1 = int(round(y1 // ratio))
+    real_x2 = int(round(x2 // ratio))
+    real_y2 = int(round(y2 // ratio))
 
-	real_x1 = int(round(x1 // ratio))
-	real_y1 = int(round(y1 // ratio))
-	real_x2 = int(round(x2 // ratio))
-	real_y2 = int(round(y2 // ratio))
+    return (real_x1, real_y1, real_x2, real_y2)
 
-	return (real_x1, real_y1, real_x2 ,real_y2)
 
 def intersection(ai, bi, area):
-	x = max(ai[0], bi[0])
-	y = max(ai[1], bi[1])
-	w = min(ai[2], bi[2]) - x
-	h = min(ai[3], bi[3]) - y
-	if w < 0 or h < 0:
-		return 0
-	return w*h/area
+    x = max(ai[0], bi[0])
+    y = max(ai[1], bi[1])
+    w = min(ai[2], bi[2]) - x
+    h = min(ai[3], bi[3]) - y
+    if w < 0 or h < 0:
+        return 0
+    return w * h / area
+
 
 def box_grid_overlap(bboxes, get_img_output_length):
     width, height = 960, 540
@@ -109,7 +118,9 @@ def box_grid_overlap(bboxes, get_img_output_length):
     if num_bboxes > 0:
         # get the GT box coordinates, and resize to account for image resizing
         gta = np.zeros((num_bboxes, 4))
-        gta[:,0],gta[:,1],gta[:,2],gta[:,3] = bboxes[:,0],bboxes[:,1],bboxes[:,2]+bboxes[:,0],bboxes[:,3]+bboxes[:,1]
+        gta[:, 0], gta[:, 1], gta[:, 2], gta[:, 3] = bboxes[:, 0], bboxes[:, 1], bboxes[:, 2] + bboxes[:, 0], bboxes[:,
+                                                                                                              3] + bboxes[
+                                                                                                                   :, 1]
         # for bbox_num in range(num_bboxes):
         #     # get the GT box coordinates, and resize to account for image resizing
         #     gta[bbox_num, 0] = bboxes[bbox_num][0]
@@ -124,62 +135,66 @@ def box_grid_overlap(bboxes, get_img_output_length):
                 y2_anc = downscale * (jy + 1)
                 best_op = 0
                 for b in range(num_bboxes):
-                    grid = [x1_anc,y1_anc,x2_anc,y2_anc]
-                    op = intersection(grid,gta[b,:],downscale**2)
-                    best_op = op if op>best_op else best_op
-                y_grid_overlap[jy,ix] = best_op
+                    grid = [x1_anc, y1_anc, x2_anc, y2_anc]
+                    op = intersection(grid, gta[b, :], downscale ** 2)
+                    best_op = op if op > best_op else best_op
+                y_grid_overlap[jy, ix] = best_op
+
+    y_grid_overlap = np.expand_dims(y_grid_overlap.reshape((1, -1)), axis=0)
+    return y_grid_overlap
 
-    y_grid_overlap = np.expand_dims(y_grid_overlap.reshape((1,-1)), axis=0)
-    return  y_grid_overlap
 
 def integrate_motion_score(bboxes, probs, pred, stride=16):
-	if len(bboxes) == 0:
-		return []
-	probs = probs.reshape((-1,1))
-	pred_anchor_score = np.zeros((probs.shape[0], 1))
-	for i in range(len(bboxes)):
-		x1, y1, x2, y2 = int(bboxes[i][0]/stride), int(bboxes[i][1]/stride), int(bboxes[i][2]/stride), int(bboxes[i][3]/stride)
-		pred_anchor_score[i,0] = np.sum(pred[y1:y2, x1:x2])/((x2-x1)*(y2-y1))
-	# alpha, belta = 2/0.7, 0.1
-	# pred_anchor_score = np.where(pred_anchor_score>0.7, pred_anchor_score*alpha, pred_anchor_score)
-	# pred_anchor_score = np.maximum(pred_anchor_score*alpha, np.ones_like(pred_anchor_score)*belta)
-	# all_probs = pred_anchor_score
-	all_probs = probs*pred_anchor_score
-	return all_probs
+    if len(bboxes) == 0:
+        return []
+    probs = probs.reshape((-1, 1))
+    pred_anchor_score = np.zeros((probs.shape[0], 1))
+    for i in range(len(bboxes)):
+        x1, y1, x2, y2 = int(bboxes[i][0] / stride), int(bboxes[i][1] / stride), int(bboxes[i][2] / stride), int(
+            bboxes[i][3] / stride)
+        pred_anchor_score[i, 0] = np.sum(pred[y1:y2, x1:x2]) / ((x2 - x1) * (y2 - y1))
+    # alpha, belta = 2/0.7, 0.1
+    # pred_anchor_score = np.where(pred_anchor_score>0.7, pred_anchor_score*alpha, pred_anchor_score)
+    # pred_anchor_score = np.maximum(pred_anchor_score*alpha, np.ones_like(pred_anchor_score)*belta)
+    # all_probs = pred_anchor_score
+    all_probs = probs * pred_anchor_score
+    return all_probs
+
 
 def box_encoder_pp(anchors, boxes, Y1):
-	A = np.copy(anchors[:, :, :, :4])
-	A = A.reshape((-1, 4))
-
-	# 1 calculate the iou scores
-	max_overlaps = np.zeros((anchors.shape[0] * anchors.shape[1] * anchors.shape[2],), dtype=np.float32)
-	if len(boxes) > 0:
-		boxes[:, 2] += boxes[:, 0]
-		boxes[:, 3] += boxes[:, 1]
-		overlaps = bbox_overlaps(np.ascontiguousarray(A, dtype=np.float64),
-								 np.ascontiguousarray(boxes, dtype=np.float64))
-		max_overlaps = overlaps.max(axis=1)
-	# normalize the iou scores
-	if np.max(max_overlaps) > 0:
-		max_overlaps = (max_overlaps - np.min(max_overlaps)) / np.max(max_overlaps)
-	# 2 calculate the rpn scores
-	rpn_score = Y1.reshape((-1)).astype(np.float32)
-	inds = np.where(max_overlaps == 0)
-	rpn_score[inds] = np.min(rpn_score)
-	scores = (rpn_score + max_overlaps) / 2
-	scores = np.expand_dims(scores.reshape((1,-1)).astype(np.float32), axis=0)
-	return scores
+    A = np.copy(anchors[:, :, :, :4])
+    A = A.reshape((-1, 4))
+
+    # 1 calculate the iou scores
+    max_overlaps = np.zeros((anchors.shape[0] * anchors.shape[1] * anchors.shape[2],), dtype=np.float32)
+    if len(boxes) > 0:
+        boxes[:, 2] += boxes[:, 0]
+        boxes[:, 3] += boxes[:, 1]
+        overlaps = bbox_overlaps(np.ascontiguousarray(A, dtype=np.float64),
+                                 np.ascontiguousarray(boxes, dtype=np.float64))
+        max_overlaps = overlaps.max(axis=1)
+    # normalize the iou scores
+    if np.max(max_overlaps) > 0:
+        max_overlaps = (max_overlaps - np.min(max_overlaps)) / np.max(max_overlaps)
+    # 2 calculate the rpn scores
+    rpn_score = Y1.reshape((-1)).astype(np.float32)
+    inds = np.where(max_overlaps == 0)
+    rpn_score[inds] = np.min(rpn_score)
+    scores = (rpn_score + max_overlaps) / 2
+    scores = np.expand_dims(scores.reshape((1, -1)).astype(np.float32), axis=0)
+    return scores
+
 
 def box_encoder_iou(anchors, boxes):
-	A = np.copy(anchors[:, :, :, :4])
-	A = A.reshape((-1, 4))
-
-	max_overlaps = np.zeros((anchors.shape[0] * anchors.shape[1] * anchors.shape[2],), dtype=np.float32)
-	if len(boxes) > 0:
-		boxes[:, 2] += boxes[:, 0]
-		boxes[:, 3] += boxes[:, 1]
-		overlaps = bbox_overlaps(np.ascontiguousarray(A, dtype=np.float64),
-								 np.ascontiguousarray(boxes, dtype=np.float64))
-		max_overlaps = overlaps.max(axis=1)
-	scores = np.expand_dims(max_overlaps.reshape((1,-1)).astype(np.float32), axis=0)
-	return scores
+    A = np.copy(anchors[:, :, :, :4])
+    A = A.reshape((-1, 4))
+
+    max_overlaps = np.zeros((anchors.shape[0] * anchors.shape[1] * anchors.shape[2],), dtype=np.float32)
+    if len(boxes) > 0:
+        boxes[:, 2] += boxes[:, 0]
+        boxes[:, 3] += boxes[:, 1]
+        overlaps = bbox_overlaps(np.ascontiguousarray(A, dtype=np.float64),
+                                 np.ascontiguousarray(boxes, dtype=np.float64))
+        max_overlaps = overlaps.max(axis=1)
+    scores = np.expand_dims(max_overlaps.reshape((1, -1)).astype(np.float32), axis=0)
+    return scores
diff --git a/keras_csp/utilsfunc.pyc b/keras_csp/utilsfunc.pyc
deleted file mode 100644
index 01c074a..0000000
Binary files a/keras_csp/utilsfunc.pyc and /dev/null differ
diff --git a/requirements.txt b/requirements.txt
index 002965c..06f8421 100755
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,7 +1,7 @@
-tensorflow-gpu==1.4.1
+tensorflow-gpu==1.14.0
 easydict==1.6
 joblib==0.10.3
-numpy==1.12.0
-opencv-python==3.4.1.15
-Pillow==4.0.0
+numpy==1.16.2
+opencv-python==4.1.0.25
+Pillow==6.1.0
 keras==2.0.6
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..fdafaf2
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,130 @@
+import os
+from Cython.Distutils import build_ext
+from distutils.core import setup, Extension
+from Cython.Build import cythonize
+import numpy as np
+
+try:
+    numpy_include = np.get_include()
+except AttributeError:
+    numpy_include = np.get_numpy_include()
+
+
+def customize_compiler_for_nvcc(self):
+    """inject deep into distutils to customize how the dispatch
+    to gcc/nvcc works.
+    If you subclass UnixCCompiler, it's not trivial to get your subclass
+    injected in, and still have the right customizations (i.e.
+    distutils.sysconfig.customize_compiler) run on it. So instead of going
+    the OO route, I have this. Note, it's kindof like a wierd functional
+    subclassing going on."""
+
+    # tell the compiler it can processes .cu
+    self.src_extensions.append('.cu')
+
+    # save references to the default compiler_so and _comple methods
+    default_compiler_so = self.compiler_so
+    super = self._compile
+
+    # now redefine the _compile method. This gets executed for each
+    # object but distutils doesn't have the ability to change compilers
+    # based on source extension: we add it.
+    def _compile(obj, src, ext, cc_args, extra_postargs, pp_opts):
+        if os.path.splitext(src)[1] == '.cu':
+            # use the cuda for .cu files
+            self.set_executable('compiler_so', CUDA['nvcc'])
+            # use only a subset of the extra_postargs, which are 1-1 translated
+            # from the extra_compile_args in the Extension class
+            postargs = extra_postargs['nvcc']
+        else:
+            postargs = extra_postargs['gcc']
+
+        super(obj, src, ext, cc_args, postargs, pp_opts)
+        # reset the default compiler_so, which we might have changed for cuda
+        self.compiler_so = default_compiler_so
+
+    # inject our redefined _compile method into the class
+    self._compile = _compile
+
+
+# run the customize_compiler
+class custom_build_ext(build_ext):
+    def build_extensions(self):
+        customize_compiler_for_nvcc(self.compiler)
+        build_ext.build_extensions(self)
+
+
+def find_in_path(name, path):
+    "Find a file in a search path"
+    # Adapted fom
+    # http://code.activestate.com/recipes/52224-find-a-file-given-a-search-path/
+    for dir in path.split(os.pathsep):
+        binpath = os.path.join(dir, name)
+        if os.path.exists(binpath):
+            return os.path.abspath(binpath)
+    return None
+
+
+def locate_cuda():
+    """Locate the CUDA environment on the system
+    Returns a dict with keys 'home', 'nvcc', 'include', and 'lib64'
+    and values giving the absolute path to each directory.
+    Starts by looking for the CUDAHOME env variable. If not found, everything
+    is based on finding 'nvcc' in the PATH.
+    """
+
+    # first check if the CUDAHOME env variable is in use
+    if 'CUDAHOME' in os.environ:
+        home = os.environ['CUDAHOME']
+        nvcc = os.path.join(home, 'bin', 'nvcc')
+    else:
+        # otherwise, search the PATH for NVCC
+        default_path = os.path.join(os.sep, 'usr', 'local', 'cuda', 'bin')
+        nvcc = find_in_path('nvcc', os.environ['PATH'] + os.pathsep + default_path)
+        if nvcc is None:
+            raise EnvironmentError('The nvcc binary could not be '
+                                   'located in your $PATH. Either add it to your path, or set $CUDAHOME')
+        home = os.path.dirname(os.path.dirname(nvcc))
+
+    cudaconfig = {'home': home, 'nvcc': nvcc,
+                  'include': os.path.join(home, 'include'),
+                  'lib64': os.path.join(home, 'lib64')}
+    for k, v in cudaconfig.items():
+        if not os.path.exists(v):
+            raise EnvironmentError('The CUDA %s path could not be located in %s' % (k, v))
+
+    return cudaconfig
+
+
+CUDA = locate_cuda()
+
+ext_modules = [
+    Extension(
+        "keras_csp.nms.cpu_nms",
+        ["keras_csp/nms/cpu_nms.pyx"],
+        extra_compile_args={'gcc': ["-Wno-cpp", "-Wno-unused-function"]},
+        include_dirs=[numpy_include]
+    ),
+    Extension('keras_csp.nms.gpu_nms',
+              ['keras_csp/nms/nms_kernel.cu', 'keras_csp/nms/gpu_nms.pyx'],
+              library_dirs=[CUDA['lib64']],
+              libraries=['cudart'],
+              language='c++',
+              runtime_library_dirs=[CUDA['lib64']],
+              # this syntax is specific to this build system
+              # we're only going to use certain compiler args with nvcc and not with
+              # gcc the implementation of this trick is in customize_compiler() below
+              extra_compile_args={'gcc': ["-Wno-unused-function"],
+                                  'nvcc': ['-arch=sm_35',
+                                           '--ptxas-options=-v',
+                                           '-c',
+                                           '--compiler-options',
+                                           "'-fPIC'"]},
+              include_dirs=[numpy_include, CUDA['include']]
+              )
+]
+
+setup(
+    ext_modules=ext_modules,
+    cmdclass={'build_ext': custom_build_ext},
+)
diff --git a/test_caltech.py b/test_caltech.py
index 8ece107..7cf8926 100644
--- a/test_caltech.py
+++ b/test_caltech.py
@@ -1,7 +1,6 @@
-from __future__ import division
 import os
 import time
-import cPickle
+import pickle
 from keras.layers import Input
 from keras.models import Model
 from keras_csp import config, bbox_process
@@ -13,9 +12,9 @@
 C.offset = True
 cache_path = 'data/cache/caltech/test'
 with open(cache_path, 'rb') as fid:
-	val_data = cPickle.load(fid)
+    val_data = pickle.load(fid, encoding='latin1')
 num_imgs = len(val_data)
-print 'num of val samples: {}'.format(num_imgs)
+print('num of val samples: {}'.format(num_imgs))
 
 C.size_test = (480, 640)
 input_shape_img = (C.size_test[0], C.size_test[1], 3)
@@ -34,53 +33,53 @@
     out_path = 'output/valresults/caltech/%s/nooff' % (C.scale)
 
 if not os.path.exists(out_path):
-	os.makedirs(out_path)
+    os.makedirs(out_path)
 files = sorted(os.listdir(w_path))
 for w_ind in range(51, 121):
-	for f in files:
-		if f.split('_')[0] == 'net' and int(f.split('_')[1][1:]) == w_ind:
-			cur_file = f
-			break
-	weight1 = os.path.join(w_path, cur_file)
-	print 'load weights from {}'.format(weight1)
-	model.load_weights(weight1, by_name=True)
-	res_path = os.path.join(out_path, '%03d'%int(str(w_ind)))
+    for f in files:
+        if f.split('_')[0] == 'net' and int(f.split('_')[1][1:]) == w_ind:
+            cur_file = f
+            break
+    weight1 = os.path.join(w_path, cur_file)
+    print('load weights from {}'.format(weight1))
+    model.load_weights(weight1, by_name=True)
+    res_path = os.path.join(out_path, '%03d' % int(str(w_ind)))
 
-	print res_path
-	if not os.path.exists(res_path):
-		os.mkdir(res_path)
-	for st in range(6, 11):
-		set_path = os.path.join(res_path, 'set' + '%02d' % st)
-		if not os.path.exists(set_path):
-			os.mkdir(set_path)
+    print(res_path)
+    if not os.path.exists(res_path):
+        os.mkdir(res_path)
+    for st in range(6, 11):
+        set_path = os.path.join(res_path, 'set' + '%02d' % st)
+        if not os.path.exists(set_path):
+            os.mkdir(set_path)
 
-	start_time = time.time()
-	for f in range(num_imgs):
-		filepath = val_data[f]['filepath']
-		filepath_next = val_data[f + 1]['filepath'] if f < num_imgs - 1 else val_data[f]['filepath']
-		set = filepath.split('/')[-1].split('_')[0]
-		video = filepath.split('/')[-1].split('_')[1]
-		frame_number = int(filepath.split('/')[-1].split('_')[2][1:6]) + 1
-		frame_number_next = int(filepath_next.split('/')[-1].split('_')[2][1:6]) + 1
-		set_path = os.path.join(res_path, set)
-		video_path = os.path.join(set_path, video + '.txt')
-		if os.path.exists(video_path):
-			continue
-		if frame_number == 30:
-			res_all = []
-		img = cv2.imread(filepath)
-		x_rcnn = format_img(img, C)
-		Y = model.predict(x_rcnn)
+    start_time = time.time()
+    for f in range(num_imgs):
+        filepath = val_data[f]['filepath']
+        filepath_next = val_data[f + 1]['filepath'] if f < num_imgs - 1 else val_data[f]['filepath']
+        set = filepath.split('/')[-1].split('_')[0]
+        video = filepath.split('/')[-1].split('_')[1]
+        frame_number = int(filepath.split('/')[-1].split('_')[2][1:6]) + 1
+        frame_number_next = int(filepath_next.split('/')[-1].split('_')[2][1:6]) + 1
+        set_path = os.path.join(res_path, set)
+        video_path = os.path.join(set_path, video + '.txt')
+        if os.path.exists(video_path):
+            continue
+        if frame_number == 30:
+            res_all = []
+        img = cv2.imread(filepath)
+        x_rcnn = format_img(img, C)
+        Y = model.predict(x_rcnn)
 
-		if C.offset:
-			boxes = bbox_process.parse_det_offset(Y, C, score=0.01,down=4)
-		else:
-			boxes = bbox_process.parse_det(Y, C, score=0.01, down=4, scale=C.scale)
+        if C.offset:
+            boxes = bbox_process.parse_det_offset(Y, C, score=0.01, down=4)
+        else:
+            boxes = bbox_process.parse_det(Y, C, score=0.01, down=4, scale=C.scale)
 
-		if len(boxes)>0:
-			f_res = np.repeat(frame_number, len(boxes), axis=0).reshape((-1, 1))
-			boxes[:, [2, 3]] -= boxes[:, [0, 1]]
-			res_all += np.concatenate((f_res, boxes), axis=-1).tolist()
-		if frame_number_next == 30 or f == num_imgs - 1:
-			np.savetxt(video_path, np.array(res_all), fmt='%6f')
-	print time.time() - start_time
+        if len(boxes) > 0:
+            f_res = np.repeat(frame_number, len(boxes), axis=0).reshape((-1, 1))
+            boxes[:, [2, 3]] -= boxes[:, [0, 1]]
+            res_all += np.concatenate((f_res, boxes), axis=-1).tolist()
+        if frame_number_next == 30 or f == num_imgs - 1:
+            np.savetxt(video_path, np.array(res_all), fmt='%6f')
+    print(time.time() - start_time)
diff --git a/test_city.py b/test_city.py
index 644659d..4f787a9 100644
--- a/test_city.py
+++ b/test_city.py
@@ -1,7 +1,6 @@
-from __future__ import division
 import os
 import time
-import cPickle
+import pickle
 from keras.layers import Input
 from keras.models import Model
 from keras_csp import config, bbox_process
@@ -12,18 +11,18 @@
 C.offset = True
 cache_path = 'data/cache/cityperson/val_500'
 with open(cache_path, 'rb') as fid:
-	val_data = cPickle.load(fid)
+    val_data = pickle.load(fid, encoding='latin1')
 num_imgs = len(val_data)
-print 'num of val samples: {}'.format(num_imgs)
+print('num of val samples: {}'.format(num_imgs))
 
 C.size_test = (1024, 2048)
 input_shape_img = (C.size_test[0], C.size_test[1], 3)
 img_input = Input(shape=input_shape_img)
 
 # define the base network (resnet here, can be MobileNet, etc)
-if C.network=='resnet50':
+if C.network == 'resnet50':
     from keras_csp import resnet50 as nn
-elif C.network=='mobilenet':
+elif C.network == 'mobilenet':
     from keras_csp import mobilenet as nn
 else:
     raise NotImplementedError('Not support network: {}'.format(C.network))
@@ -39,37 +38,39 @@
     w_path = 'output/valmodels/city/%s/nooff' % (C.scale)
     out_path = 'output/valresults/city/%s/nooff' % (C.scale)
 if not os.path.exists(out_path):
-	os.makedirs(out_path)
+    os.makedirs(out_path)
 files = sorted(os.listdir(w_path))
 # get the results from epoch 51 to epoch 150
-for w_ind in range(51,151):
-	for f in files:
-		if f.split('_')[0] == 'net' and int(f.split('_')[1][1:]) == w_ind:
-			cur_file = f
-			break
-	weight1 = os.path.join(w_path, cur_file)
-	print 'load weights from {}'.format(weight1)
-	model.load_weights(weight1, by_name=True)
-	res_path = os.path.join(out_path, '%03d'%int(str(w_ind)))
-	if not os.path.exists(res_path):
-		os.makedirs(res_path)
-	print res_path
-	res_file = os.path.join(res_path, 'val_det.txt')
-	res_all = []
-	start_time = time.time()
-	for f in range(num_imgs):
-		filepath = val_data[f]['filepath']
-		img = cv2.imread(filepath)
-		x_rcnn = format_img(img, C)
-		Y = model.predict(x_rcnn)
+for w_ind in range(150, 151):
+    for f in files:
+        if f.split('_')[0] == 'net' and int(f.split('_')[1][1:]) == w_ind:
+            cur_file = f
+            break
+    weight1 = os.path.join(w_path, cur_file)
+    print('load weights from {}'.format(weight1))
+    model.load_weights(weight1, by_name=True)
+    res_path = os.path.join(out_path, '%03d' % int(str(w_ind)))
+    if not os.path.exists(res_path):
+        os.makedirs(res_path)
+    print(res_path)
+    res_file = os.path.join(res_path, 'val_det.txt')
+    res_all = []
+    start_time = time.time()
+    for f in range(num_imgs):
+        filepath = val_data[f]['filepath']
+        img = cv2.imread(filepath)
+        if img is None:
+            raise RuntimeError("image at %s not found" % filepath)
+        x_rcnn = format_img(img, C)
+        Y = model.predict(x_rcnn)
 
-		if C.offset:
-			boxes = bbox_process.parse_det_offset(Y, C, score=0.1,down=4)
-		else:
-			boxes = bbox_process.parse_det(Y, C, score=0.1, down=4, scale=C.scale)
-		if len(boxes)>0:
-			f_res = np.repeat(f+1, len(boxes), axis=0).reshape((-1, 1))
-			boxes[:, [2, 3]] -= boxes[:, [0, 1]]
-			res_all += np.concatenate((f_res, boxes), axis=-1).tolist()
-	np.savetxt(res_file, np.array(res_all), fmt='%6f')
-	print time.time() - start_time
+        if C.offset:
+            boxes = bbox_process.parse_det_offset(Y, C, score=0.1, down=4)
+        else:
+            boxes = bbox_process.parse_det(Y, C, score=0.1, down=4, scale=C.scale)
+        if len(boxes) > 0:
+            f_res = np.repeat(f + 1, len(boxes), axis=0).reshape((-1, 1))
+            boxes[:, [2, 3]] -= boxes[:, [0, 1]]
+            res_all += np.concatenate((f_res, boxes), axis=-1).tolist()
+    np.savetxt(res_file, np.array(res_all), fmt='%6f')
+    print(time.time() - start_time)
diff --git a/test_wider_ms.py b/test_wider_ms.py
index 4e02316..b8b2169 100644
--- a/test_wider_ms.py
+++ b/test_wider_ms.py
@@ -1,7 +1,6 @@
-from __future__ import division
 import os
 import time
-import cPickle
+import pickle
 from keras.layers import Input
 from keras.models import Model
 from keras_csp import config, bbox_process
@@ -14,9 +13,9 @@
 C.num_scale = 2
 cache_path = 'data/cache/widerface/val'
 with open(cache_path, 'rb') as fid:
-	val_data = cPickle.load(fid)
+    val_data = pickle.load(fid, encoding='latin1')
 num_imgs = len(val_data)
-print 'num of val samples: {}'.format(num_imgs)
+print('num of val samples: {}'.format(num_imgs))
 
 C.size_test = [0, 0]
 input_shape_img = (None, None, 3)
@@ -24,6 +23,7 @@
 
 # define the base network (resnet here, can be MobileNet, etc)
 from keras_csp import resnet50 as nn
+
 # define the network prediction
 preds = nn.nn_p3p4p5(img_input, offset=C.offset, num_scale=C.num_scale, trainable=True)
 model = Model(img_input, preds)
@@ -35,129 +35,139 @@
     w_path = 'output/valmodels/wider/%s/nooff' % (C.scale)
     out_path = 'output/valresults/wider/%s/nooff' % (C.scale)
 if not os.path.exists(out_path):
-	os.makedirs(out_path)
+    os.makedirs(out_path)
 files = sorted(os.listdir(w_path))
 # get the results from epoch 51 to epoch 150
-for w_ind in range(382,383):
-	for f in files:
-		if f.split('_')[0] == 'net' and int(f.split('_')[1][1:]) == w_ind:
-			cur_file = f
-			break
-	weight1 = os.path.join(w_path, cur_file)
-	print 'load weights from {}'.format(weight1)
-	model.load_weights(weight1, by_name=True)
-	res_path = os.path.join(out_path, '%03d'%int(str(w_ind)))
-	if not os.path.exists(res_path):
-		os.makedirs(res_path)
-	print res_path
-
-	start_time = time.time()
-	for f in range(num_imgs):
-		filepath = val_data[f]['filepath']
-		event = filepath.split('/')[-2]
-		event_path = os.path.join(res_path, event)
-		if not os.path.exists(event_path):
-			os.mkdir(event_path)
-		filename = filepath.split('/')[-1].split('.')[0]
-		txtpath = os.path.join(event_path, filename + '.txt')
-		if os.path.exists(txtpath):
-			continue
-
-		img = cv2.imread(filepath)
-
-		def detect_face(img, scale=1, flip=False):
-			img_h, img_w = img.shape[:2]
-			img_h_new, img_w_new = int(np.ceil(scale * img_h / 16) * 16), int(np.ceil(scale * img_w / 16) * 16)
-			scale_h, scale_w = img_h_new / img_h, img_w_new / img_w
-
-			img_s = cv2.resize(img, None, None, fx=scale_w, fy=scale_h, interpolation=cv2.INTER_LINEAR)
-			# img_h, img_w = img_s.shape[:2]
-			# print frame_number
-			C.size_test[0] = img_h_new
-			C.size_test[1] = img_w_new
-
-			if flip:
-				img_sf = cv2.flip(img_s, 1)
-				# x_rcnn = format_img_pad(img_sf, C)
-				x_rcnn = format_img(img_sf, C)
-			else:
-				# x_rcnn = format_img_pad(img_s, C)
-				x_rcnn = format_img(img_s, C)
-			Y = model.predict(x_rcnn)
-			boxes = bbox_process.parse_wider_offset(Y, C, score=0.05, nmsthre=0.6)
-			if len(boxes) > 0:
-				keep_index = np.where(np.minimum(boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]) >= 12)[0]
-				boxes = boxes[keep_index, :]
-			if len(boxes) > 0:
-				if flip:
-					boxes[:, [0, 2]] = img_s.shape[1] - boxes[:, [2, 0]]
-				boxes[:, 0:4:2] = boxes[:, 0:4:2] / scale_w
-				boxes[:, 1:4:2] = boxes[:, 1:4:2] / scale_h
-			else:
-				boxes = np.empty(shape=[0, 5], dtype=np.float32)
-			return boxes
-
-		def im_det_ms_pyramid(image, max_im_shrink):
-			# shrink detecting and shrink only detect big face
-			det_s = np.row_stack((detect_face(image, 0.5), detect_face(image, 0.5, flip=True)))
-			index = np.where(np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1) > 64)[0]
-			det_s = det_s[index, :]
-
-			det_temp = np.row_stack((detect_face(image, 0.75), detect_face(image, 0.75, flip=True)))
-			index = np.where(np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) > 32)[0]
-			det_temp = det_temp[index, :]
-			det_s = np.row_stack((det_s, det_temp))
-
-			det_temp = np.row_stack((detect_face(image, 0.25), detect_face(image, 0.25, flip=True)))
-			index = np.where(np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) > 96)[0]
-			det_temp = det_temp[index, :]
-			det_s = np.row_stack((det_s, det_temp))
-
-			st = [1.25, 1.5, 1.75, 2.0, 2.25]
-			for i in range(len(st)):
-				if (st[i] <= max_im_shrink):
-					det_temp = np.row_stack((detect_face(image, st[i]), detect_face(image, st[i], flip=True)))
-					# Enlarged images are only used to detect small faces.
-					if st[i] == 1.25:
-						index = np.where(
-							np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) < 128)[0]
-						det_temp = det_temp[index, :]
-					elif st[i] == 1.5:
-						index = np.where(
-							np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) < 96)[0]
-						det_temp = det_temp[index, :]
-					elif st[i] == 1.75:
-						index = np.where(
-							np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) < 64)[0]
-						det_temp = det_temp[index, :]
-					elif st[i] == 2.0:
-						index = np.where(
-							np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) < 48)[0]
-						det_temp = det_temp[index, :]
-					elif st[i] == 2.25:
-						index = np.where(
-							np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) < 32)[0]
-						det_temp = det_temp[index, :]
-					det_s = np.row_stack((det_s, det_temp))
-			return det_s
-
-		max_im_shrink = (0x7fffffff / 577.0 / (img.shape[0] * img.shape[1])) ** 0.5  # the max size of input image
-		shrink = max_im_shrink if max_im_shrink < 1 else 1
-		det0 = detect_face(img)
-		det1 = detect_face(img, flip=True)
-		det2 = im_det_ms_pyramid(img, max_im_shrink)
-		# merge all test results via bounding box voting
-		det = np.row_stack((det0, det1, det2))
-		keep_index = np.where(np.minimum(det[:, 2] - det[:, 0], det[:, 3] - det[:, 1]) >= 3)[0]
-		det = det[keep_index, :]
-		dets = bbox_process.soft_bbox_vote(det, thre=0.4)
-		keep_index = np.where((dets[:, 2] - dets[:, 0] + 1) * (dets[:, 3] - dets[:, 1] + 1) >= 6 ** 2)[0]
-		dets = dets[keep_index, :]
-
-		with open(txtpath, 'w') as f:
-			f.write('{:s}\n'.format(filename))
-			f.write('{:d}\n'.format(len(dets)))
-			for line in dets:
-				f.write('{:.0f} {:.0f} {:.0f} {:.0f} {:.3f}\n'.
-						format(line[0], line[1], line[2] - line[0] + 1, line[3] - line[1] + 1, line[4]))
-	print time.time() - start_time
\ No newline at end of file
+for w_ind in range(382, 383):
+    for f in files:
+        if f.split('_')[0] == 'net' and int(f.split('_')[1][1:]) == w_ind:
+            cur_file = f
+            break
+    weight1 = os.path.join(w_path, cur_file)
+    print('load weights from {}'.format(weight1))
+    model.load_weights(weight1, by_name=True)
+    res_path = os.path.join(out_path, '%03d' % int(str(w_ind)))
+    if not os.path.exists(res_path):
+        os.makedirs(res_path)
+    print(res_path)
+
+    start_time = time.time()
+    for f in range(num_imgs):
+        filepath = val_data[f]['filepath']
+        event = filepath.split('/')[-2]
+        event_path = os.path.join(res_path, event)
+        if not os.path.exists(event_path):
+            os.mkdir(event_path)
+        filename = filepath.split('/')[-1].split('.')[0]
+        txtpath = os.path.join(event_path, filename + '.txt')
+        if os.path.exists(txtpath):
+            continue
+
+        img = cv2.imread(filepath)
+
+
+        def detect_face(img, scale=1, flip=False):
+            img_h, img_w = img.shape[:2]
+            img_h_new, img_w_new = int(np.ceil(scale * img_h / 16) * 16), int(np.ceil(scale * img_w / 16) * 16)
+            scale_h, scale_w = img_h_new / img_h, img_w_new / img_w
+
+            img_s = cv2.resize(img, None, None, fx=scale_w, fy=scale_h, interpolation=cv2.INTER_LINEAR)
+            # img_h, img_w = img_s.shape[:2]
+            # print frame_number
+            C.size_test[0] = img_h_new
+            C.size_test[1] = img_w_new
+
+            if flip:
+                img_sf = cv2.flip(img_s, 1)
+                # x_rcnn = format_img_pad(img_sf, C)
+                x_rcnn = format_img(img_sf, C)
+            else:
+                # x_rcnn = format_img_pad(img_s, C)
+                x_rcnn = format_img(img_s, C)
+            Y = model.predict(x_rcnn)
+            boxes = bbox_process.parse_wider_offset(Y, C, score=0.05, nmsthre=0.6)
+            if len(boxes) > 0:
+                keep_index = np.where(np.minimum(boxes[:, 2] - boxes[:, 0], boxes[:, 3] - boxes[:, 1]) >= 12)[0]
+                boxes = boxes[keep_index, :]
+            if len(boxes) > 0:
+                if flip:
+                    boxes[:, [0, 2]] = img_s.shape[1] - boxes[:, [2, 0]]
+                boxes[:, 0:4:2] = boxes[:, 0:4:2] / scale_w
+                boxes[:, 1:4:2] = boxes[:, 1:4:2] / scale_h
+            else:
+                boxes = np.empty(shape=[0, 5], dtype=np.float32)
+            return boxes
+
+
+        def im_det_ms_pyramid(image, max_im_shrink):
+            # shrink detecting and shrink only detect big face
+            det_s = np.row_stack((detect_face(image, 0.5), detect_face(image, 0.5, flip=True)))
+            index = np.where(np.maximum(det_s[:, 2] - det_s[:, 0] + 1, det_s[:, 3] - det_s[:, 1] + 1) > 64)[0]
+            det_s = det_s[index, :]
+
+            det_temp = np.row_stack((detect_face(image, 0.75), detect_face(image, 0.75, flip=True)))
+            index = np.where(np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) > 32)[
+                0]
+            det_temp = det_temp[index, :]
+            det_s = np.row_stack((det_s, det_temp))
+
+            det_temp = np.row_stack((detect_face(image, 0.25), detect_face(image, 0.25, flip=True)))
+            index = np.where(np.maximum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) > 96)[
+                0]
+            det_temp = det_temp[index, :]
+            det_s = np.row_stack((det_s, det_temp))
+
+            st = [1.25, 1.5, 1.75, 2.0, 2.25]
+            for i in range(len(st)):
+                if (st[i] <= max_im_shrink):
+                    det_temp = np.row_stack((detect_face(image, st[i]), detect_face(image, st[i], flip=True)))
+                    # Enlarged images are only used to detect small faces.
+                    if st[i] == 1.25:
+                        index = np.where(
+                            np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) < 128)[
+                            0]
+                        det_temp = det_temp[index, :]
+                    elif st[i] == 1.5:
+                        index = np.where(
+                            np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) < 96)[
+                            0]
+                        det_temp = det_temp[index, :]
+                    elif st[i] == 1.75:
+                        index = np.where(
+                            np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) < 64)[
+                            0]
+                        det_temp = det_temp[index, :]
+                    elif st[i] == 2.0:
+                        index = np.where(
+                            np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) < 48)[
+                            0]
+                        det_temp = det_temp[index, :]
+                    elif st[i] == 2.25:
+                        index = np.where(
+                            np.minimum(det_temp[:, 2] - det_temp[:, 0] + 1, det_temp[:, 3] - det_temp[:, 1] + 1) < 32)[
+                            0]
+                        det_temp = det_temp[index, :]
+                    det_s = np.row_stack((det_s, det_temp))
+            return det_s
+
+
+        max_im_shrink = (0x7fffffff / 577.0 / (img.shape[0] * img.shape[1])) ** 0.5  # the max size of input image
+        shrink = max_im_shrink if max_im_shrink < 1 else 1
+        det0 = detect_face(img)
+        det1 = detect_face(img, flip=True)
+        det2 = im_det_ms_pyramid(img, max_im_shrink)
+        # merge all test results via bounding box voting
+        det = np.row_stack((det0, det1, det2))
+        keep_index = np.where(np.minimum(det[:, 2] - det[:, 0], det[:, 3] - det[:, 1]) >= 3)[0]
+        det = det[keep_index, :]
+        dets = bbox_process.soft_bbox_vote(det, thre=0.4)
+        keep_index = np.where((dets[:, 2] - dets[:, 0] + 1) * (dets[:, 3] - dets[:, 1] + 1) >= 6 ** 2)[0]
+        dets = dets[keep_index, :]
+
+        with open(txtpath, 'w') as f:
+            f.write('{:s}\n'.format(filename))
+            f.write('{:d}\n'.format(len(dets)))
+            for line in dets:
+                f.write('{:.0f} {:.0f} {:.0f} {:.0f} {:.3f}\n'.
+                        format(line[0], line[1], line[2] - line[0] + 1, line[3] - line[1] + 1, line[4]))
+    print(time.time() - start_time)
diff --git a/train_caltech.py b/train_caltech.py
index 7359d8a..ec6b0f1 100644
--- a/train_caltech.py
+++ b/train_caltech.py
@@ -1,9 +1,8 @@
-from __future__ import division
 import random
 import sys, os
 import time
 import numpy as np
-import cPickle
+import pickle
 from keras.utils import generic_utils
 from keras.optimizers import Adam
 from keras.layers import Input
@@ -28,20 +27,22 @@
 cache_ped = 'data/cache/caltech/train_gt'
 cache_emp = 'data/cache/caltech/train_nogt'
 with open(cache_ped, 'rb') as fid:
-    ped_data = cPickle.load(fid)
+    ped_data = pickle.load(fid, encoding='latin1')
 with open(cache_emp, 'rb') as fid:
-    emp_data = cPickle.load(fid)
+    emp_data = pickle.load(fid, encoding='latin1')
 num_imgs_ped = len(ped_data)
 num_imgs_emp = len(emp_data)
-print ('num of ped and emp samples: {} {}'.format(num_imgs_ped,num_imgs_emp))
+print(('num of ped and emp samples: {} {}'.format(num_imgs_ped, num_imgs_emp)))
 data_gen_train = data_generators.get_data_hybrid(ped_data, emp_data, C, batchsize=batchsize, hyratio=0.5)
 
 # define the base network (resnet here, can be MobileNet, etc)
-if C.network=='resnet50':
+if C.network == 'resnet50':
     from keras_csp import resnet50 as nn
+
     weight_path = 'data/models/resnet50_weights_tf_dim_ordering_tf_kernels.h5'
-elif C.network=='mobilenet':
+elif C.network == 'mobilenet':
     from keras_csp import mobilenet as nn
+
     weight_path = 'data/models/mobilenet_1_0_224_tf_no_top.h5'
 else:
     raise NotImplementedError('Not support network: {}'.format(C.network))
@@ -53,48 +54,47 @@
 preds_tea = nn.nn_p3p4p5(img_input, offset=C.offset, num_scale=C.num_scale, trainable=True)
 
 model = Model(img_input, preds)
-if num_gpu>1:
+if num_gpu > 1:
     from keras_csp.parallel_model import ParallelModel
+
     model = ParallelModel(model, int(num_gpu))
     model_stu = Model(img_input, preds)
 model_tea = Model(img_input, preds_tea)
 
 model.load_weights(weight_path, by_name=True)
 model_tea.load_weights(weight_path, by_name=True)
-print 'load weights from {}'.format(weight_path)
+print('load weights from {}'.format(weight_path))
 
 if C.offset:
     out_path = 'output/valmodels/caltech/%s/off2' % (C.scale)
 else:
     out_path = 'output/valmodels/caltech/%s/nooff' % (C.scale)
 
-
 if not os.path.exists(out_path):
     os.makedirs(out_path)
-res_file = os.path.join(out_path,'records.txt')
+res_file = os.path.join(out_path, 'records.txt')
 
 optimizer = Adam(lr=C.init_lr)
 if C.offset:
     model.compile(optimizer=optimizer, loss=[losses.cls_center, losses.regr_h, losses.regr_offset])
 else:
-    if C.scale=='hw':
+    if C.scale == 'hw':
         model.compile(optimizer=optimizer, loss=[losses.cls_center, losses.regr_hw])
     else:
         model.compile(optimizer=optimizer, loss=[losses.cls_center, losses.regr_h])
 
-
-epoch_length = int(C.iter_per_epoch/batchsize)
+epoch_length = int(C.iter_per_epoch / batchsize)
 iter_num = 0
 add_epoch = 0
 losses = np.zeros((epoch_length, 3))
 
 best_loss = np.Inf
-print('Starting training with lr {} and alpha {}'.format(C.init_lr, C.alpha))
+print(('Starting training with lr {} and alpha {}'.format(C.init_lr, C.alpha)))
 start_time = time.time()
 total_loss_r, cls_loss_r1, regr_loss_r1, offset_loss_r1 = [], [], [], []
 for epoch_num in range(C.num_epochs):
     progbar = generic_utils.Progbar(epoch_length)
-    print('Epoch {}/{}'.format(epoch_num + 1 + add_epoch, C.num_epochs + C.add_epoch))
+    print(('Epoch {}/{}'.format(epoch_num + 1 + add_epoch, C.num_epochs + C.add_epoch)))
     while True:
         try:
             X, Y = next(data_gen_train)
@@ -102,12 +102,13 @@
 
             for l in model_tea.layers:
                 weights_tea = l.get_weights()
-                if len(weights_tea)>0:
+                if len(weights_tea) > 0:
                     if num_gpu > 1:
                         weights_stu = model_stu.get_layer(name=l.name).get_weights()
                     else:
                         weights_stu = model.get_layer(name=l.name).get_weights()
-                    weights_tea = [C.alpha*w_tea + (1-C.alpha)*w_stu for (w_tea, w_stu) in zip(weights_tea, weights_stu)]
+                    weights_tea = [C.alpha * w_tea + (1 - C.alpha) * w_stu for (w_tea, w_stu) in
+                                   zip(weights_tea, weights_stu)]
                     l.set_weights(weights_tea)
             # print loss_s1
             losses[iter_num, 0] = loss_s1[1]
@@ -120,30 +121,32 @@
             iter_num += 1
             if iter_num % 20 == 0:
                 progbar.update(iter_num,
-                               [('cls', np.mean(losses[:iter_num, 0])), ('regr_h', np.mean(losses[:iter_num, 1])), ('offset', np.mean(losses[:iter_num, 2]))])
+                               [('cls', np.mean(losses[:iter_num, 0])), ('regr_h', np.mean(losses[:iter_num, 1])),
+                                ('offset', np.mean(losses[:iter_num, 2]))])
             if iter_num == epoch_length:
                 cls_loss1 = np.mean(losses[:, 0])
                 regr_loss1 = np.mean(losses[:, 1])
                 offset_loss1 = np.mean(losses[:, 2])
-                total_loss = cls_loss1+regr_loss1+offset_loss1
+                total_loss = cls_loss1 + regr_loss1 + offset_loss1
 
                 total_loss_r.append(total_loss)
                 cls_loss_r1.append(cls_loss1)
                 regr_loss_r1.append(regr_loss1)
                 offset_loss_r1.append(offset_loss1)
-                print('Total loss: {}'.format(total_loss))
-                print('Elapsed time: {}'.format(time.time() - start_time))
+                print(('Total loss: {}'.format(total_loss)))
+                print(('Elapsed time: {}'.format(time.time() - start_time)))
 
                 iter_num = 0
                 start_time = time.time()
 
                 if total_loss < best_loss:
-                    print('Total loss decreased from {} to {}, saving weights'.format(best_loss, total_loss))
+                    print(('Total loss decreased from {} to {}, saving weights'.format(best_loss, total_loss)))
                     best_loss = total_loss
-                model_tea.save_weights(os.path.join(out_path, 'net_e{}_l{}.hdf5'.format(epoch_num + 1 + add_epoch, total_loss)))
+                model_tea.save_weights(
+                    os.path.join(out_path, 'net_e{}_l{}.hdf5'.format(epoch_num + 1 + add_epoch, total_loss)))
                 break
         except Exception as e:
-            print ('Exception: {}'.format(e))
+            print(('Exception: {}'.format(e)))
             continue
     records = np.concatenate((np.asarray(total_loss_r).reshape((-1, 1)),
                               np.asarray(cls_loss_r1).reshape((-1, 1)),
@@ -151,4 +154,4 @@
                               np.asarray(offset_loss_r1).reshape((-1, 1)),),
                              axis=-1)
     np.savetxt(res_file, np.array(records), fmt='%.6f')
-print('Training complete, exiting.')
\ No newline at end of file
+print('Training complete, exiting.')
diff --git a/train_city.py b/train_city.py
index 8eb64ac..cd6c58f 100644
--- a/train_city.py
+++ b/train_city.py
@@ -1,9 +1,9 @@
-from __future__ import division
+import glob
 import random
 import sys, os
 import time
 import numpy as np
-import cPickle
+import pickle
 from keras.utils import generic_utils
 from keras.optimizers import Adam
 from keras.layers import Input
@@ -15,7 +15,7 @@
 C = config.Config()
 C.gpu_ids = '0,1,2,3'
 C.onegpu = 2
-C.size_train = (640,1280)
+C.size_train = (640, 1280)
 C.init_lr = 2e-4
 C.num_epochs = 150
 C.offset = True
@@ -27,118 +27,128 @@
 # get the training data
 cache_path = 'data/cache/cityperson/train_h50'
 with open(cache_path, 'rb') as fid:
-    train_data = cPickle.load(fid)
+    train_data = pickle.load(fid, encoding='latin1')
 num_imgs_train = len(train_data)
 random.shuffle(train_data)
-print 'num of training samples: {}'.format(num_imgs_train)
+print('num of training samples: {}'.format(num_imgs_train))
 data_gen_train = data_generators.get_data(train_data, C, batchsize=batchsize)
 
 # define the base network (resnet here, can be MobileNet, etc)
-if C.network=='resnet50':
+if C.network == 'resnet50':
     from keras_csp import resnet50 as nn
+
     weight_path = 'data/models/resnet50_weights_tf_dim_ordering_tf_kernels.h5'
 
+if C.offset:
+    out_path = 'output/valmodels/city/%s/off' % (C.scale)
+else:
+    out_path = 'output/valmodels/city/%s/nooff' % (C.scale)
+if not os.path.exists(out_path):
+    os.makedirs(out_path)
+    epoch = 0
+else:
+    checkpoint_paths = glob.glob(out_path + "/net*.hdf5")
+    checkpoint_names = [f.split("/")[-1] for f in checkpoint_paths]
+    epochs = [*map(int, [f.split("net_e")[1].split("_")[0] for f in checkpoint_names if "net_e" in f])]
+    max_epoch_idx = np.argmax(epochs)
+    epoch = epochs[max_epoch_idx]
+    weight_path = checkpoint_paths[max_epoch_idx]
+
 input_shape_img = (C.size_train[0], C.size_train[1], 3)
 img_input = Input(shape=input_shape_img)
 # define the network prediction
 preds = nn.nn_p3p4p5(img_input, offset=C.offset, num_scale=C.num_scale, trainable=True)
 preds_tea = nn.nn_p3p4p5(img_input, offset=C.offset, num_scale=C.num_scale, trainable=True)
-
 model = Model(img_input, preds)
-if num_gpu>1:
+
+if num_gpu > 1:
     from keras_csp.parallel_model import ParallelModel
+
     model = ParallelModel(model, int(num_gpu))
     model_stu = Model(img_input, preds)
 model_tea = Model(img_input, preds_tea)
 
 model.load_weights(weight_path, by_name=True)
 model_tea.load_weights(weight_path, by_name=True)
-print 'load weights from {}'.format(weight_path)
+print('load weights from {}'.format(weight_path))
 
-if C.offset:
-    out_path = 'output/valmodels/city/%s/off' % (C.scale)
-else:
-    out_path = 'output/valmodels/city/%s/nooff' % (C.scale)
-if not os.path.exists(out_path):
-    os.makedirs(out_path)
-res_file = os.path.join(out_path,'records.txt')
+res_file = os.path.join(out_path, 'records.txt')
 
 optimizer = Adam(lr=C.init_lr)
 if C.offset:
     model.compile(optimizer=optimizer, loss=[losses.cls_center, losses.regr_h, losses.regr_offset])
 else:
-    if C.scale=='hw':
+    if C.scale == 'hw':
         model.compile(optimizer=optimizer, loss=[losses.cls_center, losses.regr_hw])
     else:
         model.compile(optimizer=optimizer, loss=[losses.cls_center, losses.regr_h])
 
-
-epoch_length = int(C.iter_per_epoch/batchsize)
+epoch_length = int(C.iter_per_epoch / batchsize)
 iter_num = 0
 add_epoch = 0
 losses = np.zeros((epoch_length, 3))
 
 best_loss = np.Inf
-print('Starting training with lr {} and alpha {}'.format(C.init_lr, C.alpha))
+print(('Starting training with lr {} and alpha {}'.format(C.init_lr, C.alpha)))
 start_time = time.time()
 total_loss_r, cls_loss_r1, regr_loss_r1, offset_loss_r1 = [], [], [], []
-for epoch_num in range(C.num_epochs):
+for epoch_num in range(epoch, C.num_epochs):
     progbar = generic_utils.Progbar(epoch_length)
-    print('Epoch {}/{}'.format(epoch_num + 1 + add_epoch, C.num_epochs + C.add_epoch))
+    print(('Epoch {}/{}'.format(epoch_num + 1 + add_epoch, C.num_epochs + C.add_epoch)))
     while True:
-        try:
-            X, Y = next(data_gen_train)
-            loss_s1 = model.train_on_batch(X, Y)
-
-            for l in model_tea.layers:
-                weights_tea = l.get_weights()
-                if len(weights_tea)>0:
-                    if num_gpu > 1:
-                        weights_stu = model_stu.get_layer(name=l.name).get_weights()
-                    else:
-                        weights_stu = model.get_layer(name=l.name).get_weights()
-                    weights_tea = [C.alpha*w_tea + (1-C.alpha)*w_stu for (w_tea, w_stu) in zip(weights_tea, weights_stu)]
-                    l.set_weights(weights_tea)
-            # print loss_s1
-            losses[iter_num, 0] = loss_s1[1]
-            losses[iter_num, 1] = loss_s1[2]
-            if C.offset:
-                losses[iter_num, 2] = loss_s1[3]
-            else:
-                losses[iter_num, 2] = 0
-
-            iter_num += 1
-            if iter_num % 20 == 0:
-                progbar.update(iter_num,
-                               [('cls', np.mean(losses[:iter_num, 0])), ('regr_h', np.mean(losses[:iter_num, 1])), ('offset', np.mean(losses[:iter_num, 2]))])
-            if iter_num == epoch_length:
-                cls_loss1 = np.mean(losses[:, 0])
-                regr_loss1 = np.mean(losses[:, 1])
-                offset_loss1 = np.mean(losses[:, 2])
-                total_loss = cls_loss1+regr_loss1+offset_loss1
-
-                total_loss_r.append(total_loss)
-                cls_loss_r1.append(cls_loss1)
-                regr_loss_r1.append(regr_loss1)
-                offset_loss_r1.append(offset_loss1)
-                print('Total loss: {}'.format(total_loss))
-                print('Elapsed time: {}'.format(time.time() - start_time))
-
-                iter_num = 0
-                start_time = time.time()
-
-                if total_loss < best_loss:
-                    print('Total loss decreased from {} to {}, saving weights'.format(best_loss, total_loss))
-                    best_loss = total_loss
-                model_tea.save_weights(os.path.join(out_path, 'net_e{}_l{}.hdf5'.format(epoch_num + 1 + add_epoch, total_loss)))
-                break
-        except Exception as e:
-            print ('Exception: {}'.format(e))
-            continue
+        X, Y = next(data_gen_train)
+        loss_s1 = model.train_on_batch(X, Y)
+
+        for l in model_tea.layers:
+            weights_tea = l.get_weights()
+            if len(weights_tea) > 0:
+                if num_gpu > 1:
+                    weights_stu = model_stu.get_layer(name=l.name).get_weights()
+                else:
+                    weights_stu = model.get_layer(name=l.name).get_weights()
+                weights_tea = [C.alpha * w_tea + (1 - C.alpha) * w_stu for (w_tea, w_stu) in
+                               zip(weights_tea, weights_stu)]
+                l.set_weights(weights_tea)
+        # print loss_s1
+        losses[iter_num, 0] = loss_s1[1]
+        losses[iter_num, 1] = loss_s1[2]
+        if C.offset:
+            losses[iter_num, 2] = loss_s1[3]
+        else:
+            losses[iter_num, 2] = 0
+
+        iter_num += 1
+        if iter_num % 20 == 0:
+            progbar.update(iter_num,
+                           [('cls', np.mean(losses[:iter_num, 0])), ('regr_h', np.mean(losses[:iter_num, 1])),
+                            ('offset', np.mean(losses[:iter_num, 2]))])
+        if iter_num == epoch_length:
+            cls_loss1 = np.mean(losses[:, 0])
+            regr_loss1 = np.mean(losses[:, 1])
+            offset_loss1 = np.mean(losses[:, 2])
+            total_loss = cls_loss1 + regr_loss1 + offset_loss1
+
+            total_loss_r.append(total_loss)
+            cls_loss_r1.append(cls_loss1)
+            regr_loss_r1.append(regr_loss1)
+            offset_loss_r1.append(offset_loss1)
+            print(('Total loss: {}'.format(total_loss)))
+            print(('Elapsed time: {}'.format(time.time() - start_time)))
+
+            iter_num = 0
+            start_time = time.time()
+
+            if total_loss < best_loss:
+                print(('Total loss decreased from {} to {}, saving weights'.format(best_loss, total_loss)))
+                best_loss = total_loss
+            model_tea.save_weights(
+                os.path.join(out_path, 'net_e{}_l{}.hdf5'.format(epoch_num + 1 + add_epoch, total_loss)))
+            break
+
     records = np.concatenate((np.asarray(total_loss_r).reshape((-1, 1)),
                               np.asarray(cls_loss_r1).reshape((-1, 1)),
                               np.asarray(regr_loss_r1).reshape((-1, 1)),
                               np.asarray(offset_loss_r1).reshape((-1, 1)),),
                              axis=-1)
     np.savetxt(res_file, np.array(records), fmt='%.6f')
-print('Training complete, exiting.')
\ No newline at end of file
+print('Training complete, exiting.')
diff --git a/train_wider.py b/train_wider.py
index 404b2fe..34ffd6f 100644
--- a/train_wider.py
+++ b/train_wider.py
@@ -1,9 +1,8 @@
-from __future__ import division
 import random
 import sys, os
 import time
 import numpy as np
-import cPickle
+import pickle
 from keras.utils import generic_utils
 from keras.optimizers import Adam
 from keras.layers import Input
@@ -15,7 +14,7 @@
 C = config.Config()
 C.gpu_ids = '0,1,2,3,4,5,6,7'
 C.onegpu = 4
-C.size_train = (704,704)
+C.size_train = (704, 704)
 C.init_lr = 2e-4
 C.offset = True
 C.scale = 'hw'
@@ -29,14 +28,15 @@
 # get the training data
 cache_path = 'data/cache/widerface/train'
 with open(cache_path, 'rb') as fid:
-    train_data = cPickle.load(fid)
+    train_data = pickle.load(fid, encoding='latin1')
 num_imgs_train = len(train_data)
-print 'num of training samples: {}'.format(num_imgs_train)
+print('num of training samples: {}'.format(num_imgs_train))
 data_gen_train = data_generators.get_data_wider(train_data, C, batchsize=batchsize)
 
 # define the base network (resnet here, can be MobileNet, etc)
-if C.network=='resnet50':
+if C.network == 'resnet50':
     from keras_csp import resnet50 as nn
+
     weight_path = 'data/models/resnet50_weights_tf_dim_ordering_tf_kernels.h5'
 
 input_shape_img = (C.size_train[0], C.size_train[1], 3)
@@ -46,15 +46,16 @@
 preds_tea = nn.nn_p3p4p5(img_input, offset=C.offset, num_scale=C.num_scale, trainable=True)
 
 model = Model(img_input, preds)
-if num_gpu>1:
+if num_gpu > 1:
     from keras_csp.parallel_model import ParallelModel
+
     model = ParallelModel(model, int(num_gpu))
     model_stu = Model(img_input, preds)
 model_tea = Model(img_input, preds_tea)
 
 model.load_weights(weight_path, by_name=True)
 model_tea.load_weights(weight_path, by_name=True)
-print 'load weights from {}'.format(weight_path)
+print('load weights from {}'.format(weight_path))
 
 if C.offset:
     out_path = 'output/valmodels/wider/%s/off' % (C.scale)
@@ -62,7 +63,7 @@
     out_path = 'output/valmodels/wider/%s/nooff' % (C.scale)
 if not os.path.exists(out_path):
     os.makedirs(out_path)
-res_file = os.path.join(out_path,'records.txt')
+res_file = os.path.join(out_path, 'records.txt')
 
 optimizer = Adam(lr=C.init_lr)
 if C.offset:
@@ -70,18 +71,18 @@
 else:
     model.compile(optimizer=optimizer, loss=[losses.cls_center, losses.regr_hw])
 
-epoch_length = int(C.iter_per_epoch/batchsize)
+epoch_length = int(C.iter_per_epoch / batchsize)
 iter_num = 0
 add_epoch = 0
 losses = np.zeros((epoch_length, 3))
 
 best_loss = np.Inf
-print('Starting training with lr {} and alpha {}'.format(C.init_lr, C.alpha))
+print(('Starting training with lr {} and alpha {}'.format(C.init_lr, C.alpha)))
 start_time = time.time()
 total_loss_r, cls_loss_r1, regr_loss_r1, offset_loss_r1 = [], [], [], []
 for epoch_num in range(C.num_epochs):
     progbar = generic_utils.Progbar(epoch_length)
-    print('Epoch {}/{}'.format(epoch_num + 1 + add_epoch, C.num_epochs + C.add_epoch))
+    print(('Epoch {}/{}'.format(epoch_num + 1 + add_epoch, C.num_epochs + C.add_epoch)))
     while True:
         try:
             X, Y = next(data_gen_train)
@@ -89,12 +90,13 @@
 
             for l in model_tea.layers:
                 weights_tea = l.get_weights()
-                if len(weights_tea)>0:
+                if len(weights_tea) > 0:
                     if num_gpu > 1:
                         weights_stu = model_stu.get_layer(name=l.name).get_weights()
                     else:
                         weights_stu = model.get_layer(name=l.name).get_weights()
-                    weights_tea = [C.alpha*w_tea + (1-C.alpha)*w_stu for (w_tea, w_stu) in zip(weights_tea, weights_stu)]
+                    weights_tea = [C.alpha * w_tea + (1 - C.alpha) * w_stu for (w_tea, w_stu) in
+                                   zip(weights_tea, weights_stu)]
                     l.set_weights(weights_tea)
             # print loss_s1
             losses[iter_num, 0] = loss_s1[1]
@@ -107,30 +109,32 @@
             iter_num += 1
             if iter_num % 20 == 0:
                 progbar.update(iter_num,
-                               [('cls', np.mean(losses[:iter_num, 0])), ('regr_h', np.mean(losses[:iter_num, 1])), ('offset', np.mean(losses[:iter_num, 2]))])
+                               [('cls', np.mean(losses[:iter_num, 0])), ('regr_h', np.mean(losses[:iter_num, 1])),
+                                ('offset', np.mean(losses[:iter_num, 2]))])
             if iter_num == epoch_length:
                 cls_loss1 = np.mean(losses[:, 0])
                 regr_loss1 = np.mean(losses[:, 1])
                 offset_loss1 = np.mean(losses[:, 2])
-                total_loss = cls_loss1+regr_loss1+offset_loss1
+                total_loss = cls_loss1 + regr_loss1 + offset_loss1
 
                 total_loss_r.append(total_loss)
                 cls_loss_r1.append(cls_loss1)
                 regr_loss_r1.append(regr_loss1)
                 offset_loss_r1.append(offset_loss1)
-                print('Total loss: {}'.format(total_loss))
-                print('Elapsed time: {}'.format(time.time() - start_time))
+                print(('Total loss: {}'.format(total_loss)))
+                print(('Elapsed time: {}'.format(time.time() - start_time)))
 
                 iter_num = 0
                 start_time = time.time()
 
                 if total_loss < best_loss:
-                    print('Total loss decreased from {} to {}, saving weights'.format(best_loss, total_loss))
+                    print(('Total loss decreased from {} to {}, saving weights'.format(best_loss, total_loss)))
                     best_loss = total_loss
-                model_tea.save_weights(os.path.join(out_path, 'net_e{}_l{}.hdf5'.format(epoch_num + 1 + add_epoch, total_loss)))
+                model_tea.save_weights(
+                    os.path.join(out_path, 'net_e{}_l{}.hdf5'.format(epoch_num + 1 + add_epoch, total_loss)))
                 break
         except Exception as e:
-            print ('Exception: {}'.format(e))
+            print(('Exception: {}'.format(e)))
             continue
     records = np.concatenate((np.asarray(total_loss_r).reshape((-1, 1)),
                               np.asarray(cls_loss_r1).reshape((-1, 1)),
@@ -138,4 +142,4 @@
                               np.asarray(offset_loss_r1).reshape((-1, 1)),),
                              axis=-1)
     np.savetxt(res_file, np.array(records), fmt='%.6f')
-print('Training complete, exiting.')
\ No newline at end of file
+print('Training complete, exiting.')