diff --git a/.gitignore b/.gitignore
index 0c1ca188..f4cf204c 100644
--- a/.gitignore
+++ b/.gitignore
@@ -35,3 +35,5 @@ projects/loop-algorithm/figures/
 projects/parsers/output/
 
 projects/get-donors-pump-settings/temp-plot\.html
+
+projects/bigdata-processing-pipeline/get_stats/debug/
diff --git a/projects/bigdata-processing-pipeline/__init__.py b/projects/bigdata-processing-pipeline/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/README.md b/projects/bigdata-processing-pipeline/anonymize_and_export_data/README.md
similarity index 100%
rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/README.md
rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/README.md
diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/anonymize-and-export.py b/projects/bigdata-processing-pipeline/anonymize_and_export_data/anonymize-and-export.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/anonymize-and-export.py
rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/anonymize-and-export.py
diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/dataFieldExportList.csv b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/dataFieldExportList.csv
similarity index 100%
rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/dataFieldExportList.csv
rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/dataFieldExportList.csv
diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.csv b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.csv
similarity index 100%
rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.csv
rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.csv
diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.json b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.json
similarity index 100%
rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.json
rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.json
diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.xlsx b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.xlsx
similarity index 100%
rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.xlsx
rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.xlsx
diff --git a/projects/bigdata-processing-pipeline/environment.yml b/projects/bigdata-processing-pipeline/environment.yml
index 4c945436..64ef3601 100644
--- a/projects/bigdata-processing-pipeline/environment.yml
+++ b/projects/bigdata-processing-pipeline/environment.yml
@@ -3,9 +3,8 @@ channels:
   - defaults
 dependencies:
   - python=3.7.3
-  - numpy=1.16.4
   - pandas=0.24.2
+  - spyder=3.3.6
   - pip=19.1.1
-  - spyder=3.3.5
   - pip:
     - python-dotenv==0.10.3
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/.gitignore b/projects/bigdata-processing-pipeline/estimate_local_time/.gitignore
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/.gitignore
rename to projects/bigdata-processing-pipeline/estimate_local_time/.gitignore
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/README.md b/projects/bigdata-processing-pipeline/estimate_local_time/README.md
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/README.md
rename to projects/bigdata-processing-pipeline/estimate_local_time/README.md
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/estimate-local-time.py b/projects/bigdata-processing-pipeline/estimate_local_time/estimate-local-time.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/estimate-local-time.py
rename to projects/bigdata-processing-pipeline/estimate_local_time/estimate-local-time.py
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/estimateLocalTime-batchProcess.py b/projects/bigdata-processing-pipeline/estimate_local_time/estimateLocalTime-batchProcess.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/estimateLocalTime-batchProcess.py
rename to projects/bigdata-processing-pipeline/estimate_local_time/estimateLocalTime-batchProcess.py
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/example-csv.csv b/projects/bigdata-processing-pipeline/estimate_local_time/example-csv.csv
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/example-csv.csv
rename to projects/bigdata-processing-pipeline/estimate_local_time/example-csv.csv
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/example-json.json b/projects/bigdata-processing-pipeline/estimate_local_time/example-json.json
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/example-json.json
rename to projects/bigdata-processing-pipeline/estimate_local_time/example-json.json
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/example-xlsx.xlsx b/projects/bigdata-processing-pipeline/estimate_local_time/example-xlsx.xlsx
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/example-xlsx.xlsx
rename to projects/bigdata-processing-pipeline/estimate_local_time/example-xlsx.xlsx
diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/wikipedia-timezone-aliases-2018-04-28.csv b/projects/bigdata-processing-pipeline/estimate_local_time/wikipedia-timezone-aliases-2018-04-28.csv
similarity index 100%
rename from projects/bigdata-processing-pipeline/estimate-local-time/wikipedia-timezone-aliases-2018-04-28.csv
rename to projects/bigdata-processing-pipeline/estimate_local_time/wikipedia-timezone-aliases-2018-04-28.csv
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/README.md b/projects/bigdata-processing-pipeline/get_donor_data/README.md
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/README.md
rename to projects/bigdata-processing-pipeline/get_donor_data/README.md
diff --git a/projects/bigdata-processing-pipeline/get_donor_data/__init__.py b/projects/bigdata-processing-pipeline/get_donor_data/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py b/projects/bigdata-processing-pipeline/get_donor_data/accept_new_donors_and_get_donor_list.py
similarity index 99%
rename from projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py
rename to projects/bigdata-processing-pipeline/get_donor_data/accept_new_donors_and_get_donor_list.py
index 0d8c4a41..b17f5c9e 100644
--- a/projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py
+++ b/projects/bigdata-processing-pipeline/get_donor_data/accept_new_donors_and_get_donor_list.py
@@ -16,6 +16,7 @@
 import requests
 import json
 import argparse
+import pdb
 envPath = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 if envPath not in sys.path:
     sys.path.insert(0, envPath)
@@ -247,7 +248,7 @@ def accept_and_get_list(args):
     )
 
     # polish up the final donor list
-    final_donor_list.sort_values(by="donorGroup", inplace=True)
+    final_donor_list.sort_values(by="userID", inplace=True)
     final_donor_list.reset_index(drop=True, inplace=True)
 
     if args.save_donor_list:
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/accept-new-donors.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/accept-new-donors.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/accept-new-donors.py
rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/accept-new-donors.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-all-col-headings.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-all-col-headings.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-all-col-headings.py
rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-all-col-headings.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-json-files.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-json-files.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-json-files.py
rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-json-files.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-list.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-list.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-list.py
rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-list.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get_all_donor_data.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get_all_donor_data.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get_all_donor_data.py
rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get_all_donor_data.py
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/example_get_all_data_for_single_user.py b/projects/bigdata-processing-pipeline/get_donor_data/example_get_all_data_for_single_user.py
similarity index 95%
rename from projects/bigdata-processing-pipeline/get-donor-data/example_get_all_data_for_single_user.py
rename to projects/bigdata-processing-pipeline/get_donor_data/example_get_all_data_for_single_user.py
index 14767119..3a0966d9 100644
--- a/projects/bigdata-processing-pipeline/get-donor-data/example_get_all_data_for_single_user.py
+++ b/projects/bigdata-processing-pipeline/get_donor_data/example_get_all_data_for_single_user.py
@@ -25,6 +25,6 @@
 )
 data, _ = get_data(
     donor_group="bigdata",
-    userid_of_shared_user="0d4524bc11",
+    userid="0d4524bc11",
     weeks_of_data=4
     )
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py b/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process.py
similarity index 78%
rename from projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py
rename to projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process.py
index 15daa252..8e81b372 100644
--- a/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py
+++ b/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process.py
@@ -117,12 +117,11 @@ def get_all_data(userid, donor_group):
 
 metadata_path = os.path.join(
     args.data_path,
-    "PHI-" + "2019-07-13" + "-donor-data",
-    "PHI-" + "2019-07-13" + "-metadata"
-
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-metadata"
 )
 
-all_files = glob.glob(os.path.join(metadata_path, "*.csv"))
+all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
 all_metadata = pd.DataFrame()
 for f in all_files:
     temp_meta = pd.read_csv(f)
@@ -137,3 +136,32 @@ def get_all_data(userid, donor_group):
     os.path.join(donor_folder, phi_date_stamp + "-donor-metadata.csv")
 )
 print("saving metadata...code complete")
+
+
+# %% COMBINE AND SAVE ALL DATASET INFO (METADATA)
+print("combining all dataset metadata")
+
+metadata_path = os.path.join(
+    args.data_path,
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-datasetSummary"
+)
+
+all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+dataset_metadata = pd.DataFrame()
+for f in all_files:
+    temp_meta = pd.read_csv(f)
+    temp_meta.rename(columns={"Unnamed: 0": "col_name"}, inplace=True)
+    userid = f[-32:-22]
+    temp_meta["userid"] = userid
+    dataset_metadata = pd.concat(
+        [dataset_metadata, temp_meta],
+        ignore_index=True,
+        sort=False
+    )
+
+dataset_metadata.to_csv(
+    os.path.join(donor_folder, phi_date_stamp + "-all-dataset-info.csv")
+)
+print("saving all-dataset-info-metadata...code complete")
+
diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process_json.py b/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process_json.py
new file mode 100644
index 00000000..d43b8e9a
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process_json.py
@@ -0,0 +1,138 @@
+# -*- coding: utf-8 -*-
+"""accept_donors_and_pull_data.py
+This is a wrapper script that accepts all bigdata donation project donors,
+and then pulls of their datasets for further processing.
+"""
+
+# %% REQUIRED LIBRARIES
+from accept_new_donors_and_get_donor_list import accept_and_get_list
+import datetime as dt
+import pandas as pd
+import subprocess as sub
+import os
+import glob
+import time
+import argparse
+from multiprocessing import Pool
+
+
+# %% USER INPUTS (choices to be made in order to run the code)
+codeDescription = "accepts new donors (shares) and grab their data"
+parser = argparse.ArgumentParser(description=codeDescription)
+
+parser.add_argument(
+    "-d",
+    "--date-stamp",
+    dest="date_stamp",
+    default=dt.datetime.now().strftime("%Y-%m-%d"),
+    help="date, in '%Y-%m-%d' format, of the date when " +
+    "donors were accepted"
+)
+
+parser.add_argument(
+    "-o",
+    "--output-data-path",
+    dest="data_path",
+    default=os.path.abspath(
+        os.path.join(
+            os.path.dirname(__file__), "..", "data"
+        )
+    ),
+    help="the output path where the data is stored"
+)
+
+parser.add_argument(
+    "-s",
+    "--save-donor-list",
+    dest="save_donor_list",
+    default=True,
+    help="specify if you want to save the donor list (True/False)"
+)
+
+args = parser.parse_args()
+
+
+# %% FUNCTIONS
+def run_process(func_name, userid, donor_group):
+    func_path = os.path.join(".", func_name)
+
+    p = sub.Popen(
+        [
+             "python", func_path,
+             "-d", args.date_stamp,
+             "-dg", donor_group,
+             "-u", userid,
+             "-o", args.data_path
+         ],
+        stdout=sub.PIPE,
+        stderr=sub.PIPE
+    )
+
+    output, errors = p.communicate()
+    output = output.decode("utf-8")
+    errors = errors.decode("utf-8")
+
+    if errors == '':
+        print(output)
+    else:
+        print(errors)
+
+    return
+
+
+def get_all_data(userid, donor_group):
+
+    run_process("get_single_donor_metadata.py", userid, donor_group)
+    run_process("get_single_tidepool_dataset_json.py", userid, donor_group)
+
+    return
+
+
+# %% GET LATEST DONOR LIST
+final_donor_list = accept_and_get_list(args)
+
+
+# %% GET DONOR META DATA AND DATASETS
+# use multiple cores to process
+startTime = time.time()
+print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
+pool = Pool(os.cpu_count())
+pool.starmap(get_all_data, zip(
+    final_donor_list["userID"],
+    final_donor_list["donorGroup"]
+))
+pool.close()
+endTime = time.time()
+print(
+  "finshed pulling data at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+)
+total_duration = round((endTime - startTime) / 60, 1)
+print("total duration was %s minutes" % total_duration)
+
+
+# %% COMBINE AND SAVE ALL DONOR METADATA
+print("combining all metadata")
+phi_date_stamp = "PHI-" + args.date_stamp
+donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data")
+
+metadata_path = os.path.join(
+    args.data_path,
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-metadata"
+)
+
+all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+all_metadata = pd.DataFrame()
+for f in all_files:
+    temp_meta = pd.read_csv(f)
+    temp_meta.rename(columns={"Unnamed: 0": "userid"}, inplace=True)
+    all_metadata = pd.concat(
+        [all_metadata, temp_meta],
+        ignore_index=True,
+        sort=False
+    )
+
+all_metadata.to_csv(
+    os.path.join(donor_folder, phi_date_stamp + "-donor-metadata.csv")
+)
+print("saving metadata...code complete")
diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_interim_dataset_summaries.py b/projects/bigdata-processing-pipeline/get_donor_data/get_interim_dataset_summaries.py
new file mode 100644
index 00000000..0fa04201
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_donor_data/get_interim_dataset_summaries.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+
+# %% REQUIRED LIBRARIES
+import datetime as dt
+import pandas as pd
+import os
+import glob
+import argparse
+
+
+# %% FUNCTIONS
+def get_dataset_summaries(
+        save_data_path=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__),
+                "..",
+                "data"
+            )
+        ),
+        date_stamp=dt.datetime.now().strftime("%Y-%m-%d"),
+):
+
+
+
+    phi_date_stamp = "PHI-" + args.date_stamp
+    donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data")
+
+    print("combining all dataset metadata")
+
+    metadata_path = os.path.join(
+        args.data_path,
+        phi_date_stamp + "-donor-data",
+        phi_date_stamp + "-datasetSummary"
+    )
+
+    all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+    dataset_metadata = pd.DataFrame()
+    n_files = len(all_files)
+    print("there are {} files".format(n_files))
+    f_counter = 1
+    for f in all_files:
+        temp_meta = pd.read_csv(f)
+        temp_meta.rename(columns={"Unnamed: 0": "col_name"}, inplace=True)
+        userid = f[-32:-22]
+        temp_meta["userid"] = userid
+        dataset_metadata = pd.concat(
+            [dataset_metadata, temp_meta],
+            ignore_index=True,
+            sort=False
+        )
+
+        if f_counter % 10 == 0:
+            print("completed file {} of {}".format(f_counter, n_files))
+        f_counter = f_counter + 1
+    dataset_metadata.to_csv(
+        os.path.join(donor_folder, phi_date_stamp + "-all-dataset-info.csv.gz")
+    )
+    print("saving all-dataset-info-metadata...code complete")
+
+    return
+
+
+# %% MAIN
+if __name__ == "__main__":
+    # USER INPUTS (choices to be made in order to run the code)
+    codeDescription = "get donor json file"
+    parser = argparse.ArgumentParser(description=codeDescription)
+
+    parser.add_argument(
+        "-d",
+        "--date-stamp",
+        dest="date_stamp",
+        default=dt.datetime.now().strftime("%Y-%m-%d"),
+        help="date, in '%Y-%m-%d' format, of the date when " +
+        "donors were accepted"
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output-data-path",
+        dest="data_path",
+        default=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__), "..", "data"
+            )
+        ),
+        help="the output path where the data is stored"
+    )
+
+    args = parser.parse_args()
+
+    # the main function
+    get_dataset_summaries(
+        save_data_path=args.data_path,
+        date_stamp=args.date_stamp
+    )
diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_single_dataset_info.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_dataset_info.py
new file mode 100644
index 00000000..d1ddab75
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_dataset_info.py
@@ -0,0 +1,357 @@
+# -*- coding: utf-8 -*-
+"""get_donor_data_and_metadata.py
+This code takes a tidepool dataset as input, and gives
+a description of the type of data in the dataset.
+"""
+
+
+# %% REQUIRED LIBRARIES
+import pandas as pd
+import datetime as dt
+import numpy as np
+import os
+import ast
+import argparse
+
+
+# %% FUNCTIONS
+def get_type(val):
+    return type(val).__name__
+
+
+def get_len(val):
+    return len(val)
+
+
+def get_val(val, k):
+    return val[k]
+
+
+def literal_return(val):
+    try:
+        return ast.literal_eval(val)
+    except (ValueError, SyntaxError):
+        return val
+
+
+def remove_cols(df, cols_to_remove):
+
+    temp_remove_cols = list(set(df) & set(cols_to_remove))
+    tempDf = df[temp_remove_cols]
+    df = df.drop(columns=temp_remove_cols)
+
+    return df, tempDf
+
+
+def make_folder_if_doesnt_exist(folder_paths):
+    ''' function requires a single path or a list of paths'''
+    if not isinstance(folder_paths, list):
+        folder_paths = [folder_paths]
+    for folder_path in folder_paths:
+        if not os.path.exists(folder_path):
+            os.makedirs(folder_path)
+    return
+
+
+def create_output_folder(
+        data_path,
+        date_stamp,
+        folder_name,
+        phi=True
+):
+    if phi:
+        date_stamp = "PHI-" + date_stamp
+    donor_folder = os.path.join(data_path, date_stamp + "-donor-data")
+    dataset_path = os.path.join(
+        donor_folder,
+        date_stamp + "-" + folder_name
+    )
+    make_folder_if_doesnt_exist(dataset_path)
+
+    return dataset_path
+
+
+def save_df(
+        df,
+        userid,
+        data_path,
+        date_stamp,
+        folder_name,
+        phi=True,
+        name_suffix="",
+):
+
+    output_folder = create_output_folder(
+        data_path=data_path,
+        date_stamp=date_stamp,
+        folder_name=folder_name,
+        phi=phi
+    )
+
+    # if the data contains phi, add prefix to the file
+    if phi:
+        phi_prefix = 'PHI-'
+    else:
+        phi_prefix = ''
+    output_path = os.path.join(
+        output_folder,
+        phi_prefix + userid + "{}.csv.gz".format(name_suffix)
+    )
+
+    df.to_csv(output_path)
+
+    return output_path
+
+
+def expand_df(df, do_not_expand_list=[]):
+
+    # remove fields that we don't want to flatten
+    df, hold_df = remove_cols(df, do_not_expand_list)
+
+    # get a description of the original columns
+    col_df = pd.DataFrame(df.dtypes, columns=["dtype"])
+
+    # go through each dtype that is an object to see if it
+    # contains strings, mixed datatypes, embedded json, or lists
+    col_df["nObjectTypes"] = np.nan
+    col_df["objectType"] = np.nan
+
+    new_df = pd.DataFrame()
+    for col in col_df[col_df["dtype"] == "object"].index:
+        rows = df.index[df[col].notnull()].tolist()
+
+        # sometimes the object gets wrapped in a string
+        literal_df = pd.DataFrame(df.loc[rows, col].apply(literal_return))
+
+        # see if there are mixed ojbect types
+        type_df = pd.DataFrame(literal_df.loc[rows, col].apply(get_type))
+        unique_types = type_df[col].unique()
+        col_df.loc[col, "nObjectTypes"] = len(unique_types)
+        col_df.loc[col, "objectType"] = str(unique_types)
+
+        # USE UNDERSCORE FOR LIST EXPANSION
+        if "list" in col_df.loc[col, "objectType"]:
+            list_df = pd.DataFrame(literal_df.loc[type_df[col] == "list", col])
+            list_df["len"] = list_df[col].apply(get_len)
+
+            for i in np.arange(1, list_df["len"].max() + 1):
+                blob_df = pd.DataFrame(
+                    list_df.loc[
+                        list_df["len"] >= i, col
+                        ].apply(get_val, k=i-1)
+                    ).add_suffix('_' + str(i))
+
+                new_df = pd.concat([new_df, blob_df], axis=1)
+
+        # USE DOT FOR JSON (DICT) EXPANSION
+        if "dict" in col_df.loc[col, "objectType"]:
+            json_blob = literal_df.loc[type_df[col] == "dict", col]
+            blob_df = pd.DataFrame(
+                json_blob.tolist(),
+                index=json_blob.index
+            ).add_prefix(col + '.')
+            new_df = pd.concat([new_df, blob_df], axis=1)
+
+    # merge the dataframes together
+    df = pd.concat([df, new_df, hold_df], axis=1)
+
+    df.sort_index(axis=1, inplace=True)
+
+    return df, col_df
+
+
+def expand_data(starting_df, depth=10):
+    print("\ninitial df has {} columns".format(len(starting_df.columns)))
+    print("starting expansion ...")
+    temp_df, temp_col = expand_df(starting_df)
+    col_df = temp_col.copy()
+    skip_columns = starting_df.columns.tolist()
+    d = 1
+    n_col_expanded = len(list(temp_df)) - len(list(starting_df))
+    print("{} columns added". format(n_col_expanded))
+
+    while not ((d >= depth) | (len(temp_col) == 0)):
+        print("expanding layer {} ... ".format(d))
+        next_skip_columns = temp_df.columns.tolist()
+        temp_df, temp_col = expand_df(temp_df, skip_columns)
+        skip_columns = next_skip_columns.copy()
+
+        col_df = pd.concat([col_df, temp_col])
+        n_col_expanded = len(list(temp_df)) - len(next_skip_columns)
+        print("{} columns added". format(n_col_expanded))
+        d += 1
+
+    print("expansion complete...getting dataset summary info...")
+
+    col_df.sort_index(inplace=True)
+
+    # get the start and end time for each data type
+    print("getting data start and end times for each data type ...")
+    col_df["startTime"] = np.nan
+    col_df["endTime"] = np.nan
+    for col in col_df.index:
+        try:
+            start_time = temp_df.loc[temp_df[col].notnull(), ["time"]].min()
+            end_time = temp_df.loc[temp_df[col].notnull(), ["time"]].max()
+            col_df.loc[col, "startTime"] = start_time.values[0]
+            col_df.loc[col, "endTime"] = end_time.values[0]
+        except:
+            print(col, "missing timestamp")
+
+    # get summary information
+    print("getting summary information ...")
+    df_info = pd.DataFrame(temp_df.describe(include='all').T)
+    df_info.loc["_all", ["count", "unique"]] = temp_df.shape
+    df_info.sort_index(inplace=True)
+
+    # add which type (or subtype) each column comes from
+    for typeType in ["type", "subType"]:
+        if typeType in list(starting_df):
+            type_groups = temp_df.groupby(by=typeType)
+            not_null_index = temp_df[typeType].notnull()
+            for type_ in temp_df.loc[not_null_index, typeType].unique():
+                type_df = type_groups.get_group(type_).dropna(
+                    axis=1,
+                    how="all"
+                )
+                df_info.loc[type_df.columns, typeType + "=" + type_] = type_
+
+    # get memory size of each data type
+    print("getting memory information ...")
+    mem_usage = pd.DataFrame(
+        temp_df.memory_usage(index=True, deep=True),
+        columns=["memorySize"]
+    )
+    mem_usage.rename(index={"Index": "_all"}, inplace=True)
+    df_info["memorySize"] = mem_usage["memorySize"]
+    df_info.loc["_all", "memorySize"] = temp_df.memory_usage(
+        index=True, deep=True
+    ).sum()
+
+    # combine with col_summary
+    summary_df = pd.concat([col_df, df_info], axis=1, sort=True)
+
+    # get/add a list of string values
+    print("getting a a list of string values ...")
+    str_cols = summary_df[
+        ((summary_df["objectType"] == "['str']") &
+         (summary_df["unique"] > 1) &
+         (summary_df["unique"] < 50)
+        )
+    ].index
+    for str_col in str_cols:
+        not_null_index = temp_df[str_col].notnull()
+        str_vals = temp_df.loc[not_null_index, str_col].unique().tolist()
+        summary_df.loc[str_col, "strVals"] = str(str_vals)
+
+    print("dataset summary info complete\n")
+
+    return summary_df, temp_df
+
+
+# %% START OF CODE
+def get_dataset_info(
+    data,
+    date_stamp,
+    data_path,
+    userid,
+    save_expanded
+):
+
+    if userid == "not-specified":
+        userid = input("Enter userid of dataset you want info on:\n")
+
+    if type(data) is float:  # np.nan is a float
+        dataset_folder = create_output_folder(
+            data_path,
+            date_stamp,
+            "csvData"
+        )
+        dataset_path = os.path.join(
+            dataset_folder,
+            "PHI-{}.csv.gz".format(userid)
+        )
+        data = pd.read_csv(dataset_path, low_memory=False, index_col=0)
+
+    # expand embedded lists and json within dataset
+    summary_df, expanded_df = expand_data(data.copy(), depth=10)
+
+    # save summary data
+    _ = save_df(
+        summary_df,
+        userid=userid,
+        data_path=data_path,
+        date_stamp=date_stamp,
+        folder_name="datasetSummary",
+        phi=True,
+        name_suffix="-datasetSummary"
+    )
+
+    if save_expanded:
+        # save expanded data
+        _ = save_df(
+            expanded_df,
+            userid=userid,
+            data_path=args.data_path,
+            date_stamp=args.date_stamp,
+            folder_name="expandedData",
+            phi=True,
+            name_suffix="-expandedData"
+        )
+
+
+if __name__ == "__main__":
+    # USER INPUTS (choices to be made in order to run the code)
+    codeDescription = "get an overview of the columns and data in the dataset"
+    parser = argparse.ArgumentParser(description=codeDescription)
+
+    parser.add_argument(
+        "-d",
+        "--date-stamp",
+        dest="date_stamp",
+        default=dt.datetime.now().strftime("%Y-%m-%d"),
+        help="date, in '%Y-%m-%d' format, of the date when " +
+        "donors were accepted"
+    )
+
+    parser.add_argument(
+        "-u",
+        "--userid",
+        dest="userid",
+        default="not-specified",
+        help="userid of the dataset you are interested in"
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output-data-path",
+        dest="data_path",
+        default=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__), "..", "data"
+            )
+        ),
+        help="the output path where the data is stored"
+    )
+
+    parser.add_argument(
+        "-s",
+        "--save-expanded-dataset",
+        dest="save_expanded",
+        default=True,
+        help=(
+            "specify if you want to save the expanded datafram (True/False)"
+            + "NOTE: these files can be rather large"
+        )
+    )
+
+    args = parser.parse_args()
+
+    # main function
+    get_dataset_info(
+        data=np.nan,
+        date_stamp=args.date_stamp,
+        data_path=args.data_path,
+        userid=args.userid,
+        save_expanded=args.save_expanded
+    )
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_donor_metadata.py
similarity index 99%
rename from projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py
rename to projects/bigdata-processing-pipeline/get_donor_data/get_single_donor_metadata.py
index 3135ff41..e02708a9 100644
--- a/projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py
+++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_donor_metadata.py
@@ -233,7 +233,7 @@ def get_and_save_metadata(
     # save data
     meta_output_path = os.path.join(
         metadata_path,
-        'PHI-' + userid + ".csv"
+        'PHI-' + userid + ".csv.gz"
     )
 
     meta_df.to_csv(meta_output_path)
diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py
similarity index 54%
rename from projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py
rename to projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py
index 290b5324..0b3e384f 100644
--- a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py
+++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py
@@ -8,6 +8,10 @@
 """
 
 # %% REQUIRED LIBRARIES
+try:
+    from get_single_dataset_info import expand_data, save_df
+except: # TODO: there has to be a better way to do this
+    from get_donor_data.get_single_dataset_info import expand_data, save_df
 import pandas as pd
 import datetime as dt
 import numpy as np
@@ -16,7 +20,6 @@
 import getpass
 import requests
 import json
-import pdb
 import argparse
 envPath = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
 if envPath not in sys.path:
@@ -24,93 +27,7 @@
 import environmentalVariables
 
 
-# %% USER INPUTS (choices to be made in order to run the code)
-codeDescription = "get donor metadata"
-parser = argparse.ArgumentParser(description=codeDescription)
-
-parser.add_argument(
-    "-d",
-    "--date-stamp",
-    dest="date_stamp",
-    default=dt.datetime.now().strftime("%Y-%m-%d"),
-    help="date, in '%Y-%m-%d' format, of the date when " +
-    "donors were accepted"
-)
-
-parser.add_argument(
-    "-w",
-    "--weeks-of-data",
-    dest="weeks_of_data",
-    default=52*10,
-    help="enter the number of weeks of data you want to download"
-)
-
-parser.add_argument(
-    "-dg",
-    "--donor-group",
-    dest="donor_group",
-    default=np.nan,
-    help="name of the donor group in the tidepool .env file"
-)
-
-parser.add_argument(
-    "-u",
-    "--userid",
-    dest="userid_of_shared_user",
-    default=np.nan,
-    help="userid of account shared with the donor group or master account"
-)
-
-parser.add_argument(
-    "-a",
-    "--auth",
-    dest="auth",
-    default=np.nan,
-    help="tuple that contains (email, password)"
-)
-
-parser.add_argument(
-    "-e",
-    "--email",
-    dest="email",
-    default=np.nan,
-    help="email address of the master account"
-)
-
-parser.add_argument(
-    "-p",
-    "--password",
-    dest="password",
-    default=np.nan,
-    help="password of the master account"
-)
-
-parser.add_argument(
-    "-o",
-    "--output-data-path",
-    dest="data_path",
-    default=os.path.abspath(
-        os.path.join(
-            os.path.dirname(__file__), "..", "data"
-        )
-    ),
-    help="the output path where the data is stored"
-)
-
-args = parser.parse_args()
-
-
 # %% FUNCTIONS
-def make_folder_if_doesnt_exist(folder_paths):
-    ''' function requires a single path or a list of paths'''
-    if not isinstance(folder_paths, list):
-        folder_paths = [folder_paths]
-    for folder_path in folder_paths:
-        if not os.path.exists(folder_path):
-            os.makedirs(folder_path)
-    return
-
-
 def get_data_api(userid, startDate, endDate, headers):
 
     startDate = startDate.strftime("%Y-%m-%d") + "T00:00:00.000Z"
@@ -145,7 +62,7 @@ def get_data_api(userid, startDate, endDate, headers):
 def get_data(
     weeks_of_data=10*52,
     donor_group=np.nan,
-    userid_of_shared_user=np.nan,
+    userid=np.nan,
     auth=np.nan,
     email=np.nan,
     password=np.nan,
@@ -180,8 +97,8 @@ def get_data(
     else:
         sys.exit("Error with " + auth[0] + ":" + str(api_response.status_code))
 
-    if pd.isnull(userid_of_shared_user):
-        userid_of_shared_user = userid_master
+    if pd.isnull(userid):
+        userid = userid_master
         print(
             "getting data for the master account since no shared " +
             "user account was given"
@@ -204,7 +121,7 @@ def get_data(
                 endDate.day + 1
             )
             year_df, endDate = get_data_api(
-                userid_of_shared_user,
+                userid,
                 startDate,
                 endDate,
                 headers
@@ -222,7 +139,7 @@ def get_data(
         )
 
         df, _ = get_data_api(
-            userid_of_shared_user,
+            userid,
             startDate,
             endDate,
             headers
@@ -241,58 +158,170 @@ def get_data(
             auth[0] + ":" + str(api_response.status_code)
         )
 
-    return df, userid_of_shared_user
+    return df, userid
 
 
 # %% START OF CODE
 def get_and_save_dataset(
-    date_stamp=args.date_stamp,
-    data_path=args.data_path,
-    weeks_of_data=args.weeks_of_data,
-    donor_group=args.donor_group,
-    userid_of_shared_user=args.userid_of_shared_user,
-    auth=args.auth,
-    email=args.email,
-    password=args.password
+    date_stamp,
+    data_path,
+    weeks_of_data,
+    donor_group,
+    userid,
+    auth,
+    email,
+    password,
+    expand_dataset
 ):
-    # create output folders if they don't exist
-
-    phi_date_stamp = "PHI-" + date_stamp
-    donor_folder = os.path.join(data_path, phi_date_stamp + "-donor-data")
-
-    dataset_path = os.path.join(
-        donor_folder,
-        phi_date_stamp + "-csvData"
-    )
-    make_folder_if_doesnt_exist(dataset_path)
 
     # get dataset
     data, userid = get_data(
         weeks_of_data=weeks_of_data,
         donor_group=donor_group,
-        userid_of_shared_user=userid_of_shared_user,
+        userid=userid,
         auth=auth,
         email=email,
         password=password
     )
 
-    # save data
-    dataset_output_path = os.path.join(
-        dataset_path,
-        'PHI-' + userid + ".csv"
-    )
+    # if the there is data
+    if len(data) > 1:
+        # save data
+        print("saving csv data...")
+        _ = save_df(
+                data,
+                userid=userid,
+                data_path=data_path,
+                date_stamp=date_stamp,
+                folder_name="csvData",
+                phi=True
+        )
 
-    data.to_csv(dataset_output_path)
+        # get dataset info
+        if expand_dataset:
+            summary_df, expanded_df = expand_data(data)
+            print("saving summary data...")
+            _ = save_df(
+                summary_df,
+                userid=userid,
+                data_path=data_path,
+                date_stamp=date_stamp,
+                folder_name="datasetSummary",
+                phi=True,
+                name_suffix="-datasetSummary"
+            )
+
+            # save expanded data
+            print("saving expanded data...")
+            _ = save_df(
+                expanded_df,
+                userid=userid,
+                data_path=args.data_path,
+                date_stamp=args.date_stamp,
+                folder_name="expandedData",
+                phi=True,
+                name_suffix="-expandedData"
+            )
+    else:
+        print("{} has no data".format(userid))
 
 
 if __name__ == "__main__":
+    # USER INPUTS (choices to be made in order to run the code)
+    codeDescription = "get donor metadata"
+    parser = argparse.ArgumentParser(description=codeDescription)
+
+    parser.add_argument(
+        "-d",
+        "--date-stamp",
+        dest="date_stamp",
+        default=dt.datetime.now().strftime("%Y-%m-%d"),
+        help="date, in '%Y-%m-%d' format, of the date when " +
+        "donors were accepted"
+    )
+
+    parser.add_argument(
+        "-w",
+        "--weeks-of-data",
+        dest="weeks_of_data",
+        default=52*10,
+        help="enter the number of weeks of data you want to download"
+    )
+
+    parser.add_argument(
+        "-dg",
+        "--donor-group",
+        dest="donor_group",
+        default=np.nan,
+        help="name of the donor group in the tidepool .env file"
+    )
+
+    parser.add_argument(
+        "-u",
+        "--userid",
+        dest="userid",
+        default=np.nan,
+        help="userid of account shared with the donor group or master account"
+    )
+
+    parser.add_argument(
+        "-a",
+        "--auth",
+        dest="auth",
+        default=np.nan,
+        help="tuple that contains (email, password)"
+    )
+
+    parser.add_argument(
+        "-e",
+        "--email",
+        dest="email",
+        default=np.nan,
+        help="email address of the master account"
+    )
+
+    parser.add_argument(
+        "-p",
+        "--password",
+        dest="password",
+        default=np.nan,
+        help="password of the master account"
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output-data-path",
+        dest="data_path",
+        default=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__), "..", "data"
+            )
+        ),
+        help="the output path where the data is stored"
+    )
+
+    parser.add_argument(
+        "-ex",
+        "--expand-dataset",
+        dest="expand_dataset",
+        default=True,
+        help=(
+            "specify if you want to get/save the expanded datafram (True/False)"
+            + "NOTE: this process is time consuming"
+        )
+    )
+
+    args = parser.parse_args()
+
+    # the main function
     get_and_save_dataset(
         date_stamp=args.date_stamp,
         data_path=args.data_path,
         weeks_of_data=args.weeks_of_data,
         donor_group=args.donor_group,
-        userid_of_shared_user=args.userid_of_shared_user,
+        userid=args.userid,
         auth=args.auth,
         email=args.email,
-        password=args.password
+        password=args.password,
+        expand_dataset=args.expand_dataset
     )
diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py
new file mode 100644
index 00000000..d8496891
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py
@@ -0,0 +1,325 @@
+# -*- coding: utf-8 -*-
+"""get_donor_data_and_metadata.py
+In the context of the big data donation
+project, this code grabs donor data and metadata.
+
+This code calls accept_new_donors_and_get_donor_list.py
+to get the most recent donor list
+"""
+
+# %% REQUIRED LIBRARIES
+import pandas as pd
+import datetime as dt
+import numpy as np
+import os
+import sys
+import time
+import getpass
+import requests
+import json
+import argparse
+import pdb
+envPath = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
+if envPath not in sys.path:
+    sys.path.insert(0, envPath)
+import environmentalVariables
+
+# %% GLOBAL VARIABLES
+current_date = dt.datetime.now().strftime("%Y-%m-%d")
+
+# %% FUNCTIONS
+def make_folder_if_doesnt_exist(folder_paths):
+    ''' function requires a single path or a list of paths'''
+    if not isinstance(folder_paths, list):
+        folder_paths = [folder_paths]
+    for folder_path in folder_paths:
+        if not os.path.exists(folder_path):
+            os.makedirs(folder_path)
+    return
+
+
+def get_data_api(userid, startDate, endDate, headers):
+
+    startDate = startDate.strftime("%Y-%m-%d") + "T00:00:00.000Z"
+    endDate = endDate.strftime("%Y-%m-%d") + "T23:59:59.999Z"
+
+    api_call = (
+        "https://api.tidepool.org/data/" + userid + "?" +
+        "endDate=" + endDate + "&" +
+        "startDate=" + startDate + "&" +
+        "dexcom=true" + "&" +
+        "medtronic=true" + "&" +
+        "carelink=true"
+    )
+
+    api_response = requests.get(api_call, headers=headers)
+    if(api_response.ok):
+        print("getting data between %s and %s" % (startDate, endDate))
+        json_data = json.loads(api_response.content.decode())
+
+    else:
+        sys.exit(
+            "ERROR in getting data between %s and %s" % (startDate, endDate),
+            api_response.status_code
+        )
+
+    endDate = pd.to_datetime(startDate) - pd.Timedelta(1, unit="d")
+
+    return json_data, endDate
+
+
+def get_data(
+        weeks_of_data=10*52,
+        save_data_path=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__),
+                "..",
+                "data",
+                "PHI-" + current_date + "-donor-data",
+                "PHI-" + current_date + "-jsonData",
+            )
+        ),
+        overwrite_hours=24,
+        donor_group=np.nan,
+        userid=np.nan,
+        auth=np.nan,
+        email=np.nan,
+        password=np.nan,
+        save_file="False",
+):
+    # login
+    if pd.notnull(donor_group):
+        if donor_group == "bigdata":
+            dg = ""
+        else:
+            dg = donor_group
+
+        auth = environmentalVariables.get_environmental_variables(dg)
+
+    if pd.isnull(auth):
+        if pd.isnull(email):
+            email = input("Enter Tidepool email address:\n")
+
+        if pd.isnull(password):
+            password = getpass.getpass("Enter password:\n")
+
+        auth = (email, password)
+
+    api_call = "https://api.tidepool.org/auth/login"
+    api_response = requests.post(api_call, auth=auth)
+    if(api_response.ok):
+        xtoken = api_response.headers["x-tidepool-session-token"]
+        userid_master = json.loads(api_response.content.decode())["userid"]
+        headers = {
+            "x-tidepool-session-token": xtoken,
+            "Content-Type": "application/json"
+        }
+    else:
+        sys.exit("Error with " + auth[0] + ":" + str(api_response.status_code))
+
+    if pd.isnull(userid):
+        userid = userid_master
+        print(
+            "getting data for the master account since no shared " +
+            "user account was given"
+        )
+
+    print("logging into", auth[0], "...")
+
+    # download user data
+    print("downloading data for {} ...".format(userid))
+    endDate = pd.datetime.now() + pd.Timedelta(1, unit="d")
+
+    output_folder = os.path.join(
+        save_data_path,
+        "PHI-" + userid,
+    )
+
+    output_file_path = os.path.join(
+        output_folder,
+        "PHI-{}.json".format(userid)
+    )
+
+    download_ = True
+    for f in [output_folder, output_file_path]:
+        path_exist = os.path.exists(f)
+        if path_exist:
+            last_save = os.path.getmtime(f)
+            time_threshold = time.time() - (overwrite_hours * 3600)
+            within_time_threshold = last_save > time_threshold
+            if within_time_threshold:
+                download_ = False
+
+    if download_:
+
+        big_json_file = []
+
+        if weeks_of_data > 52:
+            years_of_data = int(np.floor(weeks_of_data/52))
+
+            for years in range(0, years_of_data + 1):
+                startDate = pd.datetime(
+                    endDate.year - 1,
+                    endDate.month,
+                    endDate.day + 1
+                )
+                json_data, endDate = get_data_api(
+                    userid,
+                    startDate,
+                    endDate,
+                    headers
+                )
+
+                big_json_file = big_json_file + json_data
+
+        else:
+            startDate = (
+                pd.to_datetime(endDate) - pd.Timedelta(weeks_of_data*7, "d")
+            )
+
+            json_data, _ = get_data_api(
+                userid,
+                startDate,
+                endDate,
+                headers
+                )
+
+            big_json_file = big_json_file + json_data
+
+        # save data
+        if len(big_json_file) > 1:
+            if "T" in str(save_file).upper():
+                make_folder_if_doesnt_exist(output_folder)
+                print("saving data for {}".format(userid))
+                with open(output_file_path, 'w') as outfile:
+                    json.dump(big_json_file, outfile)
+            else:
+                print("{} has data, but will not be saved".format(userid))
+        else:
+            print("{} has no data".format(userid))
+
+        # logout
+        api_call = "https://api.tidepool.org/auth/logout"
+        api_response = requests.post(api_call, auth=auth)
+
+        if(api_response.ok):
+            print("successfully logged out of", auth[0])
+
+        else:
+            sys.exit(
+                "Error with logging out for " +
+                auth[0] + ":" + str(api_response.status_code)
+            )
+    else:
+        print(
+            "skipping bc {}'s data was downloaded (attempted)".format(userid)
+            + " within the last {} hours".format(overwrite_hours)
+        )
+
+    if "T" in str(save_file).upper():
+        return np.nan, userid
+    else:
+        df = pd.DataFrame(big_json_file)
+        return df, userid
+
+
+# %% MAIN
+if __name__ == "__main__":
+    # USER INPUTS (choices to be made in order to run the code)
+    codeDescription = "get donor json file"
+    parser = argparse.ArgumentParser(description=codeDescription)
+
+    parser.add_argument(
+        "-o",
+        "--output-data-path",
+        dest="data_path",
+        default=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__),
+                "..",
+                "data",
+                "PHI-" + current_date + "-donor-data",
+                "PHI-" + current_date + "-jsonData",
+            )
+        ),
+        help="the output path where the data is stored"
+    )
+
+    parser.add_argument(
+        "-w",
+        "--weeks-of-data",
+        dest="weeks_of_data",
+        default=2,  # 52*10,  # go back the last 10 years as default
+        help="enter the number of weeks of data you want to download"
+    )
+
+    parser.add_argument(
+        "-ow",
+        "--over-write",
+        dest="overwrite_hours",
+        default=24,
+        help="if data was downloaded in the last <24> hours, skip download"
+    )
+
+    parser.add_argument(
+        "-dg",
+        "--donor-group",
+        dest="donor_group",
+        default=np.nan,
+        help="name of the donor group in the tidepool .env file"
+    )
+
+    parser.add_argument(
+        "-u",
+        "--userid",
+        dest="userid",
+        default=np.nan,
+        help="userid of account shared with the donor group or master account"
+    )
+
+    parser.add_argument(
+        "-a",
+        "--auth",
+        dest="auth",
+        default=np.nan,
+        help="tuple that contains (email, password)"
+    )
+
+    parser.add_argument(
+        "-e",
+        "--email",
+        dest="email",
+        default=np.nan,
+        help="email address of the master account"
+    )
+
+    parser.add_argument(
+        "-p",
+        "--password",
+        dest="password",
+        default=np.nan,
+        help="password of the master account"
+    )
+
+    parser.add_argument(
+        "-s",
+        "--save_file",
+        dest="save_file",
+        default="true",
+        help="specify whether to save the downloaded donor data"
+    )
+
+    args = parser.parse_args()
+
+    # the main function
+    data, userid = get_data(
+        save_data_path=args.data_path,
+        weeks_of_data=args.weeks_of_data,
+        overwrite_hours=args.overwrite_hours,
+        donor_group=args.donor_group,
+        userid=args.userid,
+        auth=args.auth,
+        email=args.email,
+        password=args.password,
+        save_file=args.save_file,
+    )
diff --git a/projects/bigdata-processing-pipeline/get_stats/__init__.py b/projects/bigdata-processing-pipeline/get_stats/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
new file mode 100644
index 00000000..3fe2fef9
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py
@@ -0,0 +1,116 @@
+# -*- coding: utf-8 -*-
+"""accept_donors_and_pull_data.py
+This is a wrapper script that gets distributions and stats for all donors,
+NOTE: this needs to be refactored because it is currently set up to run
+on json files that are in a snowflake path
+
+"""
+
+# %% REQUIRED LIBRARIES
+import datetime as dt
+import pandas as pd
+import subprocess as sub
+import os
+import glob
+import time
+import argparse
+from multiprocessing import Pool
+
+
+# %% USER INPUTS (choices to be made in order to run the code)
+codeDescription = "get distribution and stats for all donor's json data"
+parser = argparse.ArgumentParser(description=codeDescription)
+
+parser.add_argument(
+    "-i",
+    "--input-json-data-path",
+    dest="json_data_path",
+    default=os.path.abspath(
+        os.path.join(
+            os.path.dirname(__file__), "..", "data", "dremio", "**", "*.json"
+        ),
+    ),
+    help="the path where json data is located"
+)
+
+parser.add_argument(
+    "-d",
+    "--date-stamp",
+    dest="date_stamp",
+    default=dt.datetime.now().strftime("%Y-%m-%d"),
+    help="date, in '%Y-%m-%d' format, of the date when " +
+    "donors were accepted"
+)
+
+parser.add_argument(
+    "-o",
+    "--output-data-path",
+    dest="data_path",
+    default=os.path.abspath(
+        os.path.join(
+            os.path.dirname(__file__), "..", "data"
+        )
+    ),
+    help="the output path where the data is stored"
+)
+
+args = parser.parse_args()
+
+
+# %% FUNCTIONS
+def run_process(json_data_path):
+    userid = json_data_path[-15:-5]
+
+    # check to see if the file was already processed
+    phi_date_stamp = "PHI-" + args.date_stamp
+
+    metadata_path = os.path.join(
+        args.data_path,
+        phi_date_stamp + "-donor-data",
+        phi_date_stamp + "-cgm-metadata"
+    )
+
+    all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+    if userid not in str(all_metadata_files):
+
+        p = sub.Popen(
+            [
+                 "python", "get_cgm_distributions_and_stats.py",
+                 "-i", json_data_path,
+                 "-u", userid,
+                 "-d", args.date_stamp,
+                 "-o", args.data_path
+             ],
+            stdout=sub.PIPE,
+            stderr=sub.PIPE
+        )
+
+        output, errors = p.communicate()
+        output = output.decode("utf-8")
+        errors = errors.decode("utf-8")
+
+        if errors == '':
+            print(output)
+        else:
+            print(errors)
+    else:
+        print(userid, "was already processed")
+
+    return
+
+
+# %% GET A LIST OF DONOR JSON FILE LOCATIONS
+all_files = glob.glob(args.json_data_path, recursive=True)
+
+# use multiple cores to process
+startTime = time.time()
+print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S"))
+pool = Pool(int(os.cpu_count()))
+pool.map(run_process, all_files)
+pool.close()
+endTime = time.time()
+print(
+  "finshed pulling data at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
+)
+total_duration = round((endTime - startTime) / 60, 1)
+print("total duration was %s minutes" % total_duration)
diff --git a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py
new file mode 100644
index 00000000..b8bac502
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py
@@ -0,0 +1,121 @@
+# -*- coding: utf-8 -*-
+"""accept_donors_and_pull_data.py
+This is a wrapper script that gets distributions and stats for all donors,
+NOTE: this needs to be refactored because it is currently set up to run
+on json files that are in a snowflake path
+
+"""
+
+# %% REQUIRED LIBRARIES
+import pandas as pd
+import numpy as np
+import os
+import glob
+import argparse
+
+
+# %% USER INPUTS (choices to be made in order to run the code)
+codeDescription = "get distribution and stats for all donor's json data"
+parser = argparse.ArgumentParser(description=codeDescription)
+
+parser.add_argument(
+    "-d",
+    "--date-stamp",
+    dest="date_stamp",
+    default="2019-07-17",
+    help="date, in '%Y-%m-%d' format, of the date when " +
+    "donors were accepted"
+)
+
+parser.add_argument(
+    "-o",
+    "--output-data-path",
+    dest="data_path",
+    default=os.path.abspath(
+        os.path.join(
+            os.path.dirname(__file__), "..", "data"
+        )
+    ),
+    help="the output path where the data is stored"
+)
+
+
+parser.add_argument(
+    "-c",
+    "--chunk-size",
+    dest="chunk_size",
+    default=50,
+    help="the output path where the data is stored"
+)
+
+args = parser.parse_args()
+
+
+# %% COMBINE AND SAVE ALL DONOR METADATA
+print("combining all metadata")
+phi_date_stamp = "PHI-" + args.date_stamp
+donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data")
+
+metadata_path = os.path.join(
+    args.data_path,
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-cgm-metadata"
+)
+
+all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+print("combining {} metaata files".format(len(all_metadata_files)))
+all_metadata = pd.DataFrame()
+for f in all_metadata_files:
+    temp_meta = pd.read_csv(f, low_memory=False)
+    all_metadata = pd.concat(
+        [all_metadata, temp_meta],
+        ignore_index=True,
+        sort=False
+    )
+
+all_metadata.to_csv(
+    os.path.join(
+        donor_folder,
+        phi_date_stamp
+        + "-cgm-metadata-0-{}.csv.gz".format(str(len(all_metadata_files)))
+    )
+)
+print("finished saving metadata...starting distribution data...")
+
+
+# %% COMBINE AND SAVE ALL DISTRIBUTION DATA
+metadata_path = os.path.join(
+    args.data_path,
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-cgm-distributions"
+)
+
+all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+print("combining {} distribution data files".format(len(all_metadata_files)))
+chunks = np.arange(0, len(all_metadata_files), int(args.chunk_size))
+chunks = np.append(chunks, len(all_metadata_files))
+for chunk_start, chunk_end in zip(chunks[0:-1], chunks[1:]):
+    print("starting chunk {}-{}".format(str(chunk_start), str(chunk_end)))
+    distribution_metadata = pd.DataFrame()
+    for c_idx in np.arange(chunk_start, chunk_end):
+        temp_meta = pd.read_csv(
+            all_metadata_files[c_idx],
+            index_col=[0],
+            low_memory=False
+        )
+        distribution_metadata = pd.concat(
+            [distribution_metadata, temp_meta],
+            ignore_index=True,
+            sort=False
+        )
+    # save chunk
+    print("saving chunk {}-{}".format(str(chunk_start), str(chunk_end)))
+    distribution_metadata.to_csv(
+        os.path.join(
+            donor_folder,
+            phi_date_stamp + "-cgm-distributions-{}-{}.csv.gz".format(
+                str(chunk_start),
+                str(chunk_end))
+        )
+    )
+print("finished saving all-dataset-distribution-data...code complete")
diff --git a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_results.py b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_results.py
new file mode 100644
index 00000000..12abb350
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_results.py
@@ -0,0 +1,92 @@
+# -*- coding: utf-8 -*-
+"""accept_donors_and_pull_data.py
+This is a wrapper script that gets distributions and stats for all donors,
+NOTE: this needs to be refactored because it is currently set up to run
+on json files that are in a snowflake path
+
+"""
+
+# %% REQUIRED LIBRARIES
+import pandas as pd
+import numpy as np
+import os
+import glob
+import argparse
+
+
+# %% USER INPUTS (choices to be made in order to run the code)
+codeDescription = "get distribution and stats for all donor's json data"
+parser = argparse.ArgumentParser(description=codeDescription)
+
+parser.add_argument(
+    "-d",
+    "--date-stamp",
+    dest="date_stamp",
+    default="2019-07-17",
+    help="date, in '%Y-%m-%d' format, of the date when " +
+    "donors were accepted"
+)
+
+parser.add_argument(
+    "-o",
+    "--output-data-path",
+    dest="data_path",
+    default=os.path.abspath(
+        os.path.join(
+            os.path.dirname(__file__), "..", "data"
+        )
+    ),
+    help="the output path where the data is stored"
+)
+
+parser.add_argument(
+    "-c",
+    "--chunk-size",
+    dest="chunk_size",
+    default=50,
+    help="the output path where the data is stored"
+)
+
+args = parser.parse_args()
+
+
+# %% COMBINE AND SAVE ALL DISTRIBUTION DATA
+
+phi_date_stamp = "PHI-" + args.date_stamp
+donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data")
+
+metadata_path = os.path.join(
+    args.data_path,
+    phi_date_stamp + "-donor-data",
+    phi_date_stamp + "-cgm-distributions"
+)
+
+all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz"))
+print("combining {} distribution data files".format(len(all_metadata_files)))
+chunks = np.arange(0, len(all_metadata_files), int(args.chunk_size))
+chunks = np.append(chunks, len(all_metadata_files))
+for chunk_start, chunk_end in zip(chunks[0:-1], chunks[1:]):
+    print("starting chunk {}-{}".format(str(chunk_start), str(chunk_end)))
+    distribution_metadata = pd.DataFrame()
+    for c_idx in np.arange(chunk_start, chunk_end):
+        temp_meta = pd.read_csv(
+            all_metadata_files[c_idx],
+            index_col=[0],
+            low_memory=False
+        )
+        distribution_metadata = pd.concat(
+            [distribution_metadata, temp_meta],
+            ignore_index=True,
+            sort=False
+        )
+    # save chunk
+    print("saving chunk {}-{}".format(str(chunk_start), str(chunk_end)))
+    distribution_metadata.to_csv(
+        os.path.join(
+            donor_folder,
+            phi_date_stamp + "-cgm-distributions-{}-{}.csv.gz".format(
+                str(chunk_start),
+                str(chunk_end))
+        )
+    )
+print("finished saving all-dataset-distribution-data...code complete")
diff --git a/projects/bigdata-processing-pipeline/get_stats/development-versions/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/development-versions/get_cgm_distributions_v3.py
new file mode 100644
index 00000000..f691f506
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_stats/development-versions/get_cgm_distributions_v3.py
@@ -0,0 +1,2397 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+calculate cgm statsistics for a single tidepool (donor) dataset
+'''
+
+
+# %% REQUIRED LIBRARIES
+import os
+import sys
+import hashlib
+import pytz
+import numpy as np
+import pandas as pd
+import datetime as dt
+import glob
+import pdb
+# TODO: figure out how to get rid of these path dependcies
+get_donor_data_path = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..")
+)
+if get_donor_data_path not in sys.path:
+    sys.path.insert(0, get_donor_data_path)
+import environmentalVariables
+from get_donor_data.get_single_donor_metadata import get_shared_metadata
+from get_donor_data.get_single_tidepool_dataset import get_data
+from get_donor_data.get_single_tidepool_dataset_json import make_folder_if_doesnt_exist
+
+# %% CONSTANTS
+MGDL_PER_MMOLL = 18.01559
+
+
+# %% FUNCTIONS
+'''
+the functions that are called in this script,
+which includes notes of where the functions came from,
+and whether they were refactored
+'''
+
+
+def get_episodes(
+        df,
+        episode_criterion="cgm < 54",
+        min_duration=5,
+):
+    # TODO: deal with case where there are nan's in the middle of an episode
+    # it probably makes sense to interpolate between values iff the gap is
+    # <= 1 to 6 points (5 to 30 minutes)
+
+    # put consecutive data that matches in groups
+    df["tempGroups"] = ((
+        df[episode_criterion] != df[episode_criterion].shift()
+    ).cumsum())
+
+    df["episodeId"] = (
+        df["tempGroups"] * df[episode_criterion]
+    )
+
+    # group by the episode groups
+    episode_groups = df.groupby("episodeId")
+    episodes = episode_groups["roundedUtcTime"].count().reset_index()
+    episodes["duration"] = episodes["roundedUtcTime"] * 5
+    episodes.rename(columns={"roundedUtcTime": "episodeCounts"}, inplace=True)
+
+    df = pd.merge(df, episodes, on="episodeId", how="left")
+    df["episodeDuration"] = (
+        df["duration"] * df[episode_criterion]
+    )
+
+    # mark record as belonging to an episode
+    df["isEpisode"] = (
+        df["episodeDuration"] >= min_duration
+    )
+
+    # get the hypo episode starts so we only count each episode once
+    df["episodeStart"] = (
+        (df[episode_criterion])
+        & (~df[episode_criterion].shift(1).fillna(False))
+#        & (df["hasCgm"])
+#        & (df["hasCgm"].shift(1))
+    )
+
+    # calculate the total duration and attach to start record
+    # which is needed to get the average duration per episode
+    df["episodeTotalDuration"] = (
+        df["episodeStart"] * df["episodeDuration"]
+    )
+    df["episodeTotalDuration"].replace(0, np.nan, inplace=True)
+
+    episode_prefix = (
+        "episode." + episode_criterion
+        + ".durationThreshold=" + str(min_duration) + "."
+    )
+
+    df = df[[
+        "isEpisode", "episodeId", "episodeStart", "episodeTotalDuration"
+    ]].add_prefix(episode_prefix)
+
+    return df
+
+
+def get_slope(y):
+    if "array" not in type(y).__name__:
+        raise TypeError('Expecting a numpy array')
+
+    count_ = len(y)
+
+    x = np.arange(start=0, stop=count_*5, step=5)
+
+    sum_x = x.sum()
+    sum_y = y.sum()
+    sum_xy = (x * y).sum()
+    sum_x_squared = (x * x).sum()
+
+    slope = (
+        ((count_ * sum_xy) - (sum_x * sum_y))
+        / ((count_ * sum_x_squared) - (sum_x * sum_x))
+    )
+
+    return slope
+
+
+def expand_entire_dict(ts):
+    if "Series" not in type(ts).__name__:
+        raise TypeError('Expecting a pandas time series object')
+    notnull_idx = ts.index[ts.notnull()]
+    temp_df = pd.DataFrame(
+        ts[notnull_idx].tolist(),
+        index=notnull_idx
+    )
+
+    return temp_df
+
+
+def expand_embedded_dict(ts, key_):
+    '''Expanded a single field that has embedded json
+
+    Args:
+        ts: a pandas time series of the field that has embedded json
+        key_: the key that you want to expand
+
+    Raise:
+        TypeError: if you don't pass in a pandas time series
+
+    Returns:
+        key_ts: a new time series of the key of interest
+
+    NOTE:
+        this is new function
+    TODO:
+        could be refactored to allow multiple keys or all keys to be returned
+        could be refactored for speed as the current process
+    '''
+
+    if "Series" not in type(ts).__name__:
+        raise TypeError('Expecting a pandas time series object')
+    key_ts = pd.Series(name=ts.name + "." + key_, index=ts.index)
+    notnull_idx = ts.notnull()
+    # TODO: maybe sped up by only getting the one field of interest?
+    # though, the current method is fairly quick and compact
+    temp_df = expand_entire_dict(ts)
+    if key_ in list(temp_df):
+        key_ts[notnull_idx] = temp_df[key_].values
+
+    return key_ts
+
+
+def get_embedded_field(ts, embedded_field):
+    '''get a field that is nested in more than 1 embedded dictionary (json)
+
+    Args:
+        ts: a pandas time series of the field that has embedded json
+        embedded_field (str): the location of the field that is deeply nested
+            (e.g., "origin.payload.device.model")
+
+    Raise:
+        ValueError: if you don't pass in a pandas time series
+
+    Returns:
+        new_ts: a new time series of the key of interest
+
+    NOTE:
+        this is new function
+        the "." notation is used to reference nested json
+
+    '''
+    field_list = embedded_field.split(".")
+    if len(field_list) < 2:
+        raise ValueError('Expecting at least 1 embedded field')
+
+    new_ts = expand_embedded_dict(ts, field_list[1])
+    for i in range(2, len(field_list)):
+        new_ts = expand_embedded_dict(new_ts, field_list[i])
+
+    return new_ts
+
+
+def add_upload_info_to_cgm_records(groups, df):
+    upload_locations = [
+        "upload.uploadId",
+        "upload.deviceManufacturers",
+        "upload.deviceModel",
+        "upload.deviceSerialNumber",
+        "upload.deviceTags"
+    ]
+
+    if "upload" in groups["type"].unique():
+        upload = groups.get_group("upload").dropna(axis=1, how="all").add_prefix("upload.")
+        df = pd.merge(
+            left=df,
+            right=upload[list(set(upload_locations) & set(list(upload)))],
+            left_on="uploadId",
+            right_on="upload.uploadId",
+            how="left"
+        )
+
+    return df
+
+
+def expand_heathkit_cgm_fields(df):
+    # TODO: refactor the code/function that originally grabs
+    # these fields, so we are only doing it once, and so
+    # we don't have to drop the columns for the code below to work.
+    drop_columns = [
+        'origin.payload.device.name',
+        'origin.payload.device.manufacturer',
+        'origin.payload.sourceRevision.source.name'
+    ]
+    for drop_col in drop_columns:
+        if drop_col in list(df):
+            df.drop(columns=[drop_col], inplace=True)
+
+    healthkit_locations = [
+        "origin",
+        "origin.payload",
+        "origin.payload.device",
+        "origin.payload.sourceRevision",
+        "origin.payload.sourceRevision.source",
+        "payload",
+    ]
+
+    for hk_loc in healthkit_locations:
+        if hk_loc in list(df):
+            temp_df = (
+                expand_entire_dict(df[hk_loc].copy()).add_prefix(hk_loc + ".")
+            )
+            df = pd.concat([df, temp_df], axis=1)
+
+    return df
+
+
+def get_dexcom_cgm_model(df):
+    # add cgm model
+
+    dexcom_model_locations = [
+        "deviceId",
+        "deviceManufacturers",
+        "upload.deviceManufacturers",
+        "deviceModel",
+        "upload.deviceModel",
+        "deviceSerialNumber",
+        "upload.deviceSerialNumber",
+        "origin.payload.sourceRevision.source.name",
+        "payload.transmitterGeneration",
+        "payload.HKMetadataKeySyncIdentifier",
+        "payload.transmitterId",
+    ]
+
+    for model_location in dexcom_model_locations:
+        # only check if model has NOT been determined, or if it is G5_G6
+        m_idx = (
+            (df["cgmModel"].isnull())
+            | (df["cgmModel"].astype(str).str.contains("G5_G6"))
+        )
+
+        # get index that matches model
+        if ((model_location in list(df)) & (m_idx.sum() > 0)):
+            str_list = df[model_location].astype(str).str
+
+            # G4
+            g4_idx = str_list.contains("G4", case=False, na=False)
+            df.loc[g4_idx, "cgmModel"] = "G4"
+            df.loc[g4_idx, "cgmModelSensedFrom"] = model_location
+
+            # G5
+            g5_idx = str_list.contains("G5", case=False, na=False)
+            df.loc[g5_idx, "cgmModel"] = "G5"
+            df.loc[g5_idx, "cgmModelSensedFrom"] = model_location
+
+            # G6
+            g6_idx = str_list.contains("G6", case=False, na=False)
+            df.loc[g6_idx, "cgmModel"] = "G6"
+            df.loc[g6_idx, "cgmModelSensedFrom"] = model_location
+
+            # edge case of g5 and g6
+            g5_g6_idx = (g5_idx & g6_idx)
+            df.loc[g5_g6_idx, "cgmModel"] = "G5_G6"
+            df.loc[g5_g6_idx, "cgmModelSensedFrom"] = model_location
+
+            # case of "transmitterId"
+            if (
+                ("transmitterId" in model_location)
+                | ("payload.HKMetadataKeySyncIdentifier" in model_location)
+            ):
+                # if length of string is 5, then it is likely a G4 sensor
+                length5_idx = str_list.len() == 5
+                df.loc[length5_idx, "cgmModel"] = "G4"
+                df.loc[length5_idx, "cgmModelSensedFrom"] = model_location
+
+                # if length of string > 5  then might be G5 or G6
+                length_gt5_idx = str_list.len() > 5
+
+                # if sensor stats with 4 then likely G5
+                starts4_idx = str_list.startswith("4")
+                df.loc[(length_gt5_idx & starts4_idx), "cgmModel"] = "G5"
+                df.loc[(length_gt5_idx & starts4_idx), "cgmModelSensedFrom"] = model_location
+
+                # if sensor stats with 2 or 8 then likely G6
+                starts2_6_idx = (
+                    (str_list.startswith("2")) | (str_list.startswith("8"))
+                )
+                df.loc[(length_gt5_idx & starts2_6_idx), "cgmModel"] = "G6"
+                df.loc[(length_gt5_idx & starts2_6_idx), "cgmModelSensedFrom"] = model_location
+
+    return df[["cgmModel", "cgmModelSensedFrom"]]
+
+
+def get_non_dexcom_cgm_model(df):
+    # non-dexcom cgm model query
+    model_locations = ["deviceId"]
+
+    # model types (NOTE: for medtronic getting pump type not cgm)
+    models_670G = "MMT-158|MMT-178"
+    models_640G = "MMT-1511|MMT-1512|MMT-1711|MMT-1712"
+    models_630G = "MMT-1514|MMT-1515|MMT-1714|MMT-1715"
+    models_530G = (
+        "530G|MedT-551|MedT-751|MedT-554|MedT-754|Veo - 554|Veo - 754"
+    )
+    models_523_723 = "MedT-523|MedT-723|Revel - 523|Revel - 723"  # 523/723
+    models_libre = "AbbottFreeStyleLibre"
+    models_animas = "IR1295"
+    # NOTE: the tandem G4 will first be written as G5_G6,
+    # but the logic should overwrite back to G4
+    models_tandem_G5_G6 = "tandem"
+    models_tandem_G4 = "4628003|5448003"
+
+    non_dex_models = [
+        models_670G, models_640G, models_630G, models_530G, models_523_723,
+        models_libre, models_animas, models_tandem_G5_G6, models_tandem_G4
+    ]
+
+    non_dex_model_names = [
+        "670G", "640G", "630G", "530G", "523_723",
+        "LIBRE", "G4", "G5_G6", "G4"
+    ]
+
+    for model_location in model_locations:
+        # only check if model has NOT been determined, or if it is G5_G6
+        m_idx = (
+            (df["cgmModel"].isnull())
+            | (df["cgmModel"].astype(str).str.contains("G5_G6"))
+        )
+
+        # get index that matches model
+        if ((model_location in list(df)) & (m_idx.sum() > 0)):
+            str_list = df[model_location].astype(str).str
+
+            for non_dex_model, model_name in zip(
+                non_dex_models, non_dex_model_names
+            ):
+
+                model_idx = str_list.contains(non_dex_model, na=False)
+                df.loc[model_idx, "cgmModel"] = model_name
+                df.loc[model_idx, "cgmModelSensedFrom"] = model_location
+
+    return df[["cgmModel", "cgmModelSensedFrom"]]
+
+
+def hash_userid(userid, salt):
+    '''
+    taken from anonymize-and-export.py
+    refactored name(s) to meet style guide
+    '''
+    usr_string = userid + salt
+    hash_user = hashlib.sha256(usr_string.encode())
+    hashid = hash_user.hexdigest()
+
+    return hashid
+
+
+def get_type(val):
+    return type(val).__name__
+
+
+def remove_negative_durations(df):
+    '''
+    taken from https://github.com/tidepool-org/data-analytics/blob/
+    etn/get-settings-and-events/projects/get-donors-pump-settings/
+    get-users-settings-and-events.py
+
+    refactored name(s) to meet style guide
+    refactored pandas field call to df["field"] instead of df.field
+    refactored because physical activity includes embedded json, whereas
+    the other fields in the data model require a integer
+    TODO: I think that durations are coming in as floats too, so we need
+    to refactor to account for that.
+    '''
+    if "duration" in list(df):
+        type_ = df["duration"].apply(get_type)
+        valid_index = ((type_ == "int") & (df["duration"].notnull()))
+        n_negative_durations = sum(df.loc[valid_index, "duration"] < 0)
+        if n_negative_durations > 0:
+            df = df[~(df.loc[valid_index, "duration"] < 0)]
+    else:
+        n_negative_durations = np.nan
+
+    return df, n_negative_durations
+
+
+def tslim_calibration_fix(df):
+    '''
+    taken from https://github.com/tidepool-org/data-analytics/blob/
+    etn/get-settings-and-events/projects/get-donors-pump-settings/
+    get-users-settings-and-events.py
+
+    refactored name(s) to meet style guide
+    refactored pandas field call to df["field"] instead of df.field
+    refactored to only expand one field
+    '''
+
+    # expand payload field one level
+    if "payload" in list(df):
+        df["payload.calibration_reading"] = (
+            expand_embedded_dict(df["payload"], "calibration_reading")
+        )
+
+        if df["payload.calibration_reading"].notnull().sum() > 0:
+
+            search_for = ['tan']
+            tandem_data_index = (
+                (df["deviceId"].str.contains('|'.join(search_for)))
+                & (df["type"] == "deviceEvent")
+            )
+
+            cal_index = df["payload.calibration_reading"].notnull()
+            valid_index = tandem_data_index & cal_index
+
+            n_cal_readings = sum(valid_index)
+
+            if n_cal_readings > 0:
+                # if reading is > 30 then it is in the wrong units
+                if df["payload.calibration_reading"].min() > 30:
+                    df.loc[cal_index, "value"] = (
+                        df.loc[valid_index, "payload.calibration_reading"]
+                        / MGDL_PER_MMOLL
+                    )
+                else:
+                    df.loc[cal_index, "value"] = (
+                        df.loc[valid_index, "payload.calibration_reading"]
+                    )
+        else:
+            n_cal_readings = 0
+    else:
+        n_cal_readings = 0
+    return df, n_cal_readings
+
+
+def replace_smoothed_cgm_values(df):
+
+    if 'payload.realTimeValue' in list(df):
+        raw_val_idx = df['payload.realTimeValue'].notnull()
+        n_replaced = raw_val_idx.sum()
+        df.loc[raw_val_idx, "mg/dL"] = (
+            df.loc[raw_val_idx, "payload.realTimeValue"]
+        )
+    else:
+        n_replaced = np.nan
+
+    raw_values = df["mg/dL"]
+
+    return raw_values, n_replaced
+
+
+def get_healthkit_timezone(df):
+    '''
+    TODO: refactor to account for more efficient way to get embedded json
+    '''
+    if "payload" in list(df):
+        df["payload.HKTimeZone"] = (
+            expand_embedded_dict(df["payload"], "HKTimeZone")
+        )
+        if "timezone" not in list(df):
+            if "payload.HKTimeZone" in list(df):
+                hk_tz_idx = df["payload.HKTimeZone"].notnull()
+                df.loc[hk_tz_idx, "deviceType"] = "healthkit"
+                df.rename(columns={"payload.HKTimeZone": "timezone"}, inplace=True)
+
+            else:
+                df["timezone"] = np.nan
+                df["deviceType"] = np.nan
+        else:
+            if "payload.HKTimeZone" in list(df):
+                hk_tz_idx = df["payload.HKTimeZone"].notnull()
+                df.loc[hk_tz_idx, "timezone"] = (
+                    df.loc[hk_tz_idx, "payload.HKTimeZone"]
+                )
+                df.loc[hk_tz_idx, "deviceType"] = "healthkit"
+            else:
+                df["timezone"] = np.nan
+                df["deviceType"] = np.nan
+
+    else:
+        df["timezone"] = np.nan
+        df["deviceType"] = np.nan
+
+    return df[["timezone", "deviceType"]]
+
+
+def get_and_fill_timezone(df):
+    '''
+    this is new to deal with healthkit data
+    requires that a data frame that contains payload and HKTimeZone is passed
+    '''
+    df = get_healthkit_timezone(df)
+
+    df["timezone"].fillna(method='ffill', inplace=True)
+    df["timezone"].fillna(method='bfill', inplace=True)
+
+    return df["timezone"]
+
+
+def make_tz_unaware(date_time):
+    return date_time.replace(tzinfo=None)
+
+
+def to_utc_datetime(df):
+    '''
+    this is new to deal with perfomance issue with the previous method
+    of converting to string to datetime with pd.to_datetime()
+    '''
+    utc_time_tz_aware = pd.to_datetime(
+        df["time"],
+        format="%Y-%m-%dT%H:%M:%S",
+        utc=True
+    )
+    utc_tz_unaware = utc_time_tz_aware.apply(make_tz_unaware)
+
+    return utc_tz_unaware
+
+
+# apply the large timezone offset correction (AKA Darin's fix)
+def timezone_offset_bug_fix(df):
+    '''
+    this is taken from estimate-local-time.py
+    TODO: add in unit testing where there is no TZP that is > 840 or < -720
+    '''
+
+    if "timezoneOffset" in list(df):
+
+        while ((df.timezoneOffset > 840).sum() > 0):
+            df.loc[df.timezoneOffset > 840, ["conversionOffset"]] = (
+                df.loc[df.timezoneOffset > 840, ["conversionOffset"]]
+                - (1440 * 60 * 1000)
+                )
+
+            df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] = (
+                df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] - 1440
+            )
+
+        while ((df.timezoneOffset < -720).sum() > 0):
+            df.loc[df.timezoneOffset < -720, ["conversionOffset"]] = (
+                df.loc[df.timezoneOffset < -720, ["conversionOffset"]]
+                + (1440 * 60 * 1000)
+            )
+
+            df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] = (
+                df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] + 1440
+            )
+
+    return df
+
+
+def get_local_time(df):
+
+    tzo = df[['utcTime', 'inferredTimezone']].apply(
+        lambda x: get_timezone_offset(*x), axis=1
+    )
+    local_time = df['utcTime'] + pd.to_timedelta(tzo, unit="m")
+
+    return local_time
+
+
+def round_time(
+        df,
+        time_interval_minutes=5,
+        start_with_first_record=True,
+        return_calculation_columns=False
+):
+    '''
+    A general purpose round time function that rounds the "time"
+    field to nearest <time_interval_minutes> minutes
+    INPUTS:
+        * a dataframe (df) or time series that contains only one time field
+        that you want to round
+        * time_interval_minutes (defaults to 5 minutes given that most cgms
+        output every 5 minutes)
+        * start_with_first_record starts the rounding with the first record
+        if True, and the last record if False (defaults to True)
+        * return_calculation_columns specifies whether the extra columns
+        used to make calculations are returned
+    refactored name(s) to meet style guide
+    '''
+    # if a time series is passed in, convert to dataframe
+    if "Series" in get_type(df):
+        df = pd.DataFrame(df)
+    columns_ = list(df)
+    if len(columns_) > 1:
+        sys.exit(
+            "Error: df should only have one time column"
+        )
+    else:
+        df.rename(columns={columns_[0]: "t"}, inplace=True)
+
+    df.sort_values(
+        by="t",
+        ascending=start_with_first_record,
+        inplace=True
+    )
+
+    df.reset_index(drop=False, inplace=True)
+    df.rename(columns={"index": "originalIndex"}, inplace=True)
+
+    # calculate the time between consecutive records
+    df["t_shift"] = df["t"].shift(1)
+    df["timeBetweenRecords"] = round(
+        (df["t"] - df["t_shift"]).dt.days*(86400/(60 * time_interval_minutes))
+        + (df["t"] - df["t_shift"]).dt.seconds/(60 * time_interval_minutes)
+    ) * time_interval_minutes
+
+    # separate the data into chunks if timeBetweenRecords is greater than
+    # 2 times the <time_interval_minutes> minutes so the rounding process
+    # starts over
+    big_gaps = list(
+        df.query("abs(timeBetweenRecords) > "
+                 + str(time_interval_minutes * 2)).index
+    )
+    big_gaps.insert(0, 0)
+    big_gaps.append(len(df))
+
+    for gap_index in range(0, len(big_gaps) - 1):
+        chunk = df["t"][big_gaps[gap_index]:big_gaps[gap_index+1]]
+        first_chunk = df["t"][big_gaps[gap_index]]
+
+        # calculate the time difference between
+        # each time record and the first record
+        df.loc[
+            big_gaps[gap_index]:big_gaps[gap_index+1],
+            "minutesFromFirstRecord"
+        ] = (
+            (chunk - first_chunk).dt.days*(86400/60)
+            + (chunk - first_chunk).dt.seconds/60
+        )
+
+        # then round to the nearest X Minutes
+        # NOTE: the ".000001" ensures that mulitples of 2:30 always rounds up.
+        df.loc[
+            big_gaps[gap_index]:big_gaps[gap_index+1],
+            "roundedMinutesFromFirstRecord"
+        ] = round(
+            (df.loc[
+                big_gaps[gap_index]:big_gaps[gap_index+1],
+                "minutesFromFirstRecord"
+            ] / time_interval_minutes) + 0.000001
+        ) * (time_interval_minutes)
+
+        rounded_first_record = (
+            first_chunk + pd.Timedelta("1microseconds")
+        ).round(str(time_interval_minutes) + "min")
+
+        df.loc[
+            big_gaps[gap_index]:big_gaps[gap_index+1],
+            "roundedTime"
+        ] = rounded_first_record + pd.to_timedelta(
+            df.loc[
+                big_gaps[gap_index]:big_gaps[gap_index+1],
+                "roundedMinutesFromFirstRecord"
+            ], unit="m"
+        )
+
+    if return_calculation_columns is False:
+        df.drop(
+            columns=[
+                "timeBetweenRecords",
+                "minutesFromFirstRecord",
+                "roundedMinutesFromFirstRecord"
+            ], inplace=True
+        )
+    # sort back to the original index
+    df.sort_values(by="originalIndex", inplace=True)
+
+    return df["roundedTime"].values
+
+
+def add_upload_time(df):
+    '''
+    this is taken from a colab notebook that is not in our github
+    given that it has been refactored to account for bug where there are
+    no upload records
+    NOTE: this is a new fix introduced with healthkit data...we now have
+    data that does not have an upload record
+
+    '''
+
+    if "upload" in df.type.unique():
+        upload_times = pd.DataFrame(
+            df[df.type == "upload"].groupby("uploadId")["utcTime"].max()
+        )
+    else:
+        upload_times = pd.DataFrame(columns=["utcTime"])
+
+    unique_uploadIds = set(df["uploadId"].unique())
+    unique_uploadRecords = set(
+        df.loc[df["type"] == "upload", "uploadId"].unique()
+    )
+    uploadIds_missing_uploadRecords = unique_uploadIds - unique_uploadRecords
+
+    for upId in uploadIds_missing_uploadRecords:
+        last_upload_time = df.loc[df["uploadId"] == upId, "utcTime"].max()
+        upload_times.loc[upId, "utcTime"] = last_upload_time
+
+    upload_times.reset_index(inplace=True)
+    upload_times.rename(
+        columns={"utcTime": "uploadTime",
+                 "index": "uploadId"},
+        inplace=True
+    )
+
+    df = pd.merge(df, upload_times, how='left', on='uploadId')
+
+    return df["uploadTime"].values
+
+
+def remove_invalid_cgm_values(df):
+
+    nBefore = len(df)
+    # remove values < 38 and > 402 mg/dL
+    df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] < 38))].index)
+    df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] > 402))].index)
+    nRemoved = nBefore - len(df)
+
+    return df, nRemoved
+
+
+def removeDuplicates(df, criteriaDF):
+    nBefore = len(df)
+    df = df.loc[~(df[criteriaDF].duplicated())]
+    df = df.reset_index(drop=True)
+    nDuplicatesRemoved = nBefore - len(df)
+
+    return df, nDuplicatesRemoved
+
+
+def removeCgmDuplicates(df, timeCriterion, valueCriterion="value"):
+    if timeCriterion in df:
+        df.sort_values(by=[timeCriterion, "uploadTime"],
+                       ascending=[False, False],
+                       inplace=True)
+        dfIsNull = df[df[timeCriterion].isnull()]
+        dfNotNull = df[df[timeCriterion].notnull()]
+        dfNotNull, nDuplicatesRemoved = (
+            removeDuplicates(dfNotNull, [timeCriterion, valueCriterion])
+        )
+        df = pd.concat([dfIsNull, dfNotNull])
+        df.sort_values(by=[timeCriterion, "uploadTime"],
+                       ascending=[False, False],
+                       inplace=True)
+    else:
+        nDuplicatesRemoved = 0
+
+    return df, nDuplicatesRemoved
+
+
+# get rid of spike data
+def remove_spike_data(df):
+    if "origin" in list(df):
+        nBefore = len(df)
+        spike_locations = [
+            "origin.payload.device.name",
+            "origin.payload.device.manufacturer",
+            "origin.payload.sourceRevision.source.name",
+        ]
+        for spike_loc in spike_locations:
+            df[spike_loc] = get_embedded_field(df["origin"], spike_loc)
+            notnull_idx = df[spike_loc].notnull()
+            df_notnull = df[notnull_idx]
+            is_spike = df_notnull[spike_loc].astype(str).str.lower().str.contains("spike")
+            spike_idx = df_notnull[is_spike].index
+            df.drop(spike_idx, inplace=True)
+
+        nRemoved = nBefore - len(df)
+
+    else:
+        nRemoved = np.nan
+
+    return df, nRemoved
+
+
+# %% ESTIMATE LOCAL TIME FUNCTIONS
+def convert_deprecated_timezone_to_alias(df, tzAlias):
+    if "timezone" in df:
+        uniqueTimezones = df.timezone.unique()
+        uniqueTimezones = uniqueTimezones[pd.notnull(df.timezone.unique())]
+
+        for uniqueTimezone in uniqueTimezones:
+            alias = tzAlias.loc[tzAlias.tz.str.endswith(uniqueTimezone),
+                                ["alias"]].values
+            if len(alias) == 1:
+                df.loc[df.timezone == uniqueTimezone, ["timezone"]] = alias
+
+    return df
+
+
+def create_contiguous_day_series(df):
+    first_day = df["date"].min()
+    last_day = df["date"].max()
+    rng = pd.date_range(first_day, last_day).date
+    contiguousDaySeries = \
+        pd.DataFrame(rng, columns=["date"]).sort_values(
+                "date", ascending=False).reset_index(drop=True)
+
+    return contiguousDaySeries
+
+
+def add_device_type(df):
+    col_headings = list(df)
+    if "deviceType" not in col_headings:
+        df["deviceType"] = np.nan
+    if "deviceTags" in col_headings:
+        # first make sure deviceTag is in string format
+        df["deviceTags"] = df.deviceTags.astype(str)
+        # filter by type not null device tags
+        ud = df[df["deviceTags"].notnull()].copy()
+        # define a device type (e.g., pump, cgm, or healthkit)
+        ud.loc[
+            ((ud["deviceTags"].str.contains("pump"))
+             & (ud["deviceType"].isnull())),
+            ["deviceType"]
+        ] = "pump"
+
+        # define a device type (e.g., cgm)
+        ud.loc[
+            ((ud["deviceTags"].str.contains("cgm"))
+             & (ud["deviceType"].isnull())),
+            ["deviceType"]
+        ] = "cgm"
+
+        return ud["deviceType"]
+    else:
+        return np.nan
+
+
+def get_timezone_offset(currentDate, currentTimezone):
+
+    tz = pytz.timezone(currentTimezone)
+    # here we add 1 day to the current date to account for changes to/from DST
+    tzoNum = int(
+        tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z")
+    )
+    tzoHours = np.floor(tzoNum / 100)
+    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+    tzoSign = np.sign(tzoHours)
+    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+
+    return tzo
+
+
+def add_device_day_series(df, dfContDays, deviceTypeName):
+    if len(df) > 0:
+        dfDayGroups = df.groupby("date")
+        if "timezoneOffset" in df:
+            dfDaySeries = pd.DataFrame(dfDayGroups["timezoneOffset"].median())
+        else:
+            dfDaySeries = pd.DataFrame(columns=["timezoneOffset"])
+            dfDaySeries.index.name = "date"
+
+        if "upload" in deviceTypeName:
+            if (("timezone" in df) & (df["timezone"].notnull().sum() > 0)):
+                dfDaySeries["timezone"] = (
+                    dfDayGroups.timezone.describe()["top"]
+                )
+                # get the timezone offset for the timezone
+                for i in dfDaySeries.index:
+                    if pd.notnull(dfDaySeries.loc[i, "timezone"]):
+                        tzo = get_timezone_offset(
+                                pd.to_datetime(i),
+                                dfDaySeries.loc[i, "timezone"])
+                        dfDaySeries.loc[i, ["timezoneOffset"]] = tzo
+                if "timeProcessing" in dfDaySeries:
+                    dfDaySeries["timeProcessing"] = \
+                        dfDayGroups.timeProcessing.describe()["top"]
+                else:
+                    dfDaySeries["timeProcessing"] = np.nan
+
+
+        dfDaySeries = dfDaySeries.add_prefix(deviceTypeName + "."). \
+            rename(columns={deviceTypeName + ".date": "date"})
+
+        dfContDays = pd.merge(dfContDays, dfDaySeries.reset_index(),
+                              on="date", how="left")
+
+    else:
+        dfContDays[deviceTypeName + ".timezoneOffset"] = np.nan
+
+    return dfContDays
+
+
+def impute_upload_records(df, contDays, deviceTypeName):
+    daySeries = \
+        add_device_day_series(df, contDays, deviceTypeName)
+
+    if ((len(df) > 0) & (deviceTypeName + ".timezone" in daySeries)):
+        for i in daySeries.index[1:]:
+            if pd.isnull(daySeries[deviceTypeName + ".timezone"][i]):
+                daySeries.loc[i, [deviceTypeName + ".timezone"]] = (
+                    daySeries.loc[i-1, deviceTypeName + ".timezone"]
+                )
+            if pd.notnull(daySeries[deviceTypeName + ".timezone"][i]):
+                tz = daySeries.loc[i, deviceTypeName + ".timezone"]
+                tzo = get_timezone_offset(
+                    pd.to_datetime(daySeries.loc[i, "date"]),
+                    tz
+                )
+                daySeries.loc[i, deviceTypeName + ".timezoneOffset"] = tzo
+
+            if pd.notnull(daySeries[deviceTypeName + ".timeProcessing"][i-1]):
+                daySeries.loc[i, deviceTypeName + ".timeProcessing"] = \
+                    daySeries.loc[i-1, deviceTypeName + ".timeProcessing"]
+
+    else:
+        daySeries[deviceTypeName + ".timezone"] = np.nan
+        daySeries[deviceTypeName + ".timeProcessing"] = np.nan
+
+    return daySeries
+
+
+def add_home_timezone(df, contDays):
+
+    if (("timezone" in df) & (df["timezone"].notnull().sum()> 0)):
+        homeTimezone = df["timezone"].describe()["top"]
+        tzo = contDays.date.apply(
+                lambda x: get_timezone_offset(pd.to_datetime(x), homeTimezone))
+
+        contDays["home.imputed.timezoneOffset"] = tzo
+        contDays["home.imputed.timezone"] = homeTimezone
+
+    else:
+        contDays["home.imputed.timezoneOffset"] = np.nan
+        contDays["home.imputed.timezone"] = np.nan
+    contDays["home.imputed.timeProcessing"] = np.nan
+
+    return contDays
+
+
+def estimateTzAndTzoWithUploadRecords(cDF):
+
+    cDF["est.type"] = np.nan
+    cDF["est.gapSize"] = np.nan
+    cDF["est.timezoneOffset"] = cDF["upload.timezoneOffset"]
+    cDF["est.annotations"] = np.nan
+
+    if "upload.timezone" in cDF:
+        cDF.loc[cDF["upload.timezone"].notnull(), ["est.type"]] = "UPLOAD"
+        cDF["est.timezone"] = cDF["upload.timezone"]
+        cDF["est.timeProcessing"] = cDF["upload.timeProcessing"]
+    else:
+        cDF["est.timezone"] = np.nan
+        cDF["est.timeProcessing"] = np.nan
+
+    cDF.loc[((cDF["est.timezoneOffset"] !=
+              cDF["home.imputed.timezoneOffset"]) &
+            (pd.notnull(cDF["est.timezoneOffset"]))),
+            "est.annotations"] = "travel"
+
+    return cDF
+
+
+def assignTzoFromImputedSeries(df, i, imputedSeries):
+    df.loc[i, ["est.type"]] = "DEVICE"
+
+    df.loc[i, ["est.timezoneOffset"]] = \
+        df.loc[i, imputedSeries + ".timezoneOffset"]
+
+    df.loc[i, ["est.timezone"]] = \
+        df.loc[i, imputedSeries + ".timezone"]
+
+    df.loc[i, ["est.timeProcessing"]] = \
+        df.loc[i, imputedSeries + ".timeProcessing"]
+
+    return df
+
+
+def compareDeviceTzoToImputedSeries(df, sIdx, device):
+    for i in sIdx:
+        # if the device tzo = imputed tzo, then chose the imputed tz and tzo
+        # note, dst is accounted for in the imputed tzo
+        for imputedSeries in ["pump.upload.imputed", "cgm.upload.imputed",
+                              "healthkit.upload.imputed", "home.imputed"]:
+            # if the estimate has not already been made
+            if pd.isnull(df.loc[i, "est.timezone"]):
+
+                if df.loc[i, device + ".timezoneOffset"] == \
+                  df.loc[i, imputedSeries + ".timezoneOffset"]:
+
+                    assignTzoFromImputedSeries(df, i, imputedSeries)
+
+                    df = addAnnotation(df, i,
+                                       "tz-inferred-from-" + imputedSeries)
+
+                # if the imputed series has a timezone estimate, then see if
+                # the current day is a dst change day
+                elif (pd.notnull(df.loc[i, imputedSeries + ".timezone"])):
+                    imputedTimezone = df.loc[i, imputedSeries + ".timezone"]
+                    if isDSTChangeDay(df.loc[i, "date"], imputedTimezone):
+
+                        dstRange = getRangeOfTZOsForTimezone(imputedTimezone)
+                        if ((df.loc[i, device + ".timezoneOffset"] in dstRange)
+                          & (df.loc[i, imputedSeries + ".timezoneOffset"] in dstRange)):
+
+                            assignTzoFromImputedSeries(df, i, imputedSeries)
+
+                            df = addAnnotation(df, i, "dst-change-day")
+                            df = addAnnotation(
+                                    df, i, "tz-inferred-from-" + imputedSeries)
+
+    return df
+
+
+def estimateTzAndTzoWithDeviceRecords(cDF):
+
+    # 2A. use the TZO of the pump or cgm device if it exists on a given day. In
+    # addition, compare the TZO to one of the imputed day series (i.e., the
+    # upload and home series to see if the TZ can be inferred)
+    for deviceType in ["pump", "cgm"]:
+        # find the indices of days where a TZO estimate has not been made AND
+        # where the device (e.g., pump or cgm) TZO has data
+        sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
+                        (cDF[deviceType + ".timezoneOffset"].notnull()))].index
+        # compare the device TZO to the imputed series to infer time zone
+        cDF = compareDeviceTzoToImputedSeries(cDF, sIndices, deviceType)
+
+    # 2B. if the TZ cannot be inferred with 2A, then see if the TZ can be
+    # inferred from the previous day's TZO. If the device TZO is equal to the
+    # previous day's TZO, AND if the previous day has a TZ estimate, use the
+    # previous day's TZ estimate for the current day's TZ estimate
+    for deviceType in ["pump", "cgm"]:
+        sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
+                        (cDF[deviceType + ".timezoneOffset"].notnull()))].index
+
+        cDF = compareDeviceTzoToPrevDayTzo(cDF, sIndices, deviceType)
+
+    # 2C. after 2A and 2B, check the DEVICE estimates to make sure that the
+    # pump and cgm tzo do not differ by more than 60 minutes. If they differ
+    # by more that 60 minutes, then mark the estimate as UNCERTAIN. Also, we
+    # allow the estimates to be off by 60 minutes as there are a lot of cases
+    # where the devices are off because the user changes the time for DST,
+    # at different times
+    sIndices = cDF[((cDF["est.type"] == "DEVICE") &
+                    (cDF["pump.timezoneOffset"].notnull()) &
+                    (cDF["cgm.timezoneOffset"].notnull()) &
+                    (cDF["pump.timezoneOffset"] != cDF["cgm.timezoneOffset"])
+                    )].index
+
+    tzoDiffGT60 = abs(cDF.loc[sIndices, "cgm.timezoneOffset"] -
+                      cDF.loc[sIndices, "pump.timezoneOffset"]) > 60
+
+    idx = tzoDiffGT60.index[tzoDiffGT60]
+
+    cDF.loc[idx, ["est.type"]] = "UNCERTAIN"
+    for i in idx:
+        cDF = addAnnotation(cDF, i, "pump-cgm-tzo-mismatch")
+
+    return cDF
+
+
+def imputeTzAndTzo(cDF):
+
+    sIndices = cDF[cDF["est.timezoneOffset"].isnull()].index
+    hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index
+    if len(hasTzoIndices) > 0:
+        if len(sIndices) > 0:
+            lastDay = max(sIndices)
+
+            while ((sIndices.min() < max(hasTzoIndices)) &
+                   (len(sIndices) > 0)):
+
+                currentDay, prevDayWithDay, nextDayIdx = \
+                    getImputIndices(cDF, sIndices, hasTzoIndices)
+
+                cDF = imputeByTimezone(cDF, currentDay,
+                                       prevDayWithDay, nextDayIdx)
+
+                sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
+                                (~cDF["est.annotations"].str.contains(
+                                "unable-to-impute-tzo").fillna(False)))].index
+
+                hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index
+
+            # try to impute to the last day (earliest day) in the dataset
+            # if the last record has a timezone that is the home record, then
+            # impute using the home timezone
+            if len(sIndices) > 0:
+                currentDay = min(sIndices)
+                prevDayWithDay = currentDay - 1
+                gapSize = lastDay - currentDay
+
+                for i in range(currentDay, lastDay + 1):
+                    if cDF.loc[prevDayWithDay, "est.timezoneOffset"] == \
+                      cDF.loc[prevDayWithDay, "home.imputed.timezoneOffset"]:
+
+                        cDF.loc[i, ["est.type"]] = "IMPUTE"
+
+                        cDF.loc[i, ["est.timezoneOffset"]] = \
+                            cDF.loc[i, "home.imputed.timezoneOffset"]
+
+                        cDF.loc[i, ["est.timezone"]] = \
+                            cDF.loc[i, "home.imputed.timezone"]
+
+                        cDF = addAnnotation(cDF, i, "gap=" + str(gapSize))
+                        cDF.loc[i, ["est.gapSize"]] = gapSize
+
+                    else:
+                        cDF.loc[i, ["est.type"]] = "UNCERTAIN"
+                        cDF = addAnnotation(cDF, i, "unable-to-impute-tzo")
+    else:
+        cDF["est.type"] = "UNCERTAIN"
+        cDF["est.annotations"] = "unable-to-impute-tzo"
+
+    return cDF
+
+
+def getRangeOfTZOsForTimezone(tz):
+    minMaxTzo = [getTimezoneOffset(pd.to_datetime("1/1/2017"), tz),
+                 getTimezoneOffset(pd.to_datetime("5/1/2017"), tz)]
+
+    rangeOfTzo = np.arange(int(min(minMaxTzo)), int(max(minMaxTzo))+1, 15)
+
+    return rangeOfTzo
+
+
+def getListOfDSTChangeDays(cDF):
+
+    # get a list of DST change days for the home time zone
+    dstChangeDays = \
+        cDF[abs(cDF["home.imputed.timezoneOffset"] -
+                cDF["home.imputed.timezoneOffset"].shift(-1)) > 0].date
+
+    return dstChangeDays
+
+
+def correctEstimatesAroundDst(df, cDF):
+
+    # get a list of DST change days for the home time zone
+    dstChangeDays = getListOfDSTChangeDays(cDF)
+
+    # loop through the df within 2 days of a daylight savings time change
+    for d in dstChangeDays:
+        dstIndex = df[(df.date > (d + dt.timedelta(days=-2))) &
+                      (df.date < (d + dt.timedelta(days=2)))].index
+        for dIdx in dstIndex:
+            if pd.notnull(df.loc[dIdx, "est.timezone"]):
+                tz = pytz.timezone(df.loc[dIdx, "est.timezone"])
+                tzRange = getRangeOfTZOsForTimezone(str(tz))
+                minHoursToLocal = min(tzRange)/60
+                tzoNum = int(tz.localize(df.loc[dIdx, "utcTime"] +
+                             dt.timedelta(hours=minHoursToLocal)).strftime("%z"))
+                tzoHours = np.floor(tzoNum / 100)
+                tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+                tzoSign = np.sign(tzoHours)
+                tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+                localTime = \
+                    df.loc[dIdx, "utcTime"] + pd.to_timedelta(tzo, unit="m")
+                df.loc[dIdx, ["est.localTime"]] = localTime
+                df.loc[dIdx, ["est.timezoneOffset"]] = tzo
+    return df
+
+
+def applyLocalTimeEstimates(df, cDF):
+    df = pd.merge(df, cDF, how="left", on="date")
+    df["est.localTime"] = \
+        df["utcTime"] + pd.to_timedelta(df["est.timezoneOffset"], unit="m")
+
+    df = correctEstimatesAroundDst(df, cDF)
+
+    return df["est.localTime"].values
+
+
+def isDSTChangeDay(currentDate, currentTimezone):
+    tzoCurrentDay = getTimezoneOffset(pd.to_datetime(currentDate),
+                                      currentTimezone)
+    tzoPreviousDay = getTimezoneOffset(pd.to_datetime(currentDate) +
+                                       dt.timedelta(days=-1), currentTimezone)
+
+    return (tzoCurrentDay != tzoPreviousDay)
+
+
+def tzoRangeWithComparisonTz(df, i, comparisonTz):
+    # if we have a previous timezone estimate, then calcuate the range of
+    # timezone offset values for that time zone
+    if pd.notnull(comparisonTz):
+        rangeTzos = getRangeOfTZOsForTimezone(comparisonTz)
+    else:
+        comparisonTz = np.nan
+        rangeTzos = np.array([])
+
+    return rangeTzos
+
+
+def tzAndTzoRangePreviousDay(df, i):
+    # if we have a previous timezone estimate, then calcuate the range of
+    # timezone offset values for that time zone
+    comparisonTz = df.loc[i-1, "est.timezone"]
+
+    rangeTzos = tzoRangeWithComparisonTz(df, i, comparisonTz)
+
+    return comparisonTz, rangeTzos
+
+
+def assignTzoFromPreviousDay(df, i, previousDayTz):
+
+    df.loc[i, ["est.type"]] = "DEVICE"
+    df.loc[i, ["est.timezone"]] = previousDayTz
+    df.loc[i, ["est.timezoneOffset"]] = \
+        getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), previousDayTz)
+
+    df.loc[i, ["est.timeProcessing"]] = df.loc[i-1, "est.timeProcessing"]
+    df = addAnnotation(df, i, "tz-inferred-from-prev-day")
+
+    return df
+
+
+def assignTzoFromDeviceTzo(df, i, device):
+
+    df.loc[i, ["est.type"]] = "DEVICE"
+    df.loc[i, ["est.timezoneOffset"]] = \
+        df.loc[i, device + ".timezoneOffset"]
+    df.loc[i, ["est.timeProcessing"]] = \
+        df.loc[i, device + ".upload.imputed.timeProcessing"]
+
+    df = addAnnotation(df, i, "likely-travel")
+    df = addAnnotation(df, i, "tzo-from-" + device)
+
+    return df
+
+
+def compareDeviceTzoToPrevDayTzo(df, sIdx, device):
+
+    for i in sIdx[sIdx > 0]:
+
+        # first see if the previous record has a tzo
+        if (pd.notnull(df.loc[i-1, "est.timezoneOffset"])):
+
+            previousDayTz, dstRange = tzAndTzoRangePreviousDay(df, i)
+            timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) -
+                           df.loc[i-1, "est.timezoneOffset"])
+
+            # next see if the previous record has a tz
+            if (pd.notnull(df.loc[i-1, "est.timezone"])):
+
+                if timeDiff == 0:
+                    assignTzoFromPreviousDay(df, i, previousDayTz)
+
+                # see if the previous day's tzo and device tzo are within the
+                # dst range (as that is a common problem with this data)
+                elif ((df.loc[i, device + ".timezoneOffset"] in dstRange)
+                      & (df.loc[i-1, "est.timezoneOffset"] in dstRange)):
+
+                    # then see if it is DST change day
+                    if isDSTChangeDay(df.loc[i, "date"], previousDayTz):
+
+                        df = addAnnotation(df, i, "dst-change-day")
+                        assignTzoFromPreviousDay(df, i, previousDayTz)
+
+                    # if it is not DST change day, then mark this as uncertain
+                    else:
+                        # also, check to see if the difference between device.
+                        # tzo and prev.tzo is less than the expected dst
+                        # difference. There is a known issue where the BtUTC
+                        # procedure puts clock drift into the device.tzo,
+                        # and as a result the tzo can be off by 15, 30,
+                        # or 45 minutes.
+                        if (((df.loc[i, device + ".timezoneOffset"] ==
+                              min(dstRange)) |
+                            (df.loc[i, device + ".timezoneOffset"] ==
+                             max(dstRange))) &
+                           ((df.loc[i-1, "est.timezoneOffset"] ==
+                             min(dstRange)) |
+                            (df.loc[i-1, "est.timezoneOffset"] ==
+                             max(dstRange)))):
+
+                            df.loc[i, ["est.type"]] = "UNCERTAIN"
+                            df = addAnnotation(df, i,
+                                               "likely-dst-error-OR-travel")
+
+                        else:
+
+                            df.loc[i, ["est.type"]] = "UNCERTAIN"
+                            df = addAnnotation(df, i,
+                                               "likely-15-min-dst-error")
+
+                # next see if time difference between device.tzo and prev.tzo
+                # is off by 720 minutes, which is indicative of a common
+                # user AM/PM error
+                elif timeDiff == 720:
+                    df.loc[i, ["est.type"]] = "UNCERTAIN"
+                    df = addAnnotation(df, i, "likely-AM-PM-error")
+
+                # if it doesn't fall into any of these cases, then the
+                # tzo difference is likely due to travel
+                else:
+                    df = assignTzoFromDeviceTzo(df, i, device)
+
+            elif timeDiff == 0:
+                df = assignTzoFromDeviceTzo(df, i, device)
+
+        # if there is no previous record to compare with check for dst errors,
+        # and if there are no errors, it is likely a travel day
+        else:
+
+            comparisonTz, dstRange = tzAndTzoRangeWithHomeTz(df, i)
+            timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) -
+                           df.loc[i, "home.imputed.timezoneOffset"])
+
+            if ((df.loc[i, device + ".timezoneOffset"] in dstRange)
+               & (df.loc[i, "home.imputed.timezoneOffset"] in dstRange)):
+
+                # see if it is DST change day
+                if isDSTChangeDay(df.loc[i, "date"], comparisonTz):
+
+                    df = addAnnotation(df, i, "dst-change-day")
+                    df.loc[i, ["est.type"]] = "DEVICE"
+                    df.loc[i, ["est.timezoneOffset"]] = \
+                        df.loc[i, device + ".timezoneOffset"]
+                    df.loc[i, ["est.timezone"]] = \
+                        df.loc[i, "home.imputed.timezone"]
+                    df.loc[i, ["est.timeProcessing"]] = \
+                        df.loc[i, device + ".upload.imputed.timeProcessing"]
+
+                # if it is not DST change day, then mark this as uncertain
+                else:
+                    # also, check to see if the difference between device.
+                    # tzo and prev.tzo is less than the expected dst
+                    # difference. There is a known issue where the BtUTC
+                    # procedure puts clock drift into the device.tzo,
+                    # and as a result the tzo can be off by 15, 30,
+                    # or 45 minutes.
+                    if (((df.loc[i, device + ".timezoneOffset"] ==
+                          min(dstRange)) |
+                        (df.loc[i, device + ".timezoneOffset"] ==
+                         max(dstRange))) &
+                       ((df.loc[i, "home.imputed.timezoneOffset"] ==
+                         min(dstRange)) |
+                        (df.loc[i, "home.imputed.timezoneOffset"] ==
+                         max(dstRange)))):
+
+                        df.loc[i, ["est.type"]] = "UNCERTAIN"
+                        df = addAnnotation(df, i, "likely-dst-error-OR-travel")
+
+                    else:
+
+                        df.loc[i, ["est.type"]] = "UNCERTAIN"
+                        df = addAnnotation(df, i, "likely-15-min-dst-error")
+
+            # next see if time difference between device.tzo and prev.tzo
+            # is off by 720 minutes, which is indicative of a common
+            # user AM/PM error
+            elif timeDiff == 720:
+                df.loc[i, ["est.type"]] = "UNCERTAIN"
+                df = addAnnotation(df, i, "likely-AM-PM-error")
+
+            # if it doesn't fall into any of these cases, then the
+            # tzo difference is likely due to travel
+
+            else:
+                df = assignTzoFromDeviceTzo(df, i, device)
+
+    return df
+
+
+def getImputIndices(df, sIdx, hIdx):
+
+    lastDayIdx = len(df) - 1
+
+    currentDayIdx = sIdx.min()
+    tempList = pd.Series(hIdx) - currentDayIdx
+    prevDayIdx = currentDayIdx - 1
+    nextDayIdx = \
+        min(currentDayIdx + min(tempList[tempList >= 0]), lastDayIdx)
+
+    return currentDayIdx, prevDayIdx, nextDayIdx
+
+
+def imputeByTimezone(df, currentDay, prevDaywData, nextDaywData):
+
+    gapSize = (nextDaywData - currentDay)
+
+    if prevDaywData >= 0:
+
+        if df.loc[prevDaywData, "est.timezone"] == \
+          df.loc[nextDaywData, "est.timezone"]:
+
+            tz = df.loc[prevDaywData, "est.timezone"]
+
+            for i in range(currentDay, nextDaywData):
+
+                df.loc[i, ["est.timezone"]] = tz
+
+                df.loc[i, ["est.timezoneOffset"]] = \
+                    getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), tz)
+
+                df.loc[i, ["est.type"]] = "IMPUTE"
+
+                df = addAnnotation(df, i, "gap=" + str(gapSize))
+                df.loc[i, ["est.gapSize"]] = gapSize
+
+        # TODO: this logic should be updated to handle the edge case
+        # where the day before and after the gap have differing TZ, but
+        # the same TZO. In that case the gap should be marked as UNCERTAIN
+        elif df.loc[prevDaywData, "est.timezoneOffset"] == \
+          df.loc[nextDaywData, "est.timezoneOffset"]:
+
+            for i in range(currentDay, nextDaywData):
+
+                df.loc[i, ["est.timezoneOffset"]] = \
+                    df.loc[prevDaywData, "est.timezoneOffset"]
+
+                df.loc[i, ["est.type"]] = "IMPUTE"
+
+                df = addAnnotation(df, i, "gap=" + str(gapSize))
+                df.loc[i, ["est.gapSize"]] = gapSize
+
+        else:
+            for i in range(currentDay, nextDaywData):
+                df.loc[i, ["est.type"]] = "UNCERTAIN"
+                df = addAnnotation(df, i, "unable-to-impute-tzo")
+
+    else:
+        for i in range(currentDay, nextDaywData):
+            df.loc[i, ["est.type"]] = "UNCERTAIN"
+            df = addAnnotation(df, i, "unable-to-impute-tzo")
+
+    return df
+
+
+def addAnnotation(df, idx, annotationMessage):
+    if pd.notnull(df.loc[idx, "est.annotations"]):
+        df.loc[idx, ["est.annotations"]] = df.loc[idx, "est.annotations"] + \
+            ", " + annotationMessage
+    else:
+        df.loc[idx, ["est.annotations"]] = annotationMessage
+
+    return df
+
+
+def getTimezoneOffset(currentDate, currentTimezone):
+
+    tz = pytz.timezone(currentTimezone)
+    # here we add 1 day to the current date to account for changes to/from DST
+    tzoNum = int(tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z"))
+    tzoHours = np.floor(tzoNum / 100)
+    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+    tzoSign = np.sign(tzoHours)
+    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+
+    return tzo
+
+
+def estimate_local_time(df):
+    df["date"] = df["utcTime"].dt.date  # TODO: change this to utcDate later
+    contiguous_days = create_contiguous_day_series(df)
+
+    df["deviceType"] = add_device_type(df)
+    cDays = add_device_day_series(df, contiguous_days, "upload")
+
+    # create day series for cgm df
+    if "timezoneOffset" not in list(df):
+        df["timezoneOffset"] = np.nan
+
+    cgmdf = df[(df["type"] == "cbg") & (df["timezoneOffset"].notnull())].copy()
+    cDays = add_device_day_series(cgmdf, cDays, "cgm")
+
+    # create day series for pump df
+    pumpdf = df[(df.type == "bolus") & (df.timezoneOffset.notnull())].copy()
+    cDays = add_device_day_series(pumpdf, cDays, "pump")
+
+    # interpolate between upload records of the same deviceType, and create a
+    # day series for interpolated pump, non-hk-cgm, and healthkit uploads
+    for deviceType in ["pump", "cgm", "healthkit"]:
+        tempUploaddf = df[df["deviceType"] == deviceType].copy()
+        cDays = impute_upload_records(
+            tempUploaddf, cDays, deviceType + ".upload.imputed"
+        )
+
+    # add a home timezone that also accounts for daylight savings time changes
+    cDays = add_home_timezone(df, cDays)
+
+    # 1. USE UPLOAD RECORDS TO ESTIMATE TZ AND TZO
+    cDays = estimateTzAndTzoWithUploadRecords(cDays)
+
+    # 2. USE DEVICE TZOs TO ESTIMATE TZO AND TZ (IF POSSIBLE)
+    # estimates can be made from pump and cgm df that have a TZO
+    # NOTE: the healthkit and dexcom-api cgm df are excluded
+    cDays = estimateTzAndTzoWithDeviceRecords(cDays)
+
+    # 3. impute, infer, or interpolate gaps in the estimated tzo and tz
+    cDays = imputeTzAndTzo(cDays)
+
+    # 4. APPLY LOCAL TIME ESTIMATES TO ALL df
+    local_time = applyLocalTimeEstimates(df, cDays)
+
+    return local_time, cDays
+
+
+# %% GET DATA FROM JSON FILE
+data_path = os.path.join("..", "data")
+all_donor_metadata = pd.read_csv(
+    os.path.join(
+        data_path,
+        "PHI-2019-07-17-donor-data",
+        "PHI-2019-07-17-donor-metadata.csv"),
+    low_memory=False
+)
+
+# glob through the json files that are available
+all_files = glob.glob(
+    os.path.join(
+        data_path,
+        "dremio",
+        "**",
+        "*.json"
+    ),
+    recursive=True
+)
+
+output_metadata = os.path.join(
+    data_path,
+    "PHI-2019-07-17-donor-data",
+    "PHI-2019-07-17-cgm-metadata"
+)
+output_distribution = os.path.join(
+    data_path,
+    "PHI-2019-07-17-donor-data",
+    "PHI-2019-07-17-cgm-distributions"
+)
+debug_duplicates = os.path.join(
+    data_path,
+    "PHI-2019-07-17-donor-data",
+    "PHI-2019-07-17-debug-cgm-duplicates"
+)
+output_stats = os.path.join(
+    data_path,
+    "PHI-2019-07-17-donor-data",
+    "PHI-2019-07-17-cgm-stats"
+)
+
+
+make_folder_if_doesnt_exist(
+    [output_metadata, output_distribution, debug_duplicates, output_stats]
+)
+
+
+# %% START OF CODE
+timezone_aliases = pd.read_csv(
+    "wikipedia-timezone-aliases-2018-04-28.csv",
+    low_memory=False
+)
+
+donor_metadata_columns = [
+    'userid',
+    'diagnosisType',
+    'diagnosisDate',
+    'biologicalSex',
+    'birthday',
+    'targetTimezone',
+    'targetDevices',
+    'isOtherPerson',
+]
+
+
+## %% load test data on my computer
+## TODO: if data comes in as a .csv, the embedded json fields
+## get saved as a string and need to be unwrapped before those fields
+## can be expanded. IN OTHER WORDS: this code only works with .json data
+#for d_idx in [0]:
+#    userid = "0d4524bc11"
+#    data = pd.read_json(os.path.join(
+#            "..", "data", "dremio", userid, "PHI-{}.json".format(userid)
+#    ))
+
+# %%
+for d_idx in range(0, len(all_files)):
+    data = pd.read_json(all_files[d_idx])
+    userid = all_files[d_idx][-15:-5]
+    metadata = all_donor_metadata.loc[
+        all_donor_metadata["userid"] == userid,
+        donor_metadata_columns
+    ]
+    print("\n", "starting", userid)
+
+    #  HASH USER ID
+    hashid = hash_userid(userid, os.environ['BIGDATA_SALT'])
+    data["userid"] = userid
+    data["hashid"] = hashid
+    metadata["hashid"] = hashid
+
+    #  CLEAN DATA
+    data_fields = list(data)
+
+    # NOTE: moving remove negative durations to type specific cleaning
+    # TODO: ask backend to change "duration" to only include one object type
+
+    # Tslim calibration bug fix
+    data, n_cal_readings = tslim_calibration_fix(data.copy())
+    metadata["nTandemAndPayloadCalReadings"] = n_cal_readings
+
+    # fix large timzoneOffset bug in utcbootstrapping
+    data = timezone_offset_bug_fix(data.copy())
+
+    # add healthkit timezome information
+    # TODO: refactor this function to only require fields that might have hk tz
+    data[["timezone", "deviceType"]] = get_healthkit_timezone(data.copy())
+
+    # convert deprecated timezones to their aliases
+    data = convert_deprecated_timezone_to_alias(data, timezone_aliases)
+
+    #  TIME RELATED ITEMS
+    data["utcTime"] = to_utc_datetime(data[["time"]].copy())
+
+    # add upload time to the data, which is needed for:
+    # getting rid of duplicates and useful for getting local time
+
+    data["uploadTime"] = (
+        add_upload_time(data[["type", "uploadId", "utcTime"]].copy())
+    )
+
+#    # estimate local time (refactor of estimate-local-time.py)
+#    data["localTime"], local_time_metadata = estimate_local_time(data.copy())
+#
+# TODO: fix this issue with estimate local time
+#    '''
+#    //anaconda3/envs/tbddp/lib/python3.7/site-packages/pandas/core/ops.py:1649
+#    FutureWarning: elementwise comparison failed; returning scalar instead,
+#    but in the future will perform elementwise comparison result = method(y)
+#    '''
+
+    # round all data to the nearest 5 minutes
+    data["roundedUtcTime"] = round_time(
+        data["utcTime"].copy(),
+        time_interval_minutes=5,
+        start_with_first_record=True,
+        return_calculation_columns=False
+    )
+
+    #  TIME CATEGORIES
+    data["date"] = data["roundedUtcTime"].dt.date
+
+    # AGE, & YLW
+    # TODO: make this a function
+    if metadata["birthday"].values[0] is not np.nan:
+        bDate = pd.to_datetime(metadata["birthday"].values[0][0:7])
+        data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25)
+    else:
+        data["age"] = np.nan
+
+    if metadata["diagnosisDate"].values[0] is not np.nan:
+        dDate = pd.to_datetime(metadata["diagnosisDate"].values[0][0:7])
+        data["ylw"] = np.floor((data["roundedUtcTime"] - dDate).dt.days/365.25)
+    else:
+        data["ylw"] = np.nan
+
+    #  GROUP DATA BY TYPE
+    # first sort by upload time (used when removing dumplicates)
+    data.sort_values("uploadTime", ascending=False, inplace=True)
+    groups = data.groupby(by="type")
+
+    # check to see if person is looping
+    if "basal" in data["type"].unique():
+        basal = groups.get_group("basal").dropna(axis=1, how="all")
+        if "deliveryType" in list(basal):
+            bd = basal.loc[
+                basal["deliveryType"] == "temp",
+                ["date", "deliveryType"]
+            ]
+            temp_basal_counts = (
+                pd.DataFrame(
+                    bd.groupby("date").deliveryType.count()
+                ).reset_index()
+            )
+            temp_basal_counts.rename(
+                {"deliveryType": "tempBasalCounts"},
+                axis=1,
+                inplace=True
+            )
+            data = pd.merge(data, temp_basal_counts, on="date", how="left")
+            # >= 25 temp basals per day is likely looping
+            data["isLoopDay"] = data["tempBasalCounts"] >= 25
+            # redefine groups with the new data
+            groups = data.groupby(by="type")
+
+        else:
+            data["isLoopDay"] = np.nan
+    else:
+        data["isLoopDay"] = np.nan
+
+    # %% CGM DATA
+    if "cbg" in data["type"].unique():
+        # sort data with
+        metadata["cgmData"] = True
+
+        # filter by cgm
+        cgm = groups.get_group("cbg").copy()
+
+        # sort data
+        cgm.sort_values("roundedUtcTime", ascending=False, inplace=True)
+        cgm.reset_index(drop=False, inplace=True)
+
+        # calculate cgm in mg/dL
+        cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL)
+
+        # get rid of spike data
+        cgm, nSpike = remove_spike_data(cgm.copy())
+        metadata["nSpike"] = nSpike
+
+        # assign upload cgm device info to cgm records in that upload
+        cgm = add_upload_info_to_cgm_records(groups, cgm.copy())
+
+        # check to see if cgm info exists in healthkit locations
+        cgm = expand_heathkit_cgm_fields(cgm.copy())
+
+        # replace smoothed cgm values with raw values (if they exist)
+        # this must run after expand_heathkit_cgm_fields _
+        cgm["mg/dL"], metadata["nSmoothedCgmReplaced"] = (
+            replace_smoothed_cgm_values(cgm.copy())
+        )
+
+        # get cgm models
+        cgm["cgmModel"], cgm["cgmModelSensedFrom"] = np.nan, np.nan
+
+        # dexcom cgm models (G4, G5, G6)
+        cgm[["cgmModel", "cgmModelSensedFrom"]] = (
+            get_dexcom_cgm_model(cgm.copy())
+        )
+
+        # for non dexcom cgms
+        # 670G, 640G, 630G, 530G, 523/723, libre, animas, and tandem
+        cgm[["cgmModel", "cgmModelSensedFrom"]] = (
+            get_non_dexcom_cgm_model(cgm.copy())
+        )
+
+        # get metadata on cgm models and devices
+        metadata["nMissingCgmModels"] = cgm["cgmModel"].isnull().sum()
+        metadata["uniqueCgmModels"] = str(cgm["cgmModel"].unique())
+        if "deviceId" in list(cgm):
+            metadata["uniqueCgmDevices"] = str(cgm["deviceId"].unique())
+
+        #  clean distributions
+        # break up all traces by cgm model
+        combined_cgm_series = pd.DataFrame()
+        cgm_models = cgm.groupby(by="cgmModel")
+
+        for cgm_model in cgm_models.groups.keys():
+            print("working on", cgm_model)
+            temp_cgm = cgm_models.get_group(cgm_model)
+
+            # get rid of cgm values too low/high (< 38 & > 402 mg/dL)
+            temp_cgm, nInvalidCgmValues = remove_invalid_cgm_values(temp_cgm)
+            metadata["nInvalidCgmValues." + cgm_model] = nInvalidCgmValues
+
+            # sort by upload time before getting rid of duplicates
+            temp_cgm.sort_values("uploadTime", ascending=False, inplace=True)
+
+            # get rid of duplicates that have the same ["deviceTime", "mg/dL"]
+            temp_cgm, n_cgm_dups_removed = (
+                removeCgmDuplicates(temp_cgm, "deviceTime", "mg/dL")
+            )
+            metadata["nCgmDuplicatesRemovedDeviceTime." + cgm_model] = (
+                n_cgm_dups_removed
+            )
+
+            # get rid of duplicates that have the same ["time", "mg/dL"]
+            temp_cgm, n_cgm_dups_removed = (
+                removeCgmDuplicates(temp_cgm, "utcTime", "mg/dL")
+            )
+            metadata["nCgmDuplicatesRemovedUtcTime." + cgm_model] = (
+                n_cgm_dups_removed
+            )
+
+            # get rid of duplicates that have the same roundedTime
+            temp_cgm, n_cgm_dups_removed = (
+                removeDuplicates(temp_cgm, "roundedUtcTime")
+            )
+            metadata["nCgmDuplicatesRemovedRoundedTime." + cgm_model] = (
+                n_cgm_dups_removed
+            )
+
+            # create a contiguous 5 minute time series
+            first_day = temp_cgm["roundedUtcTime"].min()
+            metadata["firstCgm." + cgm_model] = first_day
+
+            last_day = temp_cgm["roundedUtcTime"].max()
+            metadata["lastCgm." + cgm_model] = last_day
+
+            rng = pd.date_range(first_day, last_day, freq="5min")
+            contiguous_data = pd.DataFrame(
+                rng,
+                columns=["roundedUtcTime"]
+            ).sort_values(
+                "roundedUtcTime",
+                ascending=False
+            ).reset_index(drop=True)
+
+            # merge with cgm data
+            cgm_series = pd.merge(
+                contiguous_data,
+                temp_cgm[[
+                    "roundedUtcTime", "hashid", "isLoopDay",
+                    "cgmModel", "age", "ylw", "mg/dL"
+                 ]],
+                on="roundedUtcTime",
+                how="left"
+            )
+
+            # sort so that the oldest data point is on top
+            cgm_series.sort_values(
+                "roundedUtcTime", ascending=True, inplace=True
+            )
+            cgm_series.reset_index(drop=True, inplace=True)
+
+            # get dexcom icgm bins
+            value_bins = np.array(
+                [37, 39, 60, 80, 120, 160, 200, 250, 300, 350, 400, 403]
+            )
+            value_bin_names = (
+                "< 40", "40-60", "61-80", "81-120", "121-160", "161-200",
+                "201-250", "251-300", "301-350", "351-400", "> 400"
+            )
+            cgm_series["valueBin"] = pd.cut(
+                cgm_series["mg/dL"], value_bins, labels=value_bin_names
+            )
+
+            # get the previous val
+            cgm_series["previousVal"] = cgm_series["mg/dL"].shift(1)
+
+            # get difference between current and previous val
+            cgm_series["diffFromPrevVal"] = (
+                cgm_series["mg/dL"] - cgm_series["previousVal"]
+            )
+
+            # calculate the rate from previous value (mg/dL/min)
+            cgm_series["rateFromPrevVal"] = cgm_series["diffFromPrevVal"] / 5
+
+            # get dexcom icgm rate bins
+            rate_bins = np.array(
+                [-100, -2.000001, -1.000001, -0.000001, 1, 2, 100]
+            )
+            # NOTE: bracket means include, parentheses means exclude
+            rate_bin_names = (
+                "< -2", "[-2,-1)", "[-1,-0)", "[0,1]", "(1,2]", ">2",
+            )
+            cgm_series["rateBin"] = pd.cut(
+                cgm_series["rateFromPrevVal"], rate_bins, labels=rate_bin_names
+            )
+
+            # through in the join category
+            cgm_series["valAndRateBin"] = (
+                cgm_series["valueBin"].astype(str)
+                + " & "
+                + cgm_series["rateBin"].astype(str)
+            )
+
+            # calculate slope (mg/dL/min) over the last 15, 30, and 60 minutes
+            cgm_series["slope15"] = (
+                cgm_series["mg/dL"].rolling(3).apply(get_slope, raw=True)
+            )
+
+            cgm_series["slope30"] = (
+                cgm_series["mg/dL"].rolling(6).apply(get_slope, raw=True)
+            )
+
+            cgm_series["slope60"] = (
+                cgm_series["mg/dL"].rolling(12).apply(get_slope, raw=True)
+            )
+
+            # add in the next value
+            cgm_series["nextVal"] = cgm_series["mg/dL"].shift(-1)
+
+            # get difference or relative increase/decrease of next value
+            cgm_series["relativeNextValue"] = (
+                cgm_series["nextVal"] - cgm_series["mg/dL"]
+            )
+
+            # rate of next value
+            cgm_series["rateToNextVal"] = cgm_series["relativeNextValue"] / 5
+
+            # drop rows where there is no information
+            cgm_series.dropna(subset=['hashid'], inplace=True)
+            metadata["nCgmDataPoints." + cgm_model] = len(cgm_series)
+
+            # append cgm model to a larger table
+            combined_cgm_series = pd.concat(
+                [combined_cgm_series, cgm_series],
+                ignore_index=True
+            )
+        if len(combined_cgm_series) > 0:
+            # sort so that the oldest data point is on top
+            # and that the G5_G6 get deleted if they are apart of a duplicate
+            combined_cgm_series["cgmModel_G5_and_G6"] = (
+                combined_cgm_series["cgmModel"] == "G5_G6"
+            )
+            combined_cgm_series.sort_values(
+                by=["roundedUtcTime", "cgmModel_G5_and_G6", "cgmModel"],
+                ascending=[False, True, False],
+                inplace=True
+            )
+            combined_cgm_series.reset_index(drop=True, inplace=True)
+
+            # add in check to see if there are duplicates between cgm devices
+            nUnique_cgm_times = len(combined_cgm_series["roundedUtcTime"].unique())
+            cgm_len = len(combined_cgm_series)
+            metadata["duplicateCgmDataIssue"] = nUnique_cgm_times != cgm_len
+
+            nDuplicate_cgm = cgm_len - nUnique_cgm_times
+            metadata["nDuplicateCgmDataIssues"] = nDuplicate_cgm
+
+            # if there are still duplicates, get rid of them
+            if nDuplicate_cgm > 0:
+                # save the duplicates for further examination
+                combined_cgm_series.to_csv(os.path.join(
+                    debug_duplicates,
+                    "PHI-" + userid + "-cgm-series-has-cgm-duplicates.csv.gz"
+                ))
+
+                cgm.to_csv(os.path.join(
+                    debug_duplicates,
+                    "PHI-" + userid + "-cgm-data-has-cgm-duplicates.csv.gz"
+                ))
+
+                # get rid of duplicates
+                combined_cgm_series, n_cgm_dups_removed = (
+                    removeDuplicates(combined_cgm_series, "roundedUtcTime")
+                )
+                metadata["nCgmDuplicatesRemovedRoundedTime.atEnd"] = (
+                    n_cgm_dups_removed
+                )
+            metadata["nCgmDataPoints.atEnd"] = len(combined_cgm_series)
+
+            # add whether data is dexcom cgm or not
+            combined_cgm_series["dexcomCgm"] = (
+                combined_cgm_series["cgmModel"].astype(str).str.contains("G4|G5|G6")
+            )
+
+            # save distribution data
+            combined_cgm_series.to_csv(os.path.join(
+                output_distribution,
+                "PHI-" + userid + "-cgm-distribution.csv.gz"
+            ))
+
+            # %% get cgm stats
+            # create a contiguous 5 minute time series of ALL cgm data
+            first_day = combined_cgm_series["roundedUtcTime"].min()
+            metadata["firstCgm." + cgm_model] = first_day
+
+            last_day = combined_cgm_series["roundedUtcTime"].max()
+            metadata["lastCgm." + cgm_model] = last_day
+
+            rng = pd.date_range(first_day, last_day, freq="5min")
+            contiguous_data = pd.DataFrame(
+                rng,
+                columns=["roundedUtcTime"]
+            ).sort_values(
+                "roundedUtcTime",
+                ascending=True
+            ).reset_index(drop=True)
+
+            # merge with combined_cgm_series data
+            all_cgm = pd.merge(
+                contiguous_data,
+                combined_cgm_series[[
+                    'roundedUtcTime', 'hashid', 'cgmModel', 'dexcomCgm',
+                    'age', 'ylw', 'isLoopDay', 'mg/dL',
+                ]],
+                on="roundedUtcTime",
+                how="left"
+            )
+
+            # get cgm stats
+            # get a binary (T/F) of whether we have a cgm value
+            all_cgm["hasCgm"] = all_cgm["mg/dL"].notnull()
+
+            # fill isLoopDay nan with False
+            all_cgm["isLoopDay"].fillna(False, inplace=True)
+
+            # has loop and cgm
+            all_cgm["hasLoopAndCgm"] = (
+                (all_cgm["isLoopDay"]) & (all_cgm["hasCgm"])
+            )
+
+            all_cgm["hasCgmWithoutLoop"] = (
+                (~all_cgm["isLoopDay"]) & (all_cgm["hasCgm"])
+            )
+
+            # work with all of the non-null data, even 39 = LOW and 401 = HIGH
+            ts39_401 = all_cgm["mg/dL"].copy()
+
+            # some stats should NOT include 39 or 401
+            all_cgm["mg/dL.40to400"] = (
+                ts39_401.replace(to_replace=39, value=np.nan)
+            )
+
+            all_cgm["mg/dL.40to400"] = (
+                all_cgm["mg/dL.40to400"].replace(
+                    to_replace=401,
+                    value=np.nan
+                )
+            )
+
+            ts40_400 = all_cgm["mg/dL.40to400"].copy()
+
+
+            # for all the less than (<) criteria
+            for cgm_threshold in [40, 54, 70]:
+                all_cgm["cgm < " + str(cgm_threshold)] = (
+                    ts39_401.lt(cgm_threshold)
+                )
+                # get episodes below these thresholds
+                for min_duration in [5, 15]:
+                    episode_ts = get_episodes(
+                        all_cgm[[
+                            "roundedUtcTime",
+                            "hasCgm",
+                            "cgm < " + str(cgm_threshold)
+                        ]].copy(),
+                        episode_criterion="cgm < " + str(cgm_threshold),
+                        min_duration=min_duration
+                    )
+                    all_cgm = pd.concat([all_cgm, episode_ts], axis=1)
+
+            # for all the greter than or equal to (>=) criteria
+                all_cgm["cgm >= " + str(cgm_threshold)] = (
+                    ts39_401.ge(cgm_threshold)
+                )
+
+            # for all the the less than or equal to (<=) criteria
+            for cgm_threshold in [140, 180, 250, 300, 400]:
+                all_cgm["cgm <= " + str(cgm_threshold)] = (
+                    ts39_401.le(cgm_threshold)
+                )
+            # for all the the greter than (>) criteria
+                all_cgm["cgm > " + str(cgm_threshold)] = (
+                    ts39_401.gt(cgm_threshold)
+                )
+
+            # get all of the cgm ranges
+            # (cgm >= 40) & (cgm < 54)
+            all_cgm["40 <= cgm < 54"] = (
+                (all_cgm["cgm >= 40"]) & (all_cgm["cgm < 54"])
+            )
+
+            # (cgm >= 54) & (cgm < 70)
+            all_cgm["54 <= cgm < 70"] = (
+                (all_cgm["cgm >= 54"]) & (all_cgm["cgm < 70"])
+            )
+
+            # (cgm >= 70) & (cgm <= 140)
+            all_cgm["70 <= cgm <= 140"] = (
+                (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 140"])
+            )
+
+            # (cgm >= 70) & (cgm <= 180)
+            all_cgm["70 <= cgm <= 180"] = (
+                (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 180"])
+            )
+
+            # (cgm > 180) & (cgm <= 250)
+            all_cgm["180 < cgm <= 250"] = (
+                (all_cgm["cgm > 180"]) & (all_cgm["cgm <= 250"])
+            )
+
+            # (cgm > 250) & (cgm <= 400)
+            all_cgm["250 < cgm <= 400"] = (
+                (all_cgm["cgm > 250"]) & (all_cgm["cgm <= 400"])
+            )
+
+            # derfine the windows to calculate the stats over
+            window_names = ["hour", "day", "week", "month", "quarter", "year"]
+            window_lengths = [12,    288,   288*7,  288*7*4, 288*90,   288*365]
+
+            for w_name, w_len in zip(window_names, window_lengths):
+                # require lenth of window for percent calculations
+                w_min = w_len
+
+                # get the start and end times for each window
+                all_cgm[w_name + ".startTime"] = (
+                    all_cgm["roundedUtcTime"].shift(w_len - 1)
+                )
+                all_cgm[w_name + ".endTime"] = all_cgm["roundedUtcTime"]
+
+                # add majority age for the time period
+                all_cgm[w_name + ".age"] = np.round(
+                    all_cgm["age"].rolling(
+                        min_periods=1,
+                        window=w_len
+                    ).mean()
+                )
+
+                # add majority ylw for the time period
+                all_cgm[w_name + ".ylw"] = np.round(
+                    all_cgm["ylw"].rolling(
+                        min_periods=1,
+                        window=w_len
+                    ).median()
+                )
+
+                # get percent time cgm used
+                all_cgm[w_name + ".cgmPercent"] = (
+                    all_cgm["hasCgm"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # get the total number of non-null values over this time period
+                all_cgm[w_name + ".missingCgmPercent"] = (
+                    1 - all_cgm[w_name + ".cgmPercent"]
+                )
+
+                # create (T/F) 70 and 80 percent available thresholds
+                # which will be useful for processing later
+                all_cgm[w_name + ".ge70Available"] = (
+                    all_cgm[w_name + ".cgmPercent"] >= 0.7
+                )
+
+                all_cgm[w_name + ".ge80Available"] = (
+                    all_cgm[w_name + ".cgmPercent"] >= 0.8
+                )
+
+                # get percent time Loop was used NOTE: this is
+                # approximate because we use > 24 temp basals per day
+                # ALSO: this is percent time Loop was used while cgm in use
+                all_cgm[w_name + ".loopingAndCgmPercent"] = (
+                    all_cgm["hasLoopAndCgm"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # percent of time cgm without loop
+                all_cgm[w_name + ".cgmWithoutLoopPercent"] = (
+                    all_cgm["hasCgmWithoutLoop"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # get episode stats
+                # TODO: add in hyper events
+                # get episodes below these thresholds
+                for cgm_threshold in [40, 54, 70]:
+                    # get number of episodes per time window
+                    for min_duration in [5, 15]:
+                        "cgm < " + str(cgm_threshold)
+                        episode_name = (
+                            "episode.cgm < " + str(cgm_threshold)
+                            + ".durationThreshold=" + str(min_duration)
+                        )
+                        all_cgm[w_name + ".count." + episode_name] = (
+                            all_cgm[episode_name + ".episodeStart"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).sum()
+                        )
+
+                        # get avg. duration of each episode per time window
+                        all_cgm[w_name + ".avgDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).sum() / all_cgm[w_name + ".count." + episode_name]
+                        )
+
+                        # get min duration of each episode per time window
+                        all_cgm[w_name + ".minDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).min()
+                        )
+
+                        # get median duration of each episode per time window
+                        all_cgm[w_name + ".medianDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).median()
+                        )
+
+                        # get max duration of each episode per time window
+                        all_cgm[w_name + ".maxDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).max()
+                        )
+
+                # get percent time in different ranges
+                # % Time < 54
+                all_cgm[w_name + ".lt54Percent"] = (
+                    all_cgm["cgm < 54"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in 54-70 (cgm >= 54) & (cgm < 70)
+                all_cgm[w_name + ".bt54_70Percent"] = (
+                    all_cgm["54 <= cgm < 70"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in target range (cgm >= 70) & (cgm <= 180)
+                all_cgm[w_name + ".bt70_180Percent"] = (
+                    all_cgm["70 <= cgm <= 180"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in 180-250 (cgm > 180) & (cgm <= 250)
+                all_cgm[w_name + ".bt180_250Percent"] = (
+                    all_cgm["180 < cgm <= 250"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time > 250
+                all_cgm[w_name + ".gt250Percent"] = (
+                    all_cgm["cgm > 250"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # check that all of the percentages add of to 1 or 100%
+                all_cgm[w_name + ".percentCheck"] = (
+                     all_cgm[w_name + ".missingCgmPercent"]
+                     + all_cgm[w_name + ".lt54Percent"]
+                     + all_cgm[w_name + ".bt54_70Percent"]
+                     + all_cgm[w_name + ".bt70_180Percent"]
+                     + all_cgm[w_name + ".bt180_250Percent"]
+                     + all_cgm[w_name + ".gt250Percent"]
+                )
+
+                # here are some other less common percent time in ranges
+                # % Time < 70
+                all_cgm[w_name + ".lt70Percent"] = (
+                    all_cgm["cgm < 70"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in target range (cgm >= 70) & (cgm <= 140)
+                all_cgm[w_name + ".tir70to140Percent"] = (
+                    all_cgm["70 <= cgm <= 140"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # percent time above a threshold
+                # % Time > 180
+                all_cgm[w_name + ".gt180Percent"] = (
+                    all_cgm["cgm > 180"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # quantiles
+                # NOTE: this will increase run time, so only run if you need
+                # 3-4X the processing time since it has to sort the data
+                # TODO: make this an option to the function, once it is made
+                # create a rolling object
+
+                # NOTE: these calculations only require 3 points to make
+                roll39_401 = ts39_401.rolling(min_periods=3, window=w_len)
+                roll40_400 = ts40_400.rolling(min_periods=3, window=w_len)
+
+                # min
+                all_cgm[w_name + ".min"] = roll39_401.min()
+
+                # 10, 25, 75, and 90th percentiles
+                all_cgm[w_name + ".10th"] = roll39_401.quantile(0.10)
+                all_cgm[w_name + ".25th"] = roll39_401.quantile(0.25)
+                all_cgm[w_name + ".75th"] = roll39_401.quantile(0.75)
+                all_cgm[w_name + ".90th"] = roll39_401.quantile(0.90)
+
+                # max
+                all_cgm[w_name + ".max"] = roll39_401.max()
+
+                # median
+                all_cgm[w_name + ".median"] = roll39_401.median()
+
+                # iqr
+                all_cgm[w_name + ".iqr"] = (
+                    all_cgm[w_name + ".75th"] - all_cgm[w_name + ".25th"]
+                )
+
+                # recalcuate percent of measurements available
+                all_cgm[w_name + ".40to400availablePercent"] = (
+                    roll40_400.count() / w_len
+                )
+
+                # get the total number of non-null values over this time period
+                all_cgm[w_name + ".40to400missingPercent"] = (
+                    1 - all_cgm[w_name + ".40to400availablePercent"]
+                )
+
+                all_cgm[w_name + ".40to400ge70Available"] = (
+                    all_cgm[w_name + ".40to400availablePercent"] >= 0.7
+                )
+
+                all_cgm[w_name + ".40to400ge80Available"] = (
+                    all_cgm[w_name + ".40to400availablePercent"] >= 0.8
+                )
+
+                # mean
+                all_cgm[w_name + ".mean"] = roll40_400.mean()
+
+                # GMI(%) = 3.31 + 0.02392 x [mean glucose in mg/dL]
+                all_cgm[w_name + ".gmi"] = (
+                    3.31 + (0.02392 * all_cgm[w_name + ".mean"])
+                )
+
+                # standard deviation (std)
+                all_cgm[w_name + ".std"] = roll40_400.std()
+
+                # coefficient of variation (cov) = std / mean
+                all_cgm[w_name + ".cov"] = (
+                    all_cgm[w_name + ".std"] / all_cgm[w_name + ".mean"]
+                )
+
+            # %% save cgm stats data
+            all_cgm.to_csv(os.path.join(
+                output_stats,
+                "PHI-" + userid + "-cgm-stats.csv.gz"
+            ))
+            # write the most recent example of the 90 day stats
+            # to the metadata
+            quarter_ge80Available_idx = (
+                all_cgm[all_cgm["quarter.ge80Available"]]
+            ).index.max()
+
+            if pd.notnull(quarter_ge80Available_idx):
+                # get the most recent quarter
+                most_recent = all_cgm.loc[
+                    [quarter_ge80Available_idx],
+                    all_cgm.columns
+                ]
+            else:
+                most_recent = all_cgm.loc[
+                    [all_cgm.index.max()],
+                    all_cgm.columns
+                ]
+
+            metadata = pd.merge(
+                metadata,
+                most_recent,
+                on="hashid",
+                how="left"
+            )
+
+        print(metadata.T)
+
+    else:
+        metadata["cgmData"] = False
+        print(d_idx, "no cgm data")
+
+    # save metadata
+    metadata.to_csv(os.path.join(
+        output_metadata,
+        "PHI-" + userid + "-cgm-metadata.csv.gz"
+    ))
+
+    print("finished", d_idx, userid)
diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py
new file mode 100644
index 00000000..4da725b1
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py
@@ -0,0 +1,2453 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+'''
+calculate cgm distributions and stats for a single tidepool (donor) dataset
+from a data that comes from a json file (does NOT work with data save as csv)
+'''
+
+
+# %% REQUIRED LIBRARIES
+import os
+import sys
+import hashlib
+import pytz
+import numpy as np
+import pandas as pd
+import datetime as dt
+import argparse
+import pdb
+
+get_donor_data_path = os.path.abspath(
+    os.path.join(os.path.dirname(__file__), "..")
+)
+if get_donor_data_path not in sys.path:
+    sys.path.insert(0, get_donor_data_path)
+from get_donor_data.get_single_tidepool_dataset_json import (
+    make_folder_if_doesnt_exist, get_data
+)
+from get_donor_data.get_single_donor_metadata import get_shared_metadata
+
+# %% CONSTANTS
+MGDL_PER_MMOLL = 18.01559
+
+
+# %% FUNCTIONS
+'''
+the functions that are called in this script,
+which includes notes of where the functions came from,
+and whether they were refactored
+'''
+
+def get_episodes(
+        df,
+        episode_criterion="cgm < 54",
+        min_duration=5,
+):
+    # TODO: deal with case where there are nan's in the middle of an episode
+    # it probably makes sense to interpolate between values iff the gap is
+    # <= 1 to 6 points (5 to 30 minutes)
+
+    # put consecutive data that matches in groups
+    df["tempGroups"] = ((
+        df[episode_criterion] != df[episode_criterion].shift()
+    ).cumsum())
+
+    df["episodeId"] = (
+        df["tempGroups"] * df[episode_criterion]
+    )
+
+    # group by the episode groups
+    episode_groups = df.groupby("episodeId")
+    episodes = episode_groups["roundedUtcTime"].count().reset_index()
+    episodes["duration"] = episodes["roundedUtcTime"] * 5
+    episodes.rename(columns={"roundedUtcTime": "episodeCounts"}, inplace=True)
+
+    df = pd.merge(df, episodes, on="episodeId", how="left")
+    df["episodeDuration"] = (
+        df["duration"] * df[episode_criterion]
+    )
+
+    # mark record as belonging to an episode
+    df["isEpisode"] = (
+        df["episodeDuration"] >= min_duration
+    )
+
+    # get the hypo episode starts so we only count each episode once
+    df["episodeStart"] = (
+        (df[episode_criterion])
+        & (~df[episode_criterion].shift(1).fillna(False))
+    )
+
+    # calculate the total duration and attach to start record
+    # which is needed to get the average duration per episode
+    df["episodeTotalDuration"] = (
+        df["episodeStart"] * df["episodeDuration"]
+    )
+    df["episodeTotalDuration"].replace(0, np.nan, inplace=True)
+
+    episode_prefix = (
+        "episode." + episode_criterion
+        + ".durationThreshold=" + str(min_duration) + "."
+    )
+
+    df = df[[
+        "isEpisode", "episodeId", "episodeStart", "episodeTotalDuration"
+    ]].add_prefix(episode_prefix)
+
+    return df
+
+
+def get_slope(y):
+    if "array" not in type(y).__name__:
+        raise TypeError('Expecting a numpy array')
+
+    count_ = len(y)
+
+    x = np.arange(start=0, stop=count_*5, step=5)
+
+    sum_x = x.sum()
+    sum_y = y.sum()
+    sum_xy = (x * y).sum()
+    sum_x_squared = (x * x).sum()
+
+    slope = (
+        ((count_ * sum_xy) - (sum_x * sum_y))
+        / ((count_ * sum_x_squared) - (sum_x * sum_x))
+    )
+
+    return slope
+
+
+def expand_entire_dict(ts):
+    if "Series" not in type(ts).__name__:
+        raise TypeError('Expecting a pandas time series object')
+    notnull_idx = ts.index[ts.notnull()]
+    temp_df = pd.DataFrame(
+        ts[notnull_idx].tolist(),
+        index=notnull_idx
+    )
+
+    return temp_df
+
+
+def expand_embedded_dict(ts, key_):
+    '''Expanded a single field that has embedded json
+
+    Args:
+        ts: a pandas time series of the field that has embedded json
+        key_: the key that you want to expand
+
+    Raise:
+        TypeError: if you don't pass in a pandas time series
+
+    Returns:
+        key_ts: a new time series of the key of interest
+
+    NOTE:
+        this is new function
+    TODO:
+        could be refactored to allow multiple keys or all keys to be returned
+        could be refactored for speed as the current process
+    '''
+
+    if "Series" not in type(ts).__name__:
+        raise TypeError('Expecting a pandas time series object')
+    key_ts = pd.Series(name=ts.name + "." + key_, index=ts.index)
+    notnull_idx = ts.notnull()
+    # TODO: maybe sped up by only getting the one field of interest?
+    # though, the current method is fairly quick and compact
+    temp_df = expand_entire_dict(ts)
+    if key_ in list(temp_df):
+        key_ts[notnull_idx] = temp_df[key_].values
+
+    return key_ts
+
+
+def get_embedded_field(ts, embedded_field):
+    '''get a field that is nested in more than 1 embedded dictionary (json)
+
+    Args:
+        ts: a pandas time series of the field that has embedded json
+        embedded_field (str): the location of the field that is deeply nested
+            (e.g., "origin.payload.device.model")
+
+    Raise:
+        ValueError: if you don't pass in a pandas time series
+
+    Returns:
+        new_ts: a new time series of the key of interest
+
+    NOTE:
+        this is new function
+        the "." notation is used to reference nested json
+
+    '''
+    field_list = embedded_field.split(".")
+    if len(field_list) < 2:
+        raise ValueError('Expecting at least 1 embedded field')
+
+    new_ts = expand_embedded_dict(ts, field_list[1])
+    for i in range(2, len(field_list)):
+        new_ts = expand_embedded_dict(new_ts, field_list[i])
+
+    return new_ts
+
+
+def add_upload_info_to_cgm_records(groups, df):
+    upload_locations = [
+        "upload.uploadId",
+        "upload.deviceManufacturers",
+        "upload.deviceModel",
+        "upload.deviceSerialNumber",
+        "upload.deviceTags"
+    ]
+
+    if "upload" in groups["type"].unique():
+        upload = groups.get_group("upload").dropna(axis=1, how="all").add_prefix("upload.")
+        df = pd.merge(
+            left=df,
+            right=upload[list(set(upload_locations) & set(list(upload)))],
+            left_on="uploadId",
+            right_on="upload.uploadId",
+            how="left"
+        )
+
+    return df
+
+
+def expand_heathkit_cgm_fields(df):
+    # TODO: refactor the code/function that originally grabs
+    # these fields, so we are only doing it once, and so
+    # we don't have to drop the columns for the code below to work.
+    drop_columns = [
+        'origin.payload.device.name',
+        'origin.payload.device.manufacturer',
+        'origin.payload.sourceRevision.source.name'
+    ]
+    for drop_col in drop_columns:
+        if drop_col in list(df):
+            df.drop(columns=[drop_col], inplace=True)
+
+    healthkit_locations = [
+        "origin",
+        "origin.payload",
+        "origin.payload.device",
+        "origin.payload.sourceRevision",
+        "origin.payload.sourceRevision.source",
+        "payload",
+    ]
+
+    for hk_loc in healthkit_locations:
+        if hk_loc in list(df):
+            temp_df = (
+                expand_entire_dict(df[hk_loc].copy()).add_prefix(hk_loc + ".")
+            )
+            df = pd.concat([df, temp_df], axis=1)
+
+    return df
+
+
+def get_dexcom_cgm_model(df):
+    # add cgm model
+
+    dexcom_model_locations = [
+        "deviceId",
+        "deviceManufacturers",
+        "upload.deviceManufacturers",
+        "deviceModel",
+        "upload.deviceModel",
+        "deviceSerialNumber",
+        "upload.deviceSerialNumber",
+        "origin.payload.sourceRevision.source.name",
+        "payload.transmitterGeneration",
+        "payload.HKMetadataKeySyncIdentifier",
+        "payload.transmitterId",
+    ]
+
+    for model_location in dexcom_model_locations:
+        # only check if model has NOT been determined, or if it is G5_G6
+        m_idx = (
+            (df["cgmModel"].isnull())
+            | (df["cgmModel"].astype(str).str.contains("G5_G6"))
+        )
+
+        # get index that matches model
+        if ((model_location in list(df)) & (m_idx.sum() > 0)):
+            str_list = df[model_location].astype(str).str
+
+            # G4
+            g4_idx = str_list.contains("G4", case=False, na=False)
+            df.loc[g4_idx, "cgmModel"] = "G4"
+            df.loc[g4_idx, "cgmModelSensedFrom"] = model_location
+
+            # G5
+            g5_idx = str_list.contains("G5", case=False, na=False)
+            df.loc[g5_idx, "cgmModel"] = "G5"
+            df.loc[g5_idx, "cgmModelSensedFrom"] = model_location
+
+            # G6
+            g6_idx = str_list.contains("G6", case=False, na=False)
+            df.loc[g6_idx, "cgmModel"] = "G6"
+            df.loc[g6_idx, "cgmModelSensedFrom"] = model_location
+
+            # edge case of g5 and g6
+            g5_g6_idx = (g5_idx & g6_idx)
+            df.loc[g5_g6_idx, "cgmModel"] = "G5_G6"
+            df.loc[g5_g6_idx, "cgmModelSensedFrom"] = model_location
+
+            # case of "transmitterId"
+            if (
+                ("transmitterId" in model_location)
+                | ("payload.HKMetadataKeySyncIdentifier" in model_location)
+            ):
+                # if length of string is 5, then it is likely a G4 sensor
+                length5_idx = str_list.len() == 5
+                df.loc[length5_idx, "cgmModel"] = "G4"
+                df.loc[length5_idx, "cgmModelSensedFrom"] = model_location
+
+                # if length of string > 5  then might be G5 or G6
+                length_gt5_idx = str_list.len() > 5
+
+                # if sensor stats with 4 then likely G5
+                starts4_idx = str_list.startswith("4")
+                df.loc[(length_gt5_idx & starts4_idx), "cgmModel"] = "G5"
+                df.loc[(length_gt5_idx & starts4_idx), "cgmModelSensedFrom"] = model_location
+
+                # if sensor stats with 2 or 8 then likely G6
+                starts2_6_idx = (
+                    (str_list.startswith("2")) | (str_list.startswith("8"))
+                )
+                df.loc[(length_gt5_idx & starts2_6_idx), "cgmModel"] = "G6"
+                df.loc[(length_gt5_idx & starts2_6_idx), "cgmModelSensedFrom"] = model_location
+
+    return df[["cgmModel", "cgmModelSensedFrom"]]
+
+
+def get_non_dexcom_cgm_model(df):
+    # non-dexcom cgm model query
+    model_locations = ["deviceId"]
+
+    # model types (NOTE: for medtronic getting pump type not cgm)
+    models_670G = "MMT-158|MMT-178"
+    models_640G = "MMT-1511|MMT-1512|MMT-1711|MMT-1712"
+    models_630G = "MMT-1514|MMT-1515|MMT-1714|MMT-1715"
+    models_530G = (
+        "530G|MedT-551|MedT-751|MedT-554|MedT-754|Veo - 554|Veo - 754"
+    )
+    models_523_723 = "MedT-523|MedT-723|Revel - 523|Revel - 723"  # 523/723
+    models_libre = "AbbottFreeStyleLibre"
+    models_animas = "IR1295"
+    # NOTE: the tandem G4 will first be written as G5_G6,
+    # but the logic should overwrite back to G4
+    models_tandem_G5_G6 = "tandem"
+    models_tandem_G4 = "4628003|5448003"
+
+    non_dex_models = [
+        models_670G, models_640G, models_630G, models_530G, models_523_723,
+        models_libre, models_animas, models_tandem_G5_G6, models_tandem_G4
+    ]
+
+    non_dex_model_names = [
+        "670G", "640G", "630G", "530G", "523_723",
+        "LIBRE", "G4", "G5_G6", "G4"
+    ]
+
+    for model_location in model_locations:
+        # only check if model has NOT been determined, or if it is G5_G6
+        m_idx = (
+            (df["cgmModel"].isnull())
+            | (df["cgmModel"].astype(str).str.contains("G5_G6"))
+        )
+
+        # get index that matches model
+        if ((model_location in list(df)) & (m_idx.sum() > 0)):
+            str_list = df[model_location].astype(str).str
+
+            for non_dex_model, model_name in zip(
+                non_dex_models, non_dex_model_names
+            ):
+
+                model_idx = str_list.contains(non_dex_model, na=False)
+                df.loc[model_idx, "cgmModel"] = model_name
+                df.loc[model_idx, "cgmModelSensedFrom"] = model_location
+
+    return df[["cgmModel", "cgmModelSensedFrom"]]
+
+
+def hash_userid(userid, salt):
+    '''
+    taken from anonymize-and-export.py
+    refactored name(s) to meet style guide
+    '''
+    usr_string = userid + salt
+    hash_user = hashlib.sha256(usr_string.encode())
+    hashid = hash_user.hexdigest()
+
+    return hashid
+
+
+def get_type(val):
+    return type(val).__name__
+
+
+def remove_negative_durations(df):
+    '''
+    taken from https://github.com/tidepool-org/data-analytics/blob/
+    etn/get-settings-and-events/projects/get-donors-pump-settings/
+    get-users-settings-and-events.py
+
+    refactored name(s) to meet style guide
+    refactored pandas field call to df["field"] instead of df.field
+    refactored because physical activity includes embedded json, whereas
+    the other fields in the data model require a integer
+    TODO: I think that durations are coming in as floats too, so we need
+    to refactor to account for that.
+    '''
+    if "duration" in list(df):
+        type_ = df["duration"].apply(get_type)
+        valid_index = ((type_ == "int") & (df["duration"].notnull()))
+        n_negative_durations = sum(df.loc[valid_index, "duration"] < 0)
+        if n_negative_durations > 0:
+            df = df[~(df.loc[valid_index, "duration"] < 0)]
+    else:
+        n_negative_durations = np.nan
+
+    return df, n_negative_durations
+
+
+def tslim_calibration_fix(df):
+    '''
+    taken from https://github.com/tidepool-org/data-analytics/blob/
+    etn/get-settings-and-events/projects/get-donors-pump-settings/
+    get-users-settings-and-events.py
+
+    refactored name(s) to meet style guide
+    refactored pandas field call to df["field"] instead of df.field
+    refactored to only expand one field
+    '''
+
+    # expand payload field one level
+    if "payload" in list(df):
+        df["payload.calibration_reading"] = (
+            expand_embedded_dict(df["payload"], "calibration_reading")
+        )
+
+        if df["payload.calibration_reading"].notnull().sum() > 0:
+
+            search_for = ['tan']
+            tandem_data_index = (
+                (df["deviceId"].str.contains('|'.join(search_for)))
+                & (df["type"] == "deviceEvent")
+            )
+
+            cal_index = df["payload.calibration_reading"].notnull()
+            valid_index = tandem_data_index & cal_index
+
+            n_cal_readings = sum(valid_index)
+
+            if n_cal_readings > 0:
+                # if reading is > 30 then it is in the wrong units
+                if df["payload.calibration_reading"].min() > 30:
+                    df.loc[cal_index, "value"] = (
+                        df.loc[valid_index, "payload.calibration_reading"]
+                        / MGDL_PER_MMOLL
+                    )
+                else:
+                    df.loc[cal_index, "value"] = (
+                        df.loc[valid_index, "payload.calibration_reading"]
+                    )
+        else:
+            n_cal_readings = 0
+    else:
+        n_cal_readings = 0
+    return df, n_cal_readings
+
+
+def replace_smoothed_cgm_values(df):
+
+    if 'payload.realTimeValue' in list(df):
+        raw_val_idx = df['payload.realTimeValue'].notnull()
+        n_replaced = raw_val_idx.sum()
+        df.loc[raw_val_idx, "mg/dL"] = (
+            df.loc[raw_val_idx, "payload.realTimeValue"]
+        )
+    else:
+        n_replaced = np.nan
+
+    raw_values = df["mg/dL"]
+
+    return raw_values, n_replaced
+
+
+def get_healthkit_timezone(df):
+    '''
+    TODO: refactor to account for more efficient way to get embedded json
+    '''
+    if "payload" in list(df):
+        df["payload.HKTimeZone"] = (
+            expand_embedded_dict(df["payload"], "HKTimeZone")
+        )
+        if "timezone" not in list(df):
+            if "payload.HKTimeZone" in list(df):
+                hk_tz_idx = df["payload.HKTimeZone"].notnull()
+                df.loc[hk_tz_idx, "deviceType"] = "healthkit"
+                df.rename(columns={"payload.HKTimeZone": "timezone"}, inplace=True)
+
+            else:
+                df["timezone"] = np.nan
+                df["deviceType"] = np.nan
+        else:
+            if "payload.HKTimeZone" in list(df):
+                hk_tz_idx = df["payload.HKTimeZone"].notnull()
+                df.loc[hk_tz_idx, "timezone"] = (
+                    df.loc[hk_tz_idx, "payload.HKTimeZone"]
+                )
+                df.loc[hk_tz_idx, "deviceType"] = "healthkit"
+            else:
+                df["timezone"] = np.nan
+                df["deviceType"] = np.nan
+
+    else:
+        df["timezone"] = np.nan
+        df["deviceType"] = np.nan
+
+    return df[["timezone", "deviceType"]]
+
+
+def get_and_fill_timezone(df):
+    '''
+    this is new to deal with healthkit data
+    requires that a data frame that contains payload and HKTimeZone is passed
+    '''
+    df = get_healthkit_timezone(df)
+
+    df["timezone"].fillna(method='ffill', inplace=True)
+    df["timezone"].fillna(method='bfill', inplace=True)
+
+    return df["timezone"]
+
+
+def make_tz_unaware(date_time):
+    return date_time.replace(tzinfo=None)
+
+
+def to_utc_datetime(df):
+    '''
+    this is new to deal with perfomance issue with the previous method
+    of converting to string to datetime with pd.to_datetime()
+    '''
+    utc_time_tz_aware = pd.to_datetime(
+        df["time"],
+        format="%Y-%m-%dT%H:%M:%S",
+        utc=True
+    )
+    utc_tz_unaware = utc_time_tz_aware.apply(make_tz_unaware)
+
+    return utc_tz_unaware
+
+
+# apply the large timezone offset correction (AKA Darin's fix)
+def timezone_offset_bug_fix(df):
+    '''
+    this is taken from estimate-local-time.py
+    TODO: add in unit testing where there is no TZP that is > 840 or < -720
+    '''
+
+    if "timezoneOffset" in list(df):
+
+        while ((df.timezoneOffset > 840).sum() > 0):
+            df.loc[df.timezoneOffset > 840, ["conversionOffset"]] = (
+                df.loc[df.timezoneOffset > 840, ["conversionOffset"]]
+                - (1440 * 60 * 1000)
+                )
+
+            df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] = (
+                df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] - 1440
+            )
+
+        while ((df.timezoneOffset < -720).sum() > 0):
+            df.loc[df.timezoneOffset < -720, ["conversionOffset"]] = (
+                df.loc[df.timezoneOffset < -720, ["conversionOffset"]]
+                + (1440 * 60 * 1000)
+            )
+
+            df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] = (
+                df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] + 1440
+            )
+
+    return df
+
+
+def get_local_time(df):
+
+    tzo = df[['utcTime', 'inferredTimezone']].apply(
+        lambda x: get_timezone_offset(*x), axis=1
+    )
+    local_time = df['utcTime'] + pd.to_timedelta(tzo, unit="m")
+
+    return local_time
+
+
+def round_time(
+        df,
+        time_interval_minutes=5,
+        start_with_first_record=True,
+        return_calculation_columns=False
+):
+    '''
+    A general purpose round time function that rounds the "time"
+    field to nearest <time_interval_minutes> minutes
+    INPUTS:
+        * a dataframe (df) or time series that contains only one time field
+        that you want to round
+        * time_interval_minutes (defaults to 5 minutes given that most cgms
+        output every 5 minutes)
+        * start_with_first_record starts the rounding with the first record
+        if True, and the last record if False (defaults to True)
+        * return_calculation_columns specifies whether the extra columns
+        used to make calculations are returned
+    refactored name(s) to meet style guide
+    '''
+    # if a time series is passed in, convert to dataframe
+    if "Series" in get_type(df):
+        df = pd.DataFrame(df)
+    columns_ = list(df)
+    if len(columns_) > 1:
+        sys.exit(
+            "Error: df should only have one time column"
+        )
+    else:
+        df.rename(columns={columns_[0]: "t"}, inplace=True)
+
+    df.sort_values(
+        by="t",
+        ascending=start_with_first_record,
+        inplace=True
+    )
+
+    df.reset_index(drop=False, inplace=True)
+    df.rename(columns={"index": "originalIndex"}, inplace=True)
+
+    # calculate the time between consecutive records
+    df["t_shift"] = df["t"].shift(1)
+    df["timeBetweenRecords"] = round(
+        (df["t"] - df["t_shift"]).dt.days*(86400/(60 * time_interval_minutes))
+        + (df["t"] - df["t_shift"]).dt.seconds/(60 * time_interval_minutes)
+    ) * time_interval_minutes
+
+    # separate the data into chunks if timeBetweenRecords is greater than
+    # 2 times the <time_interval_minutes> minutes so the rounding process
+    # starts over
+    big_gaps = list(
+        df.query("abs(timeBetweenRecords) > "
+                 + str(time_interval_minutes * 2)).index
+    )
+    big_gaps.insert(0, 0)
+    big_gaps.append(len(df))
+
+    for gap_index in range(0, len(big_gaps) - 1):
+        chunk = df["t"][big_gaps[gap_index]:big_gaps[gap_index+1]]
+        first_chunk = df["t"][big_gaps[gap_index]]
+
+        # calculate the time difference between
+        # each time record and the first record
+        df.loc[
+            big_gaps[gap_index]:big_gaps[gap_index+1],
+            "minutesFromFirstRecord"
+        ] = (
+            (chunk - first_chunk).dt.days*(86400/60)
+            + (chunk - first_chunk).dt.seconds/60
+        )
+
+        # then round to the nearest X Minutes
+        # NOTE: the ".000001" ensures that mulitples of 2:30 always rounds up.
+        df.loc[
+            big_gaps[gap_index]:big_gaps[gap_index+1],
+            "roundedMinutesFromFirstRecord"
+        ] = round(
+            (df.loc[
+                big_gaps[gap_index]:big_gaps[gap_index+1],
+                "minutesFromFirstRecord"
+            ] / time_interval_minutes) + 0.000001
+        ) * (time_interval_minutes)
+
+        rounded_first_record = (
+            first_chunk + pd.Timedelta("1microseconds")
+        ).round(str(time_interval_minutes) + "min")
+
+        df.loc[
+            big_gaps[gap_index]:big_gaps[gap_index+1],
+            "roundedTime"
+        ] = rounded_first_record + pd.to_timedelta(
+            df.loc[
+                big_gaps[gap_index]:big_gaps[gap_index+1],
+                "roundedMinutesFromFirstRecord"
+            ], unit="m"
+        )
+
+    if return_calculation_columns is False:
+        df.drop(
+            columns=[
+                "timeBetweenRecords",
+                "minutesFromFirstRecord",
+                "roundedMinutesFromFirstRecord"
+            ], inplace=True
+        )
+    # sort back to the original index
+    df.sort_values(by="originalIndex", inplace=True)
+
+    return df["roundedTime"].values
+
+
+def add_upload_time(df):
+    '''
+    this is taken from a colab notebook that is not in our github
+    given that it has been refactored to account for bug where there are
+    no upload records
+    NOTE: this is a new fix introduced with healthkit data...we now have
+    data that does not have an upload record
+
+    '''
+
+    if "upload" in df.type.unique():
+        upload_times = pd.DataFrame(
+            df[df.type == "upload"].groupby("uploadId")["utcTime"].max()
+        )
+    else:
+        upload_times = pd.DataFrame(columns=["utcTime"])
+
+    unique_uploadIds = set(df["uploadId"].unique())
+    unique_uploadRecords = set(
+        df.loc[df["type"] == "upload", "uploadId"].unique()
+    )
+    uploadIds_missing_uploadRecords = unique_uploadIds - unique_uploadRecords
+
+    for upId in uploadIds_missing_uploadRecords:
+        last_upload_time = df.loc[df["uploadId"] == upId, "utcTime"].max()
+        upload_times.loc[upId, "utcTime"] = last_upload_time
+
+    upload_times.reset_index(inplace=True)
+    upload_times.rename(
+        columns={"utcTime": "uploadTime",
+                 "index": "uploadId"},
+        inplace=True
+    )
+
+    df = pd.merge(df, upload_times, how='left', on='uploadId')
+
+    return df["uploadTime"].values
+
+
+def remove_invalid_cgm_values(df):
+
+    nBefore = len(df)
+    # remove values < 38 and > 402 mg/dL
+    df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] < 38))].index)
+    df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] > 402))].index)
+    nRemoved = nBefore - len(df)
+
+    return df, nRemoved
+
+
+def removeDuplicates(df, criteriaDF):
+    nBefore = len(df)
+    df = df.loc[~(df[criteriaDF].duplicated())]
+    df = df.reset_index(drop=True)
+    nDuplicatesRemoved = nBefore - len(df)
+
+    return df, nDuplicatesRemoved
+
+
+def removeCgmDuplicates(df, timeCriterion, valueCriterion="value"):
+    if timeCriterion in df:
+        df.sort_values(by=[timeCriterion, "uploadTime"],
+                       ascending=[False, False],
+                       inplace=True)
+        dfIsNull = df[df[timeCriterion].isnull()]
+        dfNotNull = df[df[timeCriterion].notnull()]
+        dfNotNull, nDuplicatesRemoved = (
+            removeDuplicates(dfNotNull, [timeCriterion, valueCriterion])
+        )
+        df = pd.concat([dfIsNull, dfNotNull])
+        df.sort_values(by=[timeCriterion, "uploadTime"],
+                       ascending=[False, False],
+                       inplace=True)
+    else:
+        nDuplicatesRemoved = 0
+
+    return df, nDuplicatesRemoved
+
+
+# get rid of spike data
+def remove_spike_data(df):
+    if "origin" in list(df):
+        nBefore = len(df)
+        spike_locations = [
+            "origin.payload.device.name",
+            "origin.payload.device.manufacturer",
+            "origin.payload.sourceRevision.source.name",
+        ]
+        for spike_loc in spike_locations:
+            df[spike_loc] = get_embedded_field(df["origin"], spike_loc)
+            notnull_idx = df[spike_loc].notnull()
+            df_notnull = df[notnull_idx]
+            is_spike = df_notnull[spike_loc].astype(str).str.lower().str.contains("spike")
+            spike_idx = df_notnull[is_spike].index
+            df.drop(spike_idx, inplace=True)
+
+        nRemoved = nBefore - len(df)
+
+    else:
+        nRemoved = np.nan
+
+    return df, nRemoved
+
+
+# %% ESTIMATE LOCAL TIME FUNCTIONS
+def convert_deprecated_timezone_to_alias(df, tzAlias):
+    if "timezone" in df:
+        uniqueTimezones = df.timezone.unique()
+        uniqueTimezones = uniqueTimezones[pd.notnull(df.timezone.unique())]
+
+        for uniqueTimezone in uniqueTimezones:
+            alias = tzAlias.loc[tzAlias.tz.str.endswith(uniqueTimezone),
+                                ["alias"]].values
+            if len(alias) == 1:
+                df.loc[df.timezone == uniqueTimezone, ["timezone"]] = alias
+
+    return df
+
+
+def create_contiguous_day_series(df):
+    first_day = df["date"].min()
+    last_day = df["date"].max()
+    rng = pd.date_range(first_day, last_day).date
+    contiguousDaySeries = \
+        pd.DataFrame(rng, columns=["date"]).sort_values(
+                "date", ascending=False).reset_index(drop=True)
+
+    return contiguousDaySeries
+
+
+def add_device_type(df):
+    col_headings = list(df)
+    if "deviceType" not in col_headings:
+        df["deviceType"] = np.nan
+    if "deviceTags" in col_headings:
+        # first make sure deviceTag is in string format
+        df["deviceTags"] = df.deviceTags.astype(str)
+        # filter by type not null device tags
+        ud = df[df["deviceTags"].notnull()].copy()
+        # define a device type (e.g., pump, cgm, or healthkit)
+        ud.loc[
+            ((ud["deviceTags"].str.contains("pump"))
+             & (ud["deviceType"].isnull())),
+            ["deviceType"]
+        ] = "pump"
+
+        # define a device type (e.g., cgm)
+        ud.loc[
+            ((ud["deviceTags"].str.contains("cgm"))
+             & (ud["deviceType"].isnull())),
+            ["deviceType"]
+        ] = "cgm"
+
+        return ud["deviceType"]
+    else:
+        return np.nan
+
+
+def get_timezone_offset(currentDate, currentTimezone):
+
+    tz = pytz.timezone(currentTimezone)
+    # here we add 1 day to the current date to account for changes to/from DST
+    tzoNum = int(
+        tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z")
+    )
+    tzoHours = np.floor(tzoNum / 100)
+    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+    tzoSign = np.sign(tzoHours)
+    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+
+    return tzo
+
+
+def add_device_day_series(df, dfContDays, deviceTypeName):
+    if len(df) > 0:
+        dfDayGroups = df.groupby("date")
+        if "timezoneOffset" in df:
+            dfDaySeries = pd.DataFrame(dfDayGroups["timezoneOffset"].median())
+        else:
+            dfDaySeries = pd.DataFrame(columns=["timezoneOffset"])
+            dfDaySeries.index.name = "date"
+
+        if "upload" in deviceTypeName:
+            if (("timezone" in df) & (df["timezone"].notnull().sum() > 0)):
+                dfDaySeries["timezone"] = (
+                    dfDayGroups.timezone.describe()["top"]
+                )
+                # get the timezone offset for the timezone
+                for i in dfDaySeries.index:
+                    if pd.notnull(dfDaySeries.loc[i, "timezone"]):
+                        tzo = get_timezone_offset(
+                                pd.to_datetime(i),
+                                dfDaySeries.loc[i, "timezone"])
+                        dfDaySeries.loc[i, ["timezoneOffset"]] = tzo
+                if "timeProcessing" in dfDaySeries:
+                    dfDaySeries["timeProcessing"] = \
+                        dfDayGroups.timeProcessing.describe()["top"]
+                else:
+                    dfDaySeries["timeProcessing"] = np.nan
+
+
+        dfDaySeries = dfDaySeries.add_prefix(deviceTypeName + "."). \
+            rename(columns={deviceTypeName + ".date": "date"})
+
+        dfContDays = pd.merge(dfContDays, dfDaySeries.reset_index(),
+                              on="date", how="left")
+
+    else:
+        dfContDays[deviceTypeName + ".timezoneOffset"] = np.nan
+
+    return dfContDays
+
+
+def impute_upload_records(df, contDays, deviceTypeName):
+    daySeries = \
+        add_device_day_series(df, contDays, deviceTypeName)
+
+    if ((len(df) > 0) & (deviceTypeName + ".timezone" in daySeries)):
+        for i in daySeries.index[1:]:
+            if pd.isnull(daySeries[deviceTypeName + ".timezone"][i]):
+                daySeries.loc[i, [deviceTypeName + ".timezone"]] = (
+                    daySeries.loc[i-1, deviceTypeName + ".timezone"]
+                )
+            if pd.notnull(daySeries[deviceTypeName + ".timezone"][i]):
+                tz = daySeries.loc[i, deviceTypeName + ".timezone"]
+                tzo = get_timezone_offset(
+                    pd.to_datetime(daySeries.loc[i, "date"]),
+                    tz
+                )
+                daySeries.loc[i, deviceTypeName + ".timezoneOffset"] = tzo
+
+            if pd.notnull(daySeries[deviceTypeName + ".timeProcessing"][i-1]):
+                daySeries.loc[i, deviceTypeName + ".timeProcessing"] = \
+                    daySeries.loc[i-1, deviceTypeName + ".timeProcessing"]
+
+    else:
+        daySeries[deviceTypeName + ".timezone"] = np.nan
+        daySeries[deviceTypeName + ".timeProcessing"] = np.nan
+
+    return daySeries
+
+
+def add_home_timezone(df, contDays):
+
+    if (("timezone" in df) & (df["timezone"].notnull().sum()> 0)):
+        homeTimezone = df["timezone"].describe()["top"]
+        tzo = contDays.date.apply(
+                lambda x: get_timezone_offset(pd.to_datetime(x), homeTimezone))
+
+        contDays["home.imputed.timezoneOffset"] = tzo
+        contDays["home.imputed.timezone"] = homeTimezone
+
+    else:
+        contDays["home.imputed.timezoneOffset"] = np.nan
+        contDays["home.imputed.timezone"] = np.nan
+    contDays["home.imputed.timeProcessing"] = np.nan
+
+    return contDays
+
+
+def estimateTzAndTzoWithUploadRecords(cDF):
+
+    cDF["est.type"] = np.nan
+    cDF["est.gapSize"] = np.nan
+    cDF["est.timezoneOffset"] = cDF["upload.timezoneOffset"]
+    cDF["est.annotations"] = np.nan
+
+    if "upload.timezone" in cDF:
+        cDF.loc[cDF["upload.timezone"].notnull(), ["est.type"]] = "UPLOAD"
+        cDF["est.timezone"] = cDF["upload.timezone"]
+        cDF["est.timeProcessing"] = cDF["upload.timeProcessing"]
+    else:
+        cDF["est.timezone"] = np.nan
+        cDF["est.timeProcessing"] = np.nan
+
+    cDF.loc[((cDF["est.timezoneOffset"] !=
+              cDF["home.imputed.timezoneOffset"]) &
+            (pd.notnull(cDF["est.timezoneOffset"]))),
+            "est.annotations"] = "travel"
+
+    return cDF
+
+
+def assignTzoFromImputedSeries(df, i, imputedSeries):
+    df.loc[i, ["est.type"]] = "DEVICE"
+
+    df.loc[i, ["est.timezoneOffset"]] = \
+        df.loc[i, imputedSeries + ".timezoneOffset"]
+
+    df.loc[i, ["est.timezone"]] = \
+        df.loc[i, imputedSeries + ".timezone"]
+
+    df.loc[i, ["est.timeProcessing"]] = \
+        df.loc[i, imputedSeries + ".timeProcessing"]
+
+    return df
+
+
+def compareDeviceTzoToImputedSeries(df, sIdx, device):
+    for i in sIdx:
+        # if the device tzo = imputed tzo, then chose the imputed tz and tzo
+        # note, dst is accounted for in the imputed tzo
+        for imputedSeries in ["pump.upload.imputed", "cgm.upload.imputed",
+                              "healthkit.upload.imputed", "home.imputed"]:
+            # if the estimate has not already been made
+            if pd.isnull(df.loc[i, "est.timezone"]):
+
+                if df.loc[i, device + ".timezoneOffset"] == \
+                  df.loc[i, imputedSeries + ".timezoneOffset"]:
+
+                    assignTzoFromImputedSeries(df, i, imputedSeries)
+
+                    df = addAnnotation(df, i,
+                                       "tz-inferred-from-" + imputedSeries)
+
+                # if the imputed series has a timezone estimate, then see if
+                # the current day is a dst change day
+                elif (pd.notnull(df.loc[i, imputedSeries + ".timezone"])):
+                    imputedTimezone = df.loc[i, imputedSeries + ".timezone"]
+                    if isDSTChangeDay(df.loc[i, "date"], imputedTimezone):
+
+                        dstRange = getRangeOfTZOsForTimezone(imputedTimezone)
+                        if ((df.loc[i, device + ".timezoneOffset"] in dstRange)
+                          & (df.loc[i, imputedSeries + ".timezoneOffset"] in dstRange)):
+
+                            assignTzoFromImputedSeries(df, i, imputedSeries)
+
+                            df = addAnnotation(df, i, "dst-change-day")
+                            df = addAnnotation(
+                                    df, i, "tz-inferred-from-" + imputedSeries)
+
+    return df
+
+
+def estimateTzAndTzoWithDeviceRecords(cDF):
+
+    # 2A. use the TZO of the pump or cgm device if it exists on a given day. In
+    # addition, compare the TZO to one of the imputed day series (i.e., the
+    # upload and home series to see if the TZ can be inferred)
+    for deviceType in ["pump", "cgm"]:
+        # find the indices of days where a TZO estimate has not been made AND
+        # where the device (e.g., pump or cgm) TZO has data
+        sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
+                        (cDF[deviceType + ".timezoneOffset"].notnull()))].index
+        # compare the device TZO to the imputed series to infer time zone
+        cDF = compareDeviceTzoToImputedSeries(cDF, sIndices, deviceType)
+
+    # 2B. if the TZ cannot be inferred with 2A, then see if the TZ can be
+    # inferred from the previous day's TZO. If the device TZO is equal to the
+    # previous day's TZO, AND if the previous day has a TZ estimate, use the
+    # previous day's TZ estimate for the current day's TZ estimate
+    for deviceType in ["pump", "cgm"]:
+        sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
+                        (cDF[deviceType + ".timezoneOffset"].notnull()))].index
+
+        cDF = compareDeviceTzoToPrevDayTzo(cDF, sIndices, deviceType)
+
+    # 2C. after 2A and 2B, check the DEVICE estimates to make sure that the
+    # pump and cgm tzo do not differ by more than 60 minutes. If they differ
+    # by more that 60 minutes, then mark the estimate as UNCERTAIN. Also, we
+    # allow the estimates to be off by 60 minutes as there are a lot of cases
+    # where the devices are off because the user changes the time for DST,
+    # at different times
+    sIndices = cDF[((cDF["est.type"] == "DEVICE") &
+                    (cDF["pump.timezoneOffset"].notnull()) &
+                    (cDF["cgm.timezoneOffset"].notnull()) &
+                    (cDF["pump.timezoneOffset"] != cDF["cgm.timezoneOffset"])
+                    )].index
+
+    tzoDiffGT60 = abs(cDF.loc[sIndices, "cgm.timezoneOffset"] -
+                      cDF.loc[sIndices, "pump.timezoneOffset"]) > 60
+
+    idx = tzoDiffGT60.index[tzoDiffGT60]
+
+    cDF.loc[idx, ["est.type"]] = "UNCERTAIN"
+    for i in idx:
+        cDF = addAnnotation(cDF, i, "pump-cgm-tzo-mismatch")
+
+    return cDF
+
+
+def imputeTzAndTzo(cDF):
+
+    sIndices = cDF[cDF["est.timezoneOffset"].isnull()].index
+    hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index
+    if len(hasTzoIndices) > 0:
+        if len(sIndices) > 0:
+            lastDay = max(sIndices)
+
+            while ((sIndices.min() < max(hasTzoIndices)) &
+                   (len(sIndices) > 0)):
+
+                currentDay, prevDayWithDay, nextDayIdx = \
+                    getImputIndices(cDF, sIndices, hasTzoIndices)
+
+                cDF = imputeByTimezone(cDF, currentDay,
+                                       prevDayWithDay, nextDayIdx)
+
+                sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) &
+                                (~cDF["est.annotations"].str.contains(
+                                "unable-to-impute-tzo").fillna(False)))].index
+
+                hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index
+
+            # try to impute to the last day (earliest day) in the dataset
+            # if the last record has a timezone that is the home record, then
+            # impute using the home timezone
+            if len(sIndices) > 0:
+                currentDay = min(sIndices)
+                prevDayWithDay = currentDay - 1
+                gapSize = lastDay - currentDay
+
+                for i in range(currentDay, lastDay + 1):
+                    if cDF.loc[prevDayWithDay, "est.timezoneOffset"] == \
+                      cDF.loc[prevDayWithDay, "home.imputed.timezoneOffset"]:
+
+                        cDF.loc[i, ["est.type"]] = "IMPUTE"
+
+                        cDF.loc[i, ["est.timezoneOffset"]] = \
+                            cDF.loc[i, "home.imputed.timezoneOffset"]
+
+                        cDF.loc[i, ["est.timezone"]] = \
+                            cDF.loc[i, "home.imputed.timezone"]
+
+                        cDF = addAnnotation(cDF, i, "gap=" + str(gapSize))
+                        cDF.loc[i, ["est.gapSize"]] = gapSize
+
+                    else:
+                        cDF.loc[i, ["est.type"]] = "UNCERTAIN"
+                        cDF = addAnnotation(cDF, i, "unable-to-impute-tzo")
+    else:
+        cDF["est.type"] = "UNCERTAIN"
+        cDF["est.annotations"] = "unable-to-impute-tzo"
+
+    return cDF
+
+
+def getRangeOfTZOsForTimezone(tz):
+    minMaxTzo = [getTimezoneOffset(pd.to_datetime("1/1/2017"), tz),
+                 getTimezoneOffset(pd.to_datetime("5/1/2017"), tz)]
+
+    rangeOfTzo = np.arange(int(min(minMaxTzo)), int(max(minMaxTzo))+1, 15)
+
+    return rangeOfTzo
+
+
+def getListOfDSTChangeDays(cDF):
+
+    # get a list of DST change days for the home time zone
+    dstChangeDays = \
+        cDF[abs(cDF["home.imputed.timezoneOffset"] -
+                cDF["home.imputed.timezoneOffset"].shift(-1)) > 0].date
+
+    return dstChangeDays
+
+
+def correctEstimatesAroundDst(df, cDF):
+
+    # get a list of DST change days for the home time zone
+    dstChangeDays = getListOfDSTChangeDays(cDF)
+
+    # loop through the df within 2 days of a daylight savings time change
+    for d in dstChangeDays:
+        dstIndex = df[(df.date > (d + dt.timedelta(days=-2))) &
+                      (df.date < (d + dt.timedelta(days=2)))].index
+        for dIdx in dstIndex:
+            if pd.notnull(df.loc[dIdx, "est.timezone"]):
+                tz = pytz.timezone(df.loc[dIdx, "est.timezone"])
+                tzRange = getRangeOfTZOsForTimezone(str(tz))
+                minHoursToLocal = min(tzRange)/60
+                tzoNum = int(tz.localize(df.loc[dIdx, "utcTime"] +
+                             dt.timedelta(hours=minHoursToLocal)).strftime("%z"))
+                tzoHours = np.floor(tzoNum / 100)
+                tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+                tzoSign = np.sign(tzoHours)
+                tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+                localTime = \
+                    df.loc[dIdx, "utcTime"] + pd.to_timedelta(tzo, unit="m")
+                df.loc[dIdx, ["est.localTime"]] = localTime
+                df.loc[dIdx, ["est.timezoneOffset"]] = tzo
+    return df
+
+
+def applyLocalTimeEstimates(df, cDF):
+    df = pd.merge(df, cDF, how="left", on="date")
+    df["est.localTime"] = \
+        df["utcTime"] + pd.to_timedelta(df["est.timezoneOffset"], unit="m")
+
+    df = correctEstimatesAroundDst(df, cDF)
+
+    return df["est.localTime"].values
+
+
+def isDSTChangeDay(currentDate, currentTimezone):
+    tzoCurrentDay = getTimezoneOffset(pd.to_datetime(currentDate),
+                                      currentTimezone)
+    tzoPreviousDay = getTimezoneOffset(pd.to_datetime(currentDate) +
+                                       dt.timedelta(days=-1), currentTimezone)
+
+    return (tzoCurrentDay != tzoPreviousDay)
+
+
+def tzoRangeWithComparisonTz(df, i, comparisonTz):
+    # if we have a previous timezone estimate, then calcuate the range of
+    # timezone offset values for that time zone
+    if pd.notnull(comparisonTz):
+        rangeTzos = getRangeOfTZOsForTimezone(comparisonTz)
+    else:
+        comparisonTz = np.nan
+        rangeTzos = np.array([])
+
+    return rangeTzos
+
+
+def tzAndTzoRangePreviousDay(df, i):
+    # if we have a previous timezone estimate, then calcuate the range of
+    # timezone offset values for that time zone
+    comparisonTz = df.loc[i-1, "est.timezone"]
+
+    rangeTzos = tzoRangeWithComparisonTz(df, i, comparisonTz)
+
+    return comparisonTz, rangeTzos
+
+
+def assignTzoFromPreviousDay(df, i, previousDayTz):
+
+    df.loc[i, ["est.type"]] = "DEVICE"
+    df.loc[i, ["est.timezone"]] = previousDayTz
+    df.loc[i, ["est.timezoneOffset"]] = \
+        getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), previousDayTz)
+
+    df.loc[i, ["est.timeProcessing"]] = df.loc[i-1, "est.timeProcessing"]
+    df = addAnnotation(df, i, "tz-inferred-from-prev-day")
+
+    return df
+
+
+def assignTzoFromDeviceTzo(df, i, device):
+
+    df.loc[i, ["est.type"]] = "DEVICE"
+    df.loc[i, ["est.timezoneOffset"]] = \
+        df.loc[i, device + ".timezoneOffset"]
+    df.loc[i, ["est.timeProcessing"]] = \
+        df.loc[i, device + ".upload.imputed.timeProcessing"]
+
+    df = addAnnotation(df, i, "likely-travel")
+    df = addAnnotation(df, i, "tzo-from-" + device)
+
+    return df
+
+
+def compareDeviceTzoToPrevDayTzo(df, sIdx, device):
+
+    for i in sIdx[sIdx > 0]:
+
+        # first see if the previous record has a tzo
+        if (pd.notnull(df.loc[i-1, "est.timezoneOffset"])):
+
+            previousDayTz, dstRange = tzAndTzoRangePreviousDay(df, i)
+            timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) -
+                           df.loc[i-1, "est.timezoneOffset"])
+
+            # next see if the previous record has a tz
+            if (pd.notnull(df.loc[i-1, "est.timezone"])):
+
+                if timeDiff == 0:
+                    assignTzoFromPreviousDay(df, i, previousDayTz)
+
+                # see if the previous day's tzo and device tzo are within the
+                # dst range (as that is a common problem with this data)
+                elif ((df.loc[i, device + ".timezoneOffset"] in dstRange)
+                      & (df.loc[i-1, "est.timezoneOffset"] in dstRange)):
+
+                    # then see if it is DST change day
+                    if isDSTChangeDay(df.loc[i, "date"], previousDayTz):
+
+                        df = addAnnotation(df, i, "dst-change-day")
+                        assignTzoFromPreviousDay(df, i, previousDayTz)
+
+                    # if it is not DST change day, then mark this as uncertain
+                    else:
+                        # also, check to see if the difference between device.
+                        # tzo and prev.tzo is less than the expected dst
+                        # difference. There is a known issue where the BtUTC
+                        # procedure puts clock drift into the device.tzo,
+                        # and as a result the tzo can be off by 15, 30,
+                        # or 45 minutes.
+                        if (((df.loc[i, device + ".timezoneOffset"] ==
+                              min(dstRange)) |
+                            (df.loc[i, device + ".timezoneOffset"] ==
+                             max(dstRange))) &
+                           ((df.loc[i-1, "est.timezoneOffset"] ==
+                             min(dstRange)) |
+                            (df.loc[i-1, "est.timezoneOffset"] ==
+                             max(dstRange)))):
+
+                            df.loc[i, ["est.type"]] = "UNCERTAIN"
+                            df = addAnnotation(df, i,
+                                               "likely-dst-error-OR-travel")
+
+                        else:
+
+                            df.loc[i, ["est.type"]] = "UNCERTAIN"
+                            df = addAnnotation(df, i,
+                                               "likely-15-min-dst-error")
+
+                # next see if time difference between device.tzo and prev.tzo
+                # is off by 720 minutes, which is indicative of a common
+                # user AM/PM error
+                elif timeDiff == 720:
+                    df.loc[i, ["est.type"]] = "UNCERTAIN"
+                    df = addAnnotation(df, i, "likely-AM-PM-error")
+
+                # if it doesn't fall into any of these cases, then the
+                # tzo difference is likely due to travel
+                else:
+                    df = assignTzoFromDeviceTzo(df, i, device)
+
+            elif timeDiff == 0:
+                df = assignTzoFromDeviceTzo(df, i, device)
+
+        # if there is no previous record to compare with check for dst errors,
+        # and if there are no errors, it is likely a travel day
+        else:
+
+            comparisonTz, dstRange = tzAndTzoRangeWithHomeTz(df, i)
+            timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) -
+                           df.loc[i, "home.imputed.timezoneOffset"])
+
+            if ((df.loc[i, device + ".timezoneOffset"] in dstRange)
+               & (df.loc[i, "home.imputed.timezoneOffset"] in dstRange)):
+
+                # see if it is DST change day
+                if isDSTChangeDay(df.loc[i, "date"], comparisonTz):
+
+                    df = addAnnotation(df, i, "dst-change-day")
+                    df.loc[i, ["est.type"]] = "DEVICE"
+                    df.loc[i, ["est.timezoneOffset"]] = \
+                        df.loc[i, device + ".timezoneOffset"]
+                    df.loc[i, ["est.timezone"]] = \
+                        df.loc[i, "home.imputed.timezone"]
+                    df.loc[i, ["est.timeProcessing"]] = \
+                        df.loc[i, device + ".upload.imputed.timeProcessing"]
+
+                # if it is not DST change day, then mark this as uncertain
+                else:
+                    # also, check to see if the difference between device.
+                    # tzo and prev.tzo is less than the expected dst
+                    # difference. There is a known issue where the BtUTC
+                    # procedure puts clock drift into the device.tzo,
+                    # and as a result the tzo can be off by 15, 30,
+                    # or 45 minutes.
+                    if (((df.loc[i, device + ".timezoneOffset"] ==
+                          min(dstRange)) |
+                        (df.loc[i, device + ".timezoneOffset"] ==
+                         max(dstRange))) &
+                       ((df.loc[i, "home.imputed.timezoneOffset"] ==
+                         min(dstRange)) |
+                        (df.loc[i, "home.imputed.timezoneOffset"] ==
+                         max(dstRange)))):
+
+                        df.loc[i, ["est.type"]] = "UNCERTAIN"
+                        df = addAnnotation(df, i, "likely-dst-error-OR-travel")
+
+                    else:
+
+                        df.loc[i, ["est.type"]] = "UNCERTAIN"
+                        df = addAnnotation(df, i, "likely-15-min-dst-error")
+
+            # next see if time difference between device.tzo and prev.tzo
+            # is off by 720 minutes, which is indicative of a common
+            # user AM/PM error
+            elif timeDiff == 720:
+                df.loc[i, ["est.type"]] = "UNCERTAIN"
+                df = addAnnotation(df, i, "likely-AM-PM-error")
+
+            # if it doesn't fall into any of these cases, then the
+            # tzo difference is likely due to travel
+
+            else:
+                df = assignTzoFromDeviceTzo(df, i, device)
+
+    return df
+
+
+def getImputIndices(df, sIdx, hIdx):
+
+    lastDayIdx = len(df) - 1
+
+    currentDayIdx = sIdx.min()
+    tempList = pd.Series(hIdx) - currentDayIdx
+    prevDayIdx = currentDayIdx - 1
+    nextDayIdx = \
+        min(currentDayIdx + min(tempList[tempList >= 0]), lastDayIdx)
+
+    return currentDayIdx, prevDayIdx, nextDayIdx
+
+
+def imputeByTimezone(df, currentDay, prevDaywData, nextDaywData):
+
+    gapSize = (nextDaywData - currentDay)
+
+    if prevDaywData >= 0:
+
+        if df.loc[prevDaywData, "est.timezone"] == \
+          df.loc[nextDaywData, "est.timezone"]:
+
+            tz = df.loc[prevDaywData, "est.timezone"]
+
+            for i in range(currentDay, nextDaywData):
+
+                df.loc[i, ["est.timezone"]] = tz
+
+                df.loc[i, ["est.timezoneOffset"]] = \
+                    getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), tz)
+
+                df.loc[i, ["est.type"]] = "IMPUTE"
+
+                df = addAnnotation(df, i, "gap=" + str(gapSize))
+                df.loc[i, ["est.gapSize"]] = gapSize
+
+        # TODO: this logic should be updated to handle the edge case
+        # where the day before and after the gap have differing TZ, but
+        # the same TZO. In that case the gap should be marked as UNCERTAIN
+        elif df.loc[prevDaywData, "est.timezoneOffset"] == \
+          df.loc[nextDaywData, "est.timezoneOffset"]:
+
+            for i in range(currentDay, nextDaywData):
+
+                df.loc[i, ["est.timezoneOffset"]] = \
+                    df.loc[prevDaywData, "est.timezoneOffset"]
+
+                df.loc[i, ["est.type"]] = "IMPUTE"
+
+                df = addAnnotation(df, i, "gap=" + str(gapSize))
+                df.loc[i, ["est.gapSize"]] = gapSize
+
+        else:
+            for i in range(currentDay, nextDaywData):
+                df.loc[i, ["est.type"]] = "UNCERTAIN"
+                df = addAnnotation(df, i, "unable-to-impute-tzo")
+
+    else:
+        for i in range(currentDay, nextDaywData):
+            df.loc[i, ["est.type"]] = "UNCERTAIN"
+            df = addAnnotation(df, i, "unable-to-impute-tzo")
+
+    return df
+
+
+def addAnnotation(df, idx, annotationMessage):
+    if pd.notnull(df.loc[idx, "est.annotations"]):
+        df.loc[idx, ["est.annotations"]] = df.loc[idx, "est.annotations"] + \
+            ", " + annotationMessage
+    else:
+        df.loc[idx, ["est.annotations"]] = annotationMessage
+
+    return df
+
+
+def getTimezoneOffset(currentDate, currentTimezone):
+
+    tz = pytz.timezone(currentTimezone)
+    # here we add 1 day to the current date to account for changes to/from DST
+    tzoNum = int(tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z"))
+    tzoHours = np.floor(tzoNum / 100)
+    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+    tzoSign = np.sign(tzoHours)
+    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+
+    return tzo
+
+
+def estimate_local_time(df):
+    df["date"] = df["utcTime"].dt.date  # TODO: change this to utcDate later
+    contiguous_days = create_contiguous_day_series(df)
+
+    df["deviceType"] = add_device_type(df)
+    cDays = add_device_day_series(df, contiguous_days, "upload")
+
+    # create day series for cgm df
+    if "timezoneOffset" not in list(df):
+        df["timezoneOffset"] = np.nan
+
+    cgmdf = df[(df["type"] == "cbg") & (df["timezoneOffset"].notnull())].copy()
+    cDays = add_device_day_series(cgmdf, cDays, "cgm")
+
+    # create day series for pump df
+    pumpdf = df[(df.type == "bolus") & (df.timezoneOffset.notnull())].copy()
+    cDays = add_device_day_series(pumpdf, cDays, "pump")
+
+    # interpolate between upload records of the same deviceType, and create a
+    # day series for interpolated pump, non-hk-cgm, and healthkit uploads
+    for deviceType in ["pump", "cgm", "healthkit"]:
+        tempUploaddf = df[df["deviceType"] == deviceType].copy()
+        cDays = impute_upload_records(
+            tempUploaddf, cDays, deviceType + ".upload.imputed"
+        )
+
+    # add a home timezone that also accounts for daylight savings time changes
+    cDays = add_home_timezone(df, cDays)
+
+    # 1. USE UPLOAD RECORDS TO ESTIMATE TZ AND TZO
+    cDays = estimateTzAndTzoWithUploadRecords(cDays)
+
+    # 2. USE DEVICE TZOs TO ESTIMATE TZO AND TZ (IF POSSIBLE)
+    # estimates can be made from pump and cgm df that have a TZO
+    # NOTE: the healthkit and dexcom-api cgm df are excluded
+    cDays = estimateTzAndTzoWithDeviceRecords(cDays)
+
+    # 3. impute, infer, or interpolate gaps in the estimated tzo and tz
+    cDays = imputeTzAndTzo(cDays)
+
+    # 4. APPLY LOCAL TIME ESTIMATES TO ALL df
+    local_time = applyLocalTimeEstimates(df, cDays)
+
+    return local_time, cDays
+
+
+# %% MAIN FUNCTION
+def get_distribution_and_stats(
+        json_data_path,
+        userid,
+        date_stamp,
+        save_data_path
+):
+
+    phi_date = "PHI-" + date_stamp
+
+    output_metadata = os.path.join(
+        save_data_path,
+        phi_date + "-donor-data",
+        phi_date + "-cgm-metadata"
+    )
+
+    output_distribution = os.path.join(
+        save_data_path,
+        phi_date + "-donor-data",
+        phi_date + "-cgm-distributions"
+    )
+    debug_duplicates = os.path.join(
+        save_data_path,
+        phi_date + "-donor-data",
+        phi_date + "-debug-cgm-duplicates"
+    )
+    output_stats = os.path.join(
+        save_data_path,
+        phi_date + "-donor-data",
+        phi_date + "-cgm-stats"
+    )
+
+    make_folder_if_doesnt_exist(
+        [output_metadata, output_distribution, debug_duplicates, output_stats]
+    )
+
+    timezone_aliases = pd.read_csv(
+        "wikipedia-timezone-aliases-2018-04-28.csv",
+        low_memory=False
+    )
+
+    donor_metadata_columns = [
+        'userid',
+        'diagnosisType',
+        'diagnosisDate',
+        'biologicalSex',
+        'birthday',
+        'targetTimezone',
+        'targetDevices',
+        'isOtherPerson',
+    ]
+
+    # load in data or pull in data
+    if pd.notnull(json_data_path):
+        data = pd.read_json(json_data_path)
+
+    else:
+        data, userid = get_data(
+            save_file="false"
+        )
+
+    # load in donor metadata
+    donor_meta_path = os.path.join(
+        save_data_path,
+        phi_date + "-donor-data",
+        phi_date + "-donor-metadata.csv"
+    )
+    if os.path.exists(donor_meta_path):
+
+        all_donor_metadata = pd.read_csv(
+            donor_meta_path,
+            low_memory=False
+        )
+
+        metadata = all_donor_metadata.loc[
+            all_donor_metadata["userid"] == userid,
+            donor_metadata_columns
+        ]
+    else:
+        metadata, _ = get_shared_metadata(
+            donor_group="bigdata",
+            userid_of_shared_user=userid
+        )
+
+    print("starting", userid)
+
+    #  HASH USER ID
+    hashid = hash_userid(userid, os.environ['BIGDATA_SALT'])
+    data["userid"] = userid
+    data["hashid"] = hashid
+    metadata["hashid"] = hashid
+
+    #  CLEAN DATA
+
+    # NOTE: moving remove negative durations to type specific cleaning
+    # TODO: ask backend to change "duration" to only include one object type
+
+    # Tslim calibration bug fix
+    data, n_cal_readings = tslim_calibration_fix(data.copy())
+    metadata["nTandemAndPayloadCalReadings"] = n_cal_readings
+
+    # fix large timzoneOffset bug in utcbootstrapping
+    data = timezone_offset_bug_fix(data.copy())
+
+    # add healthkit timezome information
+    # TODO: refactor this function to only require fields that might have hk tz
+    data[["timezone", "deviceType"]] = get_healthkit_timezone(data.copy())
+
+    # convert deprecated timezones to their aliases
+    data = convert_deprecated_timezone_to_alias(data, timezone_aliases)
+
+    #  TIME RELATED ITEMS
+    data["utcTime"] = to_utc_datetime(data[["time"]].copy())
+
+    # add upload time to the data, which is needed for:
+    # getting rid of duplicates and useful for getting local time
+
+    data["uploadTime"] = (
+        add_upload_time(data[["type", "uploadId", "utcTime"]].copy())
+    )
+
+#    # estimate local time (refactor of estimate-local-time.py)
+#    data["localTime"], local_time_metadata = estimate_local_time(data.copy())
+#
+# TODO: fix this issue with estimate local time
+#    '''
+#    //anaconda3/envs/tbddp/lib/python3.7/site-packages/pandas/core/ops.py:1649
+#    FutureWarning: elementwise comparison failed; returning scalar instead,
+#    but in the future will perform elementwise comparison result = method(y)
+#    '''
+
+    # round all data to the nearest 5 minutes
+    data["roundedUtcTime"] = round_time(
+        data["utcTime"].copy(),
+        time_interval_minutes=5,
+        start_with_first_record=True,
+        return_calculation_columns=False
+    )
+
+    #  TIME CATEGORIES
+    data["date"] = data["roundedUtcTime"].dt.date
+
+    # AGE, & YLW
+    # TODO: make this a function
+    if pd.notnull(metadata["birthday"].values[0]):
+        bDate = pd.to_datetime(metadata["birthday"].values[0][0:7])
+        data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25)
+    else:
+        data["age"] = np.nan
+
+    if pd.notnull(metadata["diagnosisDate"].values[0]):
+        dDate = pd.to_datetime(metadata["diagnosisDate"].values[0][0:7])
+        data["ylw"] = np.floor((data["roundedUtcTime"] - dDate).dt.days/365.25)
+    else:
+        data["ylw"] = np.nan
+
+    #  GROUP DATA BY TYPE
+    # first sort by upload time (used when removing dumplicates)
+    data.sort_values("uploadTime", ascending=False, inplace=True)
+    groups = data.groupby(by="type")
+
+    # check to see if person is looping
+    if "basal" in data["type"].unique():
+        basal = groups.get_group("basal").dropna(axis=1, how="all")
+        if "deliveryType" in list(basal):
+            bd = basal.loc[
+                basal["deliveryType"] == "temp",
+                ["date", "deliveryType"]
+            ]
+            temp_basal_counts = (
+                pd.DataFrame(
+                    bd.groupby("date").deliveryType.count()
+                ).reset_index()
+            )
+            temp_basal_counts.rename(
+                {"deliveryType": "tempBasalCounts"},
+                axis=1,
+                inplace=True
+            )
+            data = pd.merge(data, temp_basal_counts, on="date", how="left")
+            # >= 25 temp basals per day is likely looping
+            data["isLoopDay"] = data["tempBasalCounts"] >= 25
+            # redefine groups with the new data
+            groups = data.groupby(by="type")
+
+        else:
+            data["isLoopDay"] = np.nan
+    else:
+        data["isLoopDay"] = np.nan
+
+    # %% CGM DATA
+    if "cbg" in data["type"].unique():
+        # sort data with
+        metadata["cgmData"] = True
+
+        # filter by cgm
+        cgm = groups.get_group("cbg").copy()
+
+        # sort data
+        cgm.sort_values("roundedUtcTime", ascending=False, inplace=True)
+        cgm.reset_index(drop=False, inplace=True)
+
+        # calculate cgm in mg/dL
+        cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL)
+
+        # get rid of spike data
+        cgm, nSpike = remove_spike_data(cgm.copy())
+        metadata["nSpike"] = nSpike
+
+        # assign upload cgm device info to cgm records in that upload
+        cgm = add_upload_info_to_cgm_records(groups, cgm.copy())
+
+        # check to see if cgm info exists in healthkit locations
+        cgm = expand_heathkit_cgm_fields(cgm.copy())
+
+        # replace smoothed cgm values with raw values (if they exist)
+        # this must run after expand_heathkit_cgm_fields _
+        cgm["mg/dL"], metadata["nSmoothedCgmReplaced"] = (
+            replace_smoothed_cgm_values(cgm.copy())
+        )
+
+        # get cgm models
+        cgm["cgmModel"], cgm["cgmModelSensedFrom"] = np.nan, np.nan
+
+        # dexcom cgm models (G4, G5, G6)
+        cgm[["cgmModel", "cgmModelSensedFrom"]] = (
+            get_dexcom_cgm_model(cgm.copy())
+        )
+
+        # for non dexcom cgms
+        # 670G, 640G, 630G, 530G, 523/723, libre, animas, and tandem
+        cgm[["cgmModel", "cgmModelSensedFrom"]] = (
+            get_non_dexcom_cgm_model(cgm.copy())
+        )
+
+        # get metadata on cgm models and devices
+        metadata["nMissingCgmModels"] = cgm["cgmModel"].isnull().sum()
+        metadata["uniqueCgmModels"] = str(cgm["cgmModel"].unique())
+        if "deviceId" in list(cgm):
+            metadata["uniqueCgmDevices"] = str(cgm["deviceId"].unique())
+
+        #  clean distributions
+        # break up all traces by cgm model
+        combined_cgm_series = pd.DataFrame()
+        cgm_models = cgm.groupby(by="cgmModel")
+
+        for cgm_model in cgm_models.groups.keys():
+            print("working on", cgm_model)
+            temp_cgm = cgm_models.get_group(cgm_model)
+
+            # get rid of cgm values too low/high (< 38 & > 402 mg/dL)
+            temp_cgm, nInvalidCgmValues = remove_invalid_cgm_values(temp_cgm)
+            metadata["nInvalidCgmValues." + cgm_model] = nInvalidCgmValues
+
+            # sort by upload time before getting rid of duplicates
+            temp_cgm.sort_values("uploadTime", ascending=False, inplace=True)
+
+            # get rid of duplicates that have the same ["deviceTime", "mg/dL"]
+            temp_cgm, n_cgm_dups_removed = (
+                removeCgmDuplicates(temp_cgm, "deviceTime", "mg/dL")
+            )
+            metadata["nCgmDuplicatesRemovedDeviceTime." + cgm_model] = (
+                n_cgm_dups_removed
+            )
+
+            # get rid of duplicates that have the same ["time", "mg/dL"]
+            temp_cgm, n_cgm_dups_removed = (
+                removeCgmDuplicates(temp_cgm, "utcTime", "mg/dL")
+            )
+            metadata["nCgmDuplicatesRemovedUtcTime." + cgm_model] = (
+                n_cgm_dups_removed
+            )
+
+            # get rid of duplicates that have the same roundedTime
+            temp_cgm, n_cgm_dups_removed = (
+                removeDuplicates(temp_cgm, "roundedUtcTime")
+            )
+            metadata["nCgmDuplicatesRemovedRoundedTime." + cgm_model] = (
+                n_cgm_dups_removed
+            )
+
+            # create a contiguous 5 minute time series
+            first_day = temp_cgm["roundedUtcTime"].min()
+            metadata["firstCgm." + cgm_model] = first_day
+
+            last_day = temp_cgm["roundedUtcTime"].max()
+            metadata["lastCgm." + cgm_model] = last_day
+
+            rng = pd.date_range(first_day, last_day, freq="5min")
+            contiguous_data = pd.DataFrame(
+                rng,
+                columns=["roundedUtcTime"]
+            ).sort_values(
+                "roundedUtcTime",
+                ascending=False
+            ).reset_index(drop=True)
+
+            # merge with cgm data
+            cgm_series = pd.merge(
+                contiguous_data,
+                temp_cgm[[
+                    "roundedUtcTime", "hashid", "isLoopDay",
+                    "cgmModel", "age", "ylw", "mg/dL"
+                 ]],
+                on="roundedUtcTime",
+                how="left"
+            )
+
+            # sort so that the oldest data point is on top
+            cgm_series.sort_values(
+                "roundedUtcTime", ascending=True, inplace=True
+            )
+            cgm_series.reset_index(drop=True, inplace=True)
+
+            # get dexcom icgm bins
+            value_bins = np.array(
+                [37, 39, 60, 80, 120, 160, 200, 250, 300, 350, 400, 403]
+            )
+            value_bin_names = (
+                "< 40", "40-60", "61-80", "81-120", "121-160", "161-200",
+                "201-250", "251-300", "301-350", "351-400", "> 400"
+            )
+            cgm_series["valueBin"] = pd.cut(
+                cgm_series["mg/dL"], value_bins, labels=value_bin_names
+            )
+
+            # get the previous val
+            cgm_series["previousVal"] = cgm_series["mg/dL"].shift(1)
+
+            # get difference between current and previous val
+            cgm_series["diffFromPrevVal"] = (
+                cgm_series["mg/dL"] - cgm_series["previousVal"]
+            )
+
+            # calculate the rate from previous value (mg/dL/min)
+            cgm_series["rateFromPrevVal"] = cgm_series["diffFromPrevVal"] / 5
+
+            # get dexcom icgm rate bins
+            rate_bins = np.array(
+                [-100, -2.000001, -1.000001, -0.000001, 1, 2, 100]
+            )
+            # NOTE: bracket means include, parentheses means exclude
+            rate_bin_names = (
+                "< -2", "[-2,-1)", "[-1,-0)", "[0,1]", "(1,2]", ">2",
+            )
+            cgm_series["rateBin"] = pd.cut(
+                cgm_series["rateFromPrevVal"], rate_bins, labels=rate_bin_names
+            )
+
+            # through in the join category
+            cgm_series["valAndRateBin"] = (
+                cgm_series["valueBin"].astype(str)
+                + " & "
+                + cgm_series["rateBin"].astype(str)
+            )
+
+            # calculate slope (mg/dL/min) over the last 15, 30, and 60 minutes
+            cgm_series["slope15"] = (
+                cgm_series["mg/dL"].rolling(3).apply(get_slope, raw=True)
+            )
+
+            cgm_series["slope30"] = (
+                cgm_series["mg/dL"].rolling(6).apply(get_slope, raw=True)
+            )
+
+            cgm_series["slope60"] = (
+                cgm_series["mg/dL"].rolling(12).apply(get_slope, raw=True)
+            )
+
+            # add in the next value
+            cgm_series["nextVal"] = cgm_series["mg/dL"].shift(-1)
+
+            # get difference or relative increase/decrease of next value
+            cgm_series["relativeNextValue"] = (
+                cgm_series["nextVal"] - cgm_series["mg/dL"]
+            )
+
+            # rate of next value
+            cgm_series["rateToNextVal"] = cgm_series["relativeNextValue"] / 5
+
+            # drop rows where there is no information
+            cgm_series.dropna(subset=['hashid'], inplace=True)
+            metadata["nCgmDataPoints." + cgm_model] = len(cgm_series)
+
+            # append cgm model to a larger table
+            combined_cgm_series = pd.concat(
+                [combined_cgm_series, cgm_series],
+                ignore_index=True
+            )
+        if len(combined_cgm_series) > 0:
+            # sort so that the oldest data point is on top
+            # and that the G5_G6 get deleted if they are apart of a duplicate
+            combined_cgm_series["cgmModel_G5_and_G6"] = (
+                combined_cgm_series["cgmModel"] == "G5_G6"
+            )
+            combined_cgm_series.sort_values(
+                by=["roundedUtcTime", "cgmModel_G5_and_G6", "cgmModel"],
+                ascending=[False, True, False],
+                inplace=True
+            )
+
+            combined_cgm_series.reset_index(drop=True, inplace=True)
+
+            # add in check to see if there are duplicates between cgm devices
+            nUnique_cgm_times = len(combined_cgm_series["roundedUtcTime"].unique())
+            cgm_len = len(combined_cgm_series)
+            metadata["duplicateCgmDataIssue"] = nUnique_cgm_times != cgm_len
+
+            nDuplicate_cgm = cgm_len - nUnique_cgm_times
+            metadata["nDuplicateCgmDataIssues"] = nDuplicate_cgm
+
+            # if there are still duplicates, get rid of them
+            if nDuplicate_cgm > 0:
+                # save the duplicates for further examination
+                combined_cgm_series.to_csv(os.path.join(
+                    debug_duplicates,
+                    "PHI-" + userid + "-cgm-series-has-cgm-duplicates.csv.gz"
+                ))
+
+                cgm.to_csv(os.path.join(
+                    debug_duplicates,
+                    "PHI-" + userid + "-cgm-data-has-cgm-duplicates.csv.gz"
+                ))
+
+                # get rid of duplicates
+                combined_cgm_series, n_cgm_dups_removed = (
+                    removeDuplicates(combined_cgm_series, "roundedUtcTime")
+                )
+                metadata["nCgmDuplicatesRemovedRoundedTime.atEnd"] = (
+                    n_cgm_dups_removed
+                )
+            metadata["nCgmDataPoints.atEnd"] = len(combined_cgm_series)
+
+            # add whether data is dexcom cgm or not
+            combined_cgm_series["dexcomCgm"] = (
+                combined_cgm_series["cgmModel"].astype(str).str.contains("G4|G5|G6")
+            )
+
+            # save distribution data
+            combined_cgm_series.to_csv(os.path.join(
+                output_distribution,
+                "PHI-" + userid + "-cgm-distribution.csv.gz"
+            ))
+
+            # %% get cgm stats
+            # create a contiguous 5 minute time series of ALL cgm data
+            first_day = combined_cgm_series["roundedUtcTime"].min()
+            metadata["firstCgm." + cgm_model] = first_day
+
+            last_day = combined_cgm_series["roundedUtcTime"].max()
+            metadata["lastCgm." + cgm_model] = last_day
+
+            rng = pd.date_range(first_day, last_day, freq="5min")
+            contiguous_data = pd.DataFrame(
+                rng,
+                columns=["roundedUtcTime"]
+            ).sort_values(
+                "roundedUtcTime",
+                ascending=True
+            ).reset_index(drop=True)
+
+            # merge with combined_cgm_series data
+            all_cgm = pd.merge(
+                contiguous_data,
+                combined_cgm_series[[
+                    'roundedUtcTime', 'hashid', 'cgmModel', 'dexcomCgm',
+                    'age', 'ylw', 'isLoopDay', 'mg/dL',
+                ]],
+                on="roundedUtcTime",
+                how="left"
+            )
+
+            # get cgm stats
+            # get a binary (T/F) of whether we have a cgm value
+            all_cgm["hasCgm"] = all_cgm["mg/dL"].notnull()
+
+            # fill isLoopDay nan with False
+            all_cgm["isLoopDay"].fillna(False, inplace=True)
+
+            # has loop and cgm
+            all_cgm["hasLoopAndCgm"] = (
+                (all_cgm["isLoopDay"]) & (all_cgm["hasCgm"])
+            )
+
+            all_cgm["hasCgmWithoutLoop"] = (
+                (~all_cgm["isLoopDay"]) & (all_cgm["hasCgm"])
+            )
+
+            # work with all of the non-null data, even 39 = LOW and 401 = HIGH
+            ts39_401 = all_cgm["mg/dL"].copy()
+
+            # some stats should NOT include 39 or 401
+            all_cgm["mg/dL.40to400"] = (
+                ts39_401.replace(to_replace=39, value=np.nan)
+            )
+
+            all_cgm["mg/dL.40to400"] = (
+                all_cgm["mg/dL.40to400"].replace(
+                    to_replace=401,
+                    value=np.nan
+                )
+            )
+
+            ts40_400 = all_cgm["mg/dL.40to400"].copy()
+
+            # for all the less than (<) criteria
+            for cgm_threshold in [40, 54, 70]:
+                all_cgm["cgm < " + str(cgm_threshold)] = (
+                    ts39_401.lt(cgm_threshold)
+                )
+                # get episodes below these thresholds
+                for min_duration in [5, 15]:
+                    episode_ts = get_episodes(
+                        all_cgm[[
+                            "roundedUtcTime",
+                            "hasCgm",
+                            "cgm < " + str(cgm_threshold)
+                        ]].copy(),
+                        episode_criterion="cgm < " + str(cgm_threshold),
+                        min_duration=min_duration
+                    )
+                    all_cgm = pd.concat([all_cgm, episode_ts], axis=1)
+
+            # for all the greter than or equal to (>=) criteria
+                all_cgm["cgm >= " + str(cgm_threshold)] = (
+                    ts39_401.ge(cgm_threshold)
+                )
+
+            # for all the the less than or equal to (<=) criteria
+            for cgm_threshold in [140, 180, 250, 300, 400]:
+                all_cgm["cgm <= " + str(cgm_threshold)] = (
+                    ts39_401.le(cgm_threshold)
+                )
+            # for all the the greter than (>) criteria
+                all_cgm["cgm > " + str(cgm_threshold)] = (
+                    ts39_401.gt(cgm_threshold)
+                )
+
+            # get all of the cgm ranges
+            # (cgm >= 40) & (cgm < 54)
+            all_cgm["40 <= cgm < 54"] = (
+                (all_cgm["cgm >= 40"]) & (all_cgm["cgm < 54"])
+            )
+
+            # (cgm >= 54) & (cgm < 70)
+            all_cgm["54 <= cgm < 70"] = (
+                (all_cgm["cgm >= 54"]) & (all_cgm["cgm < 70"])
+            )
+
+            # (cgm >= 70) & (cgm <= 140)
+            all_cgm["70 <= cgm <= 140"] = (
+                (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 140"])
+            )
+
+            # (cgm >= 70) & (cgm <= 180)
+            all_cgm["70 <= cgm <= 180"] = (
+                (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 180"])
+            )
+
+            # (cgm > 180) & (cgm <= 250)
+            all_cgm["180 < cgm <= 250"] = (
+                (all_cgm["cgm > 180"]) & (all_cgm["cgm <= 250"])
+            )
+
+            # (cgm > 250) & (cgm <= 400)
+            all_cgm["250 < cgm <= 400"] = (
+                (all_cgm["cgm > 250"]) & (all_cgm["cgm <= 400"])
+            )
+
+            # derfine the windows to calculate the stats over
+            window_names = ["hour", "day", "week", "month", "quarter", "year"]
+            window_lengths = [12,    288,   288*7,  288*7*4, 288*90,   288*365]
+
+            for w_name, w_len in zip(window_names, window_lengths):
+                # require lenth of window for percent calculations
+                w_min = w_len
+
+                # get the start and end times for each window
+                all_cgm[w_name + ".startTime"] = (
+                    all_cgm["roundedUtcTime"].shift(w_len - 1)
+                )
+                all_cgm[w_name + ".endTime"] = all_cgm["roundedUtcTime"]
+
+                # add majority age for the time period
+                all_cgm[w_name + ".age"] = np.round(
+                    all_cgm["age"].rolling(
+                        min_periods=1,
+                        window=w_len
+                    ).mean()
+                )
+
+                # add majority ylw for the time period
+                all_cgm[w_name + ".ylw"] = np.round(
+                    all_cgm["ylw"].rolling(
+                        min_periods=1,
+                        window=w_len
+                    ).median()
+                )
+
+                # get percent time cgm used
+                all_cgm[w_name + ".cgmPercent"] = (
+                    all_cgm["hasCgm"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # get the total number of non-null values over this time period
+                all_cgm[w_name + ".missingCgmPercent"] = (
+                    1 - all_cgm[w_name + ".cgmPercent"]
+                )
+
+                # create (T/F) 70 and 80 percent available thresholds
+                # which will be useful for processing later
+                all_cgm[w_name + ".ge70Available"] = (
+                    all_cgm[w_name + ".cgmPercent"] >= 0.7
+                )
+
+                all_cgm[w_name + ".ge80Available"] = (
+                    all_cgm[w_name + ".cgmPercent"] >= 0.8
+                )
+
+                # get percent time Loop was used NOTE: this is
+                # approximate because we use > 24 temp basals per day
+                # ALSO: this is percent time Loop was used while cgm in use
+                all_cgm[w_name + ".loopingAndCgmPercent"] = (
+                    all_cgm["hasLoopAndCgm"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # percent of time cgm without loop
+                all_cgm[w_name + ".cgmWithoutLoopPercent"] = (
+                    all_cgm["hasCgmWithoutLoop"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # get episode stats
+                # TODO: add in hyper events
+                # get episodes below these thresholds
+                for cgm_threshold in [40, 54, 70]:
+                    # get number of episodes per time window
+                    for min_duration in [5, 15]:
+                        "cgm < " + str(cgm_threshold)
+                        episode_name = (
+                            "episode.cgm < " + str(cgm_threshold)
+                            + ".durationThreshold=" + str(min_duration)
+                        )
+                        all_cgm[w_name + ".count." + episode_name] = (
+                            all_cgm[episode_name + ".episodeStart"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).sum()
+                        )
+
+                        # get avg. duration of each episode per time window
+                        all_cgm[w_name + ".avgDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).sum() / all_cgm[w_name + ".count." + episode_name]
+                        )
+
+                        # get min duration of each episode per time window
+                        all_cgm[w_name + ".minDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).min()
+                        )
+
+                        # get median duration of each episode per time window
+                        all_cgm[w_name + ".medianDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).median()
+                        )
+
+                        # get max duration of each episode per time window
+                        all_cgm[w_name + ".maxDuration." + episode_name] = (
+                            all_cgm[episode_name + ".episodeTotalDuration"].rolling(
+                                min_periods=1,
+                                window=w_len
+                            ).max()
+                        )
+
+                # get percent time in different ranges
+                # % Time < 54
+                all_cgm[w_name + ".lt54Percent"] = (
+                    all_cgm["cgm < 54"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in 54-70 (cgm >= 54) & (cgm < 70)
+                all_cgm[w_name + ".bt54_70Percent"] = (
+                    all_cgm["54 <= cgm < 70"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in target range (cgm >= 70) & (cgm <= 180)
+                all_cgm[w_name + ".bt70_180Percent"] = (
+                    all_cgm["70 <= cgm <= 180"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in 180-250 (cgm > 180) & (cgm <= 250)
+                all_cgm[w_name + ".bt180_250Percent"] = (
+                    all_cgm["180 < cgm <= 250"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time > 250
+                all_cgm[w_name + ".gt250Percent"] = (
+                    all_cgm["cgm > 250"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # check that all of the percentages add of to 1 or 100%
+                all_cgm[w_name + ".percentCheck"] = (
+                     all_cgm[w_name + ".missingCgmPercent"]
+                     + all_cgm[w_name + ".lt54Percent"]
+                     + all_cgm[w_name + ".bt54_70Percent"]
+                     + all_cgm[w_name + ".bt70_180Percent"]
+                     + all_cgm[w_name + ".bt180_250Percent"]
+                     + all_cgm[w_name + ".gt250Percent"]
+                )
+
+                # here are some other less common percent time in ranges
+                # % Time < 70
+                all_cgm[w_name + ".lt70Percent"] = (
+                    all_cgm["cgm < 70"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # % Time in target range (cgm >= 70) & (cgm <= 140)
+                all_cgm[w_name + ".tir70to140Percent"] = (
+                    all_cgm["70 <= cgm <= 140"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # percent time above a threshold
+                # % Time > 180
+                all_cgm[w_name + ".gt180Percent"] = (
+                    all_cgm["cgm > 180"].rolling(
+                        min_periods=w_min,
+                        window=w_len
+                    ).sum() / w_len
+                )
+
+                # quantiles
+                # NOTE: this will increase run time, so only run if you need
+                # 3-4X the processing time since it has to sort the data
+                # TODO: make this an option to the function, once it is made
+                # create a rolling object
+
+                # NOTE: these calculations only require 3 points to make
+                roll39_401 = ts39_401.rolling(min_periods=3, window=w_len)
+                roll40_400 = ts40_400.rolling(min_periods=3, window=w_len)
+
+                # min
+                all_cgm[w_name + ".min"] = roll39_401.min()
+
+                # 10, 25, 75, and 90th percentiles
+                all_cgm[w_name + ".10th"] = roll39_401.quantile(0.10)
+                all_cgm[w_name + ".25th"] = roll39_401.quantile(0.25)
+                all_cgm[w_name + ".75th"] = roll39_401.quantile(0.75)
+                all_cgm[w_name + ".90th"] = roll39_401.quantile(0.90)
+
+                # max
+                all_cgm[w_name + ".max"] = roll39_401.max()
+
+                # median
+                all_cgm[w_name + ".median"] = roll39_401.median()
+
+                # iqr
+                all_cgm[w_name + ".iqr"] = (
+                    all_cgm[w_name + ".75th"] - all_cgm[w_name + ".25th"]
+                )
+
+                # recalcuate percent of measurements available
+                all_cgm[w_name + ".40to400availablePercent"] = (
+                    roll40_400.count() / w_len
+                )
+
+                # get the total number of non-null values over this time period
+                all_cgm[w_name + ".40to400missingPercent"] = (
+                    1 - all_cgm[w_name + ".40to400availablePercent"]
+                )
+
+                all_cgm[w_name + ".40to400ge70Available"] = (
+                    all_cgm[w_name + ".40to400availablePercent"] >= 0.7
+                )
+
+                all_cgm[w_name + ".40to400ge80Available"] = (
+                    all_cgm[w_name + ".40to400availablePercent"] >= 0.8
+                )
+
+                # mean
+                all_cgm[w_name + ".mean"] = roll40_400.mean()
+
+                # GMI(%) = 3.31 + 0.02392 x [mean glucose in mg/dL]
+                all_cgm[w_name + ".gmi"] = (
+                    3.31 + (0.02392 * all_cgm[w_name + ".mean"])
+                )
+
+                # standard deviation (std)
+                all_cgm[w_name + ".std"] = roll40_400.std()
+
+                # coefficient of variation (cov) = std / mean
+                all_cgm[w_name + ".cov"] = (
+                    all_cgm[w_name + ".std"] / all_cgm[w_name + ".mean"]
+                )
+
+            # %% save cgm stats data
+            all_cgm.to_csv(os.path.join(
+                output_stats,
+                "PHI-" + userid + "-cgm-stats.csv.gz"
+            ))
+            # write the most recent example of the 90 day stats
+            # to the metadata
+            quarter_ge80Available_idx = (
+                all_cgm[all_cgm["quarter.ge80Available"]]
+            ).index.max()
+
+            if pd.notnull(quarter_ge80Available_idx):
+                # get the most recent quarter
+                most_recent = all_cgm.loc[
+                    [quarter_ge80Available_idx],
+                    all_cgm.columns
+                ]
+            else:
+                most_recent = all_cgm.loc[
+                    [all_cgm.index.max()],
+                    all_cgm.columns
+                ]
+
+            metadata = pd.merge(
+                metadata,
+                most_recent,
+                on="hashid",
+                how="left"
+            )
+
+        print(metadata.T)
+
+    else:
+        metadata["cgmData"] = False
+        print(userid, " has no cgm data")
+
+    # save metadata
+    metadata.to_csv(os.path.join(
+        output_metadata,
+        "PHI-" + userid + "-cgm-metadata.csv.gz"
+    ))
+
+    print("finished with", userid, "\n")
+
+    return
+
+
+# %% MAIN
+if __name__ == "__main__":
+    # USER INPUTS (choices to be made in order to run the code)
+    codeDescription = "get distribution and stats for donor json data"
+    parser = argparse.ArgumentParser(description=codeDescription)
+
+    parser.add_argument(
+        "-i",
+        "--input-json-data-path",
+        dest="json_data_path",
+        default=np.nan,
+        help=(
+            "the path where the json data is located, defaults to none and"
+            + " will download your data using  your Tidepool credentials"
+        )
+    )
+
+    parser.add_argument(
+        "-u",
+        "--userid",
+        dest="userid",
+        default=np.nan,
+        help="userid and filename"
+    )
+
+    parser.add_argument(
+        "-d",
+        "--date-stamp",
+        dest="date_stamp",
+        default=dt.datetime.now().strftime("%Y-%m-%d"),
+        help="date, in '%Y-%m-%d' format, of the date when " +
+        "donors were accepted"
+    )
+
+    parser.add_argument(
+        "-o",
+        "--output-data-path",
+        dest="data_path",
+        default=os.path.abspath(
+            os.path.join(
+                os.path.dirname(__file__), "..", "data"
+            )
+        ),
+        help="the output path where the data is stored"
+    )
+
+    args = parser.parse_args()
+
+    # the main function
+    get_distribution_and_stats(
+        json_data_path=args.json_data_path,
+        userid=args.userid,
+        date_stamp=args.date_stamp,
+        save_data_path=args.data_path,
+    )
diff --git a/projects/bigdata-processing-pipeline/get_stats/wikipedia-timezone-aliases-2018-04-28.csv b/projects/bigdata-processing-pipeline/get_stats/wikipedia-timezone-aliases-2018-04-28.csv
new file mode 100644
index 00000000..01370b69
--- /dev/null
+++ b/projects/bigdata-processing-pipeline/get_stats/wikipedia-timezone-aliases-2018-04-28.csv
@@ -0,0 +1,206 @@
+﻿tz,alias
+Africa/Addis_Ababa,Africa/Nairobi
+Africa/Asmara,Africa/Nairobi
+Africa/Bamako,Africa/Abidjan
+Africa/Bangui,Africa/Lagos
+Africa/Banjul,Africa/Abidjan
+Africa/Blantyre,Africa/Maputo
+Africa/Brazzaville,Africa/Lagos
+Africa/Bujumbura,Africa/Maputo
+Africa/Conakry,Africa/Abidjan
+Africa/Dakar,Africa/Abidjan
+Africa/Dar_es_Salaam,Africa/Nairobi
+Africa/Djibouti,Africa/Nairobi
+Africa/Douala,Africa/Lagos
+Africa/Freetown,Africa/Abidjan
+Africa/Gaborone,Africa/Maputo
+Africa/Harare,Africa/Maputo
+Africa/Kampala,Africa/Nairobi
+Africa/Kigali,Africa/Maputo
+Africa/Kinshasa,Africa/Lagos
+Africa/Libreville,Africa/Lagos
+Africa/Lome,Africa/Abidjan
+Africa/Luanda,Africa/Lagos
+Africa/Lubumbashi,Africa/Maputo
+Africa/Lusaka,Africa/Maputo
+Africa/Malabo,Africa/Lagos
+Africa/Maseru,Africa/Johannesburg
+Africa/Mbabane,Africa/Johannesburg
+Africa/Mogadishu,Africa/Nairobi
+Africa/Niamey,Africa/Lagos
+Africa/Nouakchott,Africa/Abidjan
+Africa/Ouagadougou,Africa/Abidjan
+Africa/Porto-Novo,Africa/Lagos
+Africa/Sao_Tome,Africa/Lagos
+Africa/Timbuktu,Africa/Abidjan
+America/Anguilla,America/Port_of_Spain
+America/Antigua,America/Port_of_Spain
+America/Argentina/ComodRivadavia,America/Argentina/Catamarca
+America/Aruba,America/Curacao
+America/Atka,America/Adak
+America/Buenos_Aires,America/Argentina/Buenos_Aires
+America/Catamarca,America/Argentina/Catamarca
+America/Cayman,America/Panama
+America/Coral_Harbour,America/Atikokan
+America/Cordoba,America/Argentina/Cordoba
+America/Dominica,America/Port_of_Spain
+America/Ensenada,America/Tijuana
+America/Fort_Wayne,America/Indiana/Indianapolis
+America/Grenada,America/Port_of_Spain
+America/Guadeloupe,America/Port_of_Spain
+America/Indianapolis,America/Indiana/Indianapolis
+America/Jujuy,America/Argentina/Jujuy
+America/Knox_IN,America/Indiana/Knox
+America/Kralendijk,America/Curacao
+America/Louisville,America/Kentucky/Louisville
+America/Lower_Princes,America/Curacao
+America/Marigot,America/Port_of_Spain
+America/Mendoza,America/Argentina/Mendoza
+America/Montreal,America/Toronto
+America/Montserrat,America/Port_of_Spain
+America/Porto_Acre,America/Rio_Branco
+America/Rosario,America/Argentina/Cordoba
+America/Santa_Isabel,America/Tijuana
+America/Shiprock,America/Denver
+America/St_Barthelemy,America/Port_of_Spain
+America/St_Kitts,America/Port_of_Spain
+America/St_Lucia,America/Port_of_Spain
+America/St_Thomas,America/Port_of_Spain
+America/St_Vincent,America/Port_of_Spain
+America/Tortola,America/Port_of_Spain
+America/Virgin,America/Port_of_Spain
+Antarctica/McMurdo,Pacific/Auckland
+Antarctica/South_Pole,Pacific/Auckland
+Arctic/Longyearbyen,Europe/Oslo
+Asia/Aden,Asia/Riyadh
+Asia/Ashkhabad,Asia/Ashgabat
+Asia/Bahrain,Asia/Qatar
+Asia/Calcutta,Asia/Kolkata
+Asia/Chongqing,Asia/Shanghai
+Asia/Chungking,Asia/Shanghai
+Asia/Dacca,Asia/Dhaka
+Asia/Harbin,Asia/Shanghai
+Asia/Istanbul,Europe/Istanbul
+Asia/Kashgar,Asia/Urumqi[note1]
+Asia/Katmandu,Asia/Kathmandu
+Asia/Kuwait,Asia/Riyadh
+Asia/Macao,Asia/Macau
+Asia/Muscat,Asia/Dubai
+Asia/Phnom_Penh,Asia/Bangkok
+Asia/Rangoon,Asia/Yangon
+Asia/Saigon,Asia/Ho_Chi_Minh
+Asia/Tel_Aviv,Asia/Jerusalem
+Asia/Thimbu,Asia/Thimphu
+Asia/Ujung_Pandang,Asia/Makassar
+Asia/Ulan_Bator,Asia/Ulaanbaatar
+Asia/Vientiane,Asia/Bangkok
+Atlantic/Faeroe,Atlantic/Faroe
+Atlantic/Jan_Mayen,Europe/Oslo
+Atlantic/St_Helena,Africa/Abidjan
+Australia/ACT,Australia/Sydney
+Australia/Canberra,Australia/Sydney
+Australia/LHI,Australia/Lord_Howe
+Australia/North,Australia/Darwin
+Australia/NSW,Australia/Sydney
+Australia/Queensland,Australia/Brisbane
+Australia/South,Australia/Adelaide
+Australia/Tasmania,Australia/Hobart
+Australia/Victoria,Australia/Melbourne
+Australia/West,Australia/Perth
+Australia/Yancowinna,Australia/Broken_Hill
+Brazil/Acre,America/Rio_Branco
+Brazil/DeNoronha,America/Noronha
+Brazil/East,America/Sao_Paulo
+Brazil/West,America/Manaus
+Canada/Atlantic,America/Halifax
+Canada/Central,America/Winnipeg
+Canada/Eastern,America/Toronto
+Canada/Mountain,America/Edmonton
+Canada/Newfoundland,America/St_Johns
+Canada/Pacific,America/Vancouver
+Canada/Saskatchewan,America/Regina
+Canada/Yukon,America/Whitehorse
+Chile/Continental,America/Santiago
+Chile/EasterIsland,Pacific/Easter
+Cuba,America/Havana
+Egypt,Africa/Cairo
+Eire,Europe/Dublin
+Etc/GMT+0,Etc/GMT
+Etc/GMT-0,Etc/GMT
+Etc/GMT0,Etc/GMT
+Etc/Greenwich,Etc/GMT
+Etc/Universal,Etc/UTC
+Etc/Zulu,Etc/UTC
+Europe/Belfast,Europe/London
+Europe/Bratislava,Europe/Prague
+Europe/Busingen,Europe/Zurich
+Europe/Guernsey,Europe/London
+Europe/Isle_of_Man,Europe/London
+Europe/Jersey,Europe/London
+Europe/Ljubljana,Europe/Belgrade
+Europe/Mariehamn,Europe/Helsinki
+Europe/Nicosia,Asia/Nicosia
+Europe/Podgorica,Europe/Belgrade
+Europe/San_Marino,Europe/Rome
+Europe/Sarajevo,Europe/Belgrade
+Europe/Skopje,Europe/Belgrade
+Europe/Tiraspol,Europe/Chisinau
+Europe/Vaduz,Europe/Zurich
+Europe/Vatican,Europe/Rome
+Europe/Zagreb,Europe/Belgrade
+GB,Europe/London
+GB-Eire,Europe/London
+GMT,Etc/GMT
+GMT+0,Etc/GMT
+GMT0,Etc/GMT
+GMT−0,Etc/GMT
+Greenwich,Etc/GMT
+Hongkong,Asia/Hong_Kong
+Iceland,Atlantic/Reykjavik
+Indian/Antananarivo,Africa/Nairobi
+Indian/Comoro,Africa/Nairobi
+Indian/Mayotte,Africa/Nairobi
+Iran,Asia/Tehran
+Israel,Asia/Jerusalem
+Jamaica,America/Jamaica
+Japan,Asia/Tokyo
+Kwajalein,Pacific/Kwajalein
+Libya,Africa/Tripoli
+Mexico/BajaNorte,America/Tijuana
+Mexico/BajaSur,America/Mazatlan
+Mexico/General,America/Mexico_City
+Navajo,America/Denver
+NZ,Pacific/Auckland
+NZ-CHAT,Pacific/Chatham
+Pacific/Johnston,Pacific/Honolulu
+Pacific/Midway,Pacific/Pago_Pago
+Pacific/Ponape,Pacific/Pohnpei
+Pacific/Saipan,Pacific/Guam
+Pacific/Samoa,Pacific/Pago_Pago
+Pacific/Truk,Pacific/Chuuk
+Pacific/Yap,Pacific/Chuuk
+Poland,Europe/Warsaw
+Portugal,Europe/Lisbon
+PRC,Asia/Shanghai
+ROC,Asia/Taipei
+ROK,Asia/Seoul
+Singapore,Asia/Singapore
+Turkey,Europe/Istanbul
+UCT,Etc/UCT
+Universal,Etc/UTC
+US/Alaska,America/Anchorage
+US/Aleutian,America/Adak
+US/Arizona,America/Phoenix
+US/Central,America/Chicago
+US/East-Indiana,America/Indiana/Indianapolis
+US/Eastern,America/New_York
+US/Hawaii,Pacific/Honolulu
+US/Indiana-Starke,America/Indiana/Knox
+US/Michigan,America/Detroit
+US/Mountain,America/Denver
+US/Pacific,America/Los_Angeles
+US/Pacific-New,America/Los_Angeles
+US/Samoa,Pacific/Pago_Pago
+UTC,Etc/UTC
+W-SU,Europe/Moscow
+Zulu,Etc/UTC
\ No newline at end of file
diff --git a/projects/bigdata-processing-pipeline/qualify-data/README.md b/projects/bigdata-processing-pipeline/qualify_data/README.md
similarity index 100%
rename from projects/bigdata-processing-pipeline/qualify-data/README.md
rename to projects/bigdata-processing-pipeline/qualify_data/README.md
diff --git a/projects/bigdata-processing-pipeline/qualify-data/deprecated/qualify-data.py b/projects/bigdata-processing-pipeline/qualify_data/deprecated/qualify-data.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/qualify-data/deprecated/qualify-data.py
rename to projects/bigdata-processing-pipeline/qualify_data/deprecated/qualify-data.py
diff --git a/projects/bigdata-processing-pipeline/qualify-data/qualify_all_donor_data_batch_process.py b/projects/bigdata-processing-pipeline/qualify_data/qualify_all_donor_data_batch_process.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/qualify-data/qualify_all_donor_data_batch_process.py
rename to projects/bigdata-processing-pipeline/qualify_data/qualify_all_donor_data_batch_process.py
diff --git a/projects/bigdata-processing-pipeline/qualify-data/qualify_single_dataset.py b/projects/bigdata-processing-pipeline/qualify_data/qualify_single_dataset.py
similarity index 100%
rename from projects/bigdata-processing-pipeline/qualify-data/qualify_single_dataset.py
rename to projects/bigdata-processing-pipeline/qualify_data/qualify_single_dataset.py
diff --git a/projects/bigdata-processing-pipeline/qualify-data/tidepool-qualification-criteria.json b/projects/bigdata-processing-pipeline/qualify_data/tidepool-qualification-criteria.json
similarity index 100%
rename from projects/bigdata-processing-pipeline/qualify-data/tidepool-qualification-criteria.json
rename to projects/bigdata-processing-pipeline/qualify_data/tidepool-qualification-criteria.json