diff --git a/.gitignore b/.gitignore index 0c1ca188..f4cf204c 100644 --- a/.gitignore +++ b/.gitignore @@ -35,3 +35,5 @@ projects/loop-algorithm/figures/ projects/parsers/output/ projects/get-donors-pump-settings/temp-plot\.html + +projects/bigdata-processing-pipeline/get_stats/debug/ diff --git a/projects/bigdata-processing-pipeline/__init__.py b/projects/bigdata-processing-pipeline/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/README.md b/projects/bigdata-processing-pipeline/anonymize_and_export_data/README.md similarity index 100% rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/README.md rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/README.md diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/anonymize-and-export.py b/projects/bigdata-processing-pipeline/anonymize_and_export_data/anonymize-and-export.py similarity index 100% rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/anonymize-and-export.py rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/anonymize-and-export.py diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/dataFieldExportList.csv b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/dataFieldExportList.csv similarity index 100% rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/dataFieldExportList.csv rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/dataFieldExportList.csv diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.csv b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.csv similarity index 100% rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.csv rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.csv diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.json b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.json similarity index 100% rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.json rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.json diff --git a/projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.xlsx b/projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.xlsx similarity index 100% rename from projects/bigdata-processing-pipeline/anonymize-and-export-data/example-data/jill-jellyfish-lite.xlsx rename to projects/bigdata-processing-pipeline/anonymize_and_export_data/example-data/jill-jellyfish-lite.xlsx diff --git a/projects/bigdata-processing-pipeline/environment.yml b/projects/bigdata-processing-pipeline/environment.yml index 4c945436..64ef3601 100644 --- a/projects/bigdata-processing-pipeline/environment.yml +++ b/projects/bigdata-processing-pipeline/environment.yml @@ -3,9 +3,8 @@ channels: - defaults dependencies: - python=3.7.3 - - numpy=1.16.4 - pandas=0.24.2 + - spyder=3.3.6 - pip=19.1.1 - - spyder=3.3.5 - pip: - python-dotenv==0.10.3 diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/.gitignore b/projects/bigdata-processing-pipeline/estimate_local_time/.gitignore similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/.gitignore rename to projects/bigdata-processing-pipeline/estimate_local_time/.gitignore diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/README.md b/projects/bigdata-processing-pipeline/estimate_local_time/README.md similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/README.md rename to projects/bigdata-processing-pipeline/estimate_local_time/README.md diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/estimate-local-time.py b/projects/bigdata-processing-pipeline/estimate_local_time/estimate-local-time.py similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/estimate-local-time.py rename to projects/bigdata-processing-pipeline/estimate_local_time/estimate-local-time.py diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/estimateLocalTime-batchProcess.py b/projects/bigdata-processing-pipeline/estimate_local_time/estimateLocalTime-batchProcess.py similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/estimateLocalTime-batchProcess.py rename to projects/bigdata-processing-pipeline/estimate_local_time/estimateLocalTime-batchProcess.py diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/example-csv.csv b/projects/bigdata-processing-pipeline/estimate_local_time/example-csv.csv similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/example-csv.csv rename to projects/bigdata-processing-pipeline/estimate_local_time/example-csv.csv diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/example-json.json b/projects/bigdata-processing-pipeline/estimate_local_time/example-json.json similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/example-json.json rename to projects/bigdata-processing-pipeline/estimate_local_time/example-json.json diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/example-xlsx.xlsx b/projects/bigdata-processing-pipeline/estimate_local_time/example-xlsx.xlsx similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/example-xlsx.xlsx rename to projects/bigdata-processing-pipeline/estimate_local_time/example-xlsx.xlsx diff --git a/projects/bigdata-processing-pipeline/estimate-local-time/wikipedia-timezone-aliases-2018-04-28.csv b/projects/bigdata-processing-pipeline/estimate_local_time/wikipedia-timezone-aliases-2018-04-28.csv similarity index 100% rename from projects/bigdata-processing-pipeline/estimate-local-time/wikipedia-timezone-aliases-2018-04-28.csv rename to projects/bigdata-processing-pipeline/estimate_local_time/wikipedia-timezone-aliases-2018-04-28.csv diff --git a/projects/bigdata-processing-pipeline/get-donor-data/README.md b/projects/bigdata-processing-pipeline/get_donor_data/README.md similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/README.md rename to projects/bigdata-processing-pipeline/get_donor_data/README.md diff --git a/projects/bigdata-processing-pipeline/get_donor_data/__init__.py b/projects/bigdata-processing-pipeline/get_donor_data/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py b/projects/bigdata-processing-pipeline/get_donor_data/accept_new_donors_and_get_donor_list.py similarity index 99% rename from projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py rename to projects/bigdata-processing-pipeline/get_donor_data/accept_new_donors_and_get_donor_list.py index 0d8c4a41..b17f5c9e 100644 --- a/projects/bigdata-processing-pipeline/get-donor-data/accept_new_donors_and_get_donor_list.py +++ b/projects/bigdata-processing-pipeline/get_donor_data/accept_new_donors_and_get_donor_list.py @@ -16,6 +16,7 @@ import requests import json import argparse +import pdb envPath = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) if envPath not in sys.path: sys.path.insert(0, envPath) @@ -247,7 +248,7 @@ def accept_and_get_list(args): ) # polish up the final donor list - final_donor_list.sort_values(by="donorGroup", inplace=True) + final_donor_list.sort_values(by="userID", inplace=True) final_donor_list.reset_index(drop=True, inplace=True) if args.save_donor_list: diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/accept-new-donors.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/accept-new-donors.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/accept-new-donors.py rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/accept-new-donors.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-all-col-headings.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-all-col-headings.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-all-col-headings.py rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-all-col-headings.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-json-files.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-json-files.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-json-files.py rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-json-files.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-list.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-list.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get-donor-list.py rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get-donor-list.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/deprecated/get_all_donor_data.py b/projects/bigdata-processing-pipeline/get_donor_data/deprecated/get_all_donor_data.py similarity index 100% rename from projects/bigdata-processing-pipeline/get-donor-data/deprecated/get_all_donor_data.py rename to projects/bigdata-processing-pipeline/get_donor_data/deprecated/get_all_donor_data.py diff --git a/projects/bigdata-processing-pipeline/get-donor-data/example_get_all_data_for_single_user.py b/projects/bigdata-processing-pipeline/get_donor_data/example_get_all_data_for_single_user.py similarity index 95% rename from projects/bigdata-processing-pipeline/get-donor-data/example_get_all_data_for_single_user.py rename to projects/bigdata-processing-pipeline/get_donor_data/example_get_all_data_for_single_user.py index 14767119..3a0966d9 100644 --- a/projects/bigdata-processing-pipeline/get-donor-data/example_get_all_data_for_single_user.py +++ b/projects/bigdata-processing-pipeline/get_donor_data/example_get_all_data_for_single_user.py @@ -25,6 +25,6 @@ ) data, _ = get_data( donor_group="bigdata", - userid_of_shared_user="0d4524bc11", + userid="0d4524bc11", weeks_of_data=4 ) diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py b/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process.py similarity index 78% rename from projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py rename to projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process.py index 15daa252..8e81b372 100644 --- a/projects/bigdata-processing-pipeline/get-donor-data/get_all_donor_data_batch_process.py +++ b/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process.py @@ -117,12 +117,11 @@ def get_all_data(userid, donor_group): metadata_path = os.path.join( args.data_path, - "PHI-" + "2019-07-13" + "-donor-data", - "PHI-" + "2019-07-13" + "-metadata" - + phi_date_stamp + "-donor-data", + phi_date_stamp + "-metadata" ) -all_files = glob.glob(os.path.join(metadata_path, "*.csv")) +all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) all_metadata = pd.DataFrame() for f in all_files: temp_meta = pd.read_csv(f) @@ -137,3 +136,32 @@ def get_all_data(userid, donor_group): os.path.join(donor_folder, phi_date_stamp + "-donor-metadata.csv") ) print("saving metadata...code complete") + + +# %% COMBINE AND SAVE ALL DATASET INFO (METADATA) +print("combining all dataset metadata") + +metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-datasetSummary" +) + +all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) +dataset_metadata = pd.DataFrame() +for f in all_files: + temp_meta = pd.read_csv(f) + temp_meta.rename(columns={"Unnamed: 0": "col_name"}, inplace=True) + userid = f[-32:-22] + temp_meta["userid"] = userid + dataset_metadata = pd.concat( + [dataset_metadata, temp_meta], + ignore_index=True, + sort=False + ) + +dataset_metadata.to_csv( + os.path.join(donor_folder, phi_date_stamp + "-all-dataset-info.csv") +) +print("saving all-dataset-info-metadata...code complete") + diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process_json.py b/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process_json.py new file mode 100644 index 00000000..d43b8e9a --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_donor_data/get_all_donor_data_batch_process_json.py @@ -0,0 +1,138 @@ +# -*- coding: utf-8 -*- +"""accept_donors_and_pull_data.py +This is a wrapper script that accepts all bigdata donation project donors, +and then pulls of their datasets for further processing. +""" + +# %% REQUIRED LIBRARIES +from accept_new_donors_and_get_donor_list import accept_and_get_list +import datetime as dt +import pandas as pd +import subprocess as sub +import os +import glob +import time +import argparse +from multiprocessing import Pool + + +# %% USER INPUTS (choices to be made in order to run the code) +codeDescription = "accepts new donors (shares) and grab their data" +parser = argparse.ArgumentParser(description=codeDescription) + +parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default=dt.datetime.now().strftime("%Y-%m-%d"), + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" +) + +parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" +) + +parser.add_argument( + "-s", + "--save-donor-list", + dest="save_donor_list", + default=True, + help="specify if you want to save the donor list (True/False)" +) + +args = parser.parse_args() + + +# %% FUNCTIONS +def run_process(func_name, userid, donor_group): + func_path = os.path.join(".", func_name) + + p = sub.Popen( + [ + "python", func_path, + "-d", args.date_stamp, + "-dg", donor_group, + "-u", userid, + "-o", args.data_path + ], + stdout=sub.PIPE, + stderr=sub.PIPE + ) + + output, errors = p.communicate() + output = output.decode("utf-8") + errors = errors.decode("utf-8") + + if errors == '': + print(output) + else: + print(errors) + + return + + +def get_all_data(userid, donor_group): + + run_process("get_single_donor_metadata.py", userid, donor_group) + run_process("get_single_tidepool_dataset_json.py", userid, donor_group) + + return + + +# %% GET LATEST DONOR LIST +final_donor_list = accept_and_get_list(args) + + +# %% GET DONOR META DATA AND DATASETS +# use multiple cores to process +startTime = time.time() +print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) +pool = Pool(os.cpu_count()) +pool.starmap(get_all_data, zip( + final_donor_list["userID"], + final_donor_list["donorGroup"] +)) +pool.close() +endTime = time.time() +print( + "finshed pulling data at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") +) +total_duration = round((endTime - startTime) / 60, 1) +print("total duration was %s minutes" % total_duration) + + +# %% COMBINE AND SAVE ALL DONOR METADATA +print("combining all metadata") +phi_date_stamp = "PHI-" + args.date_stamp +donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data") + +metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-metadata" +) + +all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) +all_metadata = pd.DataFrame() +for f in all_files: + temp_meta = pd.read_csv(f) + temp_meta.rename(columns={"Unnamed: 0": "userid"}, inplace=True) + all_metadata = pd.concat( + [all_metadata, temp_meta], + ignore_index=True, + sort=False + ) + +all_metadata.to_csv( + os.path.join(donor_folder, phi_date_stamp + "-donor-metadata.csv") +) +print("saving metadata...code complete") diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_interim_dataset_summaries.py b/projects/bigdata-processing-pipeline/get_donor_data/get_interim_dataset_summaries.py new file mode 100644 index 00000000..0fa04201 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_donor_data/get_interim_dataset_summaries.py @@ -0,0 +1,98 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- + + +# %% REQUIRED LIBRARIES +import datetime as dt +import pandas as pd +import os +import glob +import argparse + + +# %% FUNCTIONS +def get_dataset_summaries( + save_data_path=os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "data" + ) + ), + date_stamp=dt.datetime.now().strftime("%Y-%m-%d"), +): + + + + phi_date_stamp = "PHI-" + args.date_stamp + donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data") + + print("combining all dataset metadata") + + metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-datasetSummary" + ) + + all_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) + dataset_metadata = pd.DataFrame() + n_files = len(all_files) + print("there are {} files".format(n_files)) + f_counter = 1 + for f in all_files: + temp_meta = pd.read_csv(f) + temp_meta.rename(columns={"Unnamed: 0": "col_name"}, inplace=True) + userid = f[-32:-22] + temp_meta["userid"] = userid + dataset_metadata = pd.concat( + [dataset_metadata, temp_meta], + ignore_index=True, + sort=False + ) + + if f_counter % 10 == 0: + print("completed file {} of {}".format(f_counter, n_files)) + f_counter = f_counter + 1 + dataset_metadata.to_csv( + os.path.join(donor_folder, phi_date_stamp + "-all-dataset-info.csv.gz") + ) + print("saving all-dataset-info-metadata...code complete") + + return + + +# %% MAIN +if __name__ == "__main__": + # USER INPUTS (choices to be made in order to run the code) + codeDescription = "get donor json file" + parser = argparse.ArgumentParser(description=codeDescription) + + parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default=dt.datetime.now().strftime("%Y-%m-%d"), + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" + ) + + parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" + ) + + args = parser.parse_args() + + # the main function + get_dataset_summaries( + save_data_path=args.data_path, + date_stamp=args.date_stamp + ) diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_single_dataset_info.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_dataset_info.py new file mode 100644 index 00000000..d1ddab75 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_dataset_info.py @@ -0,0 +1,357 @@ +# -*- coding: utf-8 -*- +"""get_donor_data_and_metadata.py +This code takes a tidepool dataset as input, and gives +a description of the type of data in the dataset. +""" + + +# %% REQUIRED LIBRARIES +import pandas as pd +import datetime as dt +import numpy as np +import os +import ast +import argparse + + +# %% FUNCTIONS +def get_type(val): + return type(val).__name__ + + +def get_len(val): + return len(val) + + +def get_val(val, k): + return val[k] + + +def literal_return(val): + try: + return ast.literal_eval(val) + except (ValueError, SyntaxError): + return val + + +def remove_cols(df, cols_to_remove): + + temp_remove_cols = list(set(df) & set(cols_to_remove)) + tempDf = df[temp_remove_cols] + df = df.drop(columns=temp_remove_cols) + + return df, tempDf + + +def make_folder_if_doesnt_exist(folder_paths): + ''' function requires a single path or a list of paths''' + if not isinstance(folder_paths, list): + folder_paths = [folder_paths] + for folder_path in folder_paths: + if not os.path.exists(folder_path): + os.makedirs(folder_path) + return + + +def create_output_folder( + data_path, + date_stamp, + folder_name, + phi=True +): + if phi: + date_stamp = "PHI-" + date_stamp + donor_folder = os.path.join(data_path, date_stamp + "-donor-data") + dataset_path = os.path.join( + donor_folder, + date_stamp + "-" + folder_name + ) + make_folder_if_doesnt_exist(dataset_path) + + return dataset_path + + +def save_df( + df, + userid, + data_path, + date_stamp, + folder_name, + phi=True, + name_suffix="", +): + + output_folder = create_output_folder( + data_path=data_path, + date_stamp=date_stamp, + folder_name=folder_name, + phi=phi + ) + + # if the data contains phi, add prefix to the file + if phi: + phi_prefix = 'PHI-' + else: + phi_prefix = '' + output_path = os.path.join( + output_folder, + phi_prefix + userid + "{}.csv.gz".format(name_suffix) + ) + + df.to_csv(output_path) + + return output_path + + +def expand_df(df, do_not_expand_list=[]): + + # remove fields that we don't want to flatten + df, hold_df = remove_cols(df, do_not_expand_list) + + # get a description of the original columns + col_df = pd.DataFrame(df.dtypes, columns=["dtype"]) + + # go through each dtype that is an object to see if it + # contains strings, mixed datatypes, embedded json, or lists + col_df["nObjectTypes"] = np.nan + col_df["objectType"] = np.nan + + new_df = pd.DataFrame() + for col in col_df[col_df["dtype"] == "object"].index: + rows = df.index[df[col].notnull()].tolist() + + # sometimes the object gets wrapped in a string + literal_df = pd.DataFrame(df.loc[rows, col].apply(literal_return)) + + # see if there are mixed ojbect types + type_df = pd.DataFrame(literal_df.loc[rows, col].apply(get_type)) + unique_types = type_df[col].unique() + col_df.loc[col, "nObjectTypes"] = len(unique_types) + col_df.loc[col, "objectType"] = str(unique_types) + + # USE UNDERSCORE FOR LIST EXPANSION + if "list" in col_df.loc[col, "objectType"]: + list_df = pd.DataFrame(literal_df.loc[type_df[col] == "list", col]) + list_df["len"] = list_df[col].apply(get_len) + + for i in np.arange(1, list_df["len"].max() + 1): + blob_df = pd.DataFrame( + list_df.loc[ + list_df["len"] >= i, col + ].apply(get_val, k=i-1) + ).add_suffix('_' + str(i)) + + new_df = pd.concat([new_df, blob_df], axis=1) + + # USE DOT FOR JSON (DICT) EXPANSION + if "dict" in col_df.loc[col, "objectType"]: + json_blob = literal_df.loc[type_df[col] == "dict", col] + blob_df = pd.DataFrame( + json_blob.tolist(), + index=json_blob.index + ).add_prefix(col + '.') + new_df = pd.concat([new_df, blob_df], axis=1) + + # merge the dataframes together + df = pd.concat([df, new_df, hold_df], axis=1) + + df.sort_index(axis=1, inplace=True) + + return df, col_df + + +def expand_data(starting_df, depth=10): + print("\ninitial df has {} columns".format(len(starting_df.columns))) + print("starting expansion ...") + temp_df, temp_col = expand_df(starting_df) + col_df = temp_col.copy() + skip_columns = starting_df.columns.tolist() + d = 1 + n_col_expanded = len(list(temp_df)) - len(list(starting_df)) + print("{} columns added". format(n_col_expanded)) + + while not ((d >= depth) | (len(temp_col) == 0)): + print("expanding layer {} ... ".format(d)) + next_skip_columns = temp_df.columns.tolist() + temp_df, temp_col = expand_df(temp_df, skip_columns) + skip_columns = next_skip_columns.copy() + + col_df = pd.concat([col_df, temp_col]) + n_col_expanded = len(list(temp_df)) - len(next_skip_columns) + print("{} columns added". format(n_col_expanded)) + d += 1 + + print("expansion complete...getting dataset summary info...") + + col_df.sort_index(inplace=True) + + # get the start and end time for each data type + print("getting data start and end times for each data type ...") + col_df["startTime"] = np.nan + col_df["endTime"] = np.nan + for col in col_df.index: + try: + start_time = temp_df.loc[temp_df[col].notnull(), ["time"]].min() + end_time = temp_df.loc[temp_df[col].notnull(), ["time"]].max() + col_df.loc[col, "startTime"] = start_time.values[0] + col_df.loc[col, "endTime"] = end_time.values[0] + except: + print(col, "missing timestamp") + + # get summary information + print("getting summary information ...") + df_info = pd.DataFrame(temp_df.describe(include='all').T) + df_info.loc["_all", ["count", "unique"]] = temp_df.shape + df_info.sort_index(inplace=True) + + # add which type (or subtype) each column comes from + for typeType in ["type", "subType"]: + if typeType in list(starting_df): + type_groups = temp_df.groupby(by=typeType) + not_null_index = temp_df[typeType].notnull() + for type_ in temp_df.loc[not_null_index, typeType].unique(): + type_df = type_groups.get_group(type_).dropna( + axis=1, + how="all" + ) + df_info.loc[type_df.columns, typeType + "=" + type_] = type_ + + # get memory size of each data type + print("getting memory information ...") + mem_usage = pd.DataFrame( + temp_df.memory_usage(index=True, deep=True), + columns=["memorySize"] + ) + mem_usage.rename(index={"Index": "_all"}, inplace=True) + df_info["memorySize"] = mem_usage["memorySize"] + df_info.loc["_all", "memorySize"] = temp_df.memory_usage( + index=True, deep=True + ).sum() + + # combine with col_summary + summary_df = pd.concat([col_df, df_info], axis=1, sort=True) + + # get/add a list of string values + print("getting a a list of string values ...") + str_cols = summary_df[ + ((summary_df["objectType"] == "['str']") & + (summary_df["unique"] > 1) & + (summary_df["unique"] < 50) + ) + ].index + for str_col in str_cols: + not_null_index = temp_df[str_col].notnull() + str_vals = temp_df.loc[not_null_index, str_col].unique().tolist() + summary_df.loc[str_col, "strVals"] = str(str_vals) + + print("dataset summary info complete\n") + + return summary_df, temp_df + + +# %% START OF CODE +def get_dataset_info( + data, + date_stamp, + data_path, + userid, + save_expanded +): + + if userid == "not-specified": + userid = input("Enter userid of dataset you want info on:\n") + + if type(data) is float: # np.nan is a float + dataset_folder = create_output_folder( + data_path, + date_stamp, + "csvData" + ) + dataset_path = os.path.join( + dataset_folder, + "PHI-{}.csv.gz".format(userid) + ) + data = pd.read_csv(dataset_path, low_memory=False, index_col=0) + + # expand embedded lists and json within dataset + summary_df, expanded_df = expand_data(data.copy(), depth=10) + + # save summary data + _ = save_df( + summary_df, + userid=userid, + data_path=data_path, + date_stamp=date_stamp, + folder_name="datasetSummary", + phi=True, + name_suffix="-datasetSummary" + ) + + if save_expanded: + # save expanded data + _ = save_df( + expanded_df, + userid=userid, + data_path=args.data_path, + date_stamp=args.date_stamp, + folder_name="expandedData", + phi=True, + name_suffix="-expandedData" + ) + + +if __name__ == "__main__": + # USER INPUTS (choices to be made in order to run the code) + codeDescription = "get an overview of the columns and data in the dataset" + parser = argparse.ArgumentParser(description=codeDescription) + + parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default=dt.datetime.now().strftime("%Y-%m-%d"), + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" + ) + + parser.add_argument( + "-u", + "--userid", + dest="userid", + default="not-specified", + help="userid of the dataset you are interested in" + ) + + parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" + ) + + parser.add_argument( + "-s", + "--save-expanded-dataset", + dest="save_expanded", + default=True, + help=( + "specify if you want to save the expanded datafram (True/False)" + + "NOTE: these files can be rather large" + ) + ) + + args = parser.parse_args() + + # main function + get_dataset_info( + data=np.nan, + date_stamp=args.date_stamp, + data_path=args.data_path, + userid=args.userid, + save_expanded=args.save_expanded + ) diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_donor_metadata.py similarity index 99% rename from projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py rename to projects/bigdata-processing-pipeline/get_donor_data/get_single_donor_metadata.py index 3135ff41..e02708a9 100644 --- a/projects/bigdata-processing-pipeline/get-donor-data/get_single_donor_metadata.py +++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_donor_metadata.py @@ -233,7 +233,7 @@ def get_and_save_metadata( # save data meta_output_path = os.path.join( metadata_path, - 'PHI-' + userid + ".csv" + 'PHI-' + userid + ".csv.gz" ) meta_df.to_csv(meta_output_path) diff --git a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py similarity index 54% rename from projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py rename to projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py index 290b5324..0b3e384f 100644 --- a/projects/bigdata-processing-pipeline/get-donor-data/get_single_tidepool_dataset.py +++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset.py @@ -8,6 +8,10 @@ """ # %% REQUIRED LIBRARIES +try: + from get_single_dataset_info import expand_data, save_df +except: # TODO: there has to be a better way to do this + from get_donor_data.get_single_dataset_info import expand_data, save_df import pandas as pd import datetime as dt import numpy as np @@ -16,7 +20,6 @@ import getpass import requests import json -import pdb import argparse envPath = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) if envPath not in sys.path: @@ -24,93 +27,7 @@ import environmentalVariables -# %% USER INPUTS (choices to be made in order to run the code) -codeDescription = "get donor metadata" -parser = argparse.ArgumentParser(description=codeDescription) - -parser.add_argument( - "-d", - "--date-stamp", - dest="date_stamp", - default=dt.datetime.now().strftime("%Y-%m-%d"), - help="date, in '%Y-%m-%d' format, of the date when " + - "donors were accepted" -) - -parser.add_argument( - "-w", - "--weeks-of-data", - dest="weeks_of_data", - default=52*10, - help="enter the number of weeks of data you want to download" -) - -parser.add_argument( - "-dg", - "--donor-group", - dest="donor_group", - default=np.nan, - help="name of the donor group in the tidepool .env file" -) - -parser.add_argument( - "-u", - "--userid", - dest="userid_of_shared_user", - default=np.nan, - help="userid of account shared with the donor group or master account" -) - -parser.add_argument( - "-a", - "--auth", - dest="auth", - default=np.nan, - help="tuple that contains (email, password)" -) - -parser.add_argument( - "-e", - "--email", - dest="email", - default=np.nan, - help="email address of the master account" -) - -parser.add_argument( - "-p", - "--password", - dest="password", - default=np.nan, - help="password of the master account" -) - -parser.add_argument( - "-o", - "--output-data-path", - dest="data_path", - default=os.path.abspath( - os.path.join( - os.path.dirname(__file__), "..", "data" - ) - ), - help="the output path where the data is stored" -) - -args = parser.parse_args() - - # %% FUNCTIONS -def make_folder_if_doesnt_exist(folder_paths): - ''' function requires a single path or a list of paths''' - if not isinstance(folder_paths, list): - folder_paths = [folder_paths] - for folder_path in folder_paths: - if not os.path.exists(folder_path): - os.makedirs(folder_path) - return - - def get_data_api(userid, startDate, endDate, headers): startDate = startDate.strftime("%Y-%m-%d") + "T00:00:00.000Z" @@ -145,7 +62,7 @@ def get_data_api(userid, startDate, endDate, headers): def get_data( weeks_of_data=10*52, donor_group=np.nan, - userid_of_shared_user=np.nan, + userid=np.nan, auth=np.nan, email=np.nan, password=np.nan, @@ -180,8 +97,8 @@ def get_data( else: sys.exit("Error with " + auth[0] + ":" + str(api_response.status_code)) - if pd.isnull(userid_of_shared_user): - userid_of_shared_user = userid_master + if pd.isnull(userid): + userid = userid_master print( "getting data for the master account since no shared " + "user account was given" @@ -204,7 +121,7 @@ def get_data( endDate.day + 1 ) year_df, endDate = get_data_api( - userid_of_shared_user, + userid, startDate, endDate, headers @@ -222,7 +139,7 @@ def get_data( ) df, _ = get_data_api( - userid_of_shared_user, + userid, startDate, endDate, headers @@ -241,58 +158,170 @@ def get_data( auth[0] + ":" + str(api_response.status_code) ) - return df, userid_of_shared_user + return df, userid # %% START OF CODE def get_and_save_dataset( - date_stamp=args.date_stamp, - data_path=args.data_path, - weeks_of_data=args.weeks_of_data, - donor_group=args.donor_group, - userid_of_shared_user=args.userid_of_shared_user, - auth=args.auth, - email=args.email, - password=args.password + date_stamp, + data_path, + weeks_of_data, + donor_group, + userid, + auth, + email, + password, + expand_dataset ): - # create output folders if they don't exist - - phi_date_stamp = "PHI-" + date_stamp - donor_folder = os.path.join(data_path, phi_date_stamp + "-donor-data") - - dataset_path = os.path.join( - donor_folder, - phi_date_stamp + "-csvData" - ) - make_folder_if_doesnt_exist(dataset_path) # get dataset data, userid = get_data( weeks_of_data=weeks_of_data, donor_group=donor_group, - userid_of_shared_user=userid_of_shared_user, + userid=userid, auth=auth, email=email, password=password ) - # save data - dataset_output_path = os.path.join( - dataset_path, - 'PHI-' + userid + ".csv" - ) + # if the there is data + if len(data) > 1: + # save data + print("saving csv data...") + _ = save_df( + data, + userid=userid, + data_path=data_path, + date_stamp=date_stamp, + folder_name="csvData", + phi=True + ) - data.to_csv(dataset_output_path) + # get dataset info + if expand_dataset: + summary_df, expanded_df = expand_data(data) + print("saving summary data...") + _ = save_df( + summary_df, + userid=userid, + data_path=data_path, + date_stamp=date_stamp, + folder_name="datasetSummary", + phi=True, + name_suffix="-datasetSummary" + ) + + # save expanded data + print("saving expanded data...") + _ = save_df( + expanded_df, + userid=userid, + data_path=args.data_path, + date_stamp=args.date_stamp, + folder_name="expandedData", + phi=True, + name_suffix="-expandedData" + ) + else: + print("{} has no data".format(userid)) if __name__ == "__main__": + # USER INPUTS (choices to be made in order to run the code) + codeDescription = "get donor metadata" + parser = argparse.ArgumentParser(description=codeDescription) + + parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default=dt.datetime.now().strftime("%Y-%m-%d"), + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" + ) + + parser.add_argument( + "-w", + "--weeks-of-data", + dest="weeks_of_data", + default=52*10, + help="enter the number of weeks of data you want to download" + ) + + parser.add_argument( + "-dg", + "--donor-group", + dest="donor_group", + default=np.nan, + help="name of the donor group in the tidepool .env file" + ) + + parser.add_argument( + "-u", + "--userid", + dest="userid", + default=np.nan, + help="userid of account shared with the donor group or master account" + ) + + parser.add_argument( + "-a", + "--auth", + dest="auth", + default=np.nan, + help="tuple that contains (email, password)" + ) + + parser.add_argument( + "-e", + "--email", + dest="email", + default=np.nan, + help="email address of the master account" + ) + + parser.add_argument( + "-p", + "--password", + dest="password", + default=np.nan, + help="password of the master account" + ) + + parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" + ) + + parser.add_argument( + "-ex", + "--expand-dataset", + dest="expand_dataset", + default=True, + help=( + "specify if you want to get/save the expanded datafram (True/False)" + + "NOTE: this process is time consuming" + ) + ) + + args = parser.parse_args() + + # the main function get_and_save_dataset( date_stamp=args.date_stamp, data_path=args.data_path, weeks_of_data=args.weeks_of_data, donor_group=args.donor_group, - userid_of_shared_user=args.userid_of_shared_user, + userid=args.userid, auth=args.auth, email=args.email, - password=args.password + password=args.password, + expand_dataset=args.expand_dataset ) diff --git a/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py new file mode 100644 index 00000000..d8496891 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_donor_data/get_single_tidepool_dataset_json.py @@ -0,0 +1,325 @@ +# -*- coding: utf-8 -*- +"""get_donor_data_and_metadata.py +In the context of the big data donation +project, this code grabs donor data and metadata. + +This code calls accept_new_donors_and_get_donor_list.py +to get the most recent donor list +""" + +# %% REQUIRED LIBRARIES +import pandas as pd +import datetime as dt +import numpy as np +import os +import sys +import time +import getpass +import requests +import json +import argparse +import pdb +envPath = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) +if envPath not in sys.path: + sys.path.insert(0, envPath) +import environmentalVariables + +# %% GLOBAL VARIABLES +current_date = dt.datetime.now().strftime("%Y-%m-%d") + +# %% FUNCTIONS +def make_folder_if_doesnt_exist(folder_paths): + ''' function requires a single path or a list of paths''' + if not isinstance(folder_paths, list): + folder_paths = [folder_paths] + for folder_path in folder_paths: + if not os.path.exists(folder_path): + os.makedirs(folder_path) + return + + +def get_data_api(userid, startDate, endDate, headers): + + startDate = startDate.strftime("%Y-%m-%d") + "T00:00:00.000Z" + endDate = endDate.strftime("%Y-%m-%d") + "T23:59:59.999Z" + + api_call = ( + "https://api.tidepool.org/data/" + userid + "?" + + "endDate=" + endDate + "&" + + "startDate=" + startDate + "&" + + "dexcom=true" + "&" + + "medtronic=true" + "&" + + "carelink=true" + ) + + api_response = requests.get(api_call, headers=headers) + if(api_response.ok): + print("getting data between %s and %s" % (startDate, endDate)) + json_data = json.loads(api_response.content.decode()) + + else: + sys.exit( + "ERROR in getting data between %s and %s" % (startDate, endDate), + api_response.status_code + ) + + endDate = pd.to_datetime(startDate) - pd.Timedelta(1, unit="d") + + return json_data, endDate + + +def get_data( + weeks_of_data=10*52, + save_data_path=os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "data", + "PHI-" + current_date + "-donor-data", + "PHI-" + current_date + "-jsonData", + ) + ), + overwrite_hours=24, + donor_group=np.nan, + userid=np.nan, + auth=np.nan, + email=np.nan, + password=np.nan, + save_file="False", +): + # login + if pd.notnull(donor_group): + if donor_group == "bigdata": + dg = "" + else: + dg = donor_group + + auth = environmentalVariables.get_environmental_variables(dg) + + if pd.isnull(auth): + if pd.isnull(email): + email = input("Enter Tidepool email address:\n") + + if pd.isnull(password): + password = getpass.getpass("Enter password:\n") + + auth = (email, password) + + api_call = "https://api.tidepool.org/auth/login" + api_response = requests.post(api_call, auth=auth) + if(api_response.ok): + xtoken = api_response.headers["x-tidepool-session-token"] + userid_master = json.loads(api_response.content.decode())["userid"] + headers = { + "x-tidepool-session-token": xtoken, + "Content-Type": "application/json" + } + else: + sys.exit("Error with " + auth[0] + ":" + str(api_response.status_code)) + + if pd.isnull(userid): + userid = userid_master + print( + "getting data for the master account since no shared " + + "user account was given" + ) + + print("logging into", auth[0], "...") + + # download user data + print("downloading data for {} ...".format(userid)) + endDate = pd.datetime.now() + pd.Timedelta(1, unit="d") + + output_folder = os.path.join( + save_data_path, + "PHI-" + userid, + ) + + output_file_path = os.path.join( + output_folder, + "PHI-{}.json".format(userid) + ) + + download_ = True + for f in [output_folder, output_file_path]: + path_exist = os.path.exists(f) + if path_exist: + last_save = os.path.getmtime(f) + time_threshold = time.time() - (overwrite_hours * 3600) + within_time_threshold = last_save > time_threshold + if within_time_threshold: + download_ = False + + if download_: + + big_json_file = [] + + if weeks_of_data > 52: + years_of_data = int(np.floor(weeks_of_data/52)) + + for years in range(0, years_of_data + 1): + startDate = pd.datetime( + endDate.year - 1, + endDate.month, + endDate.day + 1 + ) + json_data, endDate = get_data_api( + userid, + startDate, + endDate, + headers + ) + + big_json_file = big_json_file + json_data + + else: + startDate = ( + pd.to_datetime(endDate) - pd.Timedelta(weeks_of_data*7, "d") + ) + + json_data, _ = get_data_api( + userid, + startDate, + endDate, + headers + ) + + big_json_file = big_json_file + json_data + + # save data + if len(big_json_file) > 1: + if "T" in str(save_file).upper(): + make_folder_if_doesnt_exist(output_folder) + print("saving data for {}".format(userid)) + with open(output_file_path, 'w') as outfile: + json.dump(big_json_file, outfile) + else: + print("{} has data, but will not be saved".format(userid)) + else: + print("{} has no data".format(userid)) + + # logout + api_call = "https://api.tidepool.org/auth/logout" + api_response = requests.post(api_call, auth=auth) + + if(api_response.ok): + print("successfully logged out of", auth[0]) + + else: + sys.exit( + "Error with logging out for " + + auth[0] + ":" + str(api_response.status_code) + ) + else: + print( + "skipping bc {}'s data was downloaded (attempted)".format(userid) + + " within the last {} hours".format(overwrite_hours) + ) + + if "T" in str(save_file).upper(): + return np.nan, userid + else: + df = pd.DataFrame(big_json_file) + return df, userid + + +# %% MAIN +if __name__ == "__main__": + # USER INPUTS (choices to be made in order to run the code) + codeDescription = "get donor json file" + parser = argparse.ArgumentParser(description=codeDescription) + + parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), + "..", + "data", + "PHI-" + current_date + "-donor-data", + "PHI-" + current_date + "-jsonData", + ) + ), + help="the output path where the data is stored" + ) + + parser.add_argument( + "-w", + "--weeks-of-data", + dest="weeks_of_data", + default=2, # 52*10, # go back the last 10 years as default + help="enter the number of weeks of data you want to download" + ) + + parser.add_argument( + "-ow", + "--over-write", + dest="overwrite_hours", + default=24, + help="if data was downloaded in the last <24> hours, skip download" + ) + + parser.add_argument( + "-dg", + "--donor-group", + dest="donor_group", + default=np.nan, + help="name of the donor group in the tidepool .env file" + ) + + parser.add_argument( + "-u", + "--userid", + dest="userid", + default=np.nan, + help="userid of account shared with the donor group or master account" + ) + + parser.add_argument( + "-a", + "--auth", + dest="auth", + default=np.nan, + help="tuple that contains (email, password)" + ) + + parser.add_argument( + "-e", + "--email", + dest="email", + default=np.nan, + help="email address of the master account" + ) + + parser.add_argument( + "-p", + "--password", + dest="password", + default=np.nan, + help="password of the master account" + ) + + parser.add_argument( + "-s", + "--save_file", + dest="save_file", + default="true", + help="specify whether to save the downloaded donor data" + ) + + args = parser.parse_args() + + # the main function + data, userid = get_data( + save_data_path=args.data_path, + weeks_of_data=args.weeks_of_data, + overwrite_hours=args.overwrite_hours, + donor_group=args.donor_group, + userid=args.userid, + auth=args.auth, + email=args.email, + password=args.password, + save_file=args.save_file, + ) diff --git a/projects/bigdata-processing-pipeline/get_stats/__init__.py b/projects/bigdata-processing-pipeline/get_stats/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py new file mode 100644 index 00000000..3fe2fef9 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_stats/batch_get_cgm_distributions_and_stats.py @@ -0,0 +1,116 @@ +# -*- coding: utf-8 -*- +"""accept_donors_and_pull_data.py +This is a wrapper script that gets distributions and stats for all donors, +NOTE: this needs to be refactored because it is currently set up to run +on json files that are in a snowflake path + +""" + +# %% REQUIRED LIBRARIES +import datetime as dt +import pandas as pd +import subprocess as sub +import os +import glob +import time +import argparse +from multiprocessing import Pool + + +# %% USER INPUTS (choices to be made in order to run the code) +codeDescription = "get distribution and stats for all donor's json data" +parser = argparse.ArgumentParser(description=codeDescription) + +parser.add_argument( + "-i", + "--input-json-data-path", + dest="json_data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data", "dremio", "**", "*.json" + ), + ), + help="the path where json data is located" +) + +parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default=dt.datetime.now().strftime("%Y-%m-%d"), + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" +) + +parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" +) + +args = parser.parse_args() + + +# %% FUNCTIONS +def run_process(json_data_path): + userid = json_data_path[-15:-5] + + # check to see if the file was already processed + phi_date_stamp = "PHI-" + args.date_stamp + + metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-cgm-metadata" + ) + + all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) + if userid not in str(all_metadata_files): + + p = sub.Popen( + [ + "python", "get_cgm_distributions_and_stats.py", + "-i", json_data_path, + "-u", userid, + "-d", args.date_stamp, + "-o", args.data_path + ], + stdout=sub.PIPE, + stderr=sub.PIPE + ) + + output, errors = p.communicate() + output = output.decode("utf-8") + errors = errors.decode("utf-8") + + if errors == '': + print(output) + else: + print(errors) + else: + print(userid, "was already processed") + + return + + +# %% GET A LIST OF DONOR JSON FILE LOCATIONS +all_files = glob.glob(args.json_data_path, recursive=True) + +# use multiple cores to process +startTime = time.time() +print("starting at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S")) +pool = Pool(int(os.cpu_count())) +pool.map(run_process, all_files) +pool.close() +endTime = time.time() +print( + "finshed pulling data at " + dt.datetime.now().strftime("%Y-%m-%d %H:%M:%S") +) +total_duration = round((endTime - startTime) / 60, 1) +print("total duration was %s minutes" % total_duration) diff --git a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py new file mode 100644 index 00000000..b8bac502 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_and_metadata_results.py @@ -0,0 +1,121 @@ +# -*- coding: utf-8 -*- +"""accept_donors_and_pull_data.py +This is a wrapper script that gets distributions and stats for all donors, +NOTE: this needs to be refactored because it is currently set up to run +on json files that are in a snowflake path + +""" + +# %% REQUIRED LIBRARIES +import pandas as pd +import numpy as np +import os +import glob +import argparse + + +# %% USER INPUTS (choices to be made in order to run the code) +codeDescription = "get distribution and stats for all donor's json data" +parser = argparse.ArgumentParser(description=codeDescription) + +parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default="2019-07-17", + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" +) + +parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" +) + + +parser.add_argument( + "-c", + "--chunk-size", + dest="chunk_size", + default=50, + help="the output path where the data is stored" +) + +args = parser.parse_args() + + +# %% COMBINE AND SAVE ALL DONOR METADATA +print("combining all metadata") +phi_date_stamp = "PHI-" + args.date_stamp +donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data") + +metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-cgm-metadata" +) + +all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) +print("combining {} metaata files".format(len(all_metadata_files))) +all_metadata = pd.DataFrame() +for f in all_metadata_files: + temp_meta = pd.read_csv(f, low_memory=False) + all_metadata = pd.concat( + [all_metadata, temp_meta], + ignore_index=True, + sort=False + ) + +all_metadata.to_csv( + os.path.join( + donor_folder, + phi_date_stamp + + "-cgm-metadata-0-{}.csv.gz".format(str(len(all_metadata_files))) + ) +) +print("finished saving metadata...starting distribution data...") + + +# %% COMBINE AND SAVE ALL DISTRIBUTION DATA +metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-cgm-distributions" +) + +all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) +print("combining {} distribution data files".format(len(all_metadata_files))) +chunks = np.arange(0, len(all_metadata_files), int(args.chunk_size)) +chunks = np.append(chunks, len(all_metadata_files)) +for chunk_start, chunk_end in zip(chunks[0:-1], chunks[1:]): + print("starting chunk {}-{}".format(str(chunk_start), str(chunk_end))) + distribution_metadata = pd.DataFrame() + for c_idx in np.arange(chunk_start, chunk_end): + temp_meta = pd.read_csv( + all_metadata_files[c_idx], + index_col=[0], + low_memory=False + ) + distribution_metadata = pd.concat( + [distribution_metadata, temp_meta], + ignore_index=True, + sort=False + ) + # save chunk + print("saving chunk {}-{}".format(str(chunk_start), str(chunk_end))) + distribution_metadata.to_csv( + os.path.join( + donor_folder, + phi_date_stamp + "-cgm-distributions-{}-{}.csv.gz".format( + str(chunk_start), + str(chunk_end)) + ) + ) +print("finished saving all-dataset-distribution-data...code complete") diff --git a/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_results.py b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_results.py new file mode 100644 index 00000000..12abb350 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_stats/combine_cgm_distribution_results.py @@ -0,0 +1,92 @@ +# -*- coding: utf-8 -*- +"""accept_donors_and_pull_data.py +This is a wrapper script that gets distributions and stats for all donors, +NOTE: this needs to be refactored because it is currently set up to run +on json files that are in a snowflake path + +""" + +# %% REQUIRED LIBRARIES +import pandas as pd +import numpy as np +import os +import glob +import argparse + + +# %% USER INPUTS (choices to be made in order to run the code) +codeDescription = "get distribution and stats for all donor's json data" +parser = argparse.ArgumentParser(description=codeDescription) + +parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default="2019-07-17", + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" +) + +parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" +) + +parser.add_argument( + "-c", + "--chunk-size", + dest="chunk_size", + default=50, + help="the output path where the data is stored" +) + +args = parser.parse_args() + + +# %% COMBINE AND SAVE ALL DISTRIBUTION DATA + +phi_date_stamp = "PHI-" + args.date_stamp +donor_folder = os.path.join(args.data_path, phi_date_stamp + "-donor-data") + +metadata_path = os.path.join( + args.data_path, + phi_date_stamp + "-donor-data", + phi_date_stamp + "-cgm-distributions" +) + +all_metadata_files = glob.glob(os.path.join(metadata_path, "*.csv.gz")) +print("combining {} distribution data files".format(len(all_metadata_files))) +chunks = np.arange(0, len(all_metadata_files), int(args.chunk_size)) +chunks = np.append(chunks, len(all_metadata_files)) +for chunk_start, chunk_end in zip(chunks[0:-1], chunks[1:]): + print("starting chunk {}-{}".format(str(chunk_start), str(chunk_end))) + distribution_metadata = pd.DataFrame() + for c_idx in np.arange(chunk_start, chunk_end): + temp_meta = pd.read_csv( + all_metadata_files[c_idx], + index_col=[0], + low_memory=False + ) + distribution_metadata = pd.concat( + [distribution_metadata, temp_meta], + ignore_index=True, + sort=False + ) + # save chunk + print("saving chunk {}-{}".format(str(chunk_start), str(chunk_end))) + distribution_metadata.to_csv( + os.path.join( + donor_folder, + phi_date_stamp + "-cgm-distributions-{}-{}.csv.gz".format( + str(chunk_start), + str(chunk_end)) + ) + ) +print("finished saving all-dataset-distribution-data...code complete") diff --git a/projects/bigdata-processing-pipeline/get_stats/development-versions/get_cgm_distributions_v3.py b/projects/bigdata-processing-pipeline/get_stats/development-versions/get_cgm_distributions_v3.py new file mode 100644 index 00000000..f691f506 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_stats/development-versions/get_cgm_distributions_v3.py @@ -0,0 +1,2397 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +calculate cgm statsistics for a single tidepool (donor) dataset +''' + + +# %% REQUIRED LIBRARIES +import os +import sys +import hashlib +import pytz +import numpy as np +import pandas as pd +import datetime as dt +import glob +import pdb +# TODO: figure out how to get rid of these path dependcies +get_donor_data_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..") +) +if get_donor_data_path not in sys.path: + sys.path.insert(0, get_donor_data_path) +import environmentalVariables +from get_donor_data.get_single_donor_metadata import get_shared_metadata +from get_donor_data.get_single_tidepool_dataset import get_data +from get_donor_data.get_single_tidepool_dataset_json import make_folder_if_doesnt_exist + +# %% CONSTANTS +MGDL_PER_MMOLL = 18.01559 + + +# %% FUNCTIONS +''' +the functions that are called in this script, +which includes notes of where the functions came from, +and whether they were refactored +''' + + +def get_episodes( + df, + episode_criterion="cgm < 54", + min_duration=5, +): + # TODO: deal with case where there are nan's in the middle of an episode + # it probably makes sense to interpolate between values iff the gap is + # <= 1 to 6 points (5 to 30 minutes) + + # put consecutive data that matches in groups + df["tempGroups"] = (( + df[episode_criterion] != df[episode_criterion].shift() + ).cumsum()) + + df["episodeId"] = ( + df["tempGroups"] * df[episode_criterion] + ) + + # group by the episode groups + episode_groups = df.groupby("episodeId") + episodes = episode_groups["roundedUtcTime"].count().reset_index() + episodes["duration"] = episodes["roundedUtcTime"] * 5 + episodes.rename(columns={"roundedUtcTime": "episodeCounts"}, inplace=True) + + df = pd.merge(df, episodes, on="episodeId", how="left") + df["episodeDuration"] = ( + df["duration"] * df[episode_criterion] + ) + + # mark record as belonging to an episode + df["isEpisode"] = ( + df["episodeDuration"] >= min_duration + ) + + # get the hypo episode starts so we only count each episode once + df["episodeStart"] = ( + (df[episode_criterion]) + & (~df[episode_criterion].shift(1).fillna(False)) +# & (df["hasCgm"]) +# & (df["hasCgm"].shift(1)) + ) + + # calculate the total duration and attach to start record + # which is needed to get the average duration per episode + df["episodeTotalDuration"] = ( + df["episodeStart"] * df["episodeDuration"] + ) + df["episodeTotalDuration"].replace(0, np.nan, inplace=True) + + episode_prefix = ( + "episode." + episode_criterion + + ".durationThreshold=" + str(min_duration) + "." + ) + + df = df[[ + "isEpisode", "episodeId", "episodeStart", "episodeTotalDuration" + ]].add_prefix(episode_prefix) + + return df + + +def get_slope(y): + if "array" not in type(y).__name__: + raise TypeError('Expecting a numpy array') + + count_ = len(y) + + x = np.arange(start=0, stop=count_*5, step=5) + + sum_x = x.sum() + sum_y = y.sum() + sum_xy = (x * y).sum() + sum_x_squared = (x * x).sum() + + slope = ( + ((count_ * sum_xy) - (sum_x * sum_y)) + / ((count_ * sum_x_squared) - (sum_x * sum_x)) + ) + + return slope + + +def expand_entire_dict(ts): + if "Series" not in type(ts).__name__: + raise TypeError('Expecting a pandas time series object') + notnull_idx = ts.index[ts.notnull()] + temp_df = pd.DataFrame( + ts[notnull_idx].tolist(), + index=notnull_idx + ) + + return temp_df + + +def expand_embedded_dict(ts, key_): + '''Expanded a single field that has embedded json + + Args: + ts: a pandas time series of the field that has embedded json + key_: the key that you want to expand + + Raise: + TypeError: if you don't pass in a pandas time series + + Returns: + key_ts: a new time series of the key of interest + + NOTE: + this is new function + TODO: + could be refactored to allow multiple keys or all keys to be returned + could be refactored for speed as the current process + ''' + + if "Series" not in type(ts).__name__: + raise TypeError('Expecting a pandas time series object') + key_ts = pd.Series(name=ts.name + "." + key_, index=ts.index) + notnull_idx = ts.notnull() + # TODO: maybe sped up by only getting the one field of interest? + # though, the current method is fairly quick and compact + temp_df = expand_entire_dict(ts) + if key_ in list(temp_df): + key_ts[notnull_idx] = temp_df[key_].values + + return key_ts + + +def get_embedded_field(ts, embedded_field): + '''get a field that is nested in more than 1 embedded dictionary (json) + + Args: + ts: a pandas time series of the field that has embedded json + embedded_field (str): the location of the field that is deeply nested + (e.g., "origin.payload.device.model") + + Raise: + ValueError: if you don't pass in a pandas time series + + Returns: + new_ts: a new time series of the key of interest + + NOTE: + this is new function + the "." notation is used to reference nested json + + ''' + field_list = embedded_field.split(".") + if len(field_list) < 2: + raise ValueError('Expecting at least 1 embedded field') + + new_ts = expand_embedded_dict(ts, field_list[1]) + for i in range(2, len(field_list)): + new_ts = expand_embedded_dict(new_ts, field_list[i]) + + return new_ts + + +def add_upload_info_to_cgm_records(groups, df): + upload_locations = [ + "upload.uploadId", + "upload.deviceManufacturers", + "upload.deviceModel", + "upload.deviceSerialNumber", + "upload.deviceTags" + ] + + if "upload" in groups["type"].unique(): + upload = groups.get_group("upload").dropna(axis=1, how="all").add_prefix("upload.") + df = pd.merge( + left=df, + right=upload[list(set(upload_locations) & set(list(upload)))], + left_on="uploadId", + right_on="upload.uploadId", + how="left" + ) + + return df + + +def expand_heathkit_cgm_fields(df): + # TODO: refactor the code/function that originally grabs + # these fields, so we are only doing it once, and so + # we don't have to drop the columns for the code below to work. + drop_columns = [ + 'origin.payload.device.name', + 'origin.payload.device.manufacturer', + 'origin.payload.sourceRevision.source.name' + ] + for drop_col in drop_columns: + if drop_col in list(df): + df.drop(columns=[drop_col], inplace=True) + + healthkit_locations = [ + "origin", + "origin.payload", + "origin.payload.device", + "origin.payload.sourceRevision", + "origin.payload.sourceRevision.source", + "payload", + ] + + for hk_loc in healthkit_locations: + if hk_loc in list(df): + temp_df = ( + expand_entire_dict(df[hk_loc].copy()).add_prefix(hk_loc + ".") + ) + df = pd.concat([df, temp_df], axis=1) + + return df + + +def get_dexcom_cgm_model(df): + # add cgm model + + dexcom_model_locations = [ + "deviceId", + "deviceManufacturers", + "upload.deviceManufacturers", + "deviceModel", + "upload.deviceModel", + "deviceSerialNumber", + "upload.deviceSerialNumber", + "origin.payload.sourceRevision.source.name", + "payload.transmitterGeneration", + "payload.HKMetadataKeySyncIdentifier", + "payload.transmitterId", + ] + + for model_location in dexcom_model_locations: + # only check if model has NOT been determined, or if it is G5_G6 + m_idx = ( + (df["cgmModel"].isnull()) + | (df["cgmModel"].astype(str).str.contains("G5_G6")) + ) + + # get index that matches model + if ((model_location in list(df)) & (m_idx.sum() > 0)): + str_list = df[model_location].astype(str).str + + # G4 + g4_idx = str_list.contains("G4", case=False, na=False) + df.loc[g4_idx, "cgmModel"] = "G4" + df.loc[g4_idx, "cgmModelSensedFrom"] = model_location + + # G5 + g5_idx = str_list.contains("G5", case=False, na=False) + df.loc[g5_idx, "cgmModel"] = "G5" + df.loc[g5_idx, "cgmModelSensedFrom"] = model_location + + # G6 + g6_idx = str_list.contains("G6", case=False, na=False) + df.loc[g6_idx, "cgmModel"] = "G6" + df.loc[g6_idx, "cgmModelSensedFrom"] = model_location + + # edge case of g5 and g6 + g5_g6_idx = (g5_idx & g6_idx) + df.loc[g5_g6_idx, "cgmModel"] = "G5_G6" + df.loc[g5_g6_idx, "cgmModelSensedFrom"] = model_location + + # case of "transmitterId" + if ( + ("transmitterId" in model_location) + | ("payload.HKMetadataKeySyncIdentifier" in model_location) + ): + # if length of string is 5, then it is likely a G4 sensor + length5_idx = str_list.len() == 5 + df.loc[length5_idx, "cgmModel"] = "G4" + df.loc[length5_idx, "cgmModelSensedFrom"] = model_location + + # if length of string > 5 then might be G5 or G6 + length_gt5_idx = str_list.len() > 5 + + # if sensor stats with 4 then likely G5 + starts4_idx = str_list.startswith("4") + df.loc[(length_gt5_idx & starts4_idx), "cgmModel"] = "G5" + df.loc[(length_gt5_idx & starts4_idx), "cgmModelSensedFrom"] = model_location + + # if sensor stats with 2 or 8 then likely G6 + starts2_6_idx = ( + (str_list.startswith("2")) | (str_list.startswith("8")) + ) + df.loc[(length_gt5_idx & starts2_6_idx), "cgmModel"] = "G6" + df.loc[(length_gt5_idx & starts2_6_idx), "cgmModelSensedFrom"] = model_location + + return df[["cgmModel", "cgmModelSensedFrom"]] + + +def get_non_dexcom_cgm_model(df): + # non-dexcom cgm model query + model_locations = ["deviceId"] + + # model types (NOTE: for medtronic getting pump type not cgm) + models_670G = "MMT-158|MMT-178" + models_640G = "MMT-1511|MMT-1512|MMT-1711|MMT-1712" + models_630G = "MMT-1514|MMT-1515|MMT-1714|MMT-1715" + models_530G = ( + "530G|MedT-551|MedT-751|MedT-554|MedT-754|Veo - 554|Veo - 754" + ) + models_523_723 = "MedT-523|MedT-723|Revel - 523|Revel - 723" # 523/723 + models_libre = "AbbottFreeStyleLibre" + models_animas = "IR1295" + # NOTE: the tandem G4 will first be written as G5_G6, + # but the logic should overwrite back to G4 + models_tandem_G5_G6 = "tandem" + models_tandem_G4 = "4628003|5448003" + + non_dex_models = [ + models_670G, models_640G, models_630G, models_530G, models_523_723, + models_libre, models_animas, models_tandem_G5_G6, models_tandem_G4 + ] + + non_dex_model_names = [ + "670G", "640G", "630G", "530G", "523_723", + "LIBRE", "G4", "G5_G6", "G4" + ] + + for model_location in model_locations: + # only check if model has NOT been determined, or if it is G5_G6 + m_idx = ( + (df["cgmModel"].isnull()) + | (df["cgmModel"].astype(str).str.contains("G5_G6")) + ) + + # get index that matches model + if ((model_location in list(df)) & (m_idx.sum() > 0)): + str_list = df[model_location].astype(str).str + + for non_dex_model, model_name in zip( + non_dex_models, non_dex_model_names + ): + + model_idx = str_list.contains(non_dex_model, na=False) + df.loc[model_idx, "cgmModel"] = model_name + df.loc[model_idx, "cgmModelSensedFrom"] = model_location + + return df[["cgmModel", "cgmModelSensedFrom"]] + + +def hash_userid(userid, salt): + ''' + taken from anonymize-and-export.py + refactored name(s) to meet style guide + ''' + usr_string = userid + salt + hash_user = hashlib.sha256(usr_string.encode()) + hashid = hash_user.hexdigest() + + return hashid + + +def get_type(val): + return type(val).__name__ + + +def remove_negative_durations(df): + ''' + taken from https://github.com/tidepool-org/data-analytics/blob/ + etn/get-settings-and-events/projects/get-donors-pump-settings/ + get-users-settings-and-events.py + + refactored name(s) to meet style guide + refactored pandas field call to df["field"] instead of df.field + refactored because physical activity includes embedded json, whereas + the other fields in the data model require a integer + TODO: I think that durations are coming in as floats too, so we need + to refactor to account for that. + ''' + if "duration" in list(df): + type_ = df["duration"].apply(get_type) + valid_index = ((type_ == "int") & (df["duration"].notnull())) + n_negative_durations = sum(df.loc[valid_index, "duration"] < 0) + if n_negative_durations > 0: + df = df[~(df.loc[valid_index, "duration"] < 0)] + else: + n_negative_durations = np.nan + + return df, n_negative_durations + + +def tslim_calibration_fix(df): + ''' + taken from https://github.com/tidepool-org/data-analytics/blob/ + etn/get-settings-and-events/projects/get-donors-pump-settings/ + get-users-settings-and-events.py + + refactored name(s) to meet style guide + refactored pandas field call to df["field"] instead of df.field + refactored to only expand one field + ''' + + # expand payload field one level + if "payload" in list(df): + df["payload.calibration_reading"] = ( + expand_embedded_dict(df["payload"], "calibration_reading") + ) + + if df["payload.calibration_reading"].notnull().sum() > 0: + + search_for = ['tan'] + tandem_data_index = ( + (df["deviceId"].str.contains('|'.join(search_for))) + & (df["type"] == "deviceEvent") + ) + + cal_index = df["payload.calibration_reading"].notnull() + valid_index = tandem_data_index & cal_index + + n_cal_readings = sum(valid_index) + + if n_cal_readings > 0: + # if reading is > 30 then it is in the wrong units + if df["payload.calibration_reading"].min() > 30: + df.loc[cal_index, "value"] = ( + df.loc[valid_index, "payload.calibration_reading"] + / MGDL_PER_MMOLL + ) + else: + df.loc[cal_index, "value"] = ( + df.loc[valid_index, "payload.calibration_reading"] + ) + else: + n_cal_readings = 0 + else: + n_cal_readings = 0 + return df, n_cal_readings + + +def replace_smoothed_cgm_values(df): + + if 'payload.realTimeValue' in list(df): + raw_val_idx = df['payload.realTimeValue'].notnull() + n_replaced = raw_val_idx.sum() + df.loc[raw_val_idx, "mg/dL"] = ( + df.loc[raw_val_idx, "payload.realTimeValue"] + ) + else: + n_replaced = np.nan + + raw_values = df["mg/dL"] + + return raw_values, n_replaced + + +def get_healthkit_timezone(df): + ''' + TODO: refactor to account for more efficient way to get embedded json + ''' + if "payload" in list(df): + df["payload.HKTimeZone"] = ( + expand_embedded_dict(df["payload"], "HKTimeZone") + ) + if "timezone" not in list(df): + if "payload.HKTimeZone" in list(df): + hk_tz_idx = df["payload.HKTimeZone"].notnull() + df.loc[hk_tz_idx, "deviceType"] = "healthkit" + df.rename(columns={"payload.HKTimeZone": "timezone"}, inplace=True) + + else: + df["timezone"] = np.nan + df["deviceType"] = np.nan + else: + if "payload.HKTimeZone" in list(df): + hk_tz_idx = df["payload.HKTimeZone"].notnull() + df.loc[hk_tz_idx, "timezone"] = ( + df.loc[hk_tz_idx, "payload.HKTimeZone"] + ) + df.loc[hk_tz_idx, "deviceType"] = "healthkit" + else: + df["timezone"] = np.nan + df["deviceType"] = np.nan + + else: + df["timezone"] = np.nan + df["deviceType"] = np.nan + + return df[["timezone", "deviceType"]] + + +def get_and_fill_timezone(df): + ''' + this is new to deal with healthkit data + requires that a data frame that contains payload and HKTimeZone is passed + ''' + df = get_healthkit_timezone(df) + + df["timezone"].fillna(method='ffill', inplace=True) + df["timezone"].fillna(method='bfill', inplace=True) + + return df["timezone"] + + +def make_tz_unaware(date_time): + return date_time.replace(tzinfo=None) + + +def to_utc_datetime(df): + ''' + this is new to deal with perfomance issue with the previous method + of converting to string to datetime with pd.to_datetime() + ''' + utc_time_tz_aware = pd.to_datetime( + df["time"], + format="%Y-%m-%dT%H:%M:%S", + utc=True + ) + utc_tz_unaware = utc_time_tz_aware.apply(make_tz_unaware) + + return utc_tz_unaware + + +# apply the large timezone offset correction (AKA Darin's fix) +def timezone_offset_bug_fix(df): + ''' + this is taken from estimate-local-time.py + TODO: add in unit testing where there is no TZP that is > 840 or < -720 + ''' + + if "timezoneOffset" in list(df): + + while ((df.timezoneOffset > 840).sum() > 0): + df.loc[df.timezoneOffset > 840, ["conversionOffset"]] = ( + df.loc[df.timezoneOffset > 840, ["conversionOffset"]] + - (1440 * 60 * 1000) + ) + + df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] = ( + df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] - 1440 + ) + + while ((df.timezoneOffset < -720).sum() > 0): + df.loc[df.timezoneOffset < -720, ["conversionOffset"]] = ( + df.loc[df.timezoneOffset < -720, ["conversionOffset"]] + + (1440 * 60 * 1000) + ) + + df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] = ( + df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] + 1440 + ) + + return df + + +def get_local_time(df): + + tzo = df[['utcTime', 'inferredTimezone']].apply( + lambda x: get_timezone_offset(*x), axis=1 + ) + local_time = df['utcTime'] + pd.to_timedelta(tzo, unit="m") + + return local_time + + +def round_time( + df, + time_interval_minutes=5, + start_with_first_record=True, + return_calculation_columns=False +): + ''' + A general purpose round time function that rounds the "time" + field to nearest minutes + INPUTS: + * a dataframe (df) or time series that contains only one time field + that you want to round + * time_interval_minutes (defaults to 5 minutes given that most cgms + output every 5 minutes) + * start_with_first_record starts the rounding with the first record + if True, and the last record if False (defaults to True) + * return_calculation_columns specifies whether the extra columns + used to make calculations are returned + refactored name(s) to meet style guide + ''' + # if a time series is passed in, convert to dataframe + if "Series" in get_type(df): + df = pd.DataFrame(df) + columns_ = list(df) + if len(columns_) > 1: + sys.exit( + "Error: df should only have one time column" + ) + else: + df.rename(columns={columns_[0]: "t"}, inplace=True) + + df.sort_values( + by="t", + ascending=start_with_first_record, + inplace=True + ) + + df.reset_index(drop=False, inplace=True) + df.rename(columns={"index": "originalIndex"}, inplace=True) + + # calculate the time between consecutive records + df["t_shift"] = df["t"].shift(1) + df["timeBetweenRecords"] = round( + (df["t"] - df["t_shift"]).dt.days*(86400/(60 * time_interval_minutes)) + + (df["t"] - df["t_shift"]).dt.seconds/(60 * time_interval_minutes) + ) * time_interval_minutes + + # separate the data into chunks if timeBetweenRecords is greater than + # 2 times the minutes so the rounding process + # starts over + big_gaps = list( + df.query("abs(timeBetweenRecords) > " + + str(time_interval_minutes * 2)).index + ) + big_gaps.insert(0, 0) + big_gaps.append(len(df)) + + for gap_index in range(0, len(big_gaps) - 1): + chunk = df["t"][big_gaps[gap_index]:big_gaps[gap_index+1]] + first_chunk = df["t"][big_gaps[gap_index]] + + # calculate the time difference between + # each time record and the first record + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "minutesFromFirstRecord" + ] = ( + (chunk - first_chunk).dt.days*(86400/60) + + (chunk - first_chunk).dt.seconds/60 + ) + + # then round to the nearest X Minutes + # NOTE: the ".000001" ensures that mulitples of 2:30 always rounds up. + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "roundedMinutesFromFirstRecord" + ] = round( + (df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "minutesFromFirstRecord" + ] / time_interval_minutes) + 0.000001 + ) * (time_interval_minutes) + + rounded_first_record = ( + first_chunk + pd.Timedelta("1microseconds") + ).round(str(time_interval_minutes) + "min") + + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "roundedTime" + ] = rounded_first_record + pd.to_timedelta( + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "roundedMinutesFromFirstRecord" + ], unit="m" + ) + + if return_calculation_columns is False: + df.drop( + columns=[ + "timeBetweenRecords", + "minutesFromFirstRecord", + "roundedMinutesFromFirstRecord" + ], inplace=True + ) + # sort back to the original index + df.sort_values(by="originalIndex", inplace=True) + + return df["roundedTime"].values + + +def add_upload_time(df): + ''' + this is taken from a colab notebook that is not in our github + given that it has been refactored to account for bug where there are + no upload records + NOTE: this is a new fix introduced with healthkit data...we now have + data that does not have an upload record + + ''' + + if "upload" in df.type.unique(): + upload_times = pd.DataFrame( + df[df.type == "upload"].groupby("uploadId")["utcTime"].max() + ) + else: + upload_times = pd.DataFrame(columns=["utcTime"]) + + unique_uploadIds = set(df["uploadId"].unique()) + unique_uploadRecords = set( + df.loc[df["type"] == "upload", "uploadId"].unique() + ) + uploadIds_missing_uploadRecords = unique_uploadIds - unique_uploadRecords + + for upId in uploadIds_missing_uploadRecords: + last_upload_time = df.loc[df["uploadId"] == upId, "utcTime"].max() + upload_times.loc[upId, "utcTime"] = last_upload_time + + upload_times.reset_index(inplace=True) + upload_times.rename( + columns={"utcTime": "uploadTime", + "index": "uploadId"}, + inplace=True + ) + + df = pd.merge(df, upload_times, how='left', on='uploadId') + + return df["uploadTime"].values + + +def remove_invalid_cgm_values(df): + + nBefore = len(df) + # remove values < 38 and > 402 mg/dL + df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] < 38))].index) + df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] > 402))].index) + nRemoved = nBefore - len(df) + + return df, nRemoved + + +def removeDuplicates(df, criteriaDF): + nBefore = len(df) + df = df.loc[~(df[criteriaDF].duplicated())] + df = df.reset_index(drop=True) + nDuplicatesRemoved = nBefore - len(df) + + return df, nDuplicatesRemoved + + +def removeCgmDuplicates(df, timeCriterion, valueCriterion="value"): + if timeCriterion in df: + df.sort_values(by=[timeCriterion, "uploadTime"], + ascending=[False, False], + inplace=True) + dfIsNull = df[df[timeCriterion].isnull()] + dfNotNull = df[df[timeCriterion].notnull()] + dfNotNull, nDuplicatesRemoved = ( + removeDuplicates(dfNotNull, [timeCriterion, valueCriterion]) + ) + df = pd.concat([dfIsNull, dfNotNull]) + df.sort_values(by=[timeCriterion, "uploadTime"], + ascending=[False, False], + inplace=True) + else: + nDuplicatesRemoved = 0 + + return df, nDuplicatesRemoved + + +# get rid of spike data +def remove_spike_data(df): + if "origin" in list(df): + nBefore = len(df) + spike_locations = [ + "origin.payload.device.name", + "origin.payload.device.manufacturer", + "origin.payload.sourceRevision.source.name", + ] + for spike_loc in spike_locations: + df[spike_loc] = get_embedded_field(df["origin"], spike_loc) + notnull_idx = df[spike_loc].notnull() + df_notnull = df[notnull_idx] + is_spike = df_notnull[spike_loc].astype(str).str.lower().str.contains("spike") + spike_idx = df_notnull[is_spike].index + df.drop(spike_idx, inplace=True) + + nRemoved = nBefore - len(df) + + else: + nRemoved = np.nan + + return df, nRemoved + + +# %% ESTIMATE LOCAL TIME FUNCTIONS +def convert_deprecated_timezone_to_alias(df, tzAlias): + if "timezone" in df: + uniqueTimezones = df.timezone.unique() + uniqueTimezones = uniqueTimezones[pd.notnull(df.timezone.unique())] + + for uniqueTimezone in uniqueTimezones: + alias = tzAlias.loc[tzAlias.tz.str.endswith(uniqueTimezone), + ["alias"]].values + if len(alias) == 1: + df.loc[df.timezone == uniqueTimezone, ["timezone"]] = alias + + return df + + +def create_contiguous_day_series(df): + first_day = df["date"].min() + last_day = df["date"].max() + rng = pd.date_range(first_day, last_day).date + contiguousDaySeries = \ + pd.DataFrame(rng, columns=["date"]).sort_values( + "date", ascending=False).reset_index(drop=True) + + return contiguousDaySeries + + +def add_device_type(df): + col_headings = list(df) + if "deviceType" not in col_headings: + df["deviceType"] = np.nan + if "deviceTags" in col_headings: + # first make sure deviceTag is in string format + df["deviceTags"] = df.deviceTags.astype(str) + # filter by type not null device tags + ud = df[df["deviceTags"].notnull()].copy() + # define a device type (e.g., pump, cgm, or healthkit) + ud.loc[ + ((ud["deviceTags"].str.contains("pump")) + & (ud["deviceType"].isnull())), + ["deviceType"] + ] = "pump" + + # define a device type (e.g., cgm) + ud.loc[ + ((ud["deviceTags"].str.contains("cgm")) + & (ud["deviceType"].isnull())), + ["deviceType"] + ] = "cgm" + + return ud["deviceType"] + else: + return np.nan + + +def get_timezone_offset(currentDate, currentTimezone): + + tz = pytz.timezone(currentTimezone) + # here we add 1 day to the current date to account for changes to/from DST + tzoNum = int( + tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z") + ) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + + return tzo + + +def add_device_day_series(df, dfContDays, deviceTypeName): + if len(df) > 0: + dfDayGroups = df.groupby("date") + if "timezoneOffset" in df: + dfDaySeries = pd.DataFrame(dfDayGroups["timezoneOffset"].median()) + else: + dfDaySeries = pd.DataFrame(columns=["timezoneOffset"]) + dfDaySeries.index.name = "date" + + if "upload" in deviceTypeName: + if (("timezone" in df) & (df["timezone"].notnull().sum() > 0)): + dfDaySeries["timezone"] = ( + dfDayGroups.timezone.describe()["top"] + ) + # get the timezone offset for the timezone + for i in dfDaySeries.index: + if pd.notnull(dfDaySeries.loc[i, "timezone"]): + tzo = get_timezone_offset( + pd.to_datetime(i), + dfDaySeries.loc[i, "timezone"]) + dfDaySeries.loc[i, ["timezoneOffset"]] = tzo + if "timeProcessing" in dfDaySeries: + dfDaySeries["timeProcessing"] = \ + dfDayGroups.timeProcessing.describe()["top"] + else: + dfDaySeries["timeProcessing"] = np.nan + + + dfDaySeries = dfDaySeries.add_prefix(deviceTypeName + "."). \ + rename(columns={deviceTypeName + ".date": "date"}) + + dfContDays = pd.merge(dfContDays, dfDaySeries.reset_index(), + on="date", how="left") + + else: + dfContDays[deviceTypeName + ".timezoneOffset"] = np.nan + + return dfContDays + + +def impute_upload_records(df, contDays, deviceTypeName): + daySeries = \ + add_device_day_series(df, contDays, deviceTypeName) + + if ((len(df) > 0) & (deviceTypeName + ".timezone" in daySeries)): + for i in daySeries.index[1:]: + if pd.isnull(daySeries[deviceTypeName + ".timezone"][i]): + daySeries.loc[i, [deviceTypeName + ".timezone"]] = ( + daySeries.loc[i-1, deviceTypeName + ".timezone"] + ) + if pd.notnull(daySeries[deviceTypeName + ".timezone"][i]): + tz = daySeries.loc[i, deviceTypeName + ".timezone"] + tzo = get_timezone_offset( + pd.to_datetime(daySeries.loc[i, "date"]), + tz + ) + daySeries.loc[i, deviceTypeName + ".timezoneOffset"] = tzo + + if pd.notnull(daySeries[deviceTypeName + ".timeProcessing"][i-1]): + daySeries.loc[i, deviceTypeName + ".timeProcessing"] = \ + daySeries.loc[i-1, deviceTypeName + ".timeProcessing"] + + else: + daySeries[deviceTypeName + ".timezone"] = np.nan + daySeries[deviceTypeName + ".timeProcessing"] = np.nan + + return daySeries + + +def add_home_timezone(df, contDays): + + if (("timezone" in df) & (df["timezone"].notnull().sum()> 0)): + homeTimezone = df["timezone"].describe()["top"] + tzo = contDays.date.apply( + lambda x: get_timezone_offset(pd.to_datetime(x), homeTimezone)) + + contDays["home.imputed.timezoneOffset"] = tzo + contDays["home.imputed.timezone"] = homeTimezone + + else: + contDays["home.imputed.timezoneOffset"] = np.nan + contDays["home.imputed.timezone"] = np.nan + contDays["home.imputed.timeProcessing"] = np.nan + + return contDays + + +def estimateTzAndTzoWithUploadRecords(cDF): + + cDF["est.type"] = np.nan + cDF["est.gapSize"] = np.nan + cDF["est.timezoneOffset"] = cDF["upload.timezoneOffset"] + cDF["est.annotations"] = np.nan + + if "upload.timezone" in cDF: + cDF.loc[cDF["upload.timezone"].notnull(), ["est.type"]] = "UPLOAD" + cDF["est.timezone"] = cDF["upload.timezone"] + cDF["est.timeProcessing"] = cDF["upload.timeProcessing"] + else: + cDF["est.timezone"] = np.nan + cDF["est.timeProcessing"] = np.nan + + cDF.loc[((cDF["est.timezoneOffset"] != + cDF["home.imputed.timezoneOffset"]) & + (pd.notnull(cDF["est.timezoneOffset"]))), + "est.annotations"] = "travel" + + return cDF + + +def assignTzoFromImputedSeries(df, i, imputedSeries): + df.loc[i, ["est.type"]] = "DEVICE" + + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[i, imputedSeries + ".timezoneOffset"] + + df.loc[i, ["est.timezone"]] = \ + df.loc[i, imputedSeries + ".timezone"] + + df.loc[i, ["est.timeProcessing"]] = \ + df.loc[i, imputedSeries + ".timeProcessing"] + + return df + + +def compareDeviceTzoToImputedSeries(df, sIdx, device): + for i in sIdx: + # if the device tzo = imputed tzo, then chose the imputed tz and tzo + # note, dst is accounted for in the imputed tzo + for imputedSeries in ["pump.upload.imputed", "cgm.upload.imputed", + "healthkit.upload.imputed", "home.imputed"]: + # if the estimate has not already been made + if pd.isnull(df.loc[i, "est.timezone"]): + + if df.loc[i, device + ".timezoneOffset"] == \ + df.loc[i, imputedSeries + ".timezoneOffset"]: + + assignTzoFromImputedSeries(df, i, imputedSeries) + + df = addAnnotation(df, i, + "tz-inferred-from-" + imputedSeries) + + # if the imputed series has a timezone estimate, then see if + # the current day is a dst change day + elif (pd.notnull(df.loc[i, imputedSeries + ".timezone"])): + imputedTimezone = df.loc[i, imputedSeries + ".timezone"] + if isDSTChangeDay(df.loc[i, "date"], imputedTimezone): + + dstRange = getRangeOfTZOsForTimezone(imputedTimezone) + if ((df.loc[i, device + ".timezoneOffset"] in dstRange) + & (df.loc[i, imputedSeries + ".timezoneOffset"] in dstRange)): + + assignTzoFromImputedSeries(df, i, imputedSeries) + + df = addAnnotation(df, i, "dst-change-day") + df = addAnnotation( + df, i, "tz-inferred-from-" + imputedSeries) + + return df + + +def estimateTzAndTzoWithDeviceRecords(cDF): + + # 2A. use the TZO of the pump or cgm device if it exists on a given day. In + # addition, compare the TZO to one of the imputed day series (i.e., the + # upload and home series to see if the TZ can be inferred) + for deviceType in ["pump", "cgm"]: + # find the indices of days where a TZO estimate has not been made AND + # where the device (e.g., pump or cgm) TZO has data + sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & + (cDF[deviceType + ".timezoneOffset"].notnull()))].index + # compare the device TZO to the imputed series to infer time zone + cDF = compareDeviceTzoToImputedSeries(cDF, sIndices, deviceType) + + # 2B. if the TZ cannot be inferred with 2A, then see if the TZ can be + # inferred from the previous day's TZO. If the device TZO is equal to the + # previous day's TZO, AND if the previous day has a TZ estimate, use the + # previous day's TZ estimate for the current day's TZ estimate + for deviceType in ["pump", "cgm"]: + sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & + (cDF[deviceType + ".timezoneOffset"].notnull()))].index + + cDF = compareDeviceTzoToPrevDayTzo(cDF, sIndices, deviceType) + + # 2C. after 2A and 2B, check the DEVICE estimates to make sure that the + # pump and cgm tzo do not differ by more than 60 minutes. If they differ + # by more that 60 minutes, then mark the estimate as UNCERTAIN. Also, we + # allow the estimates to be off by 60 minutes as there are a lot of cases + # where the devices are off because the user changes the time for DST, + # at different times + sIndices = cDF[((cDF["est.type"] == "DEVICE") & + (cDF["pump.timezoneOffset"].notnull()) & + (cDF["cgm.timezoneOffset"].notnull()) & + (cDF["pump.timezoneOffset"] != cDF["cgm.timezoneOffset"]) + )].index + + tzoDiffGT60 = abs(cDF.loc[sIndices, "cgm.timezoneOffset"] - + cDF.loc[sIndices, "pump.timezoneOffset"]) > 60 + + idx = tzoDiffGT60.index[tzoDiffGT60] + + cDF.loc[idx, ["est.type"]] = "UNCERTAIN" + for i in idx: + cDF = addAnnotation(cDF, i, "pump-cgm-tzo-mismatch") + + return cDF + + +def imputeTzAndTzo(cDF): + + sIndices = cDF[cDF["est.timezoneOffset"].isnull()].index + hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index + if len(hasTzoIndices) > 0: + if len(sIndices) > 0: + lastDay = max(sIndices) + + while ((sIndices.min() < max(hasTzoIndices)) & + (len(sIndices) > 0)): + + currentDay, prevDayWithDay, nextDayIdx = \ + getImputIndices(cDF, sIndices, hasTzoIndices) + + cDF = imputeByTimezone(cDF, currentDay, + prevDayWithDay, nextDayIdx) + + sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & + (~cDF["est.annotations"].str.contains( + "unable-to-impute-tzo").fillna(False)))].index + + hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index + + # try to impute to the last day (earliest day) in the dataset + # if the last record has a timezone that is the home record, then + # impute using the home timezone + if len(sIndices) > 0: + currentDay = min(sIndices) + prevDayWithDay = currentDay - 1 + gapSize = lastDay - currentDay + + for i in range(currentDay, lastDay + 1): + if cDF.loc[prevDayWithDay, "est.timezoneOffset"] == \ + cDF.loc[prevDayWithDay, "home.imputed.timezoneOffset"]: + + cDF.loc[i, ["est.type"]] = "IMPUTE" + + cDF.loc[i, ["est.timezoneOffset"]] = \ + cDF.loc[i, "home.imputed.timezoneOffset"] + + cDF.loc[i, ["est.timezone"]] = \ + cDF.loc[i, "home.imputed.timezone"] + + cDF = addAnnotation(cDF, i, "gap=" + str(gapSize)) + cDF.loc[i, ["est.gapSize"]] = gapSize + + else: + cDF.loc[i, ["est.type"]] = "UNCERTAIN" + cDF = addAnnotation(cDF, i, "unable-to-impute-tzo") + else: + cDF["est.type"] = "UNCERTAIN" + cDF["est.annotations"] = "unable-to-impute-tzo" + + return cDF + + +def getRangeOfTZOsForTimezone(tz): + minMaxTzo = [getTimezoneOffset(pd.to_datetime("1/1/2017"), tz), + getTimezoneOffset(pd.to_datetime("5/1/2017"), tz)] + + rangeOfTzo = np.arange(int(min(minMaxTzo)), int(max(minMaxTzo))+1, 15) + + return rangeOfTzo + + +def getListOfDSTChangeDays(cDF): + + # get a list of DST change days for the home time zone + dstChangeDays = \ + cDF[abs(cDF["home.imputed.timezoneOffset"] - + cDF["home.imputed.timezoneOffset"].shift(-1)) > 0].date + + return dstChangeDays + + +def correctEstimatesAroundDst(df, cDF): + + # get a list of DST change days for the home time zone + dstChangeDays = getListOfDSTChangeDays(cDF) + + # loop through the df within 2 days of a daylight savings time change + for d in dstChangeDays: + dstIndex = df[(df.date > (d + dt.timedelta(days=-2))) & + (df.date < (d + dt.timedelta(days=2)))].index + for dIdx in dstIndex: + if pd.notnull(df.loc[dIdx, "est.timezone"]): + tz = pytz.timezone(df.loc[dIdx, "est.timezone"]) + tzRange = getRangeOfTZOsForTimezone(str(tz)) + minHoursToLocal = min(tzRange)/60 + tzoNum = int(tz.localize(df.loc[dIdx, "utcTime"] + + dt.timedelta(hours=minHoursToLocal)).strftime("%z")) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + localTime = \ + df.loc[dIdx, "utcTime"] + pd.to_timedelta(tzo, unit="m") + df.loc[dIdx, ["est.localTime"]] = localTime + df.loc[dIdx, ["est.timezoneOffset"]] = tzo + return df + + +def applyLocalTimeEstimates(df, cDF): + df = pd.merge(df, cDF, how="left", on="date") + df["est.localTime"] = \ + df["utcTime"] + pd.to_timedelta(df["est.timezoneOffset"], unit="m") + + df = correctEstimatesAroundDst(df, cDF) + + return df["est.localTime"].values + + +def isDSTChangeDay(currentDate, currentTimezone): + tzoCurrentDay = getTimezoneOffset(pd.to_datetime(currentDate), + currentTimezone) + tzoPreviousDay = getTimezoneOffset(pd.to_datetime(currentDate) + + dt.timedelta(days=-1), currentTimezone) + + return (tzoCurrentDay != tzoPreviousDay) + + +def tzoRangeWithComparisonTz(df, i, comparisonTz): + # if we have a previous timezone estimate, then calcuate the range of + # timezone offset values for that time zone + if pd.notnull(comparisonTz): + rangeTzos = getRangeOfTZOsForTimezone(comparisonTz) + else: + comparisonTz = np.nan + rangeTzos = np.array([]) + + return rangeTzos + + +def tzAndTzoRangePreviousDay(df, i): + # if we have a previous timezone estimate, then calcuate the range of + # timezone offset values for that time zone + comparisonTz = df.loc[i-1, "est.timezone"] + + rangeTzos = tzoRangeWithComparisonTz(df, i, comparisonTz) + + return comparisonTz, rangeTzos + + +def assignTzoFromPreviousDay(df, i, previousDayTz): + + df.loc[i, ["est.type"]] = "DEVICE" + df.loc[i, ["est.timezone"]] = previousDayTz + df.loc[i, ["est.timezoneOffset"]] = \ + getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), previousDayTz) + + df.loc[i, ["est.timeProcessing"]] = df.loc[i-1, "est.timeProcessing"] + df = addAnnotation(df, i, "tz-inferred-from-prev-day") + + return df + + +def assignTzoFromDeviceTzo(df, i, device): + + df.loc[i, ["est.type"]] = "DEVICE" + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[i, device + ".timezoneOffset"] + df.loc[i, ["est.timeProcessing"]] = \ + df.loc[i, device + ".upload.imputed.timeProcessing"] + + df = addAnnotation(df, i, "likely-travel") + df = addAnnotation(df, i, "tzo-from-" + device) + + return df + + +def compareDeviceTzoToPrevDayTzo(df, sIdx, device): + + for i in sIdx[sIdx > 0]: + + # first see if the previous record has a tzo + if (pd.notnull(df.loc[i-1, "est.timezoneOffset"])): + + previousDayTz, dstRange = tzAndTzoRangePreviousDay(df, i) + timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) - + df.loc[i-1, "est.timezoneOffset"]) + + # next see if the previous record has a tz + if (pd.notnull(df.loc[i-1, "est.timezone"])): + + if timeDiff == 0: + assignTzoFromPreviousDay(df, i, previousDayTz) + + # see if the previous day's tzo and device tzo are within the + # dst range (as that is a common problem with this data) + elif ((df.loc[i, device + ".timezoneOffset"] in dstRange) + & (df.loc[i-1, "est.timezoneOffset"] in dstRange)): + + # then see if it is DST change day + if isDSTChangeDay(df.loc[i, "date"], previousDayTz): + + df = addAnnotation(df, i, "dst-change-day") + assignTzoFromPreviousDay(df, i, previousDayTz) + + # if it is not DST change day, then mark this as uncertain + else: + # also, check to see if the difference between device. + # tzo and prev.tzo is less than the expected dst + # difference. There is a known issue where the BtUTC + # procedure puts clock drift into the device.tzo, + # and as a result the tzo can be off by 15, 30, + # or 45 minutes. + if (((df.loc[i, device + ".timezoneOffset"] == + min(dstRange)) | + (df.loc[i, device + ".timezoneOffset"] == + max(dstRange))) & + ((df.loc[i-1, "est.timezoneOffset"] == + min(dstRange)) | + (df.loc[i-1, "est.timezoneOffset"] == + max(dstRange)))): + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, + "likely-dst-error-OR-travel") + + else: + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, + "likely-15-min-dst-error") + + # next see if time difference between device.tzo and prev.tzo + # is off by 720 minutes, which is indicative of a common + # user AM/PM error + elif timeDiff == 720: + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-AM-PM-error") + + # if it doesn't fall into any of these cases, then the + # tzo difference is likely due to travel + else: + df = assignTzoFromDeviceTzo(df, i, device) + + elif timeDiff == 0: + df = assignTzoFromDeviceTzo(df, i, device) + + # if there is no previous record to compare with check for dst errors, + # and if there are no errors, it is likely a travel day + else: + + comparisonTz, dstRange = tzAndTzoRangeWithHomeTz(df, i) + timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) - + df.loc[i, "home.imputed.timezoneOffset"]) + + if ((df.loc[i, device + ".timezoneOffset"] in dstRange) + & (df.loc[i, "home.imputed.timezoneOffset"] in dstRange)): + + # see if it is DST change day + if isDSTChangeDay(df.loc[i, "date"], comparisonTz): + + df = addAnnotation(df, i, "dst-change-day") + df.loc[i, ["est.type"]] = "DEVICE" + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[i, device + ".timezoneOffset"] + df.loc[i, ["est.timezone"]] = \ + df.loc[i, "home.imputed.timezone"] + df.loc[i, ["est.timeProcessing"]] = \ + df.loc[i, device + ".upload.imputed.timeProcessing"] + + # if it is not DST change day, then mark this as uncertain + else: + # also, check to see if the difference between device. + # tzo and prev.tzo is less than the expected dst + # difference. There is a known issue where the BtUTC + # procedure puts clock drift into the device.tzo, + # and as a result the tzo can be off by 15, 30, + # or 45 minutes. + if (((df.loc[i, device + ".timezoneOffset"] == + min(dstRange)) | + (df.loc[i, device + ".timezoneOffset"] == + max(dstRange))) & + ((df.loc[i, "home.imputed.timezoneOffset"] == + min(dstRange)) | + (df.loc[i, "home.imputed.timezoneOffset"] == + max(dstRange)))): + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-dst-error-OR-travel") + + else: + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-15-min-dst-error") + + # next see if time difference between device.tzo and prev.tzo + # is off by 720 minutes, which is indicative of a common + # user AM/PM error + elif timeDiff == 720: + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-AM-PM-error") + + # if it doesn't fall into any of these cases, then the + # tzo difference is likely due to travel + + else: + df = assignTzoFromDeviceTzo(df, i, device) + + return df + + +def getImputIndices(df, sIdx, hIdx): + + lastDayIdx = len(df) - 1 + + currentDayIdx = sIdx.min() + tempList = pd.Series(hIdx) - currentDayIdx + prevDayIdx = currentDayIdx - 1 + nextDayIdx = \ + min(currentDayIdx + min(tempList[tempList >= 0]), lastDayIdx) + + return currentDayIdx, prevDayIdx, nextDayIdx + + +def imputeByTimezone(df, currentDay, prevDaywData, nextDaywData): + + gapSize = (nextDaywData - currentDay) + + if prevDaywData >= 0: + + if df.loc[prevDaywData, "est.timezone"] == \ + df.loc[nextDaywData, "est.timezone"]: + + tz = df.loc[prevDaywData, "est.timezone"] + + for i in range(currentDay, nextDaywData): + + df.loc[i, ["est.timezone"]] = tz + + df.loc[i, ["est.timezoneOffset"]] = \ + getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), tz) + + df.loc[i, ["est.type"]] = "IMPUTE" + + df = addAnnotation(df, i, "gap=" + str(gapSize)) + df.loc[i, ["est.gapSize"]] = gapSize + + # TODO: this logic should be updated to handle the edge case + # where the day before and after the gap have differing TZ, but + # the same TZO. In that case the gap should be marked as UNCERTAIN + elif df.loc[prevDaywData, "est.timezoneOffset"] == \ + df.loc[nextDaywData, "est.timezoneOffset"]: + + for i in range(currentDay, nextDaywData): + + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[prevDaywData, "est.timezoneOffset"] + + df.loc[i, ["est.type"]] = "IMPUTE" + + df = addAnnotation(df, i, "gap=" + str(gapSize)) + df.loc[i, ["est.gapSize"]] = gapSize + + else: + for i in range(currentDay, nextDaywData): + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "unable-to-impute-tzo") + + else: + for i in range(currentDay, nextDaywData): + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "unable-to-impute-tzo") + + return df + + +def addAnnotation(df, idx, annotationMessage): + if pd.notnull(df.loc[idx, "est.annotations"]): + df.loc[idx, ["est.annotations"]] = df.loc[idx, "est.annotations"] + \ + ", " + annotationMessage + else: + df.loc[idx, ["est.annotations"]] = annotationMessage + + return df + + +def getTimezoneOffset(currentDate, currentTimezone): + + tz = pytz.timezone(currentTimezone) + # here we add 1 day to the current date to account for changes to/from DST + tzoNum = int(tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z")) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + + return tzo + + +def estimate_local_time(df): + df["date"] = df["utcTime"].dt.date # TODO: change this to utcDate later + contiguous_days = create_contiguous_day_series(df) + + df["deviceType"] = add_device_type(df) + cDays = add_device_day_series(df, contiguous_days, "upload") + + # create day series for cgm df + if "timezoneOffset" not in list(df): + df["timezoneOffset"] = np.nan + + cgmdf = df[(df["type"] == "cbg") & (df["timezoneOffset"].notnull())].copy() + cDays = add_device_day_series(cgmdf, cDays, "cgm") + + # create day series for pump df + pumpdf = df[(df.type == "bolus") & (df.timezoneOffset.notnull())].copy() + cDays = add_device_day_series(pumpdf, cDays, "pump") + + # interpolate between upload records of the same deviceType, and create a + # day series for interpolated pump, non-hk-cgm, and healthkit uploads + for deviceType in ["pump", "cgm", "healthkit"]: + tempUploaddf = df[df["deviceType"] == deviceType].copy() + cDays = impute_upload_records( + tempUploaddf, cDays, deviceType + ".upload.imputed" + ) + + # add a home timezone that also accounts for daylight savings time changes + cDays = add_home_timezone(df, cDays) + + # 1. USE UPLOAD RECORDS TO ESTIMATE TZ AND TZO + cDays = estimateTzAndTzoWithUploadRecords(cDays) + + # 2. USE DEVICE TZOs TO ESTIMATE TZO AND TZ (IF POSSIBLE) + # estimates can be made from pump and cgm df that have a TZO + # NOTE: the healthkit and dexcom-api cgm df are excluded + cDays = estimateTzAndTzoWithDeviceRecords(cDays) + + # 3. impute, infer, or interpolate gaps in the estimated tzo and tz + cDays = imputeTzAndTzo(cDays) + + # 4. APPLY LOCAL TIME ESTIMATES TO ALL df + local_time = applyLocalTimeEstimates(df, cDays) + + return local_time, cDays + + +# %% GET DATA FROM JSON FILE +data_path = os.path.join("..", "data") +all_donor_metadata = pd.read_csv( + os.path.join( + data_path, + "PHI-2019-07-17-donor-data", + "PHI-2019-07-17-donor-metadata.csv"), + low_memory=False +) + +# glob through the json files that are available +all_files = glob.glob( + os.path.join( + data_path, + "dremio", + "**", + "*.json" + ), + recursive=True +) + +output_metadata = os.path.join( + data_path, + "PHI-2019-07-17-donor-data", + "PHI-2019-07-17-cgm-metadata" +) +output_distribution = os.path.join( + data_path, + "PHI-2019-07-17-donor-data", + "PHI-2019-07-17-cgm-distributions" +) +debug_duplicates = os.path.join( + data_path, + "PHI-2019-07-17-donor-data", + "PHI-2019-07-17-debug-cgm-duplicates" +) +output_stats = os.path.join( + data_path, + "PHI-2019-07-17-donor-data", + "PHI-2019-07-17-cgm-stats" +) + + +make_folder_if_doesnt_exist( + [output_metadata, output_distribution, debug_duplicates, output_stats] +) + + +# %% START OF CODE +timezone_aliases = pd.read_csv( + "wikipedia-timezone-aliases-2018-04-28.csv", + low_memory=False +) + +donor_metadata_columns = [ + 'userid', + 'diagnosisType', + 'diagnosisDate', + 'biologicalSex', + 'birthday', + 'targetTimezone', + 'targetDevices', + 'isOtherPerson', +] + + +## %% load test data on my computer +## TODO: if data comes in as a .csv, the embedded json fields +## get saved as a string and need to be unwrapped before those fields +## can be expanded. IN OTHER WORDS: this code only works with .json data +#for d_idx in [0]: +# userid = "0d4524bc11" +# data = pd.read_json(os.path.join( +# "..", "data", "dremio", userid, "PHI-{}.json".format(userid) +# )) + +# %% +for d_idx in range(0, len(all_files)): + data = pd.read_json(all_files[d_idx]) + userid = all_files[d_idx][-15:-5] + metadata = all_donor_metadata.loc[ + all_donor_metadata["userid"] == userid, + donor_metadata_columns + ] + print("\n", "starting", userid) + + # HASH USER ID + hashid = hash_userid(userid, os.environ['BIGDATA_SALT']) + data["userid"] = userid + data["hashid"] = hashid + metadata["hashid"] = hashid + + # CLEAN DATA + data_fields = list(data) + + # NOTE: moving remove negative durations to type specific cleaning + # TODO: ask backend to change "duration" to only include one object type + + # Tslim calibration bug fix + data, n_cal_readings = tslim_calibration_fix(data.copy()) + metadata["nTandemAndPayloadCalReadings"] = n_cal_readings + + # fix large timzoneOffset bug in utcbootstrapping + data = timezone_offset_bug_fix(data.copy()) + + # add healthkit timezome information + # TODO: refactor this function to only require fields that might have hk tz + data[["timezone", "deviceType"]] = get_healthkit_timezone(data.copy()) + + # convert deprecated timezones to their aliases + data = convert_deprecated_timezone_to_alias(data, timezone_aliases) + + # TIME RELATED ITEMS + data["utcTime"] = to_utc_datetime(data[["time"]].copy()) + + # add upload time to the data, which is needed for: + # getting rid of duplicates and useful for getting local time + + data["uploadTime"] = ( + add_upload_time(data[["type", "uploadId", "utcTime"]].copy()) + ) + +# # estimate local time (refactor of estimate-local-time.py) +# data["localTime"], local_time_metadata = estimate_local_time(data.copy()) +# +# TODO: fix this issue with estimate local time +# ''' +# //anaconda3/envs/tbddp/lib/python3.7/site-packages/pandas/core/ops.py:1649 +# FutureWarning: elementwise comparison failed; returning scalar instead, +# but in the future will perform elementwise comparison result = method(y) +# ''' + + # round all data to the nearest 5 minutes + data["roundedUtcTime"] = round_time( + data["utcTime"].copy(), + time_interval_minutes=5, + start_with_first_record=True, + return_calculation_columns=False + ) + + # TIME CATEGORIES + data["date"] = data["roundedUtcTime"].dt.date + + # AGE, & YLW + # TODO: make this a function + if metadata["birthday"].values[0] is not np.nan: + bDate = pd.to_datetime(metadata["birthday"].values[0][0:7]) + data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25) + else: + data["age"] = np.nan + + if metadata["diagnosisDate"].values[0] is not np.nan: + dDate = pd.to_datetime(metadata["diagnosisDate"].values[0][0:7]) + data["ylw"] = np.floor((data["roundedUtcTime"] - dDate).dt.days/365.25) + else: + data["ylw"] = np.nan + + # GROUP DATA BY TYPE + # first sort by upload time (used when removing dumplicates) + data.sort_values("uploadTime", ascending=False, inplace=True) + groups = data.groupby(by="type") + + # check to see if person is looping + if "basal" in data["type"].unique(): + basal = groups.get_group("basal").dropna(axis=1, how="all") + if "deliveryType" in list(basal): + bd = basal.loc[ + basal["deliveryType"] == "temp", + ["date", "deliveryType"] + ] + temp_basal_counts = ( + pd.DataFrame( + bd.groupby("date").deliveryType.count() + ).reset_index() + ) + temp_basal_counts.rename( + {"deliveryType": "tempBasalCounts"}, + axis=1, + inplace=True + ) + data = pd.merge(data, temp_basal_counts, on="date", how="left") + # >= 25 temp basals per day is likely looping + data["isLoopDay"] = data["tempBasalCounts"] >= 25 + # redefine groups with the new data + groups = data.groupby(by="type") + + else: + data["isLoopDay"] = np.nan + else: + data["isLoopDay"] = np.nan + + # %% CGM DATA + if "cbg" in data["type"].unique(): + # sort data with + metadata["cgmData"] = True + + # filter by cgm + cgm = groups.get_group("cbg").copy() + + # sort data + cgm.sort_values("roundedUtcTime", ascending=False, inplace=True) + cgm.reset_index(drop=False, inplace=True) + + # calculate cgm in mg/dL + cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL) + + # get rid of spike data + cgm, nSpike = remove_spike_data(cgm.copy()) + metadata["nSpike"] = nSpike + + # assign upload cgm device info to cgm records in that upload + cgm = add_upload_info_to_cgm_records(groups, cgm.copy()) + + # check to see if cgm info exists in healthkit locations + cgm = expand_heathkit_cgm_fields(cgm.copy()) + + # replace smoothed cgm values with raw values (if they exist) + # this must run after expand_heathkit_cgm_fields _ + cgm["mg/dL"], metadata["nSmoothedCgmReplaced"] = ( + replace_smoothed_cgm_values(cgm.copy()) + ) + + # get cgm models + cgm["cgmModel"], cgm["cgmModelSensedFrom"] = np.nan, np.nan + + # dexcom cgm models (G4, G5, G6) + cgm[["cgmModel", "cgmModelSensedFrom"]] = ( + get_dexcom_cgm_model(cgm.copy()) + ) + + # for non dexcom cgms + # 670G, 640G, 630G, 530G, 523/723, libre, animas, and tandem + cgm[["cgmModel", "cgmModelSensedFrom"]] = ( + get_non_dexcom_cgm_model(cgm.copy()) + ) + + # get metadata on cgm models and devices + metadata["nMissingCgmModels"] = cgm["cgmModel"].isnull().sum() + metadata["uniqueCgmModels"] = str(cgm["cgmModel"].unique()) + if "deviceId" in list(cgm): + metadata["uniqueCgmDevices"] = str(cgm["deviceId"].unique()) + + # clean distributions + # break up all traces by cgm model + combined_cgm_series = pd.DataFrame() + cgm_models = cgm.groupby(by="cgmModel") + + for cgm_model in cgm_models.groups.keys(): + print("working on", cgm_model) + temp_cgm = cgm_models.get_group(cgm_model) + + # get rid of cgm values too low/high (< 38 & > 402 mg/dL) + temp_cgm, nInvalidCgmValues = remove_invalid_cgm_values(temp_cgm) + metadata["nInvalidCgmValues." + cgm_model] = nInvalidCgmValues + + # sort by upload time before getting rid of duplicates + temp_cgm.sort_values("uploadTime", ascending=False, inplace=True) + + # get rid of duplicates that have the same ["deviceTime", "mg/dL"] + temp_cgm, n_cgm_dups_removed = ( + removeCgmDuplicates(temp_cgm, "deviceTime", "mg/dL") + ) + metadata["nCgmDuplicatesRemovedDeviceTime." + cgm_model] = ( + n_cgm_dups_removed + ) + + # get rid of duplicates that have the same ["time", "mg/dL"] + temp_cgm, n_cgm_dups_removed = ( + removeCgmDuplicates(temp_cgm, "utcTime", "mg/dL") + ) + metadata["nCgmDuplicatesRemovedUtcTime." + cgm_model] = ( + n_cgm_dups_removed + ) + + # get rid of duplicates that have the same roundedTime + temp_cgm, n_cgm_dups_removed = ( + removeDuplicates(temp_cgm, "roundedUtcTime") + ) + metadata["nCgmDuplicatesRemovedRoundedTime." + cgm_model] = ( + n_cgm_dups_removed + ) + + # create a contiguous 5 minute time series + first_day = temp_cgm["roundedUtcTime"].min() + metadata["firstCgm." + cgm_model] = first_day + + last_day = temp_cgm["roundedUtcTime"].max() + metadata["lastCgm." + cgm_model] = last_day + + rng = pd.date_range(first_day, last_day, freq="5min") + contiguous_data = pd.DataFrame( + rng, + columns=["roundedUtcTime"] + ).sort_values( + "roundedUtcTime", + ascending=False + ).reset_index(drop=True) + + # merge with cgm data + cgm_series = pd.merge( + contiguous_data, + temp_cgm[[ + "roundedUtcTime", "hashid", "isLoopDay", + "cgmModel", "age", "ylw", "mg/dL" + ]], + on="roundedUtcTime", + how="left" + ) + + # sort so that the oldest data point is on top + cgm_series.sort_values( + "roundedUtcTime", ascending=True, inplace=True + ) + cgm_series.reset_index(drop=True, inplace=True) + + # get dexcom icgm bins + value_bins = np.array( + [37, 39, 60, 80, 120, 160, 200, 250, 300, 350, 400, 403] + ) + value_bin_names = ( + "< 40", "40-60", "61-80", "81-120", "121-160", "161-200", + "201-250", "251-300", "301-350", "351-400", "> 400" + ) + cgm_series["valueBin"] = pd.cut( + cgm_series["mg/dL"], value_bins, labels=value_bin_names + ) + + # get the previous val + cgm_series["previousVal"] = cgm_series["mg/dL"].shift(1) + + # get difference between current and previous val + cgm_series["diffFromPrevVal"] = ( + cgm_series["mg/dL"] - cgm_series["previousVal"] + ) + + # calculate the rate from previous value (mg/dL/min) + cgm_series["rateFromPrevVal"] = cgm_series["diffFromPrevVal"] / 5 + + # get dexcom icgm rate bins + rate_bins = np.array( + [-100, -2.000001, -1.000001, -0.000001, 1, 2, 100] + ) + # NOTE: bracket means include, parentheses means exclude + rate_bin_names = ( + "< -2", "[-2,-1)", "[-1,-0)", "[0,1]", "(1,2]", ">2", + ) + cgm_series["rateBin"] = pd.cut( + cgm_series["rateFromPrevVal"], rate_bins, labels=rate_bin_names + ) + + # through in the join category + cgm_series["valAndRateBin"] = ( + cgm_series["valueBin"].astype(str) + + " & " + + cgm_series["rateBin"].astype(str) + ) + + # calculate slope (mg/dL/min) over the last 15, 30, and 60 minutes + cgm_series["slope15"] = ( + cgm_series["mg/dL"].rolling(3).apply(get_slope, raw=True) + ) + + cgm_series["slope30"] = ( + cgm_series["mg/dL"].rolling(6).apply(get_slope, raw=True) + ) + + cgm_series["slope60"] = ( + cgm_series["mg/dL"].rolling(12).apply(get_slope, raw=True) + ) + + # add in the next value + cgm_series["nextVal"] = cgm_series["mg/dL"].shift(-1) + + # get difference or relative increase/decrease of next value + cgm_series["relativeNextValue"] = ( + cgm_series["nextVal"] - cgm_series["mg/dL"] + ) + + # rate of next value + cgm_series["rateToNextVal"] = cgm_series["relativeNextValue"] / 5 + + # drop rows where there is no information + cgm_series.dropna(subset=['hashid'], inplace=True) + metadata["nCgmDataPoints." + cgm_model] = len(cgm_series) + + # append cgm model to a larger table + combined_cgm_series = pd.concat( + [combined_cgm_series, cgm_series], + ignore_index=True + ) + if len(combined_cgm_series) > 0: + # sort so that the oldest data point is on top + # and that the G5_G6 get deleted if they are apart of a duplicate + combined_cgm_series["cgmModel_G5_and_G6"] = ( + combined_cgm_series["cgmModel"] == "G5_G6" + ) + combined_cgm_series.sort_values( + by=["roundedUtcTime", "cgmModel_G5_and_G6", "cgmModel"], + ascending=[False, True, False], + inplace=True + ) + combined_cgm_series.reset_index(drop=True, inplace=True) + + # add in check to see if there are duplicates between cgm devices + nUnique_cgm_times = len(combined_cgm_series["roundedUtcTime"].unique()) + cgm_len = len(combined_cgm_series) + metadata["duplicateCgmDataIssue"] = nUnique_cgm_times != cgm_len + + nDuplicate_cgm = cgm_len - nUnique_cgm_times + metadata["nDuplicateCgmDataIssues"] = nDuplicate_cgm + + # if there are still duplicates, get rid of them + if nDuplicate_cgm > 0: + # save the duplicates for further examination + combined_cgm_series.to_csv(os.path.join( + debug_duplicates, + "PHI-" + userid + "-cgm-series-has-cgm-duplicates.csv.gz" + )) + + cgm.to_csv(os.path.join( + debug_duplicates, + "PHI-" + userid + "-cgm-data-has-cgm-duplicates.csv.gz" + )) + + # get rid of duplicates + combined_cgm_series, n_cgm_dups_removed = ( + removeDuplicates(combined_cgm_series, "roundedUtcTime") + ) + metadata["nCgmDuplicatesRemovedRoundedTime.atEnd"] = ( + n_cgm_dups_removed + ) + metadata["nCgmDataPoints.atEnd"] = len(combined_cgm_series) + + # add whether data is dexcom cgm or not + combined_cgm_series["dexcomCgm"] = ( + combined_cgm_series["cgmModel"].astype(str).str.contains("G4|G5|G6") + ) + + # save distribution data + combined_cgm_series.to_csv(os.path.join( + output_distribution, + "PHI-" + userid + "-cgm-distribution.csv.gz" + )) + + # %% get cgm stats + # create a contiguous 5 minute time series of ALL cgm data + first_day = combined_cgm_series["roundedUtcTime"].min() + metadata["firstCgm." + cgm_model] = first_day + + last_day = combined_cgm_series["roundedUtcTime"].max() + metadata["lastCgm." + cgm_model] = last_day + + rng = pd.date_range(first_day, last_day, freq="5min") + contiguous_data = pd.DataFrame( + rng, + columns=["roundedUtcTime"] + ).sort_values( + "roundedUtcTime", + ascending=True + ).reset_index(drop=True) + + # merge with combined_cgm_series data + all_cgm = pd.merge( + contiguous_data, + combined_cgm_series[[ + 'roundedUtcTime', 'hashid', 'cgmModel', 'dexcomCgm', + 'age', 'ylw', 'isLoopDay', 'mg/dL', + ]], + on="roundedUtcTime", + how="left" + ) + + # get cgm stats + # get a binary (T/F) of whether we have a cgm value + all_cgm["hasCgm"] = all_cgm["mg/dL"].notnull() + + # fill isLoopDay nan with False + all_cgm["isLoopDay"].fillna(False, inplace=True) + + # has loop and cgm + all_cgm["hasLoopAndCgm"] = ( + (all_cgm["isLoopDay"]) & (all_cgm["hasCgm"]) + ) + + all_cgm["hasCgmWithoutLoop"] = ( + (~all_cgm["isLoopDay"]) & (all_cgm["hasCgm"]) + ) + + # work with all of the non-null data, even 39 = LOW and 401 = HIGH + ts39_401 = all_cgm["mg/dL"].copy() + + # some stats should NOT include 39 or 401 + all_cgm["mg/dL.40to400"] = ( + ts39_401.replace(to_replace=39, value=np.nan) + ) + + all_cgm["mg/dL.40to400"] = ( + all_cgm["mg/dL.40to400"].replace( + to_replace=401, + value=np.nan + ) + ) + + ts40_400 = all_cgm["mg/dL.40to400"].copy() + + + # for all the less than (<) criteria + for cgm_threshold in [40, 54, 70]: + all_cgm["cgm < " + str(cgm_threshold)] = ( + ts39_401.lt(cgm_threshold) + ) + # get episodes below these thresholds + for min_duration in [5, 15]: + episode_ts = get_episodes( + all_cgm[[ + "roundedUtcTime", + "hasCgm", + "cgm < " + str(cgm_threshold) + ]].copy(), + episode_criterion="cgm < " + str(cgm_threshold), + min_duration=min_duration + ) + all_cgm = pd.concat([all_cgm, episode_ts], axis=1) + + # for all the greter than or equal to (>=) criteria + all_cgm["cgm >= " + str(cgm_threshold)] = ( + ts39_401.ge(cgm_threshold) + ) + + # for all the the less than or equal to (<=) criteria + for cgm_threshold in [140, 180, 250, 300, 400]: + all_cgm["cgm <= " + str(cgm_threshold)] = ( + ts39_401.le(cgm_threshold) + ) + # for all the the greter than (>) criteria + all_cgm["cgm > " + str(cgm_threshold)] = ( + ts39_401.gt(cgm_threshold) + ) + + # get all of the cgm ranges + # (cgm >= 40) & (cgm < 54) + all_cgm["40 <= cgm < 54"] = ( + (all_cgm["cgm >= 40"]) & (all_cgm["cgm < 54"]) + ) + + # (cgm >= 54) & (cgm < 70) + all_cgm["54 <= cgm < 70"] = ( + (all_cgm["cgm >= 54"]) & (all_cgm["cgm < 70"]) + ) + + # (cgm >= 70) & (cgm <= 140) + all_cgm["70 <= cgm <= 140"] = ( + (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 140"]) + ) + + # (cgm >= 70) & (cgm <= 180) + all_cgm["70 <= cgm <= 180"] = ( + (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 180"]) + ) + + # (cgm > 180) & (cgm <= 250) + all_cgm["180 < cgm <= 250"] = ( + (all_cgm["cgm > 180"]) & (all_cgm["cgm <= 250"]) + ) + + # (cgm > 250) & (cgm <= 400) + all_cgm["250 < cgm <= 400"] = ( + (all_cgm["cgm > 250"]) & (all_cgm["cgm <= 400"]) + ) + + # derfine the windows to calculate the stats over + window_names = ["hour", "day", "week", "month", "quarter", "year"] + window_lengths = [12, 288, 288*7, 288*7*4, 288*90, 288*365] + + for w_name, w_len in zip(window_names, window_lengths): + # require lenth of window for percent calculations + w_min = w_len + + # get the start and end times for each window + all_cgm[w_name + ".startTime"] = ( + all_cgm["roundedUtcTime"].shift(w_len - 1) + ) + all_cgm[w_name + ".endTime"] = all_cgm["roundedUtcTime"] + + # add majority age for the time period + all_cgm[w_name + ".age"] = np.round( + all_cgm["age"].rolling( + min_periods=1, + window=w_len + ).mean() + ) + + # add majority ylw for the time period + all_cgm[w_name + ".ylw"] = np.round( + all_cgm["ylw"].rolling( + min_periods=1, + window=w_len + ).median() + ) + + # get percent time cgm used + all_cgm[w_name + ".cgmPercent"] = ( + all_cgm["hasCgm"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # get the total number of non-null values over this time period + all_cgm[w_name + ".missingCgmPercent"] = ( + 1 - all_cgm[w_name + ".cgmPercent"] + ) + + # create (T/F) 70 and 80 percent available thresholds + # which will be useful for processing later + all_cgm[w_name + ".ge70Available"] = ( + all_cgm[w_name + ".cgmPercent"] >= 0.7 + ) + + all_cgm[w_name + ".ge80Available"] = ( + all_cgm[w_name + ".cgmPercent"] >= 0.8 + ) + + # get percent time Loop was used NOTE: this is + # approximate because we use > 24 temp basals per day + # ALSO: this is percent time Loop was used while cgm in use + all_cgm[w_name + ".loopingAndCgmPercent"] = ( + all_cgm["hasLoopAndCgm"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # percent of time cgm without loop + all_cgm[w_name + ".cgmWithoutLoopPercent"] = ( + all_cgm["hasCgmWithoutLoop"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # get episode stats + # TODO: add in hyper events + # get episodes below these thresholds + for cgm_threshold in [40, 54, 70]: + # get number of episodes per time window + for min_duration in [5, 15]: + "cgm < " + str(cgm_threshold) + episode_name = ( + "episode.cgm < " + str(cgm_threshold) + + ".durationThreshold=" + str(min_duration) + ) + all_cgm[w_name + ".count." + episode_name] = ( + all_cgm[episode_name + ".episodeStart"].rolling( + min_periods=1, + window=w_len + ).sum() + ) + + # get avg. duration of each episode per time window + all_cgm[w_name + ".avgDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).sum() / all_cgm[w_name + ".count." + episode_name] + ) + + # get min duration of each episode per time window + all_cgm[w_name + ".minDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).min() + ) + + # get median duration of each episode per time window + all_cgm[w_name + ".medianDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).median() + ) + + # get max duration of each episode per time window + all_cgm[w_name + ".maxDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).max() + ) + + # get percent time in different ranges + # % Time < 54 + all_cgm[w_name + ".lt54Percent"] = ( + all_cgm["cgm < 54"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in 54-70 (cgm >= 54) & (cgm < 70) + all_cgm[w_name + ".bt54_70Percent"] = ( + all_cgm["54 <= cgm < 70"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in target range (cgm >= 70) & (cgm <= 180) + all_cgm[w_name + ".bt70_180Percent"] = ( + all_cgm["70 <= cgm <= 180"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in 180-250 (cgm > 180) & (cgm <= 250) + all_cgm[w_name + ".bt180_250Percent"] = ( + all_cgm["180 < cgm <= 250"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time > 250 + all_cgm[w_name + ".gt250Percent"] = ( + all_cgm["cgm > 250"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # check that all of the percentages add of to 1 or 100% + all_cgm[w_name + ".percentCheck"] = ( + all_cgm[w_name + ".missingCgmPercent"] + + all_cgm[w_name + ".lt54Percent"] + + all_cgm[w_name + ".bt54_70Percent"] + + all_cgm[w_name + ".bt70_180Percent"] + + all_cgm[w_name + ".bt180_250Percent"] + + all_cgm[w_name + ".gt250Percent"] + ) + + # here are some other less common percent time in ranges + # % Time < 70 + all_cgm[w_name + ".lt70Percent"] = ( + all_cgm["cgm < 70"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in target range (cgm >= 70) & (cgm <= 140) + all_cgm[w_name + ".tir70to140Percent"] = ( + all_cgm["70 <= cgm <= 140"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # percent time above a threshold + # % Time > 180 + all_cgm[w_name + ".gt180Percent"] = ( + all_cgm["cgm > 180"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # quantiles + # NOTE: this will increase run time, so only run if you need + # 3-4X the processing time since it has to sort the data + # TODO: make this an option to the function, once it is made + # create a rolling object + + # NOTE: these calculations only require 3 points to make + roll39_401 = ts39_401.rolling(min_periods=3, window=w_len) + roll40_400 = ts40_400.rolling(min_periods=3, window=w_len) + + # min + all_cgm[w_name + ".min"] = roll39_401.min() + + # 10, 25, 75, and 90th percentiles + all_cgm[w_name + ".10th"] = roll39_401.quantile(0.10) + all_cgm[w_name + ".25th"] = roll39_401.quantile(0.25) + all_cgm[w_name + ".75th"] = roll39_401.quantile(0.75) + all_cgm[w_name + ".90th"] = roll39_401.quantile(0.90) + + # max + all_cgm[w_name + ".max"] = roll39_401.max() + + # median + all_cgm[w_name + ".median"] = roll39_401.median() + + # iqr + all_cgm[w_name + ".iqr"] = ( + all_cgm[w_name + ".75th"] - all_cgm[w_name + ".25th"] + ) + + # recalcuate percent of measurements available + all_cgm[w_name + ".40to400availablePercent"] = ( + roll40_400.count() / w_len + ) + + # get the total number of non-null values over this time period + all_cgm[w_name + ".40to400missingPercent"] = ( + 1 - all_cgm[w_name + ".40to400availablePercent"] + ) + + all_cgm[w_name + ".40to400ge70Available"] = ( + all_cgm[w_name + ".40to400availablePercent"] >= 0.7 + ) + + all_cgm[w_name + ".40to400ge80Available"] = ( + all_cgm[w_name + ".40to400availablePercent"] >= 0.8 + ) + + # mean + all_cgm[w_name + ".mean"] = roll40_400.mean() + + # GMI(%) = 3.31 + 0.02392 x [mean glucose in mg/dL] + all_cgm[w_name + ".gmi"] = ( + 3.31 + (0.02392 * all_cgm[w_name + ".mean"]) + ) + + # standard deviation (std) + all_cgm[w_name + ".std"] = roll40_400.std() + + # coefficient of variation (cov) = std / mean + all_cgm[w_name + ".cov"] = ( + all_cgm[w_name + ".std"] / all_cgm[w_name + ".mean"] + ) + + # %% save cgm stats data + all_cgm.to_csv(os.path.join( + output_stats, + "PHI-" + userid + "-cgm-stats.csv.gz" + )) + # write the most recent example of the 90 day stats + # to the metadata + quarter_ge80Available_idx = ( + all_cgm[all_cgm["quarter.ge80Available"]] + ).index.max() + + if pd.notnull(quarter_ge80Available_idx): + # get the most recent quarter + most_recent = all_cgm.loc[ + [quarter_ge80Available_idx], + all_cgm.columns + ] + else: + most_recent = all_cgm.loc[ + [all_cgm.index.max()], + all_cgm.columns + ] + + metadata = pd.merge( + metadata, + most_recent, + on="hashid", + how="left" + ) + + print(metadata.T) + + else: + metadata["cgmData"] = False + print(d_idx, "no cgm data") + + # save metadata + metadata.to_csv(os.path.join( + output_metadata, + "PHI-" + userid + "-cgm-metadata.csv.gz" + )) + + print("finished", d_idx, userid) diff --git a/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py new file mode 100644 index 00000000..4da725b1 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_stats/get_cgm_distributions_and_stats.py @@ -0,0 +1,2453 @@ +#!/usr/bin/env python3 +# -*- coding: utf-8 -*- +''' +calculate cgm distributions and stats for a single tidepool (donor) dataset +from a data that comes from a json file (does NOT work with data save as csv) +''' + + +# %% REQUIRED LIBRARIES +import os +import sys +import hashlib +import pytz +import numpy as np +import pandas as pd +import datetime as dt +import argparse +import pdb + +get_donor_data_path = os.path.abspath( + os.path.join(os.path.dirname(__file__), "..") +) +if get_donor_data_path not in sys.path: + sys.path.insert(0, get_donor_data_path) +from get_donor_data.get_single_tidepool_dataset_json import ( + make_folder_if_doesnt_exist, get_data +) +from get_donor_data.get_single_donor_metadata import get_shared_metadata + +# %% CONSTANTS +MGDL_PER_MMOLL = 18.01559 + + +# %% FUNCTIONS +''' +the functions that are called in this script, +which includes notes of where the functions came from, +and whether they were refactored +''' + +def get_episodes( + df, + episode_criterion="cgm < 54", + min_duration=5, +): + # TODO: deal with case where there are nan's in the middle of an episode + # it probably makes sense to interpolate between values iff the gap is + # <= 1 to 6 points (5 to 30 minutes) + + # put consecutive data that matches in groups + df["tempGroups"] = (( + df[episode_criterion] != df[episode_criterion].shift() + ).cumsum()) + + df["episodeId"] = ( + df["tempGroups"] * df[episode_criterion] + ) + + # group by the episode groups + episode_groups = df.groupby("episodeId") + episodes = episode_groups["roundedUtcTime"].count().reset_index() + episodes["duration"] = episodes["roundedUtcTime"] * 5 + episodes.rename(columns={"roundedUtcTime": "episodeCounts"}, inplace=True) + + df = pd.merge(df, episodes, on="episodeId", how="left") + df["episodeDuration"] = ( + df["duration"] * df[episode_criterion] + ) + + # mark record as belonging to an episode + df["isEpisode"] = ( + df["episodeDuration"] >= min_duration + ) + + # get the hypo episode starts so we only count each episode once + df["episodeStart"] = ( + (df[episode_criterion]) + & (~df[episode_criterion].shift(1).fillna(False)) + ) + + # calculate the total duration and attach to start record + # which is needed to get the average duration per episode + df["episodeTotalDuration"] = ( + df["episodeStart"] * df["episodeDuration"] + ) + df["episodeTotalDuration"].replace(0, np.nan, inplace=True) + + episode_prefix = ( + "episode." + episode_criterion + + ".durationThreshold=" + str(min_duration) + "." + ) + + df = df[[ + "isEpisode", "episodeId", "episodeStart", "episodeTotalDuration" + ]].add_prefix(episode_prefix) + + return df + + +def get_slope(y): + if "array" not in type(y).__name__: + raise TypeError('Expecting a numpy array') + + count_ = len(y) + + x = np.arange(start=0, stop=count_*5, step=5) + + sum_x = x.sum() + sum_y = y.sum() + sum_xy = (x * y).sum() + sum_x_squared = (x * x).sum() + + slope = ( + ((count_ * sum_xy) - (sum_x * sum_y)) + / ((count_ * sum_x_squared) - (sum_x * sum_x)) + ) + + return slope + + +def expand_entire_dict(ts): + if "Series" not in type(ts).__name__: + raise TypeError('Expecting a pandas time series object') + notnull_idx = ts.index[ts.notnull()] + temp_df = pd.DataFrame( + ts[notnull_idx].tolist(), + index=notnull_idx + ) + + return temp_df + + +def expand_embedded_dict(ts, key_): + '''Expanded a single field that has embedded json + + Args: + ts: a pandas time series of the field that has embedded json + key_: the key that you want to expand + + Raise: + TypeError: if you don't pass in a pandas time series + + Returns: + key_ts: a new time series of the key of interest + + NOTE: + this is new function + TODO: + could be refactored to allow multiple keys or all keys to be returned + could be refactored for speed as the current process + ''' + + if "Series" not in type(ts).__name__: + raise TypeError('Expecting a pandas time series object') + key_ts = pd.Series(name=ts.name + "." + key_, index=ts.index) + notnull_idx = ts.notnull() + # TODO: maybe sped up by only getting the one field of interest? + # though, the current method is fairly quick and compact + temp_df = expand_entire_dict(ts) + if key_ in list(temp_df): + key_ts[notnull_idx] = temp_df[key_].values + + return key_ts + + +def get_embedded_field(ts, embedded_field): + '''get a field that is nested in more than 1 embedded dictionary (json) + + Args: + ts: a pandas time series of the field that has embedded json + embedded_field (str): the location of the field that is deeply nested + (e.g., "origin.payload.device.model") + + Raise: + ValueError: if you don't pass in a pandas time series + + Returns: + new_ts: a new time series of the key of interest + + NOTE: + this is new function + the "." notation is used to reference nested json + + ''' + field_list = embedded_field.split(".") + if len(field_list) < 2: + raise ValueError('Expecting at least 1 embedded field') + + new_ts = expand_embedded_dict(ts, field_list[1]) + for i in range(2, len(field_list)): + new_ts = expand_embedded_dict(new_ts, field_list[i]) + + return new_ts + + +def add_upload_info_to_cgm_records(groups, df): + upload_locations = [ + "upload.uploadId", + "upload.deviceManufacturers", + "upload.deviceModel", + "upload.deviceSerialNumber", + "upload.deviceTags" + ] + + if "upload" in groups["type"].unique(): + upload = groups.get_group("upload").dropna(axis=1, how="all").add_prefix("upload.") + df = pd.merge( + left=df, + right=upload[list(set(upload_locations) & set(list(upload)))], + left_on="uploadId", + right_on="upload.uploadId", + how="left" + ) + + return df + + +def expand_heathkit_cgm_fields(df): + # TODO: refactor the code/function that originally grabs + # these fields, so we are only doing it once, and so + # we don't have to drop the columns for the code below to work. + drop_columns = [ + 'origin.payload.device.name', + 'origin.payload.device.manufacturer', + 'origin.payload.sourceRevision.source.name' + ] + for drop_col in drop_columns: + if drop_col in list(df): + df.drop(columns=[drop_col], inplace=True) + + healthkit_locations = [ + "origin", + "origin.payload", + "origin.payload.device", + "origin.payload.sourceRevision", + "origin.payload.sourceRevision.source", + "payload", + ] + + for hk_loc in healthkit_locations: + if hk_loc in list(df): + temp_df = ( + expand_entire_dict(df[hk_loc].copy()).add_prefix(hk_loc + ".") + ) + df = pd.concat([df, temp_df], axis=1) + + return df + + +def get_dexcom_cgm_model(df): + # add cgm model + + dexcom_model_locations = [ + "deviceId", + "deviceManufacturers", + "upload.deviceManufacturers", + "deviceModel", + "upload.deviceModel", + "deviceSerialNumber", + "upload.deviceSerialNumber", + "origin.payload.sourceRevision.source.name", + "payload.transmitterGeneration", + "payload.HKMetadataKeySyncIdentifier", + "payload.transmitterId", + ] + + for model_location in dexcom_model_locations: + # only check if model has NOT been determined, or if it is G5_G6 + m_idx = ( + (df["cgmModel"].isnull()) + | (df["cgmModel"].astype(str).str.contains("G5_G6")) + ) + + # get index that matches model + if ((model_location in list(df)) & (m_idx.sum() > 0)): + str_list = df[model_location].astype(str).str + + # G4 + g4_idx = str_list.contains("G4", case=False, na=False) + df.loc[g4_idx, "cgmModel"] = "G4" + df.loc[g4_idx, "cgmModelSensedFrom"] = model_location + + # G5 + g5_idx = str_list.contains("G5", case=False, na=False) + df.loc[g5_idx, "cgmModel"] = "G5" + df.loc[g5_idx, "cgmModelSensedFrom"] = model_location + + # G6 + g6_idx = str_list.contains("G6", case=False, na=False) + df.loc[g6_idx, "cgmModel"] = "G6" + df.loc[g6_idx, "cgmModelSensedFrom"] = model_location + + # edge case of g5 and g6 + g5_g6_idx = (g5_idx & g6_idx) + df.loc[g5_g6_idx, "cgmModel"] = "G5_G6" + df.loc[g5_g6_idx, "cgmModelSensedFrom"] = model_location + + # case of "transmitterId" + if ( + ("transmitterId" in model_location) + | ("payload.HKMetadataKeySyncIdentifier" in model_location) + ): + # if length of string is 5, then it is likely a G4 sensor + length5_idx = str_list.len() == 5 + df.loc[length5_idx, "cgmModel"] = "G4" + df.loc[length5_idx, "cgmModelSensedFrom"] = model_location + + # if length of string > 5 then might be G5 or G6 + length_gt5_idx = str_list.len() > 5 + + # if sensor stats with 4 then likely G5 + starts4_idx = str_list.startswith("4") + df.loc[(length_gt5_idx & starts4_idx), "cgmModel"] = "G5" + df.loc[(length_gt5_idx & starts4_idx), "cgmModelSensedFrom"] = model_location + + # if sensor stats with 2 or 8 then likely G6 + starts2_6_idx = ( + (str_list.startswith("2")) | (str_list.startswith("8")) + ) + df.loc[(length_gt5_idx & starts2_6_idx), "cgmModel"] = "G6" + df.loc[(length_gt5_idx & starts2_6_idx), "cgmModelSensedFrom"] = model_location + + return df[["cgmModel", "cgmModelSensedFrom"]] + + +def get_non_dexcom_cgm_model(df): + # non-dexcom cgm model query + model_locations = ["deviceId"] + + # model types (NOTE: for medtronic getting pump type not cgm) + models_670G = "MMT-158|MMT-178" + models_640G = "MMT-1511|MMT-1512|MMT-1711|MMT-1712" + models_630G = "MMT-1514|MMT-1515|MMT-1714|MMT-1715" + models_530G = ( + "530G|MedT-551|MedT-751|MedT-554|MedT-754|Veo - 554|Veo - 754" + ) + models_523_723 = "MedT-523|MedT-723|Revel - 523|Revel - 723" # 523/723 + models_libre = "AbbottFreeStyleLibre" + models_animas = "IR1295" + # NOTE: the tandem G4 will first be written as G5_G6, + # but the logic should overwrite back to G4 + models_tandem_G5_G6 = "tandem" + models_tandem_G4 = "4628003|5448003" + + non_dex_models = [ + models_670G, models_640G, models_630G, models_530G, models_523_723, + models_libre, models_animas, models_tandem_G5_G6, models_tandem_G4 + ] + + non_dex_model_names = [ + "670G", "640G", "630G", "530G", "523_723", + "LIBRE", "G4", "G5_G6", "G4" + ] + + for model_location in model_locations: + # only check if model has NOT been determined, or if it is G5_G6 + m_idx = ( + (df["cgmModel"].isnull()) + | (df["cgmModel"].astype(str).str.contains("G5_G6")) + ) + + # get index that matches model + if ((model_location in list(df)) & (m_idx.sum() > 0)): + str_list = df[model_location].astype(str).str + + for non_dex_model, model_name in zip( + non_dex_models, non_dex_model_names + ): + + model_idx = str_list.contains(non_dex_model, na=False) + df.loc[model_idx, "cgmModel"] = model_name + df.loc[model_idx, "cgmModelSensedFrom"] = model_location + + return df[["cgmModel", "cgmModelSensedFrom"]] + + +def hash_userid(userid, salt): + ''' + taken from anonymize-and-export.py + refactored name(s) to meet style guide + ''' + usr_string = userid + salt + hash_user = hashlib.sha256(usr_string.encode()) + hashid = hash_user.hexdigest() + + return hashid + + +def get_type(val): + return type(val).__name__ + + +def remove_negative_durations(df): + ''' + taken from https://github.com/tidepool-org/data-analytics/blob/ + etn/get-settings-and-events/projects/get-donors-pump-settings/ + get-users-settings-and-events.py + + refactored name(s) to meet style guide + refactored pandas field call to df["field"] instead of df.field + refactored because physical activity includes embedded json, whereas + the other fields in the data model require a integer + TODO: I think that durations are coming in as floats too, so we need + to refactor to account for that. + ''' + if "duration" in list(df): + type_ = df["duration"].apply(get_type) + valid_index = ((type_ == "int") & (df["duration"].notnull())) + n_negative_durations = sum(df.loc[valid_index, "duration"] < 0) + if n_negative_durations > 0: + df = df[~(df.loc[valid_index, "duration"] < 0)] + else: + n_negative_durations = np.nan + + return df, n_negative_durations + + +def tslim_calibration_fix(df): + ''' + taken from https://github.com/tidepool-org/data-analytics/blob/ + etn/get-settings-and-events/projects/get-donors-pump-settings/ + get-users-settings-and-events.py + + refactored name(s) to meet style guide + refactored pandas field call to df["field"] instead of df.field + refactored to only expand one field + ''' + + # expand payload field one level + if "payload" in list(df): + df["payload.calibration_reading"] = ( + expand_embedded_dict(df["payload"], "calibration_reading") + ) + + if df["payload.calibration_reading"].notnull().sum() > 0: + + search_for = ['tan'] + tandem_data_index = ( + (df["deviceId"].str.contains('|'.join(search_for))) + & (df["type"] == "deviceEvent") + ) + + cal_index = df["payload.calibration_reading"].notnull() + valid_index = tandem_data_index & cal_index + + n_cal_readings = sum(valid_index) + + if n_cal_readings > 0: + # if reading is > 30 then it is in the wrong units + if df["payload.calibration_reading"].min() > 30: + df.loc[cal_index, "value"] = ( + df.loc[valid_index, "payload.calibration_reading"] + / MGDL_PER_MMOLL + ) + else: + df.loc[cal_index, "value"] = ( + df.loc[valid_index, "payload.calibration_reading"] + ) + else: + n_cal_readings = 0 + else: + n_cal_readings = 0 + return df, n_cal_readings + + +def replace_smoothed_cgm_values(df): + + if 'payload.realTimeValue' in list(df): + raw_val_idx = df['payload.realTimeValue'].notnull() + n_replaced = raw_val_idx.sum() + df.loc[raw_val_idx, "mg/dL"] = ( + df.loc[raw_val_idx, "payload.realTimeValue"] + ) + else: + n_replaced = np.nan + + raw_values = df["mg/dL"] + + return raw_values, n_replaced + + +def get_healthkit_timezone(df): + ''' + TODO: refactor to account for more efficient way to get embedded json + ''' + if "payload" in list(df): + df["payload.HKTimeZone"] = ( + expand_embedded_dict(df["payload"], "HKTimeZone") + ) + if "timezone" not in list(df): + if "payload.HKTimeZone" in list(df): + hk_tz_idx = df["payload.HKTimeZone"].notnull() + df.loc[hk_tz_idx, "deviceType"] = "healthkit" + df.rename(columns={"payload.HKTimeZone": "timezone"}, inplace=True) + + else: + df["timezone"] = np.nan + df["deviceType"] = np.nan + else: + if "payload.HKTimeZone" in list(df): + hk_tz_idx = df["payload.HKTimeZone"].notnull() + df.loc[hk_tz_idx, "timezone"] = ( + df.loc[hk_tz_idx, "payload.HKTimeZone"] + ) + df.loc[hk_tz_idx, "deviceType"] = "healthkit" + else: + df["timezone"] = np.nan + df["deviceType"] = np.nan + + else: + df["timezone"] = np.nan + df["deviceType"] = np.nan + + return df[["timezone", "deviceType"]] + + +def get_and_fill_timezone(df): + ''' + this is new to deal with healthkit data + requires that a data frame that contains payload and HKTimeZone is passed + ''' + df = get_healthkit_timezone(df) + + df["timezone"].fillna(method='ffill', inplace=True) + df["timezone"].fillna(method='bfill', inplace=True) + + return df["timezone"] + + +def make_tz_unaware(date_time): + return date_time.replace(tzinfo=None) + + +def to_utc_datetime(df): + ''' + this is new to deal with perfomance issue with the previous method + of converting to string to datetime with pd.to_datetime() + ''' + utc_time_tz_aware = pd.to_datetime( + df["time"], + format="%Y-%m-%dT%H:%M:%S", + utc=True + ) + utc_tz_unaware = utc_time_tz_aware.apply(make_tz_unaware) + + return utc_tz_unaware + + +# apply the large timezone offset correction (AKA Darin's fix) +def timezone_offset_bug_fix(df): + ''' + this is taken from estimate-local-time.py + TODO: add in unit testing where there is no TZP that is > 840 or < -720 + ''' + + if "timezoneOffset" in list(df): + + while ((df.timezoneOffset > 840).sum() > 0): + df.loc[df.timezoneOffset > 840, ["conversionOffset"]] = ( + df.loc[df.timezoneOffset > 840, ["conversionOffset"]] + - (1440 * 60 * 1000) + ) + + df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] = ( + df.loc[df.timezoneOffset > 840, ["timezoneOffset"]] - 1440 + ) + + while ((df.timezoneOffset < -720).sum() > 0): + df.loc[df.timezoneOffset < -720, ["conversionOffset"]] = ( + df.loc[df.timezoneOffset < -720, ["conversionOffset"]] + + (1440 * 60 * 1000) + ) + + df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] = ( + df.loc[df.timezoneOffset < -720, ["timezoneOffset"]] + 1440 + ) + + return df + + +def get_local_time(df): + + tzo = df[['utcTime', 'inferredTimezone']].apply( + lambda x: get_timezone_offset(*x), axis=1 + ) + local_time = df['utcTime'] + pd.to_timedelta(tzo, unit="m") + + return local_time + + +def round_time( + df, + time_interval_minutes=5, + start_with_first_record=True, + return_calculation_columns=False +): + ''' + A general purpose round time function that rounds the "time" + field to nearest minutes + INPUTS: + * a dataframe (df) or time series that contains only one time field + that you want to round + * time_interval_minutes (defaults to 5 minutes given that most cgms + output every 5 minutes) + * start_with_first_record starts the rounding with the first record + if True, and the last record if False (defaults to True) + * return_calculation_columns specifies whether the extra columns + used to make calculations are returned + refactored name(s) to meet style guide + ''' + # if a time series is passed in, convert to dataframe + if "Series" in get_type(df): + df = pd.DataFrame(df) + columns_ = list(df) + if len(columns_) > 1: + sys.exit( + "Error: df should only have one time column" + ) + else: + df.rename(columns={columns_[0]: "t"}, inplace=True) + + df.sort_values( + by="t", + ascending=start_with_first_record, + inplace=True + ) + + df.reset_index(drop=False, inplace=True) + df.rename(columns={"index": "originalIndex"}, inplace=True) + + # calculate the time between consecutive records + df["t_shift"] = df["t"].shift(1) + df["timeBetweenRecords"] = round( + (df["t"] - df["t_shift"]).dt.days*(86400/(60 * time_interval_minutes)) + + (df["t"] - df["t_shift"]).dt.seconds/(60 * time_interval_minutes) + ) * time_interval_minutes + + # separate the data into chunks if timeBetweenRecords is greater than + # 2 times the minutes so the rounding process + # starts over + big_gaps = list( + df.query("abs(timeBetweenRecords) > " + + str(time_interval_minutes * 2)).index + ) + big_gaps.insert(0, 0) + big_gaps.append(len(df)) + + for gap_index in range(0, len(big_gaps) - 1): + chunk = df["t"][big_gaps[gap_index]:big_gaps[gap_index+1]] + first_chunk = df["t"][big_gaps[gap_index]] + + # calculate the time difference between + # each time record and the first record + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "minutesFromFirstRecord" + ] = ( + (chunk - first_chunk).dt.days*(86400/60) + + (chunk - first_chunk).dt.seconds/60 + ) + + # then round to the nearest X Minutes + # NOTE: the ".000001" ensures that mulitples of 2:30 always rounds up. + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "roundedMinutesFromFirstRecord" + ] = round( + (df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "minutesFromFirstRecord" + ] / time_interval_minutes) + 0.000001 + ) * (time_interval_minutes) + + rounded_first_record = ( + first_chunk + pd.Timedelta("1microseconds") + ).round(str(time_interval_minutes) + "min") + + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "roundedTime" + ] = rounded_first_record + pd.to_timedelta( + df.loc[ + big_gaps[gap_index]:big_gaps[gap_index+1], + "roundedMinutesFromFirstRecord" + ], unit="m" + ) + + if return_calculation_columns is False: + df.drop( + columns=[ + "timeBetweenRecords", + "minutesFromFirstRecord", + "roundedMinutesFromFirstRecord" + ], inplace=True + ) + # sort back to the original index + df.sort_values(by="originalIndex", inplace=True) + + return df["roundedTime"].values + + +def add_upload_time(df): + ''' + this is taken from a colab notebook that is not in our github + given that it has been refactored to account for bug where there are + no upload records + NOTE: this is a new fix introduced with healthkit data...we now have + data that does not have an upload record + + ''' + + if "upload" in df.type.unique(): + upload_times = pd.DataFrame( + df[df.type == "upload"].groupby("uploadId")["utcTime"].max() + ) + else: + upload_times = pd.DataFrame(columns=["utcTime"]) + + unique_uploadIds = set(df["uploadId"].unique()) + unique_uploadRecords = set( + df.loc[df["type"] == "upload", "uploadId"].unique() + ) + uploadIds_missing_uploadRecords = unique_uploadIds - unique_uploadRecords + + for upId in uploadIds_missing_uploadRecords: + last_upload_time = df.loc[df["uploadId"] == upId, "utcTime"].max() + upload_times.loc[upId, "utcTime"] = last_upload_time + + upload_times.reset_index(inplace=True) + upload_times.rename( + columns={"utcTime": "uploadTime", + "index": "uploadId"}, + inplace=True + ) + + df = pd.merge(df, upload_times, how='left', on='uploadId') + + return df["uploadTime"].values + + +def remove_invalid_cgm_values(df): + + nBefore = len(df) + # remove values < 38 and > 402 mg/dL + df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] < 38))].index) + df = df.drop(df[((df.type == "cbg") & (df["mg/dL"] > 402))].index) + nRemoved = nBefore - len(df) + + return df, nRemoved + + +def removeDuplicates(df, criteriaDF): + nBefore = len(df) + df = df.loc[~(df[criteriaDF].duplicated())] + df = df.reset_index(drop=True) + nDuplicatesRemoved = nBefore - len(df) + + return df, nDuplicatesRemoved + + +def removeCgmDuplicates(df, timeCriterion, valueCriterion="value"): + if timeCriterion in df: + df.sort_values(by=[timeCriterion, "uploadTime"], + ascending=[False, False], + inplace=True) + dfIsNull = df[df[timeCriterion].isnull()] + dfNotNull = df[df[timeCriterion].notnull()] + dfNotNull, nDuplicatesRemoved = ( + removeDuplicates(dfNotNull, [timeCriterion, valueCriterion]) + ) + df = pd.concat([dfIsNull, dfNotNull]) + df.sort_values(by=[timeCriterion, "uploadTime"], + ascending=[False, False], + inplace=True) + else: + nDuplicatesRemoved = 0 + + return df, nDuplicatesRemoved + + +# get rid of spike data +def remove_spike_data(df): + if "origin" in list(df): + nBefore = len(df) + spike_locations = [ + "origin.payload.device.name", + "origin.payload.device.manufacturer", + "origin.payload.sourceRevision.source.name", + ] + for spike_loc in spike_locations: + df[spike_loc] = get_embedded_field(df["origin"], spike_loc) + notnull_idx = df[spike_loc].notnull() + df_notnull = df[notnull_idx] + is_spike = df_notnull[spike_loc].astype(str).str.lower().str.contains("spike") + spike_idx = df_notnull[is_spike].index + df.drop(spike_idx, inplace=True) + + nRemoved = nBefore - len(df) + + else: + nRemoved = np.nan + + return df, nRemoved + + +# %% ESTIMATE LOCAL TIME FUNCTIONS +def convert_deprecated_timezone_to_alias(df, tzAlias): + if "timezone" in df: + uniqueTimezones = df.timezone.unique() + uniqueTimezones = uniqueTimezones[pd.notnull(df.timezone.unique())] + + for uniqueTimezone in uniqueTimezones: + alias = tzAlias.loc[tzAlias.tz.str.endswith(uniqueTimezone), + ["alias"]].values + if len(alias) == 1: + df.loc[df.timezone == uniqueTimezone, ["timezone"]] = alias + + return df + + +def create_contiguous_day_series(df): + first_day = df["date"].min() + last_day = df["date"].max() + rng = pd.date_range(first_day, last_day).date + contiguousDaySeries = \ + pd.DataFrame(rng, columns=["date"]).sort_values( + "date", ascending=False).reset_index(drop=True) + + return contiguousDaySeries + + +def add_device_type(df): + col_headings = list(df) + if "deviceType" not in col_headings: + df["deviceType"] = np.nan + if "deviceTags" in col_headings: + # first make sure deviceTag is in string format + df["deviceTags"] = df.deviceTags.astype(str) + # filter by type not null device tags + ud = df[df["deviceTags"].notnull()].copy() + # define a device type (e.g., pump, cgm, or healthkit) + ud.loc[ + ((ud["deviceTags"].str.contains("pump")) + & (ud["deviceType"].isnull())), + ["deviceType"] + ] = "pump" + + # define a device type (e.g., cgm) + ud.loc[ + ((ud["deviceTags"].str.contains("cgm")) + & (ud["deviceType"].isnull())), + ["deviceType"] + ] = "cgm" + + return ud["deviceType"] + else: + return np.nan + + +def get_timezone_offset(currentDate, currentTimezone): + + tz = pytz.timezone(currentTimezone) + # here we add 1 day to the current date to account for changes to/from DST + tzoNum = int( + tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z") + ) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + + return tzo + + +def add_device_day_series(df, dfContDays, deviceTypeName): + if len(df) > 0: + dfDayGroups = df.groupby("date") + if "timezoneOffset" in df: + dfDaySeries = pd.DataFrame(dfDayGroups["timezoneOffset"].median()) + else: + dfDaySeries = pd.DataFrame(columns=["timezoneOffset"]) + dfDaySeries.index.name = "date" + + if "upload" in deviceTypeName: + if (("timezone" in df) & (df["timezone"].notnull().sum() > 0)): + dfDaySeries["timezone"] = ( + dfDayGroups.timezone.describe()["top"] + ) + # get the timezone offset for the timezone + for i in dfDaySeries.index: + if pd.notnull(dfDaySeries.loc[i, "timezone"]): + tzo = get_timezone_offset( + pd.to_datetime(i), + dfDaySeries.loc[i, "timezone"]) + dfDaySeries.loc[i, ["timezoneOffset"]] = tzo + if "timeProcessing" in dfDaySeries: + dfDaySeries["timeProcessing"] = \ + dfDayGroups.timeProcessing.describe()["top"] + else: + dfDaySeries["timeProcessing"] = np.nan + + + dfDaySeries = dfDaySeries.add_prefix(deviceTypeName + "."). \ + rename(columns={deviceTypeName + ".date": "date"}) + + dfContDays = pd.merge(dfContDays, dfDaySeries.reset_index(), + on="date", how="left") + + else: + dfContDays[deviceTypeName + ".timezoneOffset"] = np.nan + + return dfContDays + + +def impute_upload_records(df, contDays, deviceTypeName): + daySeries = \ + add_device_day_series(df, contDays, deviceTypeName) + + if ((len(df) > 0) & (deviceTypeName + ".timezone" in daySeries)): + for i in daySeries.index[1:]: + if pd.isnull(daySeries[deviceTypeName + ".timezone"][i]): + daySeries.loc[i, [deviceTypeName + ".timezone"]] = ( + daySeries.loc[i-1, deviceTypeName + ".timezone"] + ) + if pd.notnull(daySeries[deviceTypeName + ".timezone"][i]): + tz = daySeries.loc[i, deviceTypeName + ".timezone"] + tzo = get_timezone_offset( + pd.to_datetime(daySeries.loc[i, "date"]), + tz + ) + daySeries.loc[i, deviceTypeName + ".timezoneOffset"] = tzo + + if pd.notnull(daySeries[deviceTypeName + ".timeProcessing"][i-1]): + daySeries.loc[i, deviceTypeName + ".timeProcessing"] = \ + daySeries.loc[i-1, deviceTypeName + ".timeProcessing"] + + else: + daySeries[deviceTypeName + ".timezone"] = np.nan + daySeries[deviceTypeName + ".timeProcessing"] = np.nan + + return daySeries + + +def add_home_timezone(df, contDays): + + if (("timezone" in df) & (df["timezone"].notnull().sum()> 0)): + homeTimezone = df["timezone"].describe()["top"] + tzo = contDays.date.apply( + lambda x: get_timezone_offset(pd.to_datetime(x), homeTimezone)) + + contDays["home.imputed.timezoneOffset"] = tzo + contDays["home.imputed.timezone"] = homeTimezone + + else: + contDays["home.imputed.timezoneOffset"] = np.nan + contDays["home.imputed.timezone"] = np.nan + contDays["home.imputed.timeProcessing"] = np.nan + + return contDays + + +def estimateTzAndTzoWithUploadRecords(cDF): + + cDF["est.type"] = np.nan + cDF["est.gapSize"] = np.nan + cDF["est.timezoneOffset"] = cDF["upload.timezoneOffset"] + cDF["est.annotations"] = np.nan + + if "upload.timezone" in cDF: + cDF.loc[cDF["upload.timezone"].notnull(), ["est.type"]] = "UPLOAD" + cDF["est.timezone"] = cDF["upload.timezone"] + cDF["est.timeProcessing"] = cDF["upload.timeProcessing"] + else: + cDF["est.timezone"] = np.nan + cDF["est.timeProcessing"] = np.nan + + cDF.loc[((cDF["est.timezoneOffset"] != + cDF["home.imputed.timezoneOffset"]) & + (pd.notnull(cDF["est.timezoneOffset"]))), + "est.annotations"] = "travel" + + return cDF + + +def assignTzoFromImputedSeries(df, i, imputedSeries): + df.loc[i, ["est.type"]] = "DEVICE" + + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[i, imputedSeries + ".timezoneOffset"] + + df.loc[i, ["est.timezone"]] = \ + df.loc[i, imputedSeries + ".timezone"] + + df.loc[i, ["est.timeProcessing"]] = \ + df.loc[i, imputedSeries + ".timeProcessing"] + + return df + + +def compareDeviceTzoToImputedSeries(df, sIdx, device): + for i in sIdx: + # if the device tzo = imputed tzo, then chose the imputed tz and tzo + # note, dst is accounted for in the imputed tzo + for imputedSeries in ["pump.upload.imputed", "cgm.upload.imputed", + "healthkit.upload.imputed", "home.imputed"]: + # if the estimate has not already been made + if pd.isnull(df.loc[i, "est.timezone"]): + + if df.loc[i, device + ".timezoneOffset"] == \ + df.loc[i, imputedSeries + ".timezoneOffset"]: + + assignTzoFromImputedSeries(df, i, imputedSeries) + + df = addAnnotation(df, i, + "tz-inferred-from-" + imputedSeries) + + # if the imputed series has a timezone estimate, then see if + # the current day is a dst change day + elif (pd.notnull(df.loc[i, imputedSeries + ".timezone"])): + imputedTimezone = df.loc[i, imputedSeries + ".timezone"] + if isDSTChangeDay(df.loc[i, "date"], imputedTimezone): + + dstRange = getRangeOfTZOsForTimezone(imputedTimezone) + if ((df.loc[i, device + ".timezoneOffset"] in dstRange) + & (df.loc[i, imputedSeries + ".timezoneOffset"] in dstRange)): + + assignTzoFromImputedSeries(df, i, imputedSeries) + + df = addAnnotation(df, i, "dst-change-day") + df = addAnnotation( + df, i, "tz-inferred-from-" + imputedSeries) + + return df + + +def estimateTzAndTzoWithDeviceRecords(cDF): + + # 2A. use the TZO of the pump or cgm device if it exists on a given day. In + # addition, compare the TZO to one of the imputed day series (i.e., the + # upload and home series to see if the TZ can be inferred) + for deviceType in ["pump", "cgm"]: + # find the indices of days where a TZO estimate has not been made AND + # where the device (e.g., pump or cgm) TZO has data + sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & + (cDF[deviceType + ".timezoneOffset"].notnull()))].index + # compare the device TZO to the imputed series to infer time zone + cDF = compareDeviceTzoToImputedSeries(cDF, sIndices, deviceType) + + # 2B. if the TZ cannot be inferred with 2A, then see if the TZ can be + # inferred from the previous day's TZO. If the device TZO is equal to the + # previous day's TZO, AND if the previous day has a TZ estimate, use the + # previous day's TZ estimate for the current day's TZ estimate + for deviceType in ["pump", "cgm"]: + sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & + (cDF[deviceType + ".timezoneOffset"].notnull()))].index + + cDF = compareDeviceTzoToPrevDayTzo(cDF, sIndices, deviceType) + + # 2C. after 2A and 2B, check the DEVICE estimates to make sure that the + # pump and cgm tzo do not differ by more than 60 minutes. If they differ + # by more that 60 minutes, then mark the estimate as UNCERTAIN. Also, we + # allow the estimates to be off by 60 minutes as there are a lot of cases + # where the devices are off because the user changes the time for DST, + # at different times + sIndices = cDF[((cDF["est.type"] == "DEVICE") & + (cDF["pump.timezoneOffset"].notnull()) & + (cDF["cgm.timezoneOffset"].notnull()) & + (cDF["pump.timezoneOffset"] != cDF["cgm.timezoneOffset"]) + )].index + + tzoDiffGT60 = abs(cDF.loc[sIndices, "cgm.timezoneOffset"] - + cDF.loc[sIndices, "pump.timezoneOffset"]) > 60 + + idx = tzoDiffGT60.index[tzoDiffGT60] + + cDF.loc[idx, ["est.type"]] = "UNCERTAIN" + for i in idx: + cDF = addAnnotation(cDF, i, "pump-cgm-tzo-mismatch") + + return cDF + + +def imputeTzAndTzo(cDF): + + sIndices = cDF[cDF["est.timezoneOffset"].isnull()].index + hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index + if len(hasTzoIndices) > 0: + if len(sIndices) > 0: + lastDay = max(sIndices) + + while ((sIndices.min() < max(hasTzoIndices)) & + (len(sIndices) > 0)): + + currentDay, prevDayWithDay, nextDayIdx = \ + getImputIndices(cDF, sIndices, hasTzoIndices) + + cDF = imputeByTimezone(cDF, currentDay, + prevDayWithDay, nextDayIdx) + + sIndices = cDF[((cDF["est.timezoneOffset"].isnull()) & + (~cDF["est.annotations"].str.contains( + "unable-to-impute-tzo").fillna(False)))].index + + hasTzoIndices = cDF[cDF["est.timezoneOffset"].notnull()].index + + # try to impute to the last day (earliest day) in the dataset + # if the last record has a timezone that is the home record, then + # impute using the home timezone + if len(sIndices) > 0: + currentDay = min(sIndices) + prevDayWithDay = currentDay - 1 + gapSize = lastDay - currentDay + + for i in range(currentDay, lastDay + 1): + if cDF.loc[prevDayWithDay, "est.timezoneOffset"] == \ + cDF.loc[prevDayWithDay, "home.imputed.timezoneOffset"]: + + cDF.loc[i, ["est.type"]] = "IMPUTE" + + cDF.loc[i, ["est.timezoneOffset"]] = \ + cDF.loc[i, "home.imputed.timezoneOffset"] + + cDF.loc[i, ["est.timezone"]] = \ + cDF.loc[i, "home.imputed.timezone"] + + cDF = addAnnotation(cDF, i, "gap=" + str(gapSize)) + cDF.loc[i, ["est.gapSize"]] = gapSize + + else: + cDF.loc[i, ["est.type"]] = "UNCERTAIN" + cDF = addAnnotation(cDF, i, "unable-to-impute-tzo") + else: + cDF["est.type"] = "UNCERTAIN" + cDF["est.annotations"] = "unable-to-impute-tzo" + + return cDF + + +def getRangeOfTZOsForTimezone(tz): + minMaxTzo = [getTimezoneOffset(pd.to_datetime("1/1/2017"), tz), + getTimezoneOffset(pd.to_datetime("5/1/2017"), tz)] + + rangeOfTzo = np.arange(int(min(minMaxTzo)), int(max(minMaxTzo))+1, 15) + + return rangeOfTzo + + +def getListOfDSTChangeDays(cDF): + + # get a list of DST change days for the home time zone + dstChangeDays = \ + cDF[abs(cDF["home.imputed.timezoneOffset"] - + cDF["home.imputed.timezoneOffset"].shift(-1)) > 0].date + + return dstChangeDays + + +def correctEstimatesAroundDst(df, cDF): + + # get a list of DST change days for the home time zone + dstChangeDays = getListOfDSTChangeDays(cDF) + + # loop through the df within 2 days of a daylight savings time change + for d in dstChangeDays: + dstIndex = df[(df.date > (d + dt.timedelta(days=-2))) & + (df.date < (d + dt.timedelta(days=2)))].index + for dIdx in dstIndex: + if pd.notnull(df.loc[dIdx, "est.timezone"]): + tz = pytz.timezone(df.loc[dIdx, "est.timezone"]) + tzRange = getRangeOfTZOsForTimezone(str(tz)) + minHoursToLocal = min(tzRange)/60 + tzoNum = int(tz.localize(df.loc[dIdx, "utcTime"] + + dt.timedelta(hours=minHoursToLocal)).strftime("%z")) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + localTime = \ + df.loc[dIdx, "utcTime"] + pd.to_timedelta(tzo, unit="m") + df.loc[dIdx, ["est.localTime"]] = localTime + df.loc[dIdx, ["est.timezoneOffset"]] = tzo + return df + + +def applyLocalTimeEstimates(df, cDF): + df = pd.merge(df, cDF, how="left", on="date") + df["est.localTime"] = \ + df["utcTime"] + pd.to_timedelta(df["est.timezoneOffset"], unit="m") + + df = correctEstimatesAroundDst(df, cDF) + + return df["est.localTime"].values + + +def isDSTChangeDay(currentDate, currentTimezone): + tzoCurrentDay = getTimezoneOffset(pd.to_datetime(currentDate), + currentTimezone) + tzoPreviousDay = getTimezoneOffset(pd.to_datetime(currentDate) + + dt.timedelta(days=-1), currentTimezone) + + return (tzoCurrentDay != tzoPreviousDay) + + +def tzoRangeWithComparisonTz(df, i, comparisonTz): + # if we have a previous timezone estimate, then calcuate the range of + # timezone offset values for that time zone + if pd.notnull(comparisonTz): + rangeTzos = getRangeOfTZOsForTimezone(comparisonTz) + else: + comparisonTz = np.nan + rangeTzos = np.array([]) + + return rangeTzos + + +def tzAndTzoRangePreviousDay(df, i): + # if we have a previous timezone estimate, then calcuate the range of + # timezone offset values for that time zone + comparisonTz = df.loc[i-1, "est.timezone"] + + rangeTzos = tzoRangeWithComparisonTz(df, i, comparisonTz) + + return comparisonTz, rangeTzos + + +def assignTzoFromPreviousDay(df, i, previousDayTz): + + df.loc[i, ["est.type"]] = "DEVICE" + df.loc[i, ["est.timezone"]] = previousDayTz + df.loc[i, ["est.timezoneOffset"]] = \ + getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), previousDayTz) + + df.loc[i, ["est.timeProcessing"]] = df.loc[i-1, "est.timeProcessing"] + df = addAnnotation(df, i, "tz-inferred-from-prev-day") + + return df + + +def assignTzoFromDeviceTzo(df, i, device): + + df.loc[i, ["est.type"]] = "DEVICE" + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[i, device + ".timezoneOffset"] + df.loc[i, ["est.timeProcessing"]] = \ + df.loc[i, device + ".upload.imputed.timeProcessing"] + + df = addAnnotation(df, i, "likely-travel") + df = addAnnotation(df, i, "tzo-from-" + device) + + return df + + +def compareDeviceTzoToPrevDayTzo(df, sIdx, device): + + for i in sIdx[sIdx > 0]: + + # first see if the previous record has a tzo + if (pd.notnull(df.loc[i-1, "est.timezoneOffset"])): + + previousDayTz, dstRange = tzAndTzoRangePreviousDay(df, i) + timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) - + df.loc[i-1, "est.timezoneOffset"]) + + # next see if the previous record has a tz + if (pd.notnull(df.loc[i-1, "est.timezone"])): + + if timeDiff == 0: + assignTzoFromPreviousDay(df, i, previousDayTz) + + # see if the previous day's tzo and device tzo are within the + # dst range (as that is a common problem with this data) + elif ((df.loc[i, device + ".timezoneOffset"] in dstRange) + & (df.loc[i-1, "est.timezoneOffset"] in dstRange)): + + # then see if it is DST change day + if isDSTChangeDay(df.loc[i, "date"], previousDayTz): + + df = addAnnotation(df, i, "dst-change-day") + assignTzoFromPreviousDay(df, i, previousDayTz) + + # if it is not DST change day, then mark this as uncertain + else: + # also, check to see if the difference between device. + # tzo and prev.tzo is less than the expected dst + # difference. There is a known issue where the BtUTC + # procedure puts clock drift into the device.tzo, + # and as a result the tzo can be off by 15, 30, + # or 45 minutes. + if (((df.loc[i, device + ".timezoneOffset"] == + min(dstRange)) | + (df.loc[i, device + ".timezoneOffset"] == + max(dstRange))) & + ((df.loc[i-1, "est.timezoneOffset"] == + min(dstRange)) | + (df.loc[i-1, "est.timezoneOffset"] == + max(dstRange)))): + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, + "likely-dst-error-OR-travel") + + else: + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, + "likely-15-min-dst-error") + + # next see if time difference between device.tzo and prev.tzo + # is off by 720 minutes, which is indicative of a common + # user AM/PM error + elif timeDiff == 720: + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-AM-PM-error") + + # if it doesn't fall into any of these cases, then the + # tzo difference is likely due to travel + else: + df = assignTzoFromDeviceTzo(df, i, device) + + elif timeDiff == 0: + df = assignTzoFromDeviceTzo(df, i, device) + + # if there is no previous record to compare with check for dst errors, + # and if there are no errors, it is likely a travel day + else: + + comparisonTz, dstRange = tzAndTzoRangeWithHomeTz(df, i) + timeDiff = abs((df.loc[i, device + ".timezoneOffset"]) - + df.loc[i, "home.imputed.timezoneOffset"]) + + if ((df.loc[i, device + ".timezoneOffset"] in dstRange) + & (df.loc[i, "home.imputed.timezoneOffset"] in dstRange)): + + # see if it is DST change day + if isDSTChangeDay(df.loc[i, "date"], comparisonTz): + + df = addAnnotation(df, i, "dst-change-day") + df.loc[i, ["est.type"]] = "DEVICE" + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[i, device + ".timezoneOffset"] + df.loc[i, ["est.timezone"]] = \ + df.loc[i, "home.imputed.timezone"] + df.loc[i, ["est.timeProcessing"]] = \ + df.loc[i, device + ".upload.imputed.timeProcessing"] + + # if it is not DST change day, then mark this as uncertain + else: + # also, check to see if the difference between device. + # tzo and prev.tzo is less than the expected dst + # difference. There is a known issue where the BtUTC + # procedure puts clock drift into the device.tzo, + # and as a result the tzo can be off by 15, 30, + # or 45 minutes. + if (((df.loc[i, device + ".timezoneOffset"] == + min(dstRange)) | + (df.loc[i, device + ".timezoneOffset"] == + max(dstRange))) & + ((df.loc[i, "home.imputed.timezoneOffset"] == + min(dstRange)) | + (df.loc[i, "home.imputed.timezoneOffset"] == + max(dstRange)))): + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-dst-error-OR-travel") + + else: + + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-15-min-dst-error") + + # next see if time difference between device.tzo and prev.tzo + # is off by 720 minutes, which is indicative of a common + # user AM/PM error + elif timeDiff == 720: + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "likely-AM-PM-error") + + # if it doesn't fall into any of these cases, then the + # tzo difference is likely due to travel + + else: + df = assignTzoFromDeviceTzo(df, i, device) + + return df + + +def getImputIndices(df, sIdx, hIdx): + + lastDayIdx = len(df) - 1 + + currentDayIdx = sIdx.min() + tempList = pd.Series(hIdx) - currentDayIdx + prevDayIdx = currentDayIdx - 1 + nextDayIdx = \ + min(currentDayIdx + min(tempList[tempList >= 0]), lastDayIdx) + + return currentDayIdx, prevDayIdx, nextDayIdx + + +def imputeByTimezone(df, currentDay, prevDaywData, nextDaywData): + + gapSize = (nextDaywData - currentDay) + + if prevDaywData >= 0: + + if df.loc[prevDaywData, "est.timezone"] == \ + df.loc[nextDaywData, "est.timezone"]: + + tz = df.loc[prevDaywData, "est.timezone"] + + for i in range(currentDay, nextDaywData): + + df.loc[i, ["est.timezone"]] = tz + + df.loc[i, ["est.timezoneOffset"]] = \ + getTimezoneOffset(pd.to_datetime(df.loc[i, "date"]), tz) + + df.loc[i, ["est.type"]] = "IMPUTE" + + df = addAnnotation(df, i, "gap=" + str(gapSize)) + df.loc[i, ["est.gapSize"]] = gapSize + + # TODO: this logic should be updated to handle the edge case + # where the day before and after the gap have differing TZ, but + # the same TZO. In that case the gap should be marked as UNCERTAIN + elif df.loc[prevDaywData, "est.timezoneOffset"] == \ + df.loc[nextDaywData, "est.timezoneOffset"]: + + for i in range(currentDay, nextDaywData): + + df.loc[i, ["est.timezoneOffset"]] = \ + df.loc[prevDaywData, "est.timezoneOffset"] + + df.loc[i, ["est.type"]] = "IMPUTE" + + df = addAnnotation(df, i, "gap=" + str(gapSize)) + df.loc[i, ["est.gapSize"]] = gapSize + + else: + for i in range(currentDay, nextDaywData): + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "unable-to-impute-tzo") + + else: + for i in range(currentDay, nextDaywData): + df.loc[i, ["est.type"]] = "UNCERTAIN" + df = addAnnotation(df, i, "unable-to-impute-tzo") + + return df + + +def addAnnotation(df, idx, annotationMessage): + if pd.notnull(df.loc[idx, "est.annotations"]): + df.loc[idx, ["est.annotations"]] = df.loc[idx, "est.annotations"] + \ + ", " + annotationMessage + else: + df.loc[idx, ["est.annotations"]] = annotationMessage + + return df + + +def getTimezoneOffset(currentDate, currentTimezone): + + tz = pytz.timezone(currentTimezone) + # here we add 1 day to the current date to account for changes to/from DST + tzoNum = int(tz.localize(currentDate + dt.timedelta(days=1)).strftime("%z")) + tzoHours = np.floor(tzoNum / 100) + tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0) + tzoSign = np.sign(tzoHours) + tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign)) + + return tzo + + +def estimate_local_time(df): + df["date"] = df["utcTime"].dt.date # TODO: change this to utcDate later + contiguous_days = create_contiguous_day_series(df) + + df["deviceType"] = add_device_type(df) + cDays = add_device_day_series(df, contiguous_days, "upload") + + # create day series for cgm df + if "timezoneOffset" not in list(df): + df["timezoneOffset"] = np.nan + + cgmdf = df[(df["type"] == "cbg") & (df["timezoneOffset"].notnull())].copy() + cDays = add_device_day_series(cgmdf, cDays, "cgm") + + # create day series for pump df + pumpdf = df[(df.type == "bolus") & (df.timezoneOffset.notnull())].copy() + cDays = add_device_day_series(pumpdf, cDays, "pump") + + # interpolate between upload records of the same deviceType, and create a + # day series for interpolated pump, non-hk-cgm, and healthkit uploads + for deviceType in ["pump", "cgm", "healthkit"]: + tempUploaddf = df[df["deviceType"] == deviceType].copy() + cDays = impute_upload_records( + tempUploaddf, cDays, deviceType + ".upload.imputed" + ) + + # add a home timezone that also accounts for daylight savings time changes + cDays = add_home_timezone(df, cDays) + + # 1. USE UPLOAD RECORDS TO ESTIMATE TZ AND TZO + cDays = estimateTzAndTzoWithUploadRecords(cDays) + + # 2. USE DEVICE TZOs TO ESTIMATE TZO AND TZ (IF POSSIBLE) + # estimates can be made from pump and cgm df that have a TZO + # NOTE: the healthkit and dexcom-api cgm df are excluded + cDays = estimateTzAndTzoWithDeviceRecords(cDays) + + # 3. impute, infer, or interpolate gaps in the estimated tzo and tz + cDays = imputeTzAndTzo(cDays) + + # 4. APPLY LOCAL TIME ESTIMATES TO ALL df + local_time = applyLocalTimeEstimates(df, cDays) + + return local_time, cDays + + +# %% MAIN FUNCTION +def get_distribution_and_stats( + json_data_path, + userid, + date_stamp, + save_data_path +): + + phi_date = "PHI-" + date_stamp + + output_metadata = os.path.join( + save_data_path, + phi_date + "-donor-data", + phi_date + "-cgm-metadata" + ) + + output_distribution = os.path.join( + save_data_path, + phi_date + "-donor-data", + phi_date + "-cgm-distributions" + ) + debug_duplicates = os.path.join( + save_data_path, + phi_date + "-donor-data", + phi_date + "-debug-cgm-duplicates" + ) + output_stats = os.path.join( + save_data_path, + phi_date + "-donor-data", + phi_date + "-cgm-stats" + ) + + make_folder_if_doesnt_exist( + [output_metadata, output_distribution, debug_duplicates, output_stats] + ) + + timezone_aliases = pd.read_csv( + "wikipedia-timezone-aliases-2018-04-28.csv", + low_memory=False + ) + + donor_metadata_columns = [ + 'userid', + 'diagnosisType', + 'diagnosisDate', + 'biologicalSex', + 'birthday', + 'targetTimezone', + 'targetDevices', + 'isOtherPerson', + ] + + # load in data or pull in data + if pd.notnull(json_data_path): + data = pd.read_json(json_data_path) + + else: + data, userid = get_data( + save_file="false" + ) + + # load in donor metadata + donor_meta_path = os.path.join( + save_data_path, + phi_date + "-donor-data", + phi_date + "-donor-metadata.csv" + ) + if os.path.exists(donor_meta_path): + + all_donor_metadata = pd.read_csv( + donor_meta_path, + low_memory=False + ) + + metadata = all_donor_metadata.loc[ + all_donor_metadata["userid"] == userid, + donor_metadata_columns + ] + else: + metadata, _ = get_shared_metadata( + donor_group="bigdata", + userid_of_shared_user=userid + ) + + print("starting", userid) + + # HASH USER ID + hashid = hash_userid(userid, os.environ['BIGDATA_SALT']) + data["userid"] = userid + data["hashid"] = hashid + metadata["hashid"] = hashid + + # CLEAN DATA + + # NOTE: moving remove negative durations to type specific cleaning + # TODO: ask backend to change "duration" to only include one object type + + # Tslim calibration bug fix + data, n_cal_readings = tslim_calibration_fix(data.copy()) + metadata["nTandemAndPayloadCalReadings"] = n_cal_readings + + # fix large timzoneOffset bug in utcbootstrapping + data = timezone_offset_bug_fix(data.copy()) + + # add healthkit timezome information + # TODO: refactor this function to only require fields that might have hk tz + data[["timezone", "deviceType"]] = get_healthkit_timezone(data.copy()) + + # convert deprecated timezones to their aliases + data = convert_deprecated_timezone_to_alias(data, timezone_aliases) + + # TIME RELATED ITEMS + data["utcTime"] = to_utc_datetime(data[["time"]].copy()) + + # add upload time to the data, which is needed for: + # getting rid of duplicates and useful for getting local time + + data["uploadTime"] = ( + add_upload_time(data[["type", "uploadId", "utcTime"]].copy()) + ) + +# # estimate local time (refactor of estimate-local-time.py) +# data["localTime"], local_time_metadata = estimate_local_time(data.copy()) +# +# TODO: fix this issue with estimate local time +# ''' +# //anaconda3/envs/tbddp/lib/python3.7/site-packages/pandas/core/ops.py:1649 +# FutureWarning: elementwise comparison failed; returning scalar instead, +# but in the future will perform elementwise comparison result = method(y) +# ''' + + # round all data to the nearest 5 minutes + data["roundedUtcTime"] = round_time( + data["utcTime"].copy(), + time_interval_minutes=5, + start_with_first_record=True, + return_calculation_columns=False + ) + + # TIME CATEGORIES + data["date"] = data["roundedUtcTime"].dt.date + + # AGE, & YLW + # TODO: make this a function + if pd.notnull(metadata["birthday"].values[0]): + bDate = pd.to_datetime(metadata["birthday"].values[0][0:7]) + data["age"] = np.floor((data["roundedUtcTime"] - bDate).dt.days/365.25) + else: + data["age"] = np.nan + + if pd.notnull(metadata["diagnosisDate"].values[0]): + dDate = pd.to_datetime(metadata["diagnosisDate"].values[0][0:7]) + data["ylw"] = np.floor((data["roundedUtcTime"] - dDate).dt.days/365.25) + else: + data["ylw"] = np.nan + + # GROUP DATA BY TYPE + # first sort by upload time (used when removing dumplicates) + data.sort_values("uploadTime", ascending=False, inplace=True) + groups = data.groupby(by="type") + + # check to see if person is looping + if "basal" in data["type"].unique(): + basal = groups.get_group("basal").dropna(axis=1, how="all") + if "deliveryType" in list(basal): + bd = basal.loc[ + basal["deliveryType"] == "temp", + ["date", "deliveryType"] + ] + temp_basal_counts = ( + pd.DataFrame( + bd.groupby("date").deliveryType.count() + ).reset_index() + ) + temp_basal_counts.rename( + {"deliveryType": "tempBasalCounts"}, + axis=1, + inplace=True + ) + data = pd.merge(data, temp_basal_counts, on="date", how="left") + # >= 25 temp basals per day is likely looping + data["isLoopDay"] = data["tempBasalCounts"] >= 25 + # redefine groups with the new data + groups = data.groupby(by="type") + + else: + data["isLoopDay"] = np.nan + else: + data["isLoopDay"] = np.nan + + # %% CGM DATA + if "cbg" in data["type"].unique(): + # sort data with + metadata["cgmData"] = True + + # filter by cgm + cgm = groups.get_group("cbg").copy() + + # sort data + cgm.sort_values("roundedUtcTime", ascending=False, inplace=True) + cgm.reset_index(drop=False, inplace=True) + + # calculate cgm in mg/dL + cgm["mg/dL"] = round(cgm["value"] * MGDL_PER_MMOLL) + + # get rid of spike data + cgm, nSpike = remove_spike_data(cgm.copy()) + metadata["nSpike"] = nSpike + + # assign upload cgm device info to cgm records in that upload + cgm = add_upload_info_to_cgm_records(groups, cgm.copy()) + + # check to see if cgm info exists in healthkit locations + cgm = expand_heathkit_cgm_fields(cgm.copy()) + + # replace smoothed cgm values with raw values (if they exist) + # this must run after expand_heathkit_cgm_fields _ + cgm["mg/dL"], metadata["nSmoothedCgmReplaced"] = ( + replace_smoothed_cgm_values(cgm.copy()) + ) + + # get cgm models + cgm["cgmModel"], cgm["cgmModelSensedFrom"] = np.nan, np.nan + + # dexcom cgm models (G4, G5, G6) + cgm[["cgmModel", "cgmModelSensedFrom"]] = ( + get_dexcom_cgm_model(cgm.copy()) + ) + + # for non dexcom cgms + # 670G, 640G, 630G, 530G, 523/723, libre, animas, and tandem + cgm[["cgmModel", "cgmModelSensedFrom"]] = ( + get_non_dexcom_cgm_model(cgm.copy()) + ) + + # get metadata on cgm models and devices + metadata["nMissingCgmModels"] = cgm["cgmModel"].isnull().sum() + metadata["uniqueCgmModels"] = str(cgm["cgmModel"].unique()) + if "deviceId" in list(cgm): + metadata["uniqueCgmDevices"] = str(cgm["deviceId"].unique()) + + # clean distributions + # break up all traces by cgm model + combined_cgm_series = pd.DataFrame() + cgm_models = cgm.groupby(by="cgmModel") + + for cgm_model in cgm_models.groups.keys(): + print("working on", cgm_model) + temp_cgm = cgm_models.get_group(cgm_model) + + # get rid of cgm values too low/high (< 38 & > 402 mg/dL) + temp_cgm, nInvalidCgmValues = remove_invalid_cgm_values(temp_cgm) + metadata["nInvalidCgmValues." + cgm_model] = nInvalidCgmValues + + # sort by upload time before getting rid of duplicates + temp_cgm.sort_values("uploadTime", ascending=False, inplace=True) + + # get rid of duplicates that have the same ["deviceTime", "mg/dL"] + temp_cgm, n_cgm_dups_removed = ( + removeCgmDuplicates(temp_cgm, "deviceTime", "mg/dL") + ) + metadata["nCgmDuplicatesRemovedDeviceTime." + cgm_model] = ( + n_cgm_dups_removed + ) + + # get rid of duplicates that have the same ["time", "mg/dL"] + temp_cgm, n_cgm_dups_removed = ( + removeCgmDuplicates(temp_cgm, "utcTime", "mg/dL") + ) + metadata["nCgmDuplicatesRemovedUtcTime." + cgm_model] = ( + n_cgm_dups_removed + ) + + # get rid of duplicates that have the same roundedTime + temp_cgm, n_cgm_dups_removed = ( + removeDuplicates(temp_cgm, "roundedUtcTime") + ) + metadata["nCgmDuplicatesRemovedRoundedTime." + cgm_model] = ( + n_cgm_dups_removed + ) + + # create a contiguous 5 minute time series + first_day = temp_cgm["roundedUtcTime"].min() + metadata["firstCgm." + cgm_model] = first_day + + last_day = temp_cgm["roundedUtcTime"].max() + metadata["lastCgm." + cgm_model] = last_day + + rng = pd.date_range(first_day, last_day, freq="5min") + contiguous_data = pd.DataFrame( + rng, + columns=["roundedUtcTime"] + ).sort_values( + "roundedUtcTime", + ascending=False + ).reset_index(drop=True) + + # merge with cgm data + cgm_series = pd.merge( + contiguous_data, + temp_cgm[[ + "roundedUtcTime", "hashid", "isLoopDay", + "cgmModel", "age", "ylw", "mg/dL" + ]], + on="roundedUtcTime", + how="left" + ) + + # sort so that the oldest data point is on top + cgm_series.sort_values( + "roundedUtcTime", ascending=True, inplace=True + ) + cgm_series.reset_index(drop=True, inplace=True) + + # get dexcom icgm bins + value_bins = np.array( + [37, 39, 60, 80, 120, 160, 200, 250, 300, 350, 400, 403] + ) + value_bin_names = ( + "< 40", "40-60", "61-80", "81-120", "121-160", "161-200", + "201-250", "251-300", "301-350", "351-400", "> 400" + ) + cgm_series["valueBin"] = pd.cut( + cgm_series["mg/dL"], value_bins, labels=value_bin_names + ) + + # get the previous val + cgm_series["previousVal"] = cgm_series["mg/dL"].shift(1) + + # get difference between current and previous val + cgm_series["diffFromPrevVal"] = ( + cgm_series["mg/dL"] - cgm_series["previousVal"] + ) + + # calculate the rate from previous value (mg/dL/min) + cgm_series["rateFromPrevVal"] = cgm_series["diffFromPrevVal"] / 5 + + # get dexcom icgm rate bins + rate_bins = np.array( + [-100, -2.000001, -1.000001, -0.000001, 1, 2, 100] + ) + # NOTE: bracket means include, parentheses means exclude + rate_bin_names = ( + "< -2", "[-2,-1)", "[-1,-0)", "[0,1]", "(1,2]", ">2", + ) + cgm_series["rateBin"] = pd.cut( + cgm_series["rateFromPrevVal"], rate_bins, labels=rate_bin_names + ) + + # through in the join category + cgm_series["valAndRateBin"] = ( + cgm_series["valueBin"].astype(str) + + " & " + + cgm_series["rateBin"].astype(str) + ) + + # calculate slope (mg/dL/min) over the last 15, 30, and 60 minutes + cgm_series["slope15"] = ( + cgm_series["mg/dL"].rolling(3).apply(get_slope, raw=True) + ) + + cgm_series["slope30"] = ( + cgm_series["mg/dL"].rolling(6).apply(get_slope, raw=True) + ) + + cgm_series["slope60"] = ( + cgm_series["mg/dL"].rolling(12).apply(get_slope, raw=True) + ) + + # add in the next value + cgm_series["nextVal"] = cgm_series["mg/dL"].shift(-1) + + # get difference or relative increase/decrease of next value + cgm_series["relativeNextValue"] = ( + cgm_series["nextVal"] - cgm_series["mg/dL"] + ) + + # rate of next value + cgm_series["rateToNextVal"] = cgm_series["relativeNextValue"] / 5 + + # drop rows where there is no information + cgm_series.dropna(subset=['hashid'], inplace=True) + metadata["nCgmDataPoints." + cgm_model] = len(cgm_series) + + # append cgm model to a larger table + combined_cgm_series = pd.concat( + [combined_cgm_series, cgm_series], + ignore_index=True + ) + if len(combined_cgm_series) > 0: + # sort so that the oldest data point is on top + # and that the G5_G6 get deleted if they are apart of a duplicate + combined_cgm_series["cgmModel_G5_and_G6"] = ( + combined_cgm_series["cgmModel"] == "G5_G6" + ) + combined_cgm_series.sort_values( + by=["roundedUtcTime", "cgmModel_G5_and_G6", "cgmModel"], + ascending=[False, True, False], + inplace=True + ) + + combined_cgm_series.reset_index(drop=True, inplace=True) + + # add in check to see if there are duplicates between cgm devices + nUnique_cgm_times = len(combined_cgm_series["roundedUtcTime"].unique()) + cgm_len = len(combined_cgm_series) + metadata["duplicateCgmDataIssue"] = nUnique_cgm_times != cgm_len + + nDuplicate_cgm = cgm_len - nUnique_cgm_times + metadata["nDuplicateCgmDataIssues"] = nDuplicate_cgm + + # if there are still duplicates, get rid of them + if nDuplicate_cgm > 0: + # save the duplicates for further examination + combined_cgm_series.to_csv(os.path.join( + debug_duplicates, + "PHI-" + userid + "-cgm-series-has-cgm-duplicates.csv.gz" + )) + + cgm.to_csv(os.path.join( + debug_duplicates, + "PHI-" + userid + "-cgm-data-has-cgm-duplicates.csv.gz" + )) + + # get rid of duplicates + combined_cgm_series, n_cgm_dups_removed = ( + removeDuplicates(combined_cgm_series, "roundedUtcTime") + ) + metadata["nCgmDuplicatesRemovedRoundedTime.atEnd"] = ( + n_cgm_dups_removed + ) + metadata["nCgmDataPoints.atEnd"] = len(combined_cgm_series) + + # add whether data is dexcom cgm or not + combined_cgm_series["dexcomCgm"] = ( + combined_cgm_series["cgmModel"].astype(str).str.contains("G4|G5|G6") + ) + + # save distribution data + combined_cgm_series.to_csv(os.path.join( + output_distribution, + "PHI-" + userid + "-cgm-distribution.csv.gz" + )) + + # %% get cgm stats + # create a contiguous 5 minute time series of ALL cgm data + first_day = combined_cgm_series["roundedUtcTime"].min() + metadata["firstCgm." + cgm_model] = first_day + + last_day = combined_cgm_series["roundedUtcTime"].max() + metadata["lastCgm." + cgm_model] = last_day + + rng = pd.date_range(first_day, last_day, freq="5min") + contiguous_data = pd.DataFrame( + rng, + columns=["roundedUtcTime"] + ).sort_values( + "roundedUtcTime", + ascending=True + ).reset_index(drop=True) + + # merge with combined_cgm_series data + all_cgm = pd.merge( + contiguous_data, + combined_cgm_series[[ + 'roundedUtcTime', 'hashid', 'cgmModel', 'dexcomCgm', + 'age', 'ylw', 'isLoopDay', 'mg/dL', + ]], + on="roundedUtcTime", + how="left" + ) + + # get cgm stats + # get a binary (T/F) of whether we have a cgm value + all_cgm["hasCgm"] = all_cgm["mg/dL"].notnull() + + # fill isLoopDay nan with False + all_cgm["isLoopDay"].fillna(False, inplace=True) + + # has loop and cgm + all_cgm["hasLoopAndCgm"] = ( + (all_cgm["isLoopDay"]) & (all_cgm["hasCgm"]) + ) + + all_cgm["hasCgmWithoutLoop"] = ( + (~all_cgm["isLoopDay"]) & (all_cgm["hasCgm"]) + ) + + # work with all of the non-null data, even 39 = LOW and 401 = HIGH + ts39_401 = all_cgm["mg/dL"].copy() + + # some stats should NOT include 39 or 401 + all_cgm["mg/dL.40to400"] = ( + ts39_401.replace(to_replace=39, value=np.nan) + ) + + all_cgm["mg/dL.40to400"] = ( + all_cgm["mg/dL.40to400"].replace( + to_replace=401, + value=np.nan + ) + ) + + ts40_400 = all_cgm["mg/dL.40to400"].copy() + + # for all the less than (<) criteria + for cgm_threshold in [40, 54, 70]: + all_cgm["cgm < " + str(cgm_threshold)] = ( + ts39_401.lt(cgm_threshold) + ) + # get episodes below these thresholds + for min_duration in [5, 15]: + episode_ts = get_episodes( + all_cgm[[ + "roundedUtcTime", + "hasCgm", + "cgm < " + str(cgm_threshold) + ]].copy(), + episode_criterion="cgm < " + str(cgm_threshold), + min_duration=min_duration + ) + all_cgm = pd.concat([all_cgm, episode_ts], axis=1) + + # for all the greter than or equal to (>=) criteria + all_cgm["cgm >= " + str(cgm_threshold)] = ( + ts39_401.ge(cgm_threshold) + ) + + # for all the the less than or equal to (<=) criteria + for cgm_threshold in [140, 180, 250, 300, 400]: + all_cgm["cgm <= " + str(cgm_threshold)] = ( + ts39_401.le(cgm_threshold) + ) + # for all the the greter than (>) criteria + all_cgm["cgm > " + str(cgm_threshold)] = ( + ts39_401.gt(cgm_threshold) + ) + + # get all of the cgm ranges + # (cgm >= 40) & (cgm < 54) + all_cgm["40 <= cgm < 54"] = ( + (all_cgm["cgm >= 40"]) & (all_cgm["cgm < 54"]) + ) + + # (cgm >= 54) & (cgm < 70) + all_cgm["54 <= cgm < 70"] = ( + (all_cgm["cgm >= 54"]) & (all_cgm["cgm < 70"]) + ) + + # (cgm >= 70) & (cgm <= 140) + all_cgm["70 <= cgm <= 140"] = ( + (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 140"]) + ) + + # (cgm >= 70) & (cgm <= 180) + all_cgm["70 <= cgm <= 180"] = ( + (all_cgm["cgm >= 70"]) & (all_cgm["cgm <= 180"]) + ) + + # (cgm > 180) & (cgm <= 250) + all_cgm["180 < cgm <= 250"] = ( + (all_cgm["cgm > 180"]) & (all_cgm["cgm <= 250"]) + ) + + # (cgm > 250) & (cgm <= 400) + all_cgm["250 < cgm <= 400"] = ( + (all_cgm["cgm > 250"]) & (all_cgm["cgm <= 400"]) + ) + + # derfine the windows to calculate the stats over + window_names = ["hour", "day", "week", "month", "quarter", "year"] + window_lengths = [12, 288, 288*7, 288*7*4, 288*90, 288*365] + + for w_name, w_len in zip(window_names, window_lengths): + # require lenth of window for percent calculations + w_min = w_len + + # get the start and end times for each window + all_cgm[w_name + ".startTime"] = ( + all_cgm["roundedUtcTime"].shift(w_len - 1) + ) + all_cgm[w_name + ".endTime"] = all_cgm["roundedUtcTime"] + + # add majority age for the time period + all_cgm[w_name + ".age"] = np.round( + all_cgm["age"].rolling( + min_periods=1, + window=w_len + ).mean() + ) + + # add majority ylw for the time period + all_cgm[w_name + ".ylw"] = np.round( + all_cgm["ylw"].rolling( + min_periods=1, + window=w_len + ).median() + ) + + # get percent time cgm used + all_cgm[w_name + ".cgmPercent"] = ( + all_cgm["hasCgm"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # get the total number of non-null values over this time period + all_cgm[w_name + ".missingCgmPercent"] = ( + 1 - all_cgm[w_name + ".cgmPercent"] + ) + + # create (T/F) 70 and 80 percent available thresholds + # which will be useful for processing later + all_cgm[w_name + ".ge70Available"] = ( + all_cgm[w_name + ".cgmPercent"] >= 0.7 + ) + + all_cgm[w_name + ".ge80Available"] = ( + all_cgm[w_name + ".cgmPercent"] >= 0.8 + ) + + # get percent time Loop was used NOTE: this is + # approximate because we use > 24 temp basals per day + # ALSO: this is percent time Loop was used while cgm in use + all_cgm[w_name + ".loopingAndCgmPercent"] = ( + all_cgm["hasLoopAndCgm"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # percent of time cgm without loop + all_cgm[w_name + ".cgmWithoutLoopPercent"] = ( + all_cgm["hasCgmWithoutLoop"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # get episode stats + # TODO: add in hyper events + # get episodes below these thresholds + for cgm_threshold in [40, 54, 70]: + # get number of episodes per time window + for min_duration in [5, 15]: + "cgm < " + str(cgm_threshold) + episode_name = ( + "episode.cgm < " + str(cgm_threshold) + + ".durationThreshold=" + str(min_duration) + ) + all_cgm[w_name + ".count." + episode_name] = ( + all_cgm[episode_name + ".episodeStart"].rolling( + min_periods=1, + window=w_len + ).sum() + ) + + # get avg. duration of each episode per time window + all_cgm[w_name + ".avgDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).sum() / all_cgm[w_name + ".count." + episode_name] + ) + + # get min duration of each episode per time window + all_cgm[w_name + ".minDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).min() + ) + + # get median duration of each episode per time window + all_cgm[w_name + ".medianDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).median() + ) + + # get max duration of each episode per time window + all_cgm[w_name + ".maxDuration." + episode_name] = ( + all_cgm[episode_name + ".episodeTotalDuration"].rolling( + min_periods=1, + window=w_len + ).max() + ) + + # get percent time in different ranges + # % Time < 54 + all_cgm[w_name + ".lt54Percent"] = ( + all_cgm["cgm < 54"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in 54-70 (cgm >= 54) & (cgm < 70) + all_cgm[w_name + ".bt54_70Percent"] = ( + all_cgm["54 <= cgm < 70"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in target range (cgm >= 70) & (cgm <= 180) + all_cgm[w_name + ".bt70_180Percent"] = ( + all_cgm["70 <= cgm <= 180"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in 180-250 (cgm > 180) & (cgm <= 250) + all_cgm[w_name + ".bt180_250Percent"] = ( + all_cgm["180 < cgm <= 250"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time > 250 + all_cgm[w_name + ".gt250Percent"] = ( + all_cgm["cgm > 250"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # check that all of the percentages add of to 1 or 100% + all_cgm[w_name + ".percentCheck"] = ( + all_cgm[w_name + ".missingCgmPercent"] + + all_cgm[w_name + ".lt54Percent"] + + all_cgm[w_name + ".bt54_70Percent"] + + all_cgm[w_name + ".bt70_180Percent"] + + all_cgm[w_name + ".bt180_250Percent"] + + all_cgm[w_name + ".gt250Percent"] + ) + + # here are some other less common percent time in ranges + # % Time < 70 + all_cgm[w_name + ".lt70Percent"] = ( + all_cgm["cgm < 70"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # % Time in target range (cgm >= 70) & (cgm <= 140) + all_cgm[w_name + ".tir70to140Percent"] = ( + all_cgm["70 <= cgm <= 140"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # percent time above a threshold + # % Time > 180 + all_cgm[w_name + ".gt180Percent"] = ( + all_cgm["cgm > 180"].rolling( + min_periods=w_min, + window=w_len + ).sum() / w_len + ) + + # quantiles + # NOTE: this will increase run time, so only run if you need + # 3-4X the processing time since it has to sort the data + # TODO: make this an option to the function, once it is made + # create a rolling object + + # NOTE: these calculations only require 3 points to make + roll39_401 = ts39_401.rolling(min_periods=3, window=w_len) + roll40_400 = ts40_400.rolling(min_periods=3, window=w_len) + + # min + all_cgm[w_name + ".min"] = roll39_401.min() + + # 10, 25, 75, and 90th percentiles + all_cgm[w_name + ".10th"] = roll39_401.quantile(0.10) + all_cgm[w_name + ".25th"] = roll39_401.quantile(0.25) + all_cgm[w_name + ".75th"] = roll39_401.quantile(0.75) + all_cgm[w_name + ".90th"] = roll39_401.quantile(0.90) + + # max + all_cgm[w_name + ".max"] = roll39_401.max() + + # median + all_cgm[w_name + ".median"] = roll39_401.median() + + # iqr + all_cgm[w_name + ".iqr"] = ( + all_cgm[w_name + ".75th"] - all_cgm[w_name + ".25th"] + ) + + # recalcuate percent of measurements available + all_cgm[w_name + ".40to400availablePercent"] = ( + roll40_400.count() / w_len + ) + + # get the total number of non-null values over this time period + all_cgm[w_name + ".40to400missingPercent"] = ( + 1 - all_cgm[w_name + ".40to400availablePercent"] + ) + + all_cgm[w_name + ".40to400ge70Available"] = ( + all_cgm[w_name + ".40to400availablePercent"] >= 0.7 + ) + + all_cgm[w_name + ".40to400ge80Available"] = ( + all_cgm[w_name + ".40to400availablePercent"] >= 0.8 + ) + + # mean + all_cgm[w_name + ".mean"] = roll40_400.mean() + + # GMI(%) = 3.31 + 0.02392 x [mean glucose in mg/dL] + all_cgm[w_name + ".gmi"] = ( + 3.31 + (0.02392 * all_cgm[w_name + ".mean"]) + ) + + # standard deviation (std) + all_cgm[w_name + ".std"] = roll40_400.std() + + # coefficient of variation (cov) = std / mean + all_cgm[w_name + ".cov"] = ( + all_cgm[w_name + ".std"] / all_cgm[w_name + ".mean"] + ) + + # %% save cgm stats data + all_cgm.to_csv(os.path.join( + output_stats, + "PHI-" + userid + "-cgm-stats.csv.gz" + )) + # write the most recent example of the 90 day stats + # to the metadata + quarter_ge80Available_idx = ( + all_cgm[all_cgm["quarter.ge80Available"]] + ).index.max() + + if pd.notnull(quarter_ge80Available_idx): + # get the most recent quarter + most_recent = all_cgm.loc[ + [quarter_ge80Available_idx], + all_cgm.columns + ] + else: + most_recent = all_cgm.loc[ + [all_cgm.index.max()], + all_cgm.columns + ] + + metadata = pd.merge( + metadata, + most_recent, + on="hashid", + how="left" + ) + + print(metadata.T) + + else: + metadata["cgmData"] = False + print(userid, " has no cgm data") + + # save metadata + metadata.to_csv(os.path.join( + output_metadata, + "PHI-" + userid + "-cgm-metadata.csv.gz" + )) + + print("finished with", userid, "\n") + + return + + +# %% MAIN +if __name__ == "__main__": + # USER INPUTS (choices to be made in order to run the code) + codeDescription = "get distribution and stats for donor json data" + parser = argparse.ArgumentParser(description=codeDescription) + + parser.add_argument( + "-i", + "--input-json-data-path", + dest="json_data_path", + default=np.nan, + help=( + "the path where the json data is located, defaults to none and" + + " will download your data using your Tidepool credentials" + ) + ) + + parser.add_argument( + "-u", + "--userid", + dest="userid", + default=np.nan, + help="userid and filename" + ) + + parser.add_argument( + "-d", + "--date-stamp", + dest="date_stamp", + default=dt.datetime.now().strftime("%Y-%m-%d"), + help="date, in '%Y-%m-%d' format, of the date when " + + "donors were accepted" + ) + + parser.add_argument( + "-o", + "--output-data-path", + dest="data_path", + default=os.path.abspath( + os.path.join( + os.path.dirname(__file__), "..", "data" + ) + ), + help="the output path where the data is stored" + ) + + args = parser.parse_args() + + # the main function + get_distribution_and_stats( + json_data_path=args.json_data_path, + userid=args.userid, + date_stamp=args.date_stamp, + save_data_path=args.data_path, + ) diff --git a/projects/bigdata-processing-pipeline/get_stats/wikipedia-timezone-aliases-2018-04-28.csv b/projects/bigdata-processing-pipeline/get_stats/wikipedia-timezone-aliases-2018-04-28.csv new file mode 100644 index 00000000..01370b69 --- /dev/null +++ b/projects/bigdata-processing-pipeline/get_stats/wikipedia-timezone-aliases-2018-04-28.csv @@ -0,0 +1,206 @@ +tz,alias +Africa/Addis_Ababa,Africa/Nairobi +Africa/Asmara,Africa/Nairobi +Africa/Bamako,Africa/Abidjan +Africa/Bangui,Africa/Lagos +Africa/Banjul,Africa/Abidjan +Africa/Blantyre,Africa/Maputo +Africa/Brazzaville,Africa/Lagos +Africa/Bujumbura,Africa/Maputo +Africa/Conakry,Africa/Abidjan +Africa/Dakar,Africa/Abidjan +Africa/Dar_es_Salaam,Africa/Nairobi +Africa/Djibouti,Africa/Nairobi +Africa/Douala,Africa/Lagos +Africa/Freetown,Africa/Abidjan +Africa/Gaborone,Africa/Maputo +Africa/Harare,Africa/Maputo +Africa/Kampala,Africa/Nairobi +Africa/Kigali,Africa/Maputo +Africa/Kinshasa,Africa/Lagos +Africa/Libreville,Africa/Lagos +Africa/Lome,Africa/Abidjan +Africa/Luanda,Africa/Lagos +Africa/Lubumbashi,Africa/Maputo +Africa/Lusaka,Africa/Maputo +Africa/Malabo,Africa/Lagos +Africa/Maseru,Africa/Johannesburg +Africa/Mbabane,Africa/Johannesburg +Africa/Mogadishu,Africa/Nairobi +Africa/Niamey,Africa/Lagos +Africa/Nouakchott,Africa/Abidjan +Africa/Ouagadougou,Africa/Abidjan +Africa/Porto-Novo,Africa/Lagos +Africa/Sao_Tome,Africa/Lagos +Africa/Timbuktu,Africa/Abidjan +America/Anguilla,America/Port_of_Spain +America/Antigua,America/Port_of_Spain +America/Argentina/ComodRivadavia,America/Argentina/Catamarca +America/Aruba,America/Curacao +America/Atka,America/Adak +America/Buenos_Aires,America/Argentina/Buenos_Aires +America/Catamarca,America/Argentina/Catamarca +America/Cayman,America/Panama +America/Coral_Harbour,America/Atikokan +America/Cordoba,America/Argentina/Cordoba +America/Dominica,America/Port_of_Spain +America/Ensenada,America/Tijuana +America/Fort_Wayne,America/Indiana/Indianapolis +America/Grenada,America/Port_of_Spain +America/Guadeloupe,America/Port_of_Spain +America/Indianapolis,America/Indiana/Indianapolis +America/Jujuy,America/Argentina/Jujuy +America/Knox_IN,America/Indiana/Knox +America/Kralendijk,America/Curacao +America/Louisville,America/Kentucky/Louisville +America/Lower_Princes,America/Curacao +America/Marigot,America/Port_of_Spain +America/Mendoza,America/Argentina/Mendoza +America/Montreal,America/Toronto +America/Montserrat,America/Port_of_Spain +America/Porto_Acre,America/Rio_Branco +America/Rosario,America/Argentina/Cordoba +America/Santa_Isabel,America/Tijuana +America/Shiprock,America/Denver +America/St_Barthelemy,America/Port_of_Spain +America/St_Kitts,America/Port_of_Spain +America/St_Lucia,America/Port_of_Spain +America/St_Thomas,America/Port_of_Spain +America/St_Vincent,America/Port_of_Spain +America/Tortola,America/Port_of_Spain +America/Virgin,America/Port_of_Spain +Antarctica/McMurdo,Pacific/Auckland +Antarctica/South_Pole,Pacific/Auckland +Arctic/Longyearbyen,Europe/Oslo +Asia/Aden,Asia/Riyadh +Asia/Ashkhabad,Asia/Ashgabat +Asia/Bahrain,Asia/Qatar +Asia/Calcutta,Asia/Kolkata +Asia/Chongqing,Asia/Shanghai +Asia/Chungking,Asia/Shanghai +Asia/Dacca,Asia/Dhaka +Asia/Harbin,Asia/Shanghai +Asia/Istanbul,Europe/Istanbul +Asia/Kashgar,Asia/Urumqi[note1] +Asia/Katmandu,Asia/Kathmandu +Asia/Kuwait,Asia/Riyadh +Asia/Macao,Asia/Macau +Asia/Muscat,Asia/Dubai +Asia/Phnom_Penh,Asia/Bangkok +Asia/Rangoon,Asia/Yangon +Asia/Saigon,Asia/Ho_Chi_Minh +Asia/Tel_Aviv,Asia/Jerusalem +Asia/Thimbu,Asia/Thimphu +Asia/Ujung_Pandang,Asia/Makassar +Asia/Ulan_Bator,Asia/Ulaanbaatar +Asia/Vientiane,Asia/Bangkok +Atlantic/Faeroe,Atlantic/Faroe +Atlantic/Jan_Mayen,Europe/Oslo +Atlantic/St_Helena,Africa/Abidjan +Australia/ACT,Australia/Sydney +Australia/Canberra,Australia/Sydney +Australia/LHI,Australia/Lord_Howe +Australia/North,Australia/Darwin +Australia/NSW,Australia/Sydney +Australia/Queensland,Australia/Brisbane +Australia/South,Australia/Adelaide +Australia/Tasmania,Australia/Hobart +Australia/Victoria,Australia/Melbourne +Australia/West,Australia/Perth +Australia/Yancowinna,Australia/Broken_Hill +Brazil/Acre,America/Rio_Branco +Brazil/DeNoronha,America/Noronha +Brazil/East,America/Sao_Paulo +Brazil/West,America/Manaus +Canada/Atlantic,America/Halifax +Canada/Central,America/Winnipeg +Canada/Eastern,America/Toronto +Canada/Mountain,America/Edmonton +Canada/Newfoundland,America/St_Johns +Canada/Pacific,America/Vancouver +Canada/Saskatchewan,America/Regina +Canada/Yukon,America/Whitehorse +Chile/Continental,America/Santiago +Chile/EasterIsland,Pacific/Easter +Cuba,America/Havana +Egypt,Africa/Cairo +Eire,Europe/Dublin +Etc/GMT+0,Etc/GMT +Etc/GMT-0,Etc/GMT +Etc/GMT0,Etc/GMT +Etc/Greenwich,Etc/GMT +Etc/Universal,Etc/UTC +Etc/Zulu,Etc/UTC +Europe/Belfast,Europe/London +Europe/Bratislava,Europe/Prague +Europe/Busingen,Europe/Zurich +Europe/Guernsey,Europe/London +Europe/Isle_of_Man,Europe/London +Europe/Jersey,Europe/London +Europe/Ljubljana,Europe/Belgrade +Europe/Mariehamn,Europe/Helsinki +Europe/Nicosia,Asia/Nicosia +Europe/Podgorica,Europe/Belgrade +Europe/San_Marino,Europe/Rome +Europe/Sarajevo,Europe/Belgrade +Europe/Skopje,Europe/Belgrade +Europe/Tiraspol,Europe/Chisinau +Europe/Vaduz,Europe/Zurich +Europe/Vatican,Europe/Rome +Europe/Zagreb,Europe/Belgrade +GB,Europe/London +GB-Eire,Europe/London +GMT,Etc/GMT +GMT+0,Etc/GMT +GMT0,Etc/GMT +GMT−0,Etc/GMT +Greenwich,Etc/GMT +Hongkong,Asia/Hong_Kong +Iceland,Atlantic/Reykjavik +Indian/Antananarivo,Africa/Nairobi +Indian/Comoro,Africa/Nairobi +Indian/Mayotte,Africa/Nairobi +Iran,Asia/Tehran +Israel,Asia/Jerusalem +Jamaica,America/Jamaica +Japan,Asia/Tokyo +Kwajalein,Pacific/Kwajalein +Libya,Africa/Tripoli +Mexico/BajaNorte,America/Tijuana +Mexico/BajaSur,America/Mazatlan +Mexico/General,America/Mexico_City +Navajo,America/Denver +NZ,Pacific/Auckland +NZ-CHAT,Pacific/Chatham +Pacific/Johnston,Pacific/Honolulu +Pacific/Midway,Pacific/Pago_Pago +Pacific/Ponape,Pacific/Pohnpei +Pacific/Saipan,Pacific/Guam +Pacific/Samoa,Pacific/Pago_Pago +Pacific/Truk,Pacific/Chuuk +Pacific/Yap,Pacific/Chuuk +Poland,Europe/Warsaw +Portugal,Europe/Lisbon +PRC,Asia/Shanghai +ROC,Asia/Taipei +ROK,Asia/Seoul +Singapore,Asia/Singapore +Turkey,Europe/Istanbul +UCT,Etc/UCT +Universal,Etc/UTC +US/Alaska,America/Anchorage +US/Aleutian,America/Adak +US/Arizona,America/Phoenix +US/Central,America/Chicago +US/East-Indiana,America/Indiana/Indianapolis +US/Eastern,America/New_York +US/Hawaii,Pacific/Honolulu +US/Indiana-Starke,America/Indiana/Knox +US/Michigan,America/Detroit +US/Mountain,America/Denver +US/Pacific,America/Los_Angeles +US/Pacific-New,America/Los_Angeles +US/Samoa,Pacific/Pago_Pago +UTC,Etc/UTC +W-SU,Europe/Moscow +Zulu,Etc/UTC \ No newline at end of file diff --git a/projects/bigdata-processing-pipeline/qualify-data/README.md b/projects/bigdata-processing-pipeline/qualify_data/README.md similarity index 100% rename from projects/bigdata-processing-pipeline/qualify-data/README.md rename to projects/bigdata-processing-pipeline/qualify_data/README.md diff --git a/projects/bigdata-processing-pipeline/qualify-data/deprecated/qualify-data.py b/projects/bigdata-processing-pipeline/qualify_data/deprecated/qualify-data.py similarity index 100% rename from projects/bigdata-processing-pipeline/qualify-data/deprecated/qualify-data.py rename to projects/bigdata-processing-pipeline/qualify_data/deprecated/qualify-data.py diff --git a/projects/bigdata-processing-pipeline/qualify-data/qualify_all_donor_data_batch_process.py b/projects/bigdata-processing-pipeline/qualify_data/qualify_all_donor_data_batch_process.py similarity index 100% rename from projects/bigdata-processing-pipeline/qualify-data/qualify_all_donor_data_batch_process.py rename to projects/bigdata-processing-pipeline/qualify_data/qualify_all_donor_data_batch_process.py diff --git a/projects/bigdata-processing-pipeline/qualify-data/qualify_single_dataset.py b/projects/bigdata-processing-pipeline/qualify_data/qualify_single_dataset.py similarity index 100% rename from projects/bigdata-processing-pipeline/qualify-data/qualify_single_dataset.py rename to projects/bigdata-processing-pipeline/qualify_data/qualify_single_dataset.py diff --git a/projects/bigdata-processing-pipeline/qualify-data/tidepool-qualification-criteria.json b/projects/bigdata-processing-pipeline/qualify_data/tidepool-qualification-criteria.json similarity index 100% rename from projects/bigdata-processing-pipeline/qualify-data/tidepool-qualification-criteria.json rename to projects/bigdata-processing-pipeline/qualify_data/tidepool-qualification-criteria.json