From 917d73921813fe1539b10107dcea8653805d7277 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 11 Jan 2019 08:20:07 -0600
Subject: [PATCH 01/78] update gitignore

---
 .gitignore | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/.gitignore b/.gitignore
index 8b55d7e1..ba5690ed 100644
--- a/.gitignore
+++ b/.gitignore
@@ -21,6 +21,9 @@ work-record-archive
 export
 internal
 data
+figures
+isf-basal-figures
+fonts
 
 # Test
 htmlcov

From f295c17461dd7475d9d74fd5378102f72cacb04b Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 11 Jan 2019 08:35:38 -0600
Subject: [PATCH 02/78] initial commit

---
 .../get-users-settings-and-events.py          | 99 +++++++++++++++++++
 1 file changed, 99 insertions(+)
 create mode 100644 projects/predict-simulate/get-users-settings-and-events.py

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
new file mode 100644
index 00000000..a044a30c
--- /dev/null
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+description: get users settings and events
+version: 0.0.1
+created: 2019-01-11
+author: Ed Nykaza
+dependencies:
+    *
+license: BSD-2-Clause
+"""
+
+
+# %% REQUIRED LIBRARIES
+import pandas as pd
+import datetime as dt
+import numpy as np
+import os
+import sys
+import shutil
+import glob
+import argparse
+import hashlib
+import ast
+import time
+
+
+# %% USER INPUTS (ADD THIS IN LATER)
+#codeDescription = "Get user's settings and events"
+#parser = argparse.ArgumentParser(description=codeDescription)
+
+
+# %% LOAD IN ONE FILE, BUT EVENTUALLY THIS WILL LOOOP THROUGH ALL USER'S
+
+
+# %% ID & HASHID
+
+
+# %% AGE & YLW
+
+
+# %% UPLOAD DATE
+
+
+# %% TIME (UTC, TIMEZONE, AND EVENTUALLY LOCAL TIME)
+
+
+# %% PUMP AND CGM DEVICE ()
+
+
+# %% ISF
+
+
+# %% CIR
+
+
+# %% INSULIN ACTIVITY DURATION
+
+
+# %% MAX BASAL RATE
+
+
+# %% MAX BOLUS AMOUNT
+
+
+# %% CORRECTION TARGET
+
+
+# %% BASAL RATES (TIME, VALUE, DURATION, TYPE (SCHEDULED, TEMP, SUSPEND))
+
+
+# %% LOOP DATA (BINARY T/F)
+
+
+# %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
+
+
+# %% CGM DATA
+
+
+# %% NUMBER OF DAYS OF PUMP AND CGM DATA, OVERALL AND PER EACH AGE & YLW
+
+
+# %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)
+
+
+# %% SAVE RESULTS
+
+
+# %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
+
+
+# %% V2 DATA TO GRAB
+# ALERT SETTINGS
+# ESTIMATED LOCAL TIME
+# GLYCEMIC OUTCOMES
+# DO NOT ROUND DATA
+# INFUSION SITE CHANGES
+# CGM CALIBRATIONS

From d21d5dd9bda49df2f648f17a95f47f9066459cd9 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 11 Jan 2019 11:38:16 -0600
Subject: [PATCH 03/78] add ISF and CIR

---
 .../get-users-settings-and-events.py          | 282 +++++++++++++++++-
 1 file changed, 274 insertions(+), 8 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index a044a30c..54af995a 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -15,6 +15,7 @@
 import pandas as pd
 import datetime as dt
 import numpy as np
+import tidals as td
 import os
 import sys
 import shutil
@@ -23,6 +24,7 @@
 import hashlib
 import ast
 import time
+import pdb
 
 
 # %% USER INPUTS (ADD THIS IN LATER)
@@ -30,28 +32,286 @@
 #parser = argparse.ArgumentParser(description=codeDescription)
 
 
+# %% FUNCTIONS
+
+dataFieldExportList = [
+        'activeSchedule', 'alarmType', 'annotations.code', 'annotations.threshold',
+        'annotations.value', 'basalSchedules', 'bgInput', 'bgTarget', 'bgTarget.high', 'bgTarget.low',
+        'bgTarget.range', 'bgTarget.start', 'bgTarget.target', 'bgTargets', 'bolus', 'carbInput',
+        'carbRatio', 'carbRatios', 'carbRatio.amount', 'carbRatio.start', 'change.agent',
+        'change.from', 'change.to', 'clockDriftOffset', 'computerTime', 'conversionOffset',
+        'deliveryType', 'deviceId', 'deviceManufacturers', 'deviceModel', 'deviceSerialNumber',
+        'deviceTags', 'deviceTime', 'duration', 'expectedDuration', 'expectedExtended',
+        'expectedNormal', 'extended', 'highAlerts.enabled', 'highAlerts.level',
+        'highAlerts.snooze', 'id', 'insulinCarbRatio', 'insulinOnBoard', 'insulinSensitivity',
+        'insulinSensitivity.amount', 'insulinSensitivity.start', 'insulinSensitivities',
+        'lowAlerts.enabled', 'lowAlerts.level', 'lowAlerts.snooze', 'normal',
+        'outOfRangeAlerts.enabled', 'outOfRangeAlerts.snooze',
+        'payload.calibration_reading', 'payload.Status', 'payload.Trend Arrow',
+        'payload.Trend Rate', 'percent', 'primeTarget', 'rate', 'rateOfChangeAlerts.fallRate.enabled',
+        'rateOfChangeAlerts.fallRate.rate', 'rateOfChangeAlerts.riseRate.enabled',
+        'rateOfChangeAlerts.riseRate.rate', 'reason.resumed', 'reason.suspended', 'recommended.carb',
+        'recommended.correction', 'recommended.net', 'scheduleName', 'status', 'subType',
+        'time', 'timeProcessing', 'timezone', 'timezoneOffset', 'transmitterId', 'type', 'units',
+        'units.bg', 'units.carb', 'uploadId', 'value', 'version'
+]
+
+# CLEAN DATA FUNCTIONS
+def removeNegativeDurations(df):
+    if "duration" in list(df):
+        nNegativeDurations = sum(df.duration < 0)
+        if nNegativeDurations > 0:
+            df = df[~(df.duration < 0)]
+
+    return df, nNegativeDurations
+
+
+def removeInvalidCgmValues(df):
+
+    nBefore = len(df)
+    # remove values < 38 and > 402 mg/dL
+    df = df.drop(df[((df.type == "cbg") &
+                     (df.value < 2.109284236597303))].index)
+    df = df.drop(df[((df.type == "cbg") &
+                     (df.value > 22.314006924003046))].index)
+    nRemoved = nBefore - len(df)
+
+    return df, nRemoved
+
+
+def tslimCalibrationFix(df):
+    searchfor = ['tan']
+    tandemDataIndex = ((df.deviceId.str.contains('|'.join(searchfor))) &
+                       (df.type == "deviceEvent"))
+
+    if "payload.calibration_reading" in list(df):
+        payloadCalReadingIndex = df["payload.calibration_reading"].notnull()
+
+        nTandemAndPayloadCalReadings = sum(tandemDataIndex &
+                                           payloadCalReadingIndex)
+
+        if nTandemAndPayloadCalReadings > 0:
+            # if reading is > 30 then it is in the wrong units
+            if df["payload.calibration_reading"].min() > 30:
+                df.loc[payloadCalReadingIndex, "value"] = \
+                    df[tandemDataIndex & payloadCalReadingIndex] \
+                    ["payload.calibration_reading"] / 18.01559
+            else:
+                df.loc[payloadCalReadingIndex, "value"] = \
+                    df[tandemDataIndex &
+                        payloadCalReadingIndex]["payload.calibration_reading"]
+    else:
+        nTandemAndPayloadCalReadings = 0
+    return df, nTandemAndPayloadCalReadings
+
+
+# OTHER
+def tempRemoveFields(df):
+    removeFields = ["basalSchedules",
+                    "bgTarget",
+                    "bgTargets",
+                    "carbRatio",
+                    "carbRatios",
+                    "insulinSensitivity",
+                    "insulinSensitivities"]
+
+    tempRemoveFields = list(set(df) & set(removeFields))
+    tempDf = df[tempRemoveFields]
+    df = df.drop(columns=tempRemoveFields)
+
+    return df, tempDf
+
+
+def removeBrackets(df, fieldName):
+    if fieldName in list(df):
+        df.loc[df[fieldName].notnull(), fieldName] = \
+            df.loc[df[fieldName].notnull(), fieldName].str[0]
+
+    return df
+
+
+def flattenJson(df, dataFieldsForExport):
+
+    # remove fields that we don't want to flatten
+    df, holdData = tempRemoveFields(df)
+
+    # remove [] from annotations field
+    df = removeBrackets(df, "annotations")
+
+    # get a list of data types of column headings
+    columnHeadings = list(df)  # ["payload", "suppressed"]
+
+    # loop through each columnHeading
+    newDataFrame = pd.DataFrame()
+
+    for colHead in columnHeadings:
+        # if the df field has embedded json
+        if any(isinstance(item, dict) for item in df[colHead]):
+            # grab the data that is in brackets
+            jsonBlob = df[colHead][df[colHead].astype(str).str[0] == "{"]
+
+            # replace those values with nan
+            df.loc[jsonBlob.index, colHead] = np.nan
+
+            # turn jsonBlob to dataframe
+            newDataFrame = pd.concat([newDataFrame, pd.DataFrame(jsonBlob.tolist(),
+                                        index=jsonBlob.index).add_prefix(colHead + '.')], axis=1)
+
+    newColHeadings = list(newDataFrame)
+
+    # put df back into the main dataframe
+    # and add the fields that were removed back in
+    columnFilter = list(set(newColHeadings) & set(dataFieldsForExport))
+    tempDataFrame = newDataFrame.filter(items=columnFilter)
+    df = pd.concat([df, tempDataFrame, holdData], axis=1)
+
+    return df
+
+
+def mergeWizardWithBolus(df):
+
+    if "wizard" in data["type"].unique():
+        bolusData = data[data.type == "bolus"].copy().dropna(axis=1, how="all")
+        wizardData = data[data.type == "wizard"].copy().dropna(axis=1, how="all")
+
+        # merge the wizard data with the bolus data
+        wizardData["calculatorId"] = wizardData["id"]
+        wizardDataFields = [
+            "bgInput",
+            "bgTarget.high",
+            "bgTarget.low",
+            "bgTarget.range",
+            "bgTarget.target",
+            "bolus",
+            "carbInput",
+            "calculatorId",
+            "insulinCarbRatio",
+            "insulinOnBoard",
+            "insulinSensitivity",
+            "recommended.carb",
+            "recommended.correction",
+            "recommended.net",
+            "units",
+        ]
+        keepTheseWizardFields = \
+            set(wizardDataFields).intersection(list(wizardData))
+        bolusData = pd.merge(bolusData,
+                             wizardData[list(keepTheseWizardFields)],
+                             how="left",
+                             left_on="id",
+                             right_on="bolus")
+
+        mergedBolusData = bolusData.drop("bolus", axis=1)
+    else:
+        mergedBolusData = pd.DataFrame()
+
+    return mergedBolusData
+
+
+def addUploadDate(df):
+    uploadTimes = pd.DataFrame(df[df.type == "upload"].groupby("uploadId").time.describe()["top"])
+    uploadTimes.reset_index(inplace=True)
+    uploadTimes.rename(columns={"top": "uploadTime"}, inplace=True)
+    df = pd.merge(df, uploadTimes, how='left', on='uploadId')
+    df["uploadTime"] = pd.to_datetime(df["uploadTime"])
+
+    return df
+
+
+def mmolL_to_mgdL(mmolL):
+    return mmolL * 18.01559
+
+
 # %% LOAD IN ONE FILE, BUT EVENTUALLY THIS WILL LOOOP THROUGH ALL USER'S
+dataPulledDate = "2018-09-28"
+phiDate = "PHI-" + dataPulledDate
+donorPath = os.path.join("..", "bigdata-processing-pipeline", "data", phiDate + "-donor-data")
+
+donorList = phiDate + "-uniqueDonorList.csv"
+donors = td.load.load_csv(os.path.join(donorPath, donorList))
+
+# this is where the loop will go:
+dIndex = 2379
+
+# %% ID, HASHID, AGE, & YLW
+userID = donors.userID[dIndex]
+hashID = donors.hashID[dIndex]
+bDate = pd.to_datetime(donors.bDay[dIndex][0:7])
+dDate = pd.to_datetime(donors.dDay[dIndex][0:7])
 
 
-# %% ID & HASHID
+# %% LOAD IN DONOR JSON DATA
+metadata = pd.DataFrame(index=[dIndex])
+jsonDataPath = os.path.join(donorPath, phiDate + "-donorJsonData")
+jsonFileName = os.path.join(jsonDataPath, "PHI-" + userID + ".json")
 
+if os.path.exists(jsonFileName):
+    fileSize = os.stat(jsonFileName).st_size
+    metadata["fileSizeKB"] = fileSize / 1000
+    if fileSize > 1000:
+        data = td.load.load_json(jsonFileName)
+        # sort the data by time
+        data.sort_values("time", inplace=True)
 
-# %% AGE & YLW
+        # flatten the embedded json
+        data = flattenJson(data, dataFieldExportList)
 
 
-# %% UPLOAD DATE
 
+# %% CLEAN DATA
+        # remove negative durations
+        data, nNegativeDurations = removeNegativeDurations(data)
+        metadata["nNegativeDurations"] = nNegativeDurations
 
-# %% TIME (UTC, TIMEZONE, AND EVENTUALLY LOCAL TIME)
+        # get rid of cgm values too low/high (< 38 & > 402 mg/dL)
+        data, nInvalidCgmValues = removeInvalidCgmValues(data)
+        metadata["nInvalidCgmValues"] = nInvalidCgmValues
 
+        # Tslim calibration bug fix
+        data, nTandemAndPayloadCalReadings = tslimCalibrationFix(data)
+        metadata["nTandemAndPayloadCalReadings"] = nTandemAndPayloadCalReadings
 
-# %% PUMP AND CGM DEVICE ()
 
+# %% ADD UPLOAD DATE
+        # attach upload time to each record, for resolving duplicates
+        if "upload" in data.type.unique():
+            data = addUploadDate(data)
 
-# %% ISF
 
+# %% TIME (UTC, TIMEZONE, DAY AND EVENTUALLY LOCAL TIME)
+            data["utcTime"] = pd.to_datetime(data["time"])
+            data["timezone"].fillna(method='ffill', inplace=True)
+            data["timezone"].fillna(method='bfill', inplace=True)
+            data["day"] = pd.DatetimeIndex(data["utcTime"]).date
+
+# %% ID, HASHID, AGE, & YLW
+            data["userID"] = userID
+            data["hashID"] = hashID
+            data["age"] = np.floor((data["utcTime"] - bDate).dt.days/365.25).astype(int)
+            data["ylw"] = np.floor((data["utcTime"] - dDate).dt.days/365.25).astype(int)
+
+
+# %% FORMAT BOLUS DATA
+            bolus = mergeWizardWithBolus(data)
+            if len(bolus) > 0:
+                # get rid of duplicates that have the same ["time", "normal"]
+                bolus, nBolusDuplicatesRemoved = \
+                    td.clean.remove_duplicates(bolus, bolus[["time", "normal"]])
+                metadata["nBolusDuplicatesRemoved"] = nBolusDuplicatesRemoved
+
+
+# %% ISF, CIR
+                if "insulinSensitivities" in list(bolus):
+                    pdb.set_trace()
+
+                # ISF
+                bolus["isf_mmolL_U"] = bolus["insulinSensitivity"]
+                bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
+                isf = bolus.loc[bolus["isf"].notnull(), ["utcTime", "isf", "isf_mmolL_U"]]
+
+                # CIR
+                cir = bolus.loc[bolus["insulinCarbRatio"].notnull(), ["utcTime", "insulinCarbRatio"]]
 
-# %% CIR
 
 
 # %% INSULIN ACTIVITY DURATION
@@ -88,11 +348,17 @@
 
 
 # %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
-
+        else:
+            metadata["flags"] = "no bolus wizard data"
+    else:
+        metadata["flags"] = "file contains no data"
+else:
+    metadata["flags"] = "file does not exist"
 
 # %% V2 DATA TO GRAB
 # ALERT SETTINGS
 # ESTIMATED LOCAL TIME
+# PUMP AND CGM DEVICE ()
 # GLYCEMIC OUTCOMES
 # DO NOT ROUND DATA
 # INFUSION SITE CHANGES

From 16297bf1f09b2695a165c536e28b496157a0578a Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 11 Jan 2019 13:42:40 -0600
Subject: [PATCH 04/78] update to rounded time for tidal data analytics python
 tools

---
 tidepool-analysis-tools/tidals/clean/clean.py | 82 +++++++++++--------
 1 file changed, 50 insertions(+), 32 deletions(-)

diff --git a/tidepool-analysis-tools/tidals/clean/clean.py b/tidepool-analysis-tools/tidals/clean/clean.py
index 9a4f1836..ca61844f 100644
--- a/tidepool-analysis-tools/tidals/clean/clean.py
+++ b/tidepool-analysis-tools/tidals/clean/clean.py
@@ -17,48 +17,66 @@ def remove_duplicates(df, criteriaDF):
 
 
 def round_time(df, timeIntervalMinutes=5, timeField="time",
-               roundedTimeFieldName="roundedTime", verbose=False):
+               roundedTimeFieldName="roundedTime", startWithFirstRecord=True,
+               verbose=False):
+    '''
+    A general purpose round time function that rounds the "time"
+    field to nearest <timeIntervalMinutes> minutes
+    INPUTS:
+        * a dataframe (df) that contains a time field that you want to round
+        * timeIntervalMinutes (defaults to 5 minutes given that most cgms output every 5 minutes)
+        * timeField to round (defaults to the UTC time "time" field)
+        * roundedTimeFieldName is a user specified column name (defaults to roundedTime)
+        * startWithFirstRecord starts the rounding with the first record if True, and the last record if False (defaults to True)
+        * verbose specifies whether the extra columns used to make calculations are returned
+    '''
+
     import pandas as pd
-    # A general purpose round time function that rounds the
-    # "time" field to nearest <timeIntervalMinutes> minutes
-    # INPUTS:
-    #   * a dataframe (df) that contains a time field
-    #   * timeIntervalMinutes defaults to 5 minutes given that most cgms output every 5 minutes
-    #   * timeField defaults to UTC time "time"
-    #   * verbose specifies whether the "TIB" and "TIB_cumsum" columns are returned
-
-    df.sort_values(by=timeField, ascending=True, inplace=True)
+    df.sort_values(by=timeField, ascending=startWithFirstRecord, inplace=True)
     df.reset_index(drop=True, inplace=True)
 
-    # calculate the time-in-between (TIB) consecutive records
-    t = pd.to_datetime(df.time)
-    t_shift = pd.to_datetime(df.time.shift(1))
-    df["TIB"] = round((t - t_shift).dt.days*(86400/(60 * timeIntervalMinutes)) +
-                      (t - t_shift).dt.seconds/(60 * timeIntervalMinutes)) * timeIntervalMinutes
+    # make sure the time field is in the right form
+    t = pd.to_datetime(df[timeField])
+
+    # calculate the time between consecutive records
+    t_shift = pd.to_datetime(df[timeField].shift(1))
+    df["timeBetweenRecords"] = \
+        round((t - t_shift).dt.days*(86400/(60 * timeIntervalMinutes)) +
+              (t - t_shift).dt.seconds/(60 * timeIntervalMinutes)) * timeIntervalMinutes
 
-    # separate the data into chunks if TIB is greater than <timeIntervalMinutes> minutes
-    # so that rounding process can start over
-    largeGaps = list(df.query("TIB > " + str(timeIntervalMinutes)).index)
+    # separate the data into chunks if timeBetweenRecords is greater than
+    # 2 times the <timeIntervalMinutes> minutes so the rounding process starts over
+    largeGaps = list(df.query("abs(timeBetweenRecords) > " + str(timeIntervalMinutes * 2)).index)
     largeGaps.insert(0, 0)
     largeGaps.append(len(df))
 
-    # loop through each chunk to get the cumulative sum and the rounded time
     for gIndex in range(0, len(largeGaps) - 1):
-
-        df.loc[largeGaps[gIndex], "TIB"] = 0
-
-        df.loc[largeGaps[gIndex]:(largeGaps[gIndex + 1] - 1), "TIB_cumsum"] = \
-            df.loc[largeGaps[gIndex]:(largeGaps[gIndex + 1] - 1), "TIB"].cumsum()
-
-        df.loc[largeGaps[gIndex]:(largeGaps[gIndex + 1] - 1), roundedTimeFieldName] = \
-            pd.to_datetime(df.loc[largeGaps[gIndex], timeField]).round(str(timeIntervalMinutes) + "min") + \
-            pd.to_timedelta(df.loc[largeGaps[gIndex]:(largeGaps[gIndex + 1] - 1), "TIB_cumsum"], unit="m")
-
-    # sort descendingly by time and drop fieldsfields
-    df.sort_values(by=timeField, ascending=False, inplace=True)
+        chunk = t[largeGaps[gIndex]:largeGaps[gIndex+1]]
+        firstRecordChunk = t[largeGaps[gIndex]]
+
+        # calculate the time difference between each time record and the first record
+        df.loc[largeGaps[gIndex]:largeGaps[gIndex+1], "minutesFromFirstRecord"] = \
+            (chunk - firstRecordChunk).dt.days*(86400/(60)) + (chunk - firstRecordChunk).dt.seconds/(60)
+
+        # then round to the nearest X Minutes
+        # NOTE: the ".000001" ensures that mulitples of 2:30 always rounds up.
+        df.loc[largeGaps[gIndex]:largeGaps[gIndex+1], "roundedMinutesFromFirstRecord"] = \
+            round((df.loc[largeGaps[gIndex]:largeGaps[gIndex+1],
+                          "minutesFromFirstRecord"] / timeIntervalMinutes) + 0.000001) * (timeIntervalMinutes)
+
+        roundedFirstRecord = (firstRecordChunk + pd.Timedelta("1microseconds")).round(str(timeIntervalMinutes) + "min")
+        df.loc[largeGaps[gIndex]:largeGaps[gIndex+1], roundedTimeFieldName] = \
+            roundedFirstRecord + \
+            pd.to_timedelta(df.loc[largeGaps[gIndex]:largeGaps[gIndex+1],
+                                   "roundedMinutesFromFirstRecord"], unit="m")
+
+    # sort by time and drop fieldsfields
+    df.sort_values(by=timeField, ascending=startWithFirstRecord, inplace=True)
     df.reset_index(drop=True, inplace=True)
     if verbose is False:
-        df.drop(columns=["TIB", "TIB_cumsum"], inplace=True)
+        df.drop(columns=["timeBetweenRecords",
+                         "minutesFromFirstRecord",
+                         "roundedMinutesFromFirstRecord"], inplace=True)
 
     return df
 

From c8db909b4f9eeb0cd8d7ba07c1010cff906dad17 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 11 Jan 2019 13:42:51 -0600
Subject: [PATCH 05/78] add bolus events

---
 .../get-users-settings-and-events.py          | 166 +++++++++++++++++-
 1 file changed, 162 insertions(+), 4 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 54af995a..33492488 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -222,6 +222,123 @@ def mmolL_to_mgdL(mmolL):
     return mmolL * 18.01559
 
 
+def round_time(df, timeIntervalMinutes=5, timeField="time",
+               roundedTimeFieldName="roundedTime", startWithFirstRecord=True,
+               verbose=False):
+    '''
+    A general purpose round time function that rounds the "time"
+    field to nearest <timeIntervalMinutes> minutes
+    INPUTS:
+        * a dataframe (df) that contains a time field that you want to round
+        * timeIntervalMinutes (defaults to 5 minutes given that most cgms output every 5 minutes)
+        * timeField to round (defaults to the UTC time "time" field)
+        * roundedTimeFieldName is a user specified column name (defaults to roundedTime)
+        * startWithFirstRecord starts the rounding with the first record if True, and the last record if False (defaults to True)
+        * verbose specifies whether the extra columns used to make calculations are returned
+    '''
+
+    df.sort_values(by=timeField, ascending=startWithFirstRecord, inplace=True)
+    df.reset_index(drop=True, inplace=True)
+
+    # make sure the time field is in the right form
+    t = pd.to_datetime(df[timeField])
+
+    # calculate the time between consecutive records
+    t_shift = pd.to_datetime(df[timeField].shift(1))
+    df["timeBetweenRecords"] = \
+        round((t - t_shift).dt.days*(86400/(60 * timeIntervalMinutes)) +
+              (t - t_shift).dt.seconds/(60 * timeIntervalMinutes)) * timeIntervalMinutes
+
+    # separate the data into chunks if timeBetweenRecords is greater than
+    # 2 times the <timeIntervalMinutes> minutes so the rounding process starts over
+    largeGaps = list(df.query("abs(timeBetweenRecords) > " + str(timeIntervalMinutes * 2)).index)
+    largeGaps.insert(0, 0)
+    largeGaps.append(len(df))
+
+    for gIndex in range(0, len(largeGaps) - 1):
+        chunk = t[largeGaps[gIndex]:largeGaps[gIndex+1]]
+        firstRecordChunk = t[largeGaps[gIndex]]
+
+        # calculate the time difference between each time record and the first record
+        df.loc[largeGaps[gIndex]:largeGaps[gIndex+1], "minutesFromFirstRecord"] = \
+            (chunk - firstRecordChunk).dt.days*(86400/(60)) + (chunk - firstRecordChunk).dt.seconds/(60)
+
+        # then round to the nearest X Minutes
+        # NOTE: the ".000001" ensures that mulitples of 2:30 always rounds up.
+        df.loc[largeGaps[gIndex]:largeGaps[gIndex+1], "roundedMinutesFromFirstRecord"] = \
+            round((df.loc[largeGaps[gIndex]:largeGaps[gIndex+1],
+                          "minutesFromFirstRecord"] / timeIntervalMinutes) + 0.000001) * (timeIntervalMinutes)
+
+        roundedFirstRecord = (firstRecordChunk + pd.Timedelta("1microseconds")).round(str(timeIntervalMinutes) + "min")
+        df.loc[largeGaps[gIndex]:largeGaps[gIndex+1], roundedTimeFieldName] = \
+            roundedFirstRecord + \
+            pd.to_timedelta(df.loc[largeGaps[gIndex]:largeGaps[gIndex+1],
+                                   "roundedMinutesFromFirstRecord"], unit="m")
+
+    # sort by time and drop fieldsfields
+    df.sort_values(by=timeField, ascending=startWithFirstRecord, inplace=True)
+    df.reset_index(drop=True, inplace=True)
+    if verbose is False:
+        df.drop(columns=["timeBetweenRecords",
+                         "minutesFromFirstRecord",
+                         "roundedMinutesFromFirstRecord"], inplace=True)
+
+    return df
+
+
+def get_descriptive_stats(df, newName, dataSubType):
+
+    newDf = df[dataSubType].describe().add_suffix(newName)
+
+    newDf[("rangeOf" + newName)] = \
+        newDf[("max" + newName)] - \
+        newDf[("min" + newName)]
+
+    return newDf
+
+
+def get_bolusDaySummary(bolusData):
+
+    if "extended" not in bolusData:
+        bolusData["extended"] = 0
+
+    bolusByDay = bolusData.groupby(bolusData["day"])
+
+    # total bolus insulin for each day
+    bolusDaySummary = pd.DataFrame(bolusByDay.normal.sum())
+    bolusDaySummary = bolusDaySummary.rename(columns={"normal":"totalAmountOfNormalBolusInsulin"})
+
+    bolusDaySummary["totalAmountOfExtendedBolusInsulin"] = bolusByDay.extended.sum().fillna(0.0)
+    bolusDaySummary["totalAmountOfBolusInsulin"] = bolusDaySummary["totalAmountOfNormalBolusInsulin"].fillna(0.0) + \
+                                           bolusDaySummary["totalAmountOfExtendedBolusInsulin"].fillna(0.0)
+
+    # bolus range for normal boluses
+    normalBasalDF = get_descriptive_stats(bolusByDay, "NormalBolusAmountPerBolus", "normal")
+    bolusDaySummary = pd.concat([bolusDaySummary, normalBasalDF], axis = 1)
+
+    # total number of bolus types per day
+    bolusTypePerDay = bolusData.groupby(["day",
+                                         "subType"]).size().unstack()
+
+    bolusDaySummary["numberOfNormalBoluses"] = bolusTypePerDay["normal"].fillna(0)
+
+    if "square" not in list(bolusTypePerDay):
+        bolusDaySummary["numberOfSquareBoluses"] = 0
+    else:
+        bolusDaySummary["numberOfSquareBoluses"] = bolusTypePerDay["square"].fillna(0)
+
+    if "dual/square" not in list(bolusTypePerDay):
+        bolusDaySummary["numberOfDualBoluses"] = 0
+    else:
+        bolusDaySummary["numberOfDualBoluses"] = bolusTypePerDay["dual/square"].fillna(0)
+
+    bolusDaySummary["numberOfAllBolusTypes"] = bolusDaySummary["numberOfNormalBoluses"] + \
+                                        bolusDaySummary["numberOfSquareBoluses"] + \
+                                        bolusDaySummary["numberOfDualBoluses"]
+
+    return bolusDaySummary
+
+
 # %% LOAD IN ONE FILE, BUT EVENTUALLY THIS WILL LOOOP THROUGH ALL USER'S
 dataPulledDate = "2018-09-28"
 phiDate = "PHI-" + dataPulledDate
@@ -284,12 +401,26 @@ def mmolL_to_mgdL(mmolL):
             data["timezone"].fillna(method='bfill', inplace=True)
             data["day"] = pd.DatetimeIndex(data["utcTime"]).date
 
+            # round to the nearest 5 minutes
+            # TODO: once roundTime is pushed to tidals repository then this line can be replaced
+            # with td.clean.round_time
+            data = round_time(data, timeIntervalMinutes=5, timeField="time",
+                              roundedTimeFieldName="roundedTime", startWithFirstRecord=True,
+                              verbose=False)
+
+
 # %% ID, HASHID, AGE, & YLW
             data["userID"] = userID
             data["hashID"] = hashID
             data["age"] = np.floor((data["utcTime"] - bDate).dt.days/365.25).astype(int)
             data["ylw"] = np.floor((data["utcTime"] - dDate).dt.days/365.25).astype(int)
 
+            commonColumnHeadings = ["hashID",
+                                    "age",
+                                    "ylw",
+                                    "utcTime",
+                                    "roundedTime"]
+
 
 # %% FORMAT BOLUS DATA
             bolus = mergeWizardWithBolus(data)
@@ -301,18 +432,45 @@ def mmolL_to_mgdL(mmolL):
 
 
 # %% ISF, CIR
+                # ISF
                 if "insulinSensitivities" in list(bolus):
                     pdb.set_trace()
 
-                # ISF
                 bolus["isf_mmolL_U"] = bolus["insulinSensitivity"]
                 bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
-                isf = bolus.loc[bolus["isf"].notnull(), ["utcTime", "isf", "isf_mmolL_U"]]
+
+                isfCH = commonColumnHeadings.copy()
+                isfCH.extend(["isf", "isf_mmolL_U"])
+                isf = bolus.loc[bolus["isf"].notnull(), isfCH]
 
                 # CIR
-                cir = bolus.loc[bolus["insulinCarbRatio"].notnull(), ["utcTime", "insulinCarbRatio"]]
+                if "carbRatios" in list(bolus):
+                    pdb.set_trace()
+
+                cirCH = commonColumnHeadings.copy()
+                cirCH.extend(["insulinCarbRatio"])
+                cir = bolus.loc[bolus["insulinCarbRatio"].notnull(), cirCH]
 
 
+# %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
+                # get a summary of boluses per day
+                bolusDaySummary = get_bolusDaySummary(bolus)
+
+                if "extended" not in bolus:
+                    bolus["extended"] = np.nan
+                    bolus["duration"] = np.nan
+
+                bolusCH = commonColumnHeadings.copy()
+                bolusCH.extend(["normal", "carbInput", "subType",
+                                "insulinOnBoard", "bgInput"])
+                bolusEvents = bolus.loc[bolus["normal"].notnull(), bolusCH]
+                bolusEvents.loc[bolusEvents["bgInput"] == 0, "bgInput"] = np.nan
+                bolusEvents = bolusEvents.rename(columns={"normal": "unitsInsulin",
+                                                          "bgInput": "bg_mmolL"})
+                bolusEvents["bg_mgdL"] = mmolL_to_mgdL(bolusEvents["bg_mmolL"])
+                bolusEvents["eventType"] = "correction"
+                bolusEvents.loc[bolusEvents["carbInput"] == 0, "eventType"] = "meal"
+
 
 # %% INSULIN ACTIVITY DURATION
 
@@ -332,7 +490,7 @@ def mmolL_to_mgdL(mmolL):
 # %% LOOP DATA (BINARY T/F)
 
 
-# %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
+
 
 
 # %% CGM DATA

From 7418a81d03a46a3ff034df573b2eb335f5c8154f Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 11 Jan 2019 16:04:59 -0600
Subject: [PATCH 06/78] flatten embedded json to cover all column headings

---
 .../get-users-settings-and-events.py          | 64 +++++++++++++++++--
 1 file changed, 58 insertions(+), 6 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 33492488..4fa8ebd7 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -122,6 +122,18 @@ def tempRemoveFields(df):
     return df, tempDf
 
 
+def tempRemoveFieldsV2(df):
+    removeFields = ["suppressed",
+                    "recommended",
+                    "payload"]
+
+    tempRemoveFields = list(set(df) & set(removeFields))
+    tempDf = df[tempRemoveFields]
+    df = df.drop(columns=tempRemoveFields)
+
+    return df, tempDf
+
+
 def removeBrackets(df, fieldName):
     if fieldName in list(df):
         df.loc[df[fieldName].notnull(), fieldName] = \
@@ -133,7 +145,7 @@ def removeBrackets(df, fieldName):
 def flattenJson(df, dataFieldsForExport):
 
     # remove fields that we don't want to flatten
-    df, holdData = tempRemoveFields(df)
+    #df, holdData = tempRemoveFields(df)
 
     # remove [] from annotations field
     df = removeBrackets(df, "annotations")
@@ -161,18 +173,57 @@ def flattenJson(df, dataFieldsForExport):
 
     # put df back into the main dataframe
     # and add the fields that were removed back in
+    pdb.set_trace
     columnFilter = list(set(newColHeadings) & set(dataFieldsForExport))
     tempDataFrame = newDataFrame.filter(items=columnFilter)
-    df = pd.concat([df, tempDataFrame, holdData], axis=1)
+    df = pd.concat([df, tempDataFrame], axis=1)
+    #df = pd.concat([df, tempDataFrame, holdData], axis=1)
 
     return df
 
 
+def flattenJsonV2(df, nEmbeddings):
+    # repeat this N times
+    for nEmbed in range(0, nEmbeddings):
+        # remove fields that we don't want to flatten
+        df, holdData = tempRemoveFieldsV2(df)
+
+        # get a list of data types of column headings
+        columnHeadings = list(df)  # ["payload", "suppressed"]
+
+        # loop through each columnHeading
+        newDataFrame = pd.DataFrame()
+
+        for colHead in columnHeadings:
+            if any(isinstance(item, list) for item in df[colHead]):
+                listBlob = df[colHead][df[colHead].astype(str).str[0] == "["]
+                df.loc[listBlob.index, colHead] = df.loc[listBlob.index, colHead].str[0]
+
+            # if the df field has embedded json
+            if any(isinstance(item, dict) for item in df[colHead]):
+                # grab the data that is in brackets
+                jsonBlob = df[colHead][df[colHead].astype(str).str[0] == "{"]
+
+                # replace those values with nan
+                df.loc[jsonBlob.index, colHead] = np.nan
+
+                # turn jsonBlob to dataframe
+                newDataFrame = pd.concat([newDataFrame, pd.DataFrame(jsonBlob.tolist(),
+                                         index=jsonBlob.index).add_prefix(colHead + '.')], axis=1)
+
+        df = pd.concat([df, newDataFrame, holdData], axis=1)
+
+    df.sort_index(axis=1, inplace=True)
+
+    return df
+
+
+
 def mergeWizardWithBolus(df):
 
-    if "wizard" in data["type"].unique():
-        bolusData = data[data.type == "bolus"].copy().dropna(axis=1, how="all")
-        wizardData = data[data.type == "wizard"].copy().dropna(axis=1, how="all")
+    if "wizard" in df["type"].unique():
+        bolusData = df[df.type == "bolus"].copy().dropna(axis=1, how="all")
+        wizardData = df[df.type == "wizard"].copy().dropna(axis=1, how="all")
 
         # merge the wizard data with the bolus data
         wizardData["calculatorId"] = wizardData["id"]
@@ -371,7 +422,8 @@ def get_bolusDaySummary(bolusData):
         data.sort_values("time", inplace=True)
 
         # flatten the embedded json
-        data = flattenJson(data, dataFieldExportList)
+        #data = flattenJson(data, dataFieldExportList)
+        data = flattenJsonV2(data, 2)
 
 
 

From a2887789c7e871dc72cfbf9b126702f2c5fd3797 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 11 Jan 2019 16:05:26 -0600
Subject: [PATCH 07/78] update bolus events to include isf and cir associated
 with events

---
 .../get-users-settings-and-events.py          | 41 +++++++------------
 1 file changed, 14 insertions(+), 27 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 4fa8ebd7..1c3fe4a1 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -474,7 +474,7 @@ def get_bolusDaySummary(bolusData):
                                     "roundedTime"]
 
 
-# %% FORMAT BOLUS DATA
+# %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
             bolus = mergeWizardWithBolus(data)
             if len(bolus) > 0:
                 # get rid of duplicates that have the same ["time", "normal"]
@@ -482,29 +482,6 @@ def get_bolusDaySummary(bolusData):
                     td.clean.remove_duplicates(bolus, bolus[["time", "normal"]])
                 metadata["nBolusDuplicatesRemoved"] = nBolusDuplicatesRemoved
 
-
-# %% ISF, CIR
-                # ISF
-                if "insulinSensitivities" in list(bolus):
-                    pdb.set_trace()
-
-                bolus["isf_mmolL_U"] = bolus["insulinSensitivity"]
-                bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
-
-                isfCH = commonColumnHeadings.copy()
-                isfCH.extend(["isf", "isf_mmolL_U"])
-                isf = bolus.loc[bolus["isf"].notnull(), isfCH]
-
-                # CIR
-                if "carbRatios" in list(bolus):
-                    pdb.set_trace()
-
-                cirCH = commonColumnHeadings.copy()
-                cirCH.extend(["insulinCarbRatio"])
-                cir = bolus.loc[bolus["insulinCarbRatio"].notnull(), cirCH]
-
-
-# %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
                 # get a summary of boluses per day
                 bolusDaySummary = get_bolusDaySummary(bolus)
 
@@ -512,9 +489,19 @@ def get_bolusDaySummary(bolusData):
                     bolus["extended"] = np.nan
                     bolus["duration"] = np.nan
 
+                # ISF associated with bolus event
+                if "insulinSensitivities" in list(bolus):
+                    pdb.set_trace()
+                if "carbRatios" in list(bolus):
+                    pdb.set_trace()
+
+                bolus["isf_mmolL_U"] = bolus["insulinSensitivity"]
+                bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
+
                 bolusCH = commonColumnHeadings.copy()
                 bolusCH.extend(["normal", "carbInput", "subType",
-                                "insulinOnBoard", "bgInput"])
+                                "insulinOnBoard", "bgInput",
+                                "isf", "isf_mmolL_U", "insulinCarbRatio"])
                 bolusEvents = bolus.loc[bolus["normal"].notnull(), bolusCH]
                 bolusEvents.loc[bolusEvents["bgInput"] == 0, "bgInput"] = np.nan
                 bolusEvents = bolusEvents.rename(columns={"normal": "unitsInsulin",
@@ -524,8 +511,8 @@ def get_bolusDaySummary(bolusData):
                 bolusEvents.loc[bolusEvents["carbInput"] == 0, "eventType"] = "meal"
 
 
-# %% INSULIN ACTIVITY DURATION
-
+# %% PUMP SETTINGS
+                pumpSettings = data[data.type == "pumpSettings"].copy().dropna(axis=1, how="all")
 
 # %% MAX BASAL RATE
 

From ace918ba6c402d31dc2942ec7fc9667dfcd6b9ef Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 11 Jan 2019 16:30:34 -0600
Subject: [PATCH 08/78] get ISF from pump settings

---
 .../get-users-settings-and-events.py          | 22 +++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 1c3fe4a1..1fbad34f 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -469,9 +469,7 @@ def get_bolusDaySummary(bolusData):
 
             commonColumnHeadings = ["hashID",
                                     "age",
-                                    "ylw",
-                                    "utcTime",
-                                    "roundedTime"]
+                                    "ylw"]
 
 
 # %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
@@ -499,7 +497,7 @@ def get_bolusDaySummary(bolusData):
                 bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
 
                 bolusCH = commonColumnHeadings.copy()
-                bolusCH.extend(["normal", "carbInput", "subType",
+                bolusCH.extend(["utcTime", "roundedTime", "normal", "carbInput", "subType",
                                 "insulinOnBoard", "bgInput",
                                 "isf", "isf_mmolL_U", "insulinCarbRatio"])
                 bolusEvents = bolus.loc[bolus["normal"].notnull(), bolusCH]
@@ -514,6 +512,22 @@ def get_bolusDaySummary(bolusData):
 # %% PUMP SETTINGS
                 pumpSettings = data[data.type == "pumpSettings"].copy().dropna(axis=1, how="all")
 
+                #ISF
+                if "insulinSensitivity.amount" in list(pumpSettings):
+                    isfColHead = "insulinSensitivity"
+                else:
+                    isfColHead = "insulinSensitivities"
+
+                pumpSettings["isf_mmolL_U"] = pumpSettings[isfColHead + ".amount"]
+                pumpSettings["isf"] = mmolL_to_mgdL(pumpSettings["isf_mmolL_U"])
+                pumpSettings["time"] = pd.to_datetime(pumpSettings["day"]) + \
+                    pd.to_timedelta(pumpSettings[isfColHead + ".start"], unit="ms")
+
+                isfCH = commonColumnHeadings.copy()
+                isfCH.extend(["time", "isf", "isf_mmolL_U"])
+                isf = pumpSettings.loc[pumpSettings["isf"].notnull(), isfCH]
+
+
 # %% MAX BASAL RATE
 
 

From f58ed6474ac1161b6a8efa512084abbc0141f16f Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 11 Jan 2019 16:37:52 -0600
Subject: [PATCH 09/78] get CIR from the pump settings

---
 .../get-users-settings-and-events.py          | 20 ++++++++++++++++---
 1 file changed, 17 insertions(+), 3 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 1fbad34f..ddcde6ca 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -512,7 +512,7 @@ def get_bolusDaySummary(bolusData):
 # %% PUMP SETTINGS
                 pumpSettings = data[data.type == "pumpSettings"].copy().dropna(axis=1, how="all")
 
-                #ISF
+                # ISF
                 if "insulinSensitivity.amount" in list(pumpSettings):
                     isfColHead = "insulinSensitivity"
                 else:
@@ -520,13 +520,27 @@ def get_bolusDaySummary(bolusData):
 
                 pumpSettings["isf_mmolL_U"] = pumpSettings[isfColHead + ".amount"]
                 pumpSettings["isf"] = mmolL_to_mgdL(pumpSettings["isf_mmolL_U"])
-                pumpSettings["time"] = pd.to_datetime(pumpSettings["day"]) + \
+                pumpSettings["isfTime"] = pd.to_datetime(pumpSettings["day"]) + \
                     pd.to_timedelta(pumpSettings[isfColHead + ".start"], unit="ms")
 
                 isfCH = commonColumnHeadings.copy()
-                isfCH.extend(["time", "isf", "isf_mmolL_U"])
+                isfCH.extend(["isfTime", "isf", "isf_mmolL_U"])
                 isf = pumpSettings.loc[pumpSettings["isf"].notnull(), isfCH]
 
+                # CIR
+                if "carbRatio.amount" in list(pumpSettings):
+                    cirColHead = "carbRatio"
+                else:
+                    cirColHead = "carbRatios"
+
+                pumpSettings["cir"] = pumpSettings[cirColHead + ".amount"]
+                pumpSettings["cirTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                    pd.to_timedelta(pumpSettings[cirColHead + ".start"], unit="ms")
+
+                cirCH = commonColumnHeadings.copy()
+                cirCH.extend(["cirTime", "cir"])
+                cir = pumpSettings.loc[pumpSettings["cir"].notnull(), cirCH]
+
 
 # %% MAX BASAL RATE
 

From 964c3dc8de6dd7a73f7ead352502370d84dbe6b3 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 11 Jan 2019 17:01:53 -0600
Subject: [PATCH 10/78] add the correction target from the pump settings

---
 .../get-users-settings-and-events.py          | 19 ++++++++++++++++---
 1 file changed, 16 insertions(+), 3 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index ddcde6ca..f40b61d6 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -542,13 +542,26 @@ def get_bolusDaySummary(bolusData):
                 cir = pumpSettings.loc[pumpSettings["cir"].notnull(), cirCH]
 
 
-# %% MAX BASAL RATE
+                # CORRECTION TARGET
+                if "bgTarget.start" in list(pumpSettings):
+                    bgTargetColHead = "bgTarget"
+                else:
+                    bgTargetColHead = "bgTargets"
 
+                pumpSettings["correctionTargetLow_mmolL"] = pumpSettings[bgTargetColHead + ".low"]
+                pumpSettings["correctionTargetLow"] = \
+                    mmolL_to_mgdL(pumpSettings["correctionTargetLow_mmolL"])
 
-# %% MAX BOLUS AMOUNT
+                pumpSettings["correctionTargetHigh_mmolL"] = pumpSettings[bgTargetColHead + ".high"]
+                pumpSettings["correctionTargetHigh"] = \
+                    mmolL_to_mgdL(pumpSettings["correctionTargetHigh_mmolL"])
 
+                pumpSettings["correctionTargetTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                    pd.to_timedelta(pumpSettings[bgTargetColHead + ".start"], unit="ms")
 
-# %% CORRECTION TARGET
+                ctCH = commonColumnHeadings.copy()
+                ctCH.extend(["correctionTargetTime", "correctionTargetLow", "correctionTargetHigh"])
+                correctionTarget = pumpSettings.loc[pumpSettings["correctionTargetLow"].notnull(), ctCH]
 
 
 # %% BASAL RATES (TIME, VALUE, DURATION, TYPE (SCHEDULED, TEMP, SUSPEND))

From cd79cbdfc2f52d9e3669a108ad5b830e4bbd2328 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 11 Jan 2019 17:12:30 -0600
Subject: [PATCH 11/78] clean up unused pieces of code

---
 .../get-users-settings-and-events.py          | 103 +-----------------
 1 file changed, 3 insertions(+), 100 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index f40b61d6..b1b80492 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -13,17 +13,9 @@
 
 # %% REQUIRED LIBRARIES
 import pandas as pd
-import datetime as dt
 import numpy as np
 import tidals as td
 import os
-import sys
-import shutil
-import glob
-import argparse
-import hashlib
-import ast
-import time
 import pdb
 
 
@@ -34,28 +26,6 @@
 
 # %% FUNCTIONS
 
-dataFieldExportList = [
-        'activeSchedule', 'alarmType', 'annotations.code', 'annotations.threshold',
-        'annotations.value', 'basalSchedules', 'bgInput', 'bgTarget', 'bgTarget.high', 'bgTarget.low',
-        'bgTarget.range', 'bgTarget.start', 'bgTarget.target', 'bgTargets', 'bolus', 'carbInput',
-        'carbRatio', 'carbRatios', 'carbRatio.amount', 'carbRatio.start', 'change.agent',
-        'change.from', 'change.to', 'clockDriftOffset', 'computerTime', 'conversionOffset',
-        'deliveryType', 'deviceId', 'deviceManufacturers', 'deviceModel', 'deviceSerialNumber',
-        'deviceTags', 'deviceTime', 'duration', 'expectedDuration', 'expectedExtended',
-        'expectedNormal', 'extended', 'highAlerts.enabled', 'highAlerts.level',
-        'highAlerts.snooze', 'id', 'insulinCarbRatio', 'insulinOnBoard', 'insulinSensitivity',
-        'insulinSensitivity.amount', 'insulinSensitivity.start', 'insulinSensitivities',
-        'lowAlerts.enabled', 'lowAlerts.level', 'lowAlerts.snooze', 'normal',
-        'outOfRangeAlerts.enabled', 'outOfRangeAlerts.snooze',
-        'payload.calibration_reading', 'payload.Status', 'payload.Trend Arrow',
-        'payload.Trend Rate', 'percent', 'primeTarget', 'rate', 'rateOfChangeAlerts.fallRate.enabled',
-        'rateOfChangeAlerts.fallRate.rate', 'rateOfChangeAlerts.riseRate.enabled',
-        'rateOfChangeAlerts.riseRate.rate', 'reason.resumed', 'reason.suspended', 'recommended.carb',
-        'recommended.correction', 'recommended.net', 'scheduleName', 'status', 'subType',
-        'time', 'timeProcessing', 'timezone', 'timezoneOffset', 'transmitterId', 'type', 'units',
-        'units.bg', 'units.carb', 'uploadId', 'value', 'version'
-]
-
 # CLEAN DATA FUNCTIONS
 def removeNegativeDurations(df):
     if "duration" in list(df):
@@ -107,22 +77,6 @@ def tslimCalibrationFix(df):
 
 # OTHER
 def tempRemoveFields(df):
-    removeFields = ["basalSchedules",
-                    "bgTarget",
-                    "bgTargets",
-                    "carbRatio",
-                    "carbRatios",
-                    "insulinSensitivity",
-                    "insulinSensitivities"]
-
-    tempRemoveFields = list(set(df) & set(removeFields))
-    tempDf = df[tempRemoveFields]
-    df = df.drop(columns=tempRemoveFields)
-
-    return df, tempDf
-
-
-def tempRemoveFieldsV2(df):
     removeFields = ["suppressed",
                     "recommended",
                     "payload"]
@@ -134,59 +88,11 @@ def tempRemoveFieldsV2(df):
     return df, tempDf
 
 
-def removeBrackets(df, fieldName):
-    if fieldName in list(df):
-        df.loc[df[fieldName].notnull(), fieldName] = \
-            df.loc[df[fieldName].notnull(), fieldName].str[0]
-
-    return df
-
-
-def flattenJson(df, dataFieldsForExport):
-
-    # remove fields that we don't want to flatten
-    #df, holdData = tempRemoveFields(df)
-
-    # remove [] from annotations field
-    df = removeBrackets(df, "annotations")
-
-    # get a list of data types of column headings
-    columnHeadings = list(df)  # ["payload", "suppressed"]
-
-    # loop through each columnHeading
-    newDataFrame = pd.DataFrame()
-
-    for colHead in columnHeadings:
-        # if the df field has embedded json
-        if any(isinstance(item, dict) for item in df[colHead]):
-            # grab the data that is in brackets
-            jsonBlob = df[colHead][df[colHead].astype(str).str[0] == "{"]
-
-            # replace those values with nan
-            df.loc[jsonBlob.index, colHead] = np.nan
-
-            # turn jsonBlob to dataframe
-            newDataFrame = pd.concat([newDataFrame, pd.DataFrame(jsonBlob.tolist(),
-                                        index=jsonBlob.index).add_prefix(colHead + '.')], axis=1)
-
-    newColHeadings = list(newDataFrame)
-
-    # put df back into the main dataframe
-    # and add the fields that were removed back in
-    pdb.set_trace
-    columnFilter = list(set(newColHeadings) & set(dataFieldsForExport))
-    tempDataFrame = newDataFrame.filter(items=columnFilter)
-    df = pd.concat([df, tempDataFrame], axis=1)
-    #df = pd.concat([df, tempDataFrame, holdData], axis=1)
-
-    return df
-
-
-def flattenJsonV2(df, nEmbeddings):
+def flattenJson(df, nEmbeddings):
     # repeat this N times
     for nEmbed in range(0, nEmbeddings):
         # remove fields that we don't want to flatten
-        df, holdData = tempRemoveFieldsV2(df)
+        df, holdData = tempRemoveFields(df)
 
         # get a list of data types of column headings
         columnHeadings = list(df)  # ["payload", "suppressed"]
@@ -218,7 +124,6 @@ def flattenJsonV2(df, nEmbeddings):
     return df
 
 
-
 def mergeWizardWithBolus(df):
 
     if "wizard" in df["type"].unique():
@@ -422,9 +327,7 @@ def get_bolusDaySummary(bolusData):
         data.sort_values("time", inplace=True)
 
         # flatten the embedded json
-        #data = flattenJson(data, dataFieldExportList)
-        data = flattenJsonV2(data, 2)
-
+        data = flattenJson(data, 2)
 
 
 # %% CLEAN DATA

From 73e62289430fe74f3a88ebb10bbfa28ef8f8f117 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 11 Jan 2019 17:21:50 -0600
Subject: [PATCH 12/78] add logic for missing data

---
 .../get-users-settings-and-events.py          | 88 ++++++++++---------
 1 file changed, 47 insertions(+), 41 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index b1b80492..40ccf7bf 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -413,63 +413,65 @@ def get_bolusDaySummary(bolusData):
 
 
 # %% PUMP SETTINGS
-                pumpSettings = data[data.type == "pumpSettings"].copy().dropna(axis=1, how="all")
+                if "pumpSettings" in data.type.unique():
+                    pumpSettings = data[data.type == "pumpSettings"].copy().dropna(axis=1, how="all")
 
-                # ISF
-                if "insulinSensitivity.amount" in list(pumpSettings):
-                    isfColHead = "insulinSensitivity"
-                else:
-                    isfColHead = "insulinSensitivities"
+                    # ISF
+                    if "insulinSensitivity.amount" in list(pumpSettings):
+                        isfColHead = "insulinSensitivity"
+                    else:
+                        isfColHead = "insulinSensitivities"
 
-                pumpSettings["isf_mmolL_U"] = pumpSettings[isfColHead + ".amount"]
-                pumpSettings["isf"] = mmolL_to_mgdL(pumpSettings["isf_mmolL_U"])
-                pumpSettings["isfTime"] = pd.to_datetime(pumpSettings["day"]) + \
-                    pd.to_timedelta(pumpSettings[isfColHead + ".start"], unit="ms")
+                    pumpSettings["isf_mmolL_U"] = pumpSettings[isfColHead + ".amount"]
+                    pumpSettings["isf"] = mmolL_to_mgdL(pumpSettings["isf_mmolL_U"])
+                    pumpSettings["isfTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                        pd.to_timedelta(pumpSettings[isfColHead + ".start"], unit="ms")
 
-                isfCH = commonColumnHeadings.copy()
-                isfCH.extend(["isfTime", "isf", "isf_mmolL_U"])
-                isf = pumpSettings.loc[pumpSettings["isf"].notnull(), isfCH]
+                    isfCH = commonColumnHeadings.copy()
+                    isfCH.extend(["isfTime", "isf", "isf_mmolL_U"])
+                    isf = pumpSettings.loc[pumpSettings["isf"].notnull(), isfCH]
 
-                # CIR
-                if "carbRatio.amount" in list(pumpSettings):
-                    cirColHead = "carbRatio"
-                else:
-                    cirColHead = "carbRatios"
+                    # CIR
+                    if "carbRatio.amount" in list(pumpSettings):
+                        cirColHead = "carbRatio"
+                    else:
+                        cirColHead = "carbRatios"
 
-                pumpSettings["cir"] = pumpSettings[cirColHead + ".amount"]
-                pumpSettings["cirTime"] = pd.to_datetime(pumpSettings["day"]) + \
-                    pd.to_timedelta(pumpSettings[cirColHead + ".start"], unit="ms")
+                    pumpSettings["cir"] = pumpSettings[cirColHead + ".amount"]
+                    pumpSettings["cirTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                        pd.to_timedelta(pumpSettings[cirColHead + ".start"], unit="ms")
 
-                cirCH = commonColumnHeadings.copy()
-                cirCH.extend(["cirTime", "cir"])
-                cir = pumpSettings.loc[pumpSettings["cir"].notnull(), cirCH]
+                    cirCH = commonColumnHeadings.copy()
+                    cirCH.extend(["cirTime", "cir"])
+                    cir = pumpSettings.loc[pumpSettings["cir"].notnull(), cirCH]
 
 
-                # CORRECTION TARGET
-                if "bgTarget.start" in list(pumpSettings):
-                    bgTargetColHead = "bgTarget"
-                else:
-                    bgTargetColHead = "bgTargets"
+                    # CORRECTION TARGET
+                    if "bgTarget.start" in list(pumpSettings):
+                        bgTargetColHead = "bgTarget"
+                    else:
+                        bgTargetColHead = "bgTargets"
 
-                pumpSettings["correctionTargetLow_mmolL"] = pumpSettings[bgTargetColHead + ".low"]
-                pumpSettings["correctionTargetLow"] = \
-                    mmolL_to_mgdL(pumpSettings["correctionTargetLow_mmolL"])
+                    pumpSettings["correctionTargetLow_mmolL"] = pumpSettings[bgTargetColHead + ".low"]
+                    pumpSettings["correctionTargetLow"] = \
+                        mmolL_to_mgdL(pumpSettings["correctionTargetLow_mmolL"])
 
-                pumpSettings["correctionTargetHigh_mmolL"] = pumpSettings[bgTargetColHead + ".high"]
-                pumpSettings["correctionTargetHigh"] = \
-                    mmolL_to_mgdL(pumpSettings["correctionTargetHigh_mmolL"])
+                    pumpSettings["correctionTargetHigh_mmolL"] = pumpSettings[bgTargetColHead + ".high"]
+                    pumpSettings["correctionTargetHigh"] = \
+                        mmolL_to_mgdL(pumpSettings["correctionTargetHigh_mmolL"])
 
-                pumpSettings["correctionTargetTime"] = pd.to_datetime(pumpSettings["day"]) + \
-                    pd.to_timedelta(pumpSettings[bgTargetColHead + ".start"], unit="ms")
+                    pumpSettings["correctionTargetTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                        pd.to_timedelta(pumpSettings[bgTargetColHead + ".start"], unit="ms")
 
-                ctCH = commonColumnHeadings.copy()
-                ctCH.extend(["correctionTargetTime", "correctionTargetLow", "correctionTargetHigh"])
-                correctionTarget = pumpSettings.loc[pumpSettings["correctionTargetLow"].notnull(), ctCH]
+                    ctCH = commonColumnHeadings.copy()
+                    ctCH.extend(["correctionTargetTime", "correctionTargetLow", "correctionTargetHigh"])
+                    correctionTarget = pumpSettings.loc[pumpSettings["correctionTargetLow"].notnull(), ctCH]
 
 
 # %% BASAL RATES (TIME, VALUE, DURATION, TYPE (SCHEDULED, TEMP, SUSPEND))
 
 
+
 # %% LOOP DATA (BINARY T/F)
 
 
@@ -489,8 +491,12 @@ def get_bolusDaySummary(bolusData):
 
 
 # %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
+                else:
+                    metadata["flags"] = "no pump settings"
+            else:
+                metadata["flags"] = "no bolus wizard data"
         else:
-            metadata["flags"] = "no bolus wizard data"
+            metadata["flags"] = "no upload data"
     else:
         metadata["flags"] = "file contains no data"
 else:

From 71f90006ebceb57ff734f74dc3afd5e29f90231b Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 11 Jan 2019 20:59:18 -0600
Subject: [PATCH 13/78] get actual basal rates and scheduled basal rates

---
 .../get-users-settings-and-events.py          | 92 ++++++++++++++++---
 1 file changed, 80 insertions(+), 12 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 40ccf7bf..8831f4ab 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -88,17 +88,16 @@ def tempRemoveFields(df):
     return df, tempDf
 
 
-def flattenJson(df, nEmbeddings):
-    # repeat this N times
-    for nEmbed in range(0, nEmbeddings):
-        # remove fields that we don't want to flatten
-        df, holdData = tempRemoveFields(df)
+def flattenJson(df):
 
-        # get a list of data types of column headings
-        columnHeadings = list(df)  # ["payload", "suppressed"]
+    # remove fields that we don't want to flatten
+    df, holdData = tempRemoveFields(df)
 
-        # loop through each columnHeading
-        newDataFrame = pd.DataFrame()
+    # get a list of data types of column headings
+    columnHeadings = list(df)
+
+    # loop through each columnHeading
+    newDataFrame = pd.DataFrame()
 
         for colHead in columnHeadings:
             if any(isinstance(item, list) for item in df[colHead]):
@@ -295,6 +294,32 @@ def get_bolusDaySummary(bolusData):
     return bolusDaySummary
 
 
+def get_basalDaySummary(basal):
+    # group data by day
+    basalByDay = basal.groupby(basal["day"])
+
+    # total basal insulin per day
+    basalDaySummary = pd.DataFrame(basalByDay.totalAmountOfBasalInsulin.sum())
+
+    # total number of basals types per day
+    basalTypePerDay = basal.groupby(["day", "deliveryType"]).size().unstack()
+
+    basalDaySummary["numberOfScheduledBasals"] = basalTypePerDay["scheduled"].fillna(0)
+    if "suspend" not in list(basalTypePerDay):
+        basalDaySummary["numberOfSuspendedBasals"] = 0
+    else:
+        basalDaySummary["numberOfSuspendedBasals"] = basalTypePerDay["suspend"].fillna(0)
+    if "temp" not in list(basalTypePerDay):
+        basalDaySummary["numberOfTempBasals"] = 0
+    else:
+        basalDaySummary["numberOfTempBasals"] = basalTypePerDay["temp"].fillna(0)
+
+    basalDaySummary["totalNumberOfBasals"] = basalDaySummary["numberOfScheduledBasals"] + \
+                                 basalDaySummary["numberOfTempBasals"]
+
+    return basalDaySummary
+
+
 # %% LOAD IN ONE FILE, BUT EVENTUALLY THIS WILL LOOOP THROUGH ALL USER'S
 dataPulledDate = "2018-09-28"
 phiDate = "PHI-" + dataPulledDate
@@ -323,14 +348,15 @@ def get_bolusDaySummary(bolusData):
     metadata["fileSizeKB"] = fileSize / 1000
     if fileSize > 1000:
         data = td.load.load_json(jsonFileName)
+
         # sort the data by time
         data.sort_values("time", inplace=True)
 
         # flatten the embedded json
-        data = flattenJson(data, 2)
+        data = flattenJson(data)
 
 
-# %% CLEAN DATA
+        # %% CLEAN DATA
         # remove negative durations
         data, nNegativeDurations = removeNegativeDurations(data)
         metadata["nNegativeDurations"] = nNegativeDurations
@@ -467,9 +493,48 @@ def get_bolusDaySummary(bolusData):
                     ctCH.extend(["correctionTargetTime", "correctionTargetLow", "correctionTargetHigh"])
                     correctionTarget = pumpSettings.loc[pumpSettings["correctionTargetLow"].notnull(), ctCH]
 
+                    # SCHEDULED BASAL RATES
+                    sbrCH = commonColumnHeadings.copy()
+                    sbrCH.extend(["time", "rate"])
+                    sbr = pd.DataFrame(columns=sbrCH)
+                    for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
+                        tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
+                        tempDF["day"] = pumpSettings.loc[p, "day"]
+                        tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
+                        tempDF["age"] = pumpSettings.loc[p, "age"]
+                        tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
+                        tempDF["time"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                        sbr = pd.concat([sbr, tempDF[sbrCH]], ignore_index=True)
+
+
+                    # %% ACTUAL BASAL RATES (TIME, VALUE, DURATION, TYPE (SCHEDULED, TEMP, SUSPEND))
+                    if "basal" in data.type.unique():
+                        basal = data[data.type == "basal"].copy().dropna(axis=1, how="all")
+                        basal.sort_values("uploadTime", ascending=False, inplace=True)
+
+                        basal, nBasalDuplicatesRemoved = \
+                            td.clean.remove_duplicates(basal, basal[["deliveryType", "deviceTime", "duration", "rate"]])
+                        metadata["basal.nBasalDuplicatesRemoved"] = nBasalDuplicatesRemoved
 
-# %% BASAL RATES (TIME, VALUE, DURATION, TYPE (SCHEDULED, TEMP, SUSPEND))
+                        # fill NaNs with 0, as it indicates a suspend (temp basal of 0)
+                        basal.rate.fillna(0, inplace=True)
 
+                        # get rid of basals that have durations of 0
+                        nBasalDuration0 = sum(basal.duration > 0)
+                        basal = basal[basal.duration > 0]
+                        metadata["basal.nBasalDuration0"] = nBasalDuration0
+
+                        # get rid of basal durations that are unrealistic
+                        nUnrealisticBasalDuration = ((basal.duration < 0) | (basal.duration > 86400000))
+                        metadata["nUnrealisticBasalDuration"] = sum(nUnrealisticBasalDuration)
+                        basal.loc[nUnrealisticBasalDuration, "duration"] = np.nan
+
+                        # calculate the total amount of insulin delivered (duration * rate)
+                        basal["durationHours"] = basal["duration"] / 1000.0 / 3600.0
+                        basal["totalAmountOfBasalInsulin"] = basal["durationHours"] * basal["rate"]
+
+                        # get a summary of basals per day
+                        basalDaySummary = get_basalDaySummary(basal)
 
 
 # %% LOOP DATA (BINARY T/F)
@@ -491,6 +556,8 @@ def get_bolusDaySummary(bolusData):
 
 
 # %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
+                    else:
+                        metadata["flags"] = "no basal data"
                 else:
                     metadata["flags"] = "no pump settings"
             else:
@@ -503,6 +570,7 @@ def get_bolusDaySummary(bolusData):
     metadata["flags"] = "file does not exist"
 
 # %% V2 DATA TO GRAB
+# RE-EVALUATE THE WAY EXTENDED BOLUSES ARE BEING ACCOUNTED (ARE THEY ALSO SHOWING UP IN BASAL DATA?)
 # ALERT SETTINGS
 # ESTIMATED LOCAL TIME
 # PUMP AND CGM DEVICE ()

From eb7a77dd080cb5d837489533013f9d4e96e73177 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 11 Jan 2019 20:59:31 -0600
Subject: [PATCH 14/78] changes related to time

---
 .../get-users-settings-and-events.py          | 57 +++++++++++--------
 1 file changed, 34 insertions(+), 23 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 8831f4ab..91e348a1 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -99,24 +99,24 @@ def flattenJson(df):
     # loop through each columnHeading
     newDataFrame = pd.DataFrame()
 
-        for colHead in columnHeadings:
-            if any(isinstance(item, list) for item in df[colHead]):
-                listBlob = df[colHead][df[colHead].astype(str).str[0] == "["]
-                df.loc[listBlob.index, colHead] = df.loc[listBlob.index, colHead].str[0]
+    for colHead in columnHeadings:
+        if any(isinstance(item, list) for item in df[colHead]):
+            listBlob = df[colHead][df[colHead].astype(str).str[0] == "["]
+            df.loc[listBlob.index, colHead] = df.loc[listBlob.index, colHead].str[0]
 
-            # if the df field has embedded json
-            if any(isinstance(item, dict) for item in df[colHead]):
-                # grab the data that is in brackets
-                jsonBlob = df[colHead][df[colHead].astype(str).str[0] == "{"]
+        # if the df field has embedded json
+        if any(isinstance(item, dict) for item in df[colHead]):
+            # grab the data that is in brackets
+            jsonBlob = df[colHead][df[colHead].astype(str).str[0] == "{"]
 
-                # replace those values with nan
-                df.loc[jsonBlob.index, colHead] = np.nan
+            # replace those values with nan
+            df.loc[jsonBlob.index, colHead] = np.nan
 
-                # turn jsonBlob to dataframe
-                newDataFrame = pd.concat([newDataFrame, pd.DataFrame(jsonBlob.tolist(),
-                                         index=jsonBlob.index).add_prefix(colHead + '.')], axis=1)
+            # turn jsonBlob to dataframe
+            newDataFrame = pd.concat([newDataFrame, pd.DataFrame(jsonBlob.tolist(),
+                                     index=jsonBlob.index).add_prefix(colHead + '.')], axis=1)
 
-        df = pd.concat([df, newDataFrame, holdData], axis=1)
+    df = pd.concat([df, newDataFrame, holdData], axis=1)
 
     df.sort_index(axis=1, inplace=True)
 
@@ -334,6 +334,7 @@ def get_basalDaySummary(basal):
 # %% ID, HASHID, AGE, & YLW
 userID = donors.userID[dIndex]
 hashID = donors.hashID[dIndex]
+# round all birthdays and diagnosis dates to the first day of the month (to protect identities)
 bDate = pd.to_datetime(donors.bDay[dIndex][0:7])
 dDate = pd.to_datetime(donors.dDay[dIndex][0:7])
 
@@ -370,7 +371,7 @@ def get_basalDaySummary(basal):
         metadata["nTandemAndPayloadCalReadings"] = nTandemAndPayloadCalReadings
 
 
-# %% ADD UPLOAD DATE
+        # %% ADD UPLOAD DATE
         # attach upload time to each record, for resolving duplicates
         if "upload" in data.type.unique():
             data = addUploadDate(data)
@@ -388,9 +389,10 @@ def get_basalDaySummary(basal):
             data = round_time(data, timeIntervalMinutes=5, timeField="time",
                               roundedTimeFieldName="roundedTime", startWithFirstRecord=True,
                               verbose=False)
+            data.sort_values("uploadTime", ascending=False, inplace=True)
 
 
-# %% ID, HASHID, AGE, & YLW
+            # %% ID, HASHID, AGE, & YLW
             data["userID"] = userID
             data["hashID"] = hashID
             data["age"] = np.floor((data["utcTime"] - bDate).dt.days/365.25).astype(int)
@@ -401,12 +403,13 @@ def get_basalDaySummary(basal):
                                     "ylw"]
 
 
-# %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
+            # %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
             bolus = mergeWizardWithBolus(data)
             if len(bolus) > 0:
                 # get rid of duplicates that have the same ["time", "normal"]
+                bolus.sort_values("uploadTime", ascending=False, inplace=True)
                 bolus, nBolusDuplicatesRemoved = \
-                    td.clean.remove_duplicates(bolus, bolus[["time", "normal"]])
+                    td.clean.remove_duplicates(bolus, bolus[["deviceTime", "normal"]])
                 metadata["nBolusDuplicatesRemoved"] = nBolusDuplicatesRemoved
 
                 # get a summary of boluses per day
@@ -438,23 +441,29 @@ def get_basalDaySummary(basal):
                 bolusEvents.loc[bolusEvents["carbInput"] == 0, "eventType"] = "meal"
 
 
-# %% PUMP SETTINGS
+                # %% PUMP SETTINGS
                 if "pumpSettings" in data.type.unique():
                     pumpSettings = data[data.type == "pumpSettings"].copy().dropna(axis=1, how="all")
+                    pumpSettings.sort_values("uploadTime", ascending=False, inplace=True)
+
+                    pumpSettings, nPumpSettingsDuplicatesRemoved = \
+                    td.clean.remove_duplicates(pumpSettings, pumpSettings[["deviceTime"]])
+                    metadata["nPumpSettingsDuplicatesRemoved"] = nPumpSettingsDuplicatesRemoved
 
                     # ISF
                     if "insulinSensitivity.amount" in list(pumpSettings):
                         isfColHead = "insulinSensitivity"
                     else:
                         isfColHead = "insulinSensitivities"
+                        pdb.set_trace()
 
                     pumpSettings["isf_mmolL_U"] = pumpSettings[isfColHead + ".amount"]
                     pumpSettings["isf"] = mmolL_to_mgdL(pumpSettings["isf_mmolL_U"])
-                    pumpSettings["isfTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                    pumpSettings["time"] = pd.to_datetime(pumpSettings["day"]) + \
                         pd.to_timedelta(pumpSettings[isfColHead + ".start"], unit="ms")
 
                     isfCH = commonColumnHeadings.copy()
-                    isfCH.extend(["isfTime", "isf", "isf_mmolL_U"])
+                    isfCH.extend(["time", "isf", "isf_mmolL_U"])
                     isf = pumpSettings.loc[pumpSettings["isf"].notnull(), isfCH]
 
                     # CIR
@@ -462,13 +471,14 @@ def get_basalDaySummary(basal):
                         cirColHead = "carbRatio"
                     else:
                         cirColHead = "carbRatios"
+                        pdb.set_trace()
 
                     pumpSettings["cir"] = pumpSettings[cirColHead + ".amount"]
-                    pumpSettings["cirTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                    pumpSettings["time"] = pd.to_datetime(pumpSettings["day"]) + \
                         pd.to_timedelta(pumpSettings[cirColHead + ".start"], unit="ms")
 
                     cirCH = commonColumnHeadings.copy()
-                    cirCH.extend(["cirTime", "cir"])
+                    cirCH.extend(["time", "cir"])
                     cir = pumpSettings.loc[pumpSettings["cir"].notnull(), cirCH]
 
 
@@ -477,6 +487,7 @@ def get_basalDaySummary(basal):
                         bgTargetColHead = "bgTarget"
                     else:
                         bgTargetColHead = "bgTargets"
+                        pdb.set_trace()
 
                     pumpSettings["correctionTargetLow_mmolL"] = pumpSettings[bgTargetColHead + ".low"]
                     pumpSettings["correctionTargetLow"] = \

From aa007045d849bbf7d533436ba37dc29d983f8c6c Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 11 Jan 2019 21:48:00 -0600
Subject: [PATCH 15/78] expand correction target cases and auto mode basal
 rates

---
 .../get-users-settings-and-events.py          | 79 ++++++++++++++-----
 1 file changed, 58 insertions(+), 21 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 91e348a1..bba04127 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -329,7 +329,7 @@ def get_basalDaySummary(basal):
 donors = td.load.load_csv(os.path.join(donorPath, donorList))
 
 # this is where the loop will go:
-dIndex = 2379
+dIndex = 2
 
 # %% ID, HASHID, AGE, & YLW
 userID = donors.userID[dIndex]
@@ -459,11 +459,11 @@ def get_basalDaySummary(basal):
 
                     pumpSettings["isf_mmolL_U"] = pumpSettings[isfColHead + ".amount"]
                     pumpSettings["isf"] = mmolL_to_mgdL(pumpSettings["isf_mmolL_U"])
-                    pumpSettings["time"] = pd.to_datetime(pumpSettings["day"]) + \
+                    pumpSettings["isfTime"] = pd.to_datetime(pumpSettings["day"]) + \
                         pd.to_timedelta(pumpSettings[isfColHead + ".start"], unit="ms")
 
                     isfCH = commonColumnHeadings.copy()
-                    isfCH.extend(["time", "isf", "isf_mmolL_U"])
+                    isfCH.extend(["isfTime", "isf", "isf_mmolL_U"])
                     isf = pumpSettings.loc[pumpSettings["isf"].notnull(), isfCH]
 
                     # CIR
@@ -474,11 +474,11 @@ def get_basalDaySummary(basal):
                         pdb.set_trace()
 
                     pumpSettings["cir"] = pumpSettings[cirColHead + ".amount"]
-                    pumpSettings["time"] = pd.to_datetime(pumpSettings["day"]) + \
+                    pumpSettings["cirTime"] = pd.to_datetime(pumpSettings["day"]) + \
                         pd.to_timedelta(pumpSettings[cirColHead + ".start"], unit="ms")
 
                     cirCH = commonColumnHeadings.copy()
-                    cirCH.extend(["time", "cir"])
+                    cirCH.extend(["cirTime", "cir"])
                     cir = pumpSettings.loc[pumpSettings["cir"].notnull(), cirCH]
 
 
@@ -489,32 +489,72 @@ def get_basalDaySummary(basal):
                         bgTargetColHead = "bgTargets"
                         pdb.set_trace()
 
-                    pumpSettings["correctionTargetLow_mmolL"] = pumpSettings[bgTargetColHead + ".low"]
-                    pumpSettings["correctionTargetLow"] = \
-                        mmolL_to_mgdL(pumpSettings["correctionTargetLow_mmolL"])
+                    # low
+                    if bgTargetColHead + ".low" in list(pumpSettings):
+                        pumpSettings["correctionTargetLow_mmolL"] = pumpSettings[bgTargetColHead + ".low"]
+                        pumpSettings["correctionTargetLow"] = \
+                            mmolL_to_mgdL(pumpSettings["correctionTargetLow_mmolL"])
+                    else:
+                        pumpSettings["correctionTargetLow_mmolL"] = np.nan
+                        pumpSettings["correctionTargetLow"] = np.nan
+
+                    # high
+                    if bgTargetColHead + ".high" in list(pumpSettings):
+                        pumpSettings["correctionTargetHigh_mmolL"] = pumpSettings[bgTargetColHead + ".high"]
+                        pumpSettings["correctionTargetHigh"] = \
+                            mmolL_to_mgdL(pumpSettings["correctionTargetHigh_mmolL"])
+
+                    else:
+                        pumpSettings["correctionTargetHigh_mmolL"] = np.nan
+                        pumpSettings["correctionTargetHigh"] = np.nan
+
+                    # target
+                    if bgTargetColHead + ".target" in list(pumpSettings):
+                        pumpSettings["correctionTarget_mmolL"] = pumpSettings[bgTargetColHead + ".target"]
+                        pumpSettings["correctionTarget"] = \
+                            mmolL_to_mgdL(pumpSettings["correctionTarget_mmolL"])
+
+                    else:
+                        pumpSettings["correctionTarget_mmolL"] = np.nan
+                        pumpSettings["correctionTarget"] = np.nan
 
-                    pumpSettings["correctionTargetHigh_mmolL"] = pumpSettings[bgTargetColHead + ".high"]
-                    pumpSettings["correctionTargetHigh"] = \
-                        mmolL_to_mgdL(pumpSettings["correctionTargetHigh_mmolL"])
+                    # range
+                    if bgTargetColHead + ".range" in list(pumpSettings):
+                        pumpSettings["correctionTargetRange_mmolL"] = pumpSettings[bgTargetColHead + ".range"]
+                        pumpSettings["correctionTargetRange"] = \
+                            mmolL_to_mgdL(pumpSettings["correctionTargetRange_mmolL"])
 
-                    pumpSettings["correctionTargetTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                    else:
+                        pumpSettings["correctionTargetRange_mmolL"] = np.nan
+                        pumpSettings["correctionTargetRange"] =np.nan
+
+                    pumpSettings["ctTime"] = pd.to_datetime(pumpSettings["day"]) + \
                         pd.to_timedelta(pumpSettings[bgTargetColHead + ".start"], unit="ms")
 
                     ctCH = commonColumnHeadings.copy()
-                    ctCH.extend(["correctionTargetTime", "correctionTargetLow", "correctionTargetHigh"])
-                    correctionTarget = pumpSettings.loc[pumpSettings["correctionTargetLow"].notnull(), ctCH]
+                    ctCH.extend(["ctTime", "correctionTargetLow", "correctionTargetHigh",
+                                 "correctionTarget", "correctionTargetRange"])
+                    correctionTarget = pumpSettings.loc[pumpSettings["ctTime"].notnull(), ctCH]
 
                     # SCHEDULED BASAL RATES
                     sbrCH = commonColumnHeadings.copy()
-                    sbrCH.extend(["time", "rate"])
+                    sbrCH.extend(["sbrTime", "rate", "type"])
                     sbr = pd.DataFrame(columns=sbrCH)
                     for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
-                        tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
-                        tempDF["day"] = pumpSettings.loc[p, "day"]
+                        if 'Auto Mode' not in actSched:
+                            tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
+                            tempDF["day"] = pumpSettings.loc[p, "day"]
+                            tempDF["type"] = np.nan
+                            tempDF["sbrTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                        else:
+                            tempDF = pd.DataFrame(index=[0])
+                            tempDF["sbrTime"] = np.nan
+                            tempDF["rate"] = np.nan
+                            tempDF["type"] = "AutoMode"
+
                         tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
                         tempDF["age"] = pumpSettings.loc[p, "age"]
                         tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
-                        tempDF["time"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
                         sbr = pd.concat([sbr, tempDF[sbrCH]], ignore_index=True)
 
 
@@ -551,9 +591,6 @@ def get_basalDaySummary(basal):
 # %% LOOP DATA (BINARY T/F)
 
 
-
-
-
 # %% CGM DATA
 
 

From 88682afe10d885d2a409bb110084a29042cc7da8 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sat, 12 Jan 2019 05:58:25 -0600
Subject: [PATCH 16/78] handle insulinSensitivities and carbRatios schedules

---
 .../get-users-settings-and-events.py          | 521 ++++++++++--------
 1 file changed, 276 insertions(+), 245 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index bba04127..9fb1f148 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -32,6 +32,8 @@ def removeNegativeDurations(df):
         nNegativeDurations = sum(df.duration < 0)
         if nNegativeDurations > 0:
             df = df[~(df.duration < 0)]
+    else:
+        nNegativeDurations = np.nan
 
     return df, nNegativeDurations
 
@@ -329,295 +331,324 @@ def get_basalDaySummary(basal):
 donors = td.load.load_csv(os.path.join(donorPath, donorList))
 
 # this is where the loop will go:
-dIndex = 2
+for dIndex in range(0, len(donors)):
 
-# %% ID, HASHID, AGE, & YLW
-userID = donors.userID[dIndex]
-hashID = donors.hashID[dIndex]
-# round all birthdays and diagnosis dates to the first day of the month (to protect identities)
-bDate = pd.to_datetime(donors.bDay[dIndex][0:7])
-dDate = pd.to_datetime(donors.dDay[dIndex][0:7])
-
-
-# %% LOAD IN DONOR JSON DATA
-metadata = pd.DataFrame(index=[dIndex])
-jsonDataPath = os.path.join(donorPath, phiDate + "-donorJsonData")
-jsonFileName = os.path.join(jsonDataPath, "PHI-" + userID + ".json")
-
-if os.path.exists(jsonFileName):
-    fileSize = os.stat(jsonFileName).st_size
-    metadata["fileSizeKB"] = fileSize / 1000
-    if fileSize > 1000:
-        data = td.load.load_json(jsonFileName)
-
-        # sort the data by time
-        data.sort_values("time", inplace=True)
-
-        # flatten the embedded json
-        data = flattenJson(data)
-
-
-        # %% CLEAN DATA
-        # remove negative durations
-        data, nNegativeDurations = removeNegativeDurations(data)
-        metadata["nNegativeDurations"] = nNegativeDurations
-
-        # get rid of cgm values too low/high (< 38 & > 402 mg/dL)
-        data, nInvalidCgmValues = removeInvalidCgmValues(data)
-        metadata["nInvalidCgmValues"] = nInvalidCgmValues
-
-        # Tslim calibration bug fix
-        data, nTandemAndPayloadCalReadings = tslimCalibrationFix(data)
-        metadata["nTandemAndPayloadCalReadings"] = nTandemAndPayloadCalReadings
-
-
-        # %% ADD UPLOAD DATE
-        # attach upload time to each record, for resolving duplicates
-        if "upload" in data.type.unique():
-            data = addUploadDate(data)
-
-
-# %% TIME (UTC, TIMEZONE, DAY AND EVENTUALLY LOCAL TIME)
-            data["utcTime"] = pd.to_datetime(data["time"])
-            data["timezone"].fillna(method='ffill', inplace=True)
-            data["timezone"].fillna(method='bfill', inplace=True)
-            data["day"] = pd.DatetimeIndex(data["utcTime"]).date
-
-            # round to the nearest 5 minutes
-            # TODO: once roundTime is pushed to tidals repository then this line can be replaced
-            # with td.clean.round_time
-            data = round_time(data, timeIntervalMinutes=5, timeField="time",
-                              roundedTimeFieldName="roundedTime", startWithFirstRecord=True,
-                              verbose=False)
-            data.sort_values("uploadTime", ascending=False, inplace=True)
-
-
-            # %% ID, HASHID, AGE, & YLW
-            data["userID"] = userID
-            data["hashID"] = hashID
-            data["age"] = np.floor((data["utcTime"] - bDate).dt.days/365.25).astype(int)
-            data["ylw"] = np.floor((data["utcTime"] - dDate).dt.days/365.25).astype(int)
-
-            commonColumnHeadings = ["hashID",
-                                    "age",
-                                    "ylw"]
-
-
-            # %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
-            bolus = mergeWizardWithBolus(data)
-            if len(bolus) > 0:
-                # get rid of duplicates that have the same ["time", "normal"]
-                bolus.sort_values("uploadTime", ascending=False, inplace=True)
-                bolus, nBolusDuplicatesRemoved = \
-                    td.clean.remove_duplicates(bolus, bolus[["deviceTime", "normal"]])
-                metadata["nBolusDuplicatesRemoved"] = nBolusDuplicatesRemoved
-
-                # get a summary of boluses per day
-                bolusDaySummary = get_bolusDaySummary(bolus)
-
-                if "extended" not in bolus:
-                    bolus["extended"] = np.nan
-                    bolus["duration"] = np.nan
-
-                # ISF associated with bolus event
-                if "insulinSensitivities" in list(bolus):
-                    pdb.set_trace()
-                if "carbRatios" in list(bolus):
-                    pdb.set_trace()
-
-                bolus["isf_mmolL_U"] = bolus["insulinSensitivity"]
-                bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
-
-                bolusCH = commonColumnHeadings.copy()
-                bolusCH.extend(["utcTime", "roundedTime", "normal", "carbInput", "subType",
-                                "insulinOnBoard", "bgInput",
-                                "isf", "isf_mmolL_U", "insulinCarbRatio"])
-                bolusEvents = bolus.loc[bolus["normal"].notnull(), bolusCH]
-                bolusEvents.loc[bolusEvents["bgInput"] == 0, "bgInput"] = np.nan
-                bolusEvents = bolusEvents.rename(columns={"normal": "unitsInsulin",
-                                                          "bgInput": "bg_mmolL"})
-                bolusEvents["bg_mgdL"] = mmolL_to_mgdL(bolusEvents["bg_mmolL"])
-                bolusEvents["eventType"] = "correction"
-                bolusEvents.loc[bolusEvents["carbInput"] == 0, "eventType"] = "meal"
-
-
-                # %% PUMP SETTINGS
-                if "pumpSettings" in data.type.unique():
-                    pumpSettings = data[data.type == "pumpSettings"].copy().dropna(axis=1, how="all")
-                    pumpSettings.sort_values("uploadTime", ascending=False, inplace=True)
-
-                    pumpSettings, nPumpSettingsDuplicatesRemoved = \
-                    td.clean.remove_duplicates(pumpSettings, pumpSettings[["deviceTime"]])
-                    metadata["nPumpSettingsDuplicatesRemoved"] = nPumpSettingsDuplicatesRemoved
-
-                    # ISF
-                    if "insulinSensitivity.amount" in list(pumpSettings):
-                        isfColHead = "insulinSensitivity"
-                    else:
-                        isfColHead = "insulinSensitivities"
-                        pdb.set_trace()
+    # %% ID, HASHID, AGE, & YLW
+    userID = donors.userID[dIndex]
+    hashID = donors.hashID[dIndex]
+    # round all birthdays and diagnosis dates to the first day of the month (to protect identities)
+    bDate = pd.to_datetime(donors.bDay[dIndex][0:7])
+    dDate = pd.to_datetime(donors.dDay[dIndex][0:7])
 
-                    pumpSettings["isf_mmolL_U"] = pumpSettings[isfColHead + ".amount"]
-                    pumpSettings["isf"] = mmolL_to_mgdL(pumpSettings["isf_mmolL_U"])
-                    pumpSettings["isfTime"] = pd.to_datetime(pumpSettings["day"]) + \
-                        pd.to_timedelta(pumpSettings[isfColHead + ".start"], unit="ms")
 
-                    isfCH = commonColumnHeadings.copy()
-                    isfCH.extend(["isfTime", "isf", "isf_mmolL_U"])
-                    isf = pumpSettings.loc[pumpSettings["isf"].notnull(), isfCH]
+    # %% LOAD IN DONOR JSON DATA
+    metadata = pd.DataFrame(index=[dIndex])
+    jsonDataPath = os.path.join(donorPath, phiDate + "-donorJsonData")
+    jsonFileName = os.path.join(jsonDataPath, "PHI-" + userID + ".json")
 
-                    # CIR
-                    if "carbRatio.amount" in list(pumpSettings):
-                        cirColHead = "carbRatio"
-                    else:
-                        cirColHead = "carbRatios"
-                        pdb.set_trace()
+    if os.path.exists(jsonFileName):
+        fileSize = os.stat(jsonFileName).st_size
+        metadata["fileSizeKB"] = fileSize / 1000
+        if fileSize > 1000:
+            data = td.load.load_json(jsonFileName)
 
-                    pumpSettings["cir"] = pumpSettings[cirColHead + ".amount"]
-                    pumpSettings["cirTime"] = pd.to_datetime(pumpSettings["day"]) + \
-                        pd.to_timedelta(pumpSettings[cirColHead + ".start"], unit="ms")
+            # sort the data by time
+            data.sort_values("time", inplace=True)
 
-                    cirCH = commonColumnHeadings.copy()
-                    cirCH.extend(["cirTime", "cir"])
-                    cir = pumpSettings.loc[pumpSettings["cir"].notnull(), cirCH]
+            # flatten the embedded json
+            data = flattenJson(data)
 
 
-                    # CORRECTION TARGET
-                    if "bgTarget.start" in list(pumpSettings):
-                        bgTargetColHead = "bgTarget"
-                    else:
-                        bgTargetColHead = "bgTargets"
-                        pdb.set_trace()
+            # %% CLEAN DATA
+            # remove negative durations
+            data, nNegativeDurations = removeNegativeDurations(data)
+            metadata["nNegativeDurations"] = nNegativeDurations
 
-                    # low
-                    if bgTargetColHead + ".low" in list(pumpSettings):
-                        pumpSettings["correctionTargetLow_mmolL"] = pumpSettings[bgTargetColHead + ".low"]
-                        pumpSettings["correctionTargetLow"] = \
-                            mmolL_to_mgdL(pumpSettings["correctionTargetLow_mmolL"])
-                    else:
-                        pumpSettings["correctionTargetLow_mmolL"] = np.nan
-                        pumpSettings["correctionTargetLow"] = np.nan
+            # get rid of cgm values too low/high (< 38 & > 402 mg/dL)
+            data, nInvalidCgmValues = removeInvalidCgmValues(data)
+            metadata["nInvalidCgmValues"] = nInvalidCgmValues
 
-                    # high
-                    if bgTargetColHead + ".high" in list(pumpSettings):
-                        pumpSettings["correctionTargetHigh_mmolL"] = pumpSettings[bgTargetColHead + ".high"]
-                        pumpSettings["correctionTargetHigh"] = \
-                            mmolL_to_mgdL(pumpSettings["correctionTargetHigh_mmolL"])
+            # Tslim calibration bug fix
+            data, nTandemAndPayloadCalReadings = tslimCalibrationFix(data)
+            metadata["nTandemAndPayloadCalReadings"] = nTandemAndPayloadCalReadings
 
-                    else:
-                        pumpSettings["correctionTargetHigh_mmolL"] = np.nan
-                        pumpSettings["correctionTargetHigh"] = np.nan
 
-                    # target
-                    if bgTargetColHead + ".target" in list(pumpSettings):
-                        pumpSettings["correctionTarget_mmolL"] = pumpSettings[bgTargetColHead + ".target"]
-                        pumpSettings["correctionTarget"] = \
-                            mmolL_to_mgdL(pumpSettings["correctionTarget_mmolL"])
+            # %% ADD UPLOAD DATE
+            # attach upload time to each record, for resolving duplicates
+            if "upload" in data.type.unique():
+                data = addUploadDate(data)
 
-                    else:
-                        pumpSettings["correctionTarget_mmolL"] = np.nan
-                        pumpSettings["correctionTarget"] = np.nan
 
-                    # range
-                    if bgTargetColHead + ".range" in list(pumpSettings):
-                        pumpSettings["correctionTargetRange_mmolL"] = pumpSettings[bgTargetColHead + ".range"]
-                        pumpSettings["correctionTargetRange"] = \
-                            mmolL_to_mgdL(pumpSettings["correctionTargetRange_mmolL"])
+                # %% TIME (UTC, TIMEZONE, DAY AND EVENTUALLY LOCAL TIME)
+                data["utcTime"] = pd.to_datetime(data["time"])
+                data["timezone"].fillna(method='ffill', inplace=True)
+                data["timezone"].fillna(method='bfill', inplace=True)
+                data["day"] = pd.DatetimeIndex(data["utcTime"]).date
 
-                    else:
-                        pumpSettings["correctionTargetRange_mmolL"] = np.nan
-                        pumpSettings["correctionTargetRange"] =np.nan
-
-                    pumpSettings["ctTime"] = pd.to_datetime(pumpSettings["day"]) + \
-                        pd.to_timedelta(pumpSettings[bgTargetColHead + ".start"], unit="ms")
-
-                    ctCH = commonColumnHeadings.copy()
-                    ctCH.extend(["ctTime", "correctionTargetLow", "correctionTargetHigh",
-                                 "correctionTarget", "correctionTargetRange"])
-                    correctionTarget = pumpSettings.loc[pumpSettings["ctTime"].notnull(), ctCH]
-
-                    # SCHEDULED BASAL RATES
-                    sbrCH = commonColumnHeadings.copy()
-                    sbrCH.extend(["sbrTime", "rate", "type"])
-                    sbr = pd.DataFrame(columns=sbrCH)
-                    for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
-                        if 'Auto Mode' not in actSched:
-                            tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
-                            tempDF["day"] = pumpSettings.loc[p, "day"]
-                            tempDF["type"] = np.nan
-                            tempDF["sbrTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                # round to the nearest 5 minutes
+                # TODO: once roundTime is pushed to tidals repository then this line can be replaced
+                # with td.clean.round_time
+                data = round_time(data, timeIntervalMinutes=5, timeField="time",
+                                  roundedTimeFieldName="roundedTime", startWithFirstRecord=True,
+                                  verbose=False)
+                data.sort_values("uploadTime", ascending=False, inplace=True)
+
+
+                # %% ID, HASHID, AGE, & YLW
+                data["userID"] = userID
+                data["hashID"] = hashID
+                data["age"] = np.floor((data["utcTime"] - bDate).dt.days/365.25).astype(int)
+                data["ylw"] = np.floor((data["utcTime"] - dDate).dt.days/365.25).astype(int)
+
+                commonColumnHeadings = ["hashID",
+                                        "age",
+                                        "ylw"]
+
+
+                # %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
+                bolus = mergeWizardWithBolus(data)
+                if len(bolus) > 0:
+                    # get rid of duplicates that have the same ["time", "normal"]
+                    bolus.sort_values("uploadTime", ascending=False, inplace=True)
+                    bolus, nBolusDuplicatesRemoved = \
+                        td.clean.remove_duplicates(bolus, bolus[["deviceTime", "normal"]])
+                    metadata["nBolusDuplicatesRemoved"] = nBolusDuplicatesRemoved
+
+                    # get a summary of boluses per day
+                    bolusDaySummary = get_bolusDaySummary(bolus)
+
+                    if "extended" not in bolus:
+                        bolus["extended"] = np.nan
+                        bolus["duration"] = np.nan
+
+                    # cir associated with bolus event
+                    if "insulinSensitivities" in list(bolus):
+                        pdb.set_trace()
+                    if "carbRatios" in list(bolus):
+                        pdb.set_trace()
+
+                    bolus["isf_mmolL_U"] = bolus["insulinSensitivity"]
+                    bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
+
+                    bolusCH = commonColumnHeadings.copy()
+                    bolusCH.extend(["utcTime", "roundedTime", "normal", "carbInput", "subType",
+                                    "insulinOnBoard", "bgInput",
+                                    "isf", "isf_mmolL_U", "insulinCarbRatio"])
+                    bolusEvents = bolus.loc[bolus["normal"].notnull(), bolusCH]
+                    bolusEvents.loc[bolusEvents["bgInput"] == 0, "bgInput"] = np.nan
+                    bolusEvents = bolusEvents.rename(columns={"normal": "unitsInsulin",
+                                                              "bgInput": "bg_mmolL"})
+                    bolusEvents["bg_mgdL"] = mmolL_to_mgdL(bolusEvents["bg_mmolL"])
+                    bolusEvents["eventType"] = "correction"
+                    bolusEvents.loc[bolusEvents["carbInput"] == 0, "eventType"] = "meal"
+
+
+                    # %% PUMP SETTINGS
+                    if "pumpSettings" in data.type.unique():
+                        pumpSettings = data[data.type == "pumpSettings"].copy().dropna(axis=1, how="all")
+                        pumpSettings.sort_values("uploadTime", ascending=False, inplace=True)
+
+                        pumpSettings, nPumpSettingsDuplicatesRemoved = \
+                        td.clean.remove_duplicates(pumpSettings, pumpSettings[["deviceTime"]])
+                        metadata["nPumpSettingsDuplicatesRemoved"] = nPumpSettingsDuplicatesRemoved
+
+                        # ISF
+                        isfCH = commonColumnHeadings.copy()
+                        isfCH.extend(["isfTime", "isf", "isf_mmolL_U"])
+
+                        if "insulinSensitivity.amount" in list(pumpSettings):
+                            isfColHead = "insulinSensitivity"
+                            pumpSettings["isf_mmolL_U"] = pumpSettings[isfColHead + ".amount"]
+                            pumpSettings["isf"] = mmolL_to_mgdL(pumpSettings["isf_mmolL_U"])
+                            pumpSettings["isfTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                                pd.to_timedelta(pumpSettings[isfColHead + ".start"], unit="ms")
+
+                            isf = pumpSettings.loc[pumpSettings["isf"].notnull(), isfCH]
                         else:
-                            tempDF = pd.DataFrame(index=[0])
-                            tempDF["sbrTime"] = np.nan
-                            tempDF["rate"] = np.nan
-                            tempDF["type"] = "AutoMode"
+                            isfColHead = "insulinSensitivities"
+                            isf = pd.DataFrame(columns=isfCH)
+                            for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
+                                tempDF = pd.DataFrame(pumpSettings.loc[p, isfColHead + "." + actSched])
+                                tempDF["day"] = pumpSettings.loc[p, "day"]
+                                tempDF["isfTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
+                                tempDF["age"] = pumpSettings.loc[p, "age"]
+                                tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
+                                tempDF["isf_mmolL_U"] = tempDF["amount"]
+                                tempDF["isf"] = mmolL_to_mgdL(tempDF["isf_mmolL_U"])
+                                isf = pd.concat([isf, tempDF[isfCH]], ignore_index=True)
+
+
+
+                        # CIR
+                        cirCH = commonColumnHeadings.copy()
+                        cirCH.extend(["cirTime", "cir"])
+
+                        if "carbRatio.amount" in list(pumpSettings):
+                            cirColHead = "carbRatio"
+                            pumpSettings["cir"] = pumpSettings[cirColHead + ".amount"]
+                            pumpSettings["cirTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                                pd.to_timedelta(pumpSettings[cirColHead + ".start"], unit="ms")
+
+                            cir = pumpSettings.loc[pumpSettings["cir"].notnull(), cirCH]
+                        else:
+                            cirColHead = "carbRatios"
+                            cir = pd.DataFrame(columns=cirCH)
+                            for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
+                                tempDF = pd.DataFrame(pumpSettings.loc[p, cirColHead + "." + actSched])
+                                tempDF["day"] = pumpSettings.loc[p, "day"]
+                                tempDF["cirTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
+                                tempDF["age"] = pumpSettings.loc[p, "age"]
+                                tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
+                                tempDF["cir"] = tempDF["amount"].astype(float)
+                                cir = pd.concat([cir, tempDF[cirCH]], ignore_index=True)
+
+
+                        # CORRECTION TARGET
+                        ctCH = commonColumnHeadings.copy()
+                        ctCH.extend(["ctTime", "correctionTargetLow", "correctionTargetHigh",
+                                     "correctionTarget", "correctionTargetRange"])
+                        if "bgTarget.start" in list(pumpSettings):
+                            bgTargetColHead = "bgTarget"
+
+                            # low
+                            if bgTargetColHead + ".low" in list(pumpSettings):
+                                pumpSettings["correctionTargetLow_mmolL"] = pumpSettings[bgTargetColHead + ".low"]
+                                pumpSettings["correctionTargetLow"] = \
+                                    mmolL_to_mgdL(pumpSettings["correctionTargetLow_mmolL"])
+                            else:
+                                pumpSettings["correctionTargetLow_mmolL"] = np.nan
+                                pumpSettings["correctionTargetLow"] = np.nan
+
+                            # high
+                            if bgTargetColHead + ".high" in list(pumpSettings):
+                                pumpSettings["correctionTargetHigh_mmolL"] = pumpSettings[bgTargetColHead + ".high"]
+                                pumpSettings["correctionTargetHigh"] = \
+                                    mmolL_to_mgdL(pumpSettings["correctionTargetHigh_mmolL"])
+
+                            else:
+                                pumpSettings["correctionTargetHigh_mmolL"] = np.nan
+                                pumpSettings["correctionTargetHigh"] = np.nan
+
+                            # target
+                            if bgTargetColHead + ".target" in list(pumpSettings):
+                                pumpSettings["correctionTarget_mmolL"] = pumpSettings[bgTargetColHead + ".target"]
+                                pumpSettings["correctionTarget"] = \
+                                    mmolL_to_mgdL(pumpSettings["correctionTarget_mmolL"])
+
+                            else:
+                                pumpSettings["correctionTarget_mmolL"] = np.nan
+                                pumpSettings["correctionTarget"] = np.nan
+
+                            # range
+                            if bgTargetColHead + ".range" in list(pumpSettings):
+                                pumpSettings["correctionTargetRange_mmolL"] = pumpSettings[bgTargetColHead + ".range"]
+                                pumpSettings["correctionTargetRange"] = \
+                                    mmolL_to_mgdL(pumpSettings["correctionTargetRange_mmolL"])
+
+                            else:
+                                pumpSettings["correctionTargetRange_mmolL"] = np.nan
+                                pumpSettings["correctionTargetRange"] =np.nan
+
+                            pumpSettings["ctTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                                pd.to_timedelta(pumpSettings[bgTargetColHead + ".start"], unit="ms")
+
+
+                            correctionTarget = pumpSettings.loc[pumpSettings["ctTime"].notnull(), ctCH]
 
-                        tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
-                        tempDF["age"] = pumpSettings.loc[p, "age"]
-                        tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
-                        sbr = pd.concat([sbr, tempDF[sbrCH]], ignore_index=True)
+                        else:
+                            bgTargetColHead = "bgTargets"
+                            pdb.set_trace()
 
 
-                    # %% ACTUAL BASAL RATES (TIME, VALUE, DURATION, TYPE (SCHEDULED, TEMP, SUSPEND))
-                    if "basal" in data.type.unique():
-                        basal = data[data.type == "basal"].copy().dropna(axis=1, how="all")
-                        basal.sort_values("uploadTime", ascending=False, inplace=True)
 
-                        basal, nBasalDuplicatesRemoved = \
-                            td.clean.remove_duplicates(basal, basal[["deliveryType", "deviceTime", "duration", "rate"]])
-                        metadata["basal.nBasalDuplicatesRemoved"] = nBasalDuplicatesRemoved
+                        # SCHEDULED BASAL RATES
+                        sbrCH = commonColumnHeadings.copy()
+                        sbrCH.extend(["sbrTime", "rate", "type"])
+                        sbr = pd.DataFrame(columns=sbrCH)
+                        for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
+                            if 'Auto Mode' not in actSched:
+                                tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
+                                tempDF["day"] = pumpSettings.loc[p, "day"]
+                                tempDF["type"] = np.nan
+                                tempDF["sbrTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                            else:
+                                tempDF = pd.DataFrame(index=[0])
+                                tempDF["sbrTime"] = np.nan
+                                tempDF["rate"] = np.nan
+                                tempDF["type"] = "AutoMode"
 
-                        # fill NaNs with 0, as it indicates a suspend (temp basal of 0)
-                        basal.rate.fillna(0, inplace=True)
+                            tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
+                            tempDF["age"] = pumpSettings.loc[p, "age"]
+                            tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
+                            sbr = pd.concat([sbr, tempDF[sbrCH]], ignore_index=True)
 
-                        # get rid of basals that have durations of 0
-                        nBasalDuration0 = sum(basal.duration > 0)
-                        basal = basal[basal.duration > 0]
-                        metadata["basal.nBasalDuration0"] = nBasalDuration0
 
-                        # get rid of basal durations that are unrealistic
-                        nUnrealisticBasalDuration = ((basal.duration < 0) | (basal.duration > 86400000))
-                        metadata["nUnrealisticBasalDuration"] = sum(nUnrealisticBasalDuration)
-                        basal.loc[nUnrealisticBasalDuration, "duration"] = np.nan
+                        # %% ACTUAL BASAL RATES (TIME, VALUE, DURATION, TYPE (SCHEDULED, TEMP, SUSPEND))
+                        if "basal" in data.type.unique():
+                            basal = data[data.type == "basal"].copy().dropna(axis=1, how="all")
+                            basal.sort_values("uploadTime", ascending=False, inplace=True)
 
-                        # calculate the total amount of insulin delivered (duration * rate)
-                        basal["durationHours"] = basal["duration"] / 1000.0 / 3600.0
-                        basal["totalAmountOfBasalInsulin"] = basal["durationHours"] * basal["rate"]
+                            basal, nBasalDuplicatesRemoved = \
+                                td.clean.remove_duplicates(basal, basal[["deliveryType", "deviceTime", "duration", "rate"]])
+                            metadata["basal.nBasalDuplicatesRemoved"] = nBasalDuplicatesRemoved
 
-                        # get a summary of basals per day
-                        basalDaySummary = get_basalDaySummary(basal)
+                            # fill NaNs with 0, as it indicates a suspend (temp basal of 0)
+                            basal.rate.fillna(0, inplace=True)
 
+                            # get rid of basals that have durations of 0
+                            nBasalDuration0 = sum(basal.duration > 0)
+                            basal = basal[basal.duration > 0]
+                            metadata["basal.nBasalDuration0"] = nBasalDuration0
 
-# %% LOOP DATA (BINARY T/F)
+                            # get rid of basal durations that are unrealistic
+                            nUnrealisticBasalDuration = ((basal.duration < 0) | (basal.duration > 86400000))
+                            metadata["nUnrealisticBasalDuration"] = sum(nUnrealisticBasalDuration)
+                            basal.loc[nUnrealisticBasalDuration, "duration"] = np.nan
 
+                            # calculate the total amount of insulin delivered (duration * rate)
+                            basal["durationHours"] = basal["duration"] / 1000.0 / 3600.0
+                            basal["totalAmountOfBasalInsulin"] = basal["durationHours"] * basal["rate"]
 
-# %% CGM DATA
+                            # get a summary of basals per day
+                            basalDaySummary = get_basalDaySummary(basal)
 
 
-# %% NUMBER OF DAYS OF PUMP AND CGM DATA, OVERALL AND PER EACH AGE & YLW
+    # %% LOOP DATA (BINARY T/F)
 
 
-# %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)
+    # %% CGM DATA
 
 
-# %% SAVE RESULTS
+    # %% NUMBER OF DAYS OF PUMP AND CGM DATA, OVERALL AND PER EACH AGE & YLW
 
 
-# %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
+    # %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)
+
+
+    # %% SAVE RESULTS
+
+
+    # %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
+                        else:
+                            metadata["flags"] = "no basal data"
                     else:
-                        metadata["flags"] = "no basal data"
+                        metadata["flags"] = "no pump settings"
                 else:
-                    metadata["flags"] = "no pump settings"
+                    metadata["flags"] = "no bolus wizard data"
             else:
-                metadata["flags"] = "no bolus wizard data"
+                metadata["flags"] = "no upload data"
         else:
-            metadata["flags"] = "no upload data"
+            metadata["flags"] = "file contains no data"
     else:
-        metadata["flags"] = "file contains no data"
-else:
-    metadata["flags"] = "file does not exist"
+        metadata["flags"] = "file does not exist"
+
+    print("done with", dIndex)
+
 
 # %% V2 DATA TO GRAB
+# MAX BASAL RATE, MAX BOLUS AMOUNT, AND INSULIN DURATION SET ON SELECT PUMPS
 # RE-EVALUATE THE WAY EXTENDED BOLUSES ARE BEING ACCOUNTED (ARE THEY ALSO SHOWING UP IN BASAL DATA?)
 # ALERT SETTINGS
 # ESTIMATED LOCAL TIME

From 9d1349f2e3e4e993ef115e5b7ced54cbb1aa319a Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sat, 12 Jan 2019 06:38:04 -0600
Subject: [PATCH 17/78] refactor correction target

---
 .../get-users-settings-and-events.py          | 103 ++++++------------
 1 file changed, 36 insertions(+), 67 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 9fb1f148..bda5ea62 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -333,6 +333,9 @@ def get_basalDaySummary(basal):
 # this is where the loop will go:
 for dIndex in range(0, len(donors)):
 
+    # clear output dataframes
+    isf, cir, correctionTarget = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
+
     # %% ID, HASHID, AGE, & YLW
     userID = donors.userID[dIndex]
     hashID = donors.hashID[dIndex]
@@ -453,120 +456,86 @@ def get_basalDaySummary(basal):
                         metadata["nPumpSettingsDuplicatesRemoved"] = nPumpSettingsDuplicatesRemoved
 
                         # ISF
-                        isfCH = commonColumnHeadings.copy()
-                        isfCH.extend(["isfTime", "isf", "isf_mmolL_U"])
+                        isfColHeadings = commonColumnHeadings.copy()
+                        isfColHeadings.extend(["isf.time", "isf", "isf_mmolL_U"])
 
                         if "insulinSensitivity.amount" in list(pumpSettings):
                             isfColHead = "insulinSensitivity"
                             pumpSettings["isf_mmolL_U"] = pumpSettings[isfColHead + ".amount"]
                             pumpSettings["isf"] = mmolL_to_mgdL(pumpSettings["isf_mmolL_U"])
-                            pumpSettings["isfTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                            pumpSettings["isf.time"] = pd.to_datetime(pumpSettings["day"]) + \
                                 pd.to_timedelta(pumpSettings[isfColHead + ".start"], unit="ms")
 
-                            isf = pumpSettings.loc[pumpSettings["isf"].notnull(), isfCH]
+                            isf = pumpSettings.loc[pumpSettings["isf"].notnull(), isfColHeadings]
                         else:
                             isfColHead = "insulinSensitivities"
-                            isf = pd.DataFrame(columns=isfCH)
+                            isf = pd.DataFrame(columns=isfColHeadings)
                             for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, isfColHead + "." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
-                                tempDF["isfTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                tempDF["isf.time"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
                                 tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
                                 tempDF["age"] = pumpSettings.loc[p, "age"]
                                 tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
                                 tempDF["isf_mmolL_U"] = tempDF["amount"]
                                 tempDF["isf"] = mmolL_to_mgdL(tempDF["isf_mmolL_U"])
-                                isf = pd.concat([isf, tempDF[isfCH]], ignore_index=True)
-
-
+                                isf = pd.concat([isf, tempDF[isfColHeadings]], ignore_index=True)
 
                         # CIR
-                        cirCH = commonColumnHeadings.copy()
-                        cirCH.extend(["cirTime", "cir"])
+                        cirColHeadings = commonColumnHeadings.copy()
+                        cirColHeadings.extend(["cir.time", "cir"])
 
                         if "carbRatio.amount" in list(pumpSettings):
                             cirColHead = "carbRatio"
                             pumpSettings["cir"] = pumpSettings[cirColHead + ".amount"]
-                            pumpSettings["cirTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                            pumpSettings["cir.time"] = pd.to_datetime(pumpSettings["day"]) + \
                                 pd.to_timedelta(pumpSettings[cirColHead + ".start"], unit="ms")
 
-                            cir = pumpSettings.loc[pumpSettings["cir"].notnull(), cirCH]
+                            cir = pumpSettings.loc[pumpSettings["carbRatio.amount"].notnull(), cirColHeadings]
                         else:
                             cirColHead = "carbRatios"
-                            cir = pd.DataFrame(columns=cirCH)
+                            cir = pd.DataFrame(columns=cirColHeadings)
                             for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, cirColHead + "." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
-                                tempDF["cirTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                tempDF["cir.time"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
                                 tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
                                 tempDF["age"] = pumpSettings.loc[p, "age"]
                                 tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
                                 tempDF["cir"] = tempDF["amount"].astype(float)
-                                cir = pd.concat([cir, tempDF[cirCH]], ignore_index=True)
+                                cir = pd.concat([cir, tempDF[cirColHeadings]], ignore_index=True)
 
 
                         # CORRECTION TARGET
-                        ctCH = commonColumnHeadings.copy()
-                        ctCH.extend(["ctTime", "correctionTargetLow", "correctionTargetHigh",
-                                     "correctionTarget", "correctionTargetRange"])
+                        ctColHeadings = commonColumnHeadings.copy()
+                        ctColHeadings.extend(["ct.time", "ct.low", "ct.high", "ct.target", "ct.range"])
                         if "bgTarget.start" in list(pumpSettings):
-                            bgTargetColHead = "bgTarget"
-
-                            # low
-                            if bgTargetColHead + ".low" in list(pumpSettings):
-                                pumpSettings["correctionTargetLow_mmolL"] = pumpSettings[bgTargetColHead + ".low"]
-                                pumpSettings["correctionTargetLow"] = \
-                                    mmolL_to_mgdL(pumpSettings["correctionTargetLow_mmolL"])
-                            else:
-                                pumpSettings["correctionTargetLow_mmolL"] = np.nan
-                                pumpSettings["correctionTargetLow"] = np.nan
-
-                            # high
-                            if bgTargetColHead + ".high" in list(pumpSettings):
-                                pumpSettings["correctionTargetHigh_mmolL"] = pumpSettings[bgTargetColHead + ".high"]
-                                pumpSettings["correctionTargetHigh"] = \
-                                    mmolL_to_mgdL(pumpSettings["correctionTargetHigh_mmolL"])
+                            bgTargetColHead = "bgTarget."
 
-                            else:
-                                pumpSettings["correctionTargetHigh_mmolL"] = np.nan
-                                pumpSettings["correctionTargetHigh"] = np.nan
+                            for targetType in ["low", "high", "target", "range"]:
+                                if bgTargetColHead + targetType in list(pumpSettings):
+                                    pumpSettings["ct." + targetType + "_mmolL"] = \
+                                        pumpSettings[bgTargetColHead + targetType]
 
-                            # target
-                            if bgTargetColHead + ".target" in list(pumpSettings):
-                                pumpSettings["correctionTarget_mmolL"] = pumpSettings[bgTargetColHead + ".target"]
-                                pumpSettings["correctionTarget"] = \
-                                    mmolL_to_mgdL(pumpSettings["correctionTarget_mmolL"])
+                                    pumpSettings["ct." + targetType] = \
+                                        mmolL_to_mgdL(pumpSettings["ct." + targetType + "_mmolL"])
+                                else:
+                                    pumpSettings["ct." + targetType + "_mmolL"] = np.nan
+                                    pumpSettings["ct." + targetType]  = np.nan
 
-                            else:
-                                pumpSettings["correctionTarget_mmolL"] = np.nan
-                                pumpSettings["correctionTarget"] = np.nan
+                            pumpSettings["ct.time"] = pd.to_datetime(pumpSettings["day"]) + \
+                                pd.to_timedelta(pumpSettings[bgTargetColHead + "start"], unit="ms")
 
-                            # range
-                            if bgTargetColHead + ".range" in list(pumpSettings):
-                                pumpSettings["correctionTargetRange_mmolL"] = pumpSettings[bgTargetColHead + ".range"]
-                                pumpSettings["correctionTargetRange"] = \
-                                    mmolL_to_mgdL(pumpSettings["correctionTargetRange_mmolL"])
-
-                            else:
-                                pumpSettings["correctionTargetRange_mmolL"] = np.nan
-                                pumpSettings["correctionTargetRange"] =np.nan
-
-                            pumpSettings["ctTime"] = pd.to_datetime(pumpSettings["day"]) + \
-                                pd.to_timedelta(pumpSettings[bgTargetColHead + ".start"], unit="ms")
-
-
-                            correctionTarget = pumpSettings.loc[pumpSettings["ctTime"].notnull(), ctCH]
+                            correctionTarget = pumpSettings.loc[pumpSettings["bgTarget.start"].notnull(), ctColHeadings]
 
                         else:
                             bgTargetColHead = "bgTargets"
                             pdb.set_trace()
 
-
-
                         # SCHEDULED BASAL RATES
-                        sbrCH = commonColumnHeadings.copy()
-                        sbrCH.extend(["sbrTime", "rate", "type"])
-                        sbr = pd.DataFrame(columns=sbrCH)
+                        sbrColHeadings = commonColumnHeadings.copy()
+                        sbrColHeadings.extend(["sbrTime", "rate", "type"])
+                        sbr = pd.DataFrame(columns=sbrColHeadings)
                         for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
                             if 'Auto Mode' not in actSched:
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
@@ -582,7 +551,7 @@ def get_basalDaySummary(basal):
                             tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
                             tempDF["age"] = pumpSettings.loc[p, "age"]
                             tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
-                            sbr = pd.concat([sbr, tempDF[sbrCH]], ignore_index=True)
+                            sbr = pd.concat([sbr, tempDF[sbrColHeadings]], ignore_index=True)
 
 
                         # %% ACTUAL BASAL RATES (TIME, VALUE, DURATION, TYPE (SCHEDULED, TEMP, SUSPEND))

From 307bb2efc2dcd11d6f8c9e2691300b50b973e75b Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sat, 12 Jan 2019 07:46:48 -0600
Subject: [PATCH 18/78] handle multiple correctionTargets

---
 .../get-users-settings-and-events.py          | 32 +++++++++++++++----
 1 file changed, 26 insertions(+), 6 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index bda5ea62..75d928f8 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -509,13 +509,14 @@ def get_basalDaySummary(basal):
                         # CORRECTION TARGET
                         ctColHeadings = commonColumnHeadings.copy()
                         ctColHeadings.extend(["ct.time", "ct.low", "ct.high", "ct.target", "ct.range"])
+                        correctionTarget
                         if "bgTarget.start" in list(pumpSettings):
-                            bgTargetColHead = "bgTarget."
+                            ctColHead = "bgTarget."
 
                             for targetType in ["low", "high", "target", "range"]:
-                                if bgTargetColHead + targetType in list(pumpSettings):
+                                if ctColHead + targetType in list(pumpSettings):
                                     pumpSettings["ct." + targetType + "_mmolL"] = \
-                                        pumpSettings[bgTargetColHead + targetType]
+                                        pumpSettings[ctColHead + targetType]
 
                                     pumpSettings["ct." + targetType] = \
                                         mmolL_to_mgdL(pumpSettings["ct." + targetType + "_mmolL"])
@@ -524,13 +525,32 @@ def get_basalDaySummary(basal):
                                     pumpSettings["ct." + targetType]  = np.nan
 
                             pumpSettings["ct.time"] = pd.to_datetime(pumpSettings["day"]) + \
-                                pd.to_timedelta(pumpSettings[bgTargetColHead + "start"], unit="ms")
+                                pd.to_timedelta(pumpSettings[ctColHead + "start"], unit="ms")
 
                             correctionTarget = pumpSettings.loc[pumpSettings["bgTarget.start"].notnull(), ctColHeadings]
 
                         else:
-                            bgTargetColHead = "bgTargets"
-                            pdb.set_trace()
+                            ctColHead = "bgTargets"
+                            correctionTarget = pd.DataFrame(columns=ctColHeadings)
+                            for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
+                                tempDF = pd.DataFrame(pumpSettings.loc[p, ctColHead + "." + actSched])
+                                tempDF["day"] = pumpSettings.loc[p, "day"]
+                                tempDF["ct.time"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
+                                tempDF["age"] = pumpSettings.loc[p, "age"]
+                                tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
+                                for targetType in ["low", "high", "target", "range"]:
+                                    if targetType in list(tempDF):
+                                        tempDF["ct." + targetType + "_mmolL"] = \
+                                            tempDF[targetType]
+
+                                        tempDF["ct." + targetType] = \
+                                            mmolL_to_mgdL(tempDF["ct." + targetType + "_mmolL"])
+                                    else:
+                                        tempDF["ct." + targetType + "_mmolL"] = np.nan
+                                        tempDF["ct." + targetType]  = np.nan
+
+                                correctionTarget = pd.concat([correctionTarget, tempDF[ctColHeadings]], ignore_index=True)
 
                         # SCHEDULED BASAL RATES
                         sbrColHeadings = commonColumnHeadings.copy()

From 2190b24705c8109997a377e1512686dfbf0bced6 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sat, 12 Jan 2019 08:58:41 -0600
Subject: [PATCH 19/78] add cgm data

---
 .../get-users-settings-and-events.py          | 166 +++++++++++++++++-
 1 file changed, 160 insertions(+), 6 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 75d928f8..701d2302 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -322,6 +322,97 @@ def get_basalDaySummary(basal):
     return basalDaySummary
 
 
+def filterAndSort(groupedDF, filterByField, sortByField):
+    filterDF = groupedDF.get_group(filterByField).dropna(axis=1, how="all")
+    filterDF = filterDF.sort_values(sortByField)
+    return filterDF
+
+
+def getClosedLoopDays(groupedData, nTempBasalsPerDayIsClosedLoop, metadata):
+    # filter by basal data and sort by time
+    if "basal" in groupedData.type.unique():
+        basalData = filterAndSort(groupedData, "basal", "time")
+
+        # get closed loop days
+        nTB = nTempBasalsPerDayIsClosedLoop
+
+        tbDataFrame = basalData.loc[basalData.deliveryType == "temp", ["time"]]
+        tbDataFrame.index = pd.to_datetime(tbDataFrame["time"])
+        tbDataFrame = tbDataFrame.drop(["time"], axis=1)
+        tbDataFrame["basal.temp.count"] = 1
+        nTempBasalsPerDay = tbDataFrame.resample("D").sum()
+        closedLoopDF = pd.DataFrame(nTempBasalsPerDay,
+                                    index=nTempBasalsPerDay.index.date)
+        closedLoopDF["date"] = nTempBasalsPerDay.index.date
+        closedLoopDF["basal.closedLoopDays"] = \
+            closedLoopDF["basal.temp.count"] >= nTB
+        nClosedLoopDays = closedLoopDF["basal.closedLoopDays"].sum()
+
+        # get the number of days with 670g
+        basalData["date"] = pd.to_datetime(basalData.time).dt.date
+        bdGroup = basalData.groupby("date")
+        topPump = bdGroup.deviceId.describe()["top"]
+        med670g = pd.DataFrame(topPump.str.contains("1780")).rename(columns={"top":"670g"})
+        med670g.reset_index(inplace=True)
+        n670gDays = med670g["670g"].sum()
+
+    else:
+        closedLoopDF = pd.DataFrame(columns=["basal.closedLoopDays", "date"])
+        med670g = pd.DataFrame(columns=["670g", "date"])
+        nClosedLoopDays = 0
+        n670gDays = 0
+
+    metadata["basal.closedLoopDays.count"] = nClosedLoopDays
+    metadata["med670gDays.count"] = n670gDays
+
+    return closedLoopDF, med670g, metadata
+
+
+def removeDuplicates(df, criteriaDF):
+    nBefore = len(df)
+    df = df.loc[~(df[criteriaDF].duplicated())]
+    df = df.reset_index(drop=True)
+    nDuplicatesRemoved = nBefore - len(df)
+
+    return df, nDuplicatesRemoved
+
+
+def removeCgmDuplicates(df, timeCriterion):
+    if timeCriterion in df:
+        df.sort_values(by=[timeCriterion, "uploadTime"],
+                       ascending=[False, False],
+                       inplace=True)
+        dfIsNull = df[df[timeCriterion].isnull()]
+        dfNotNull = df[df[timeCriterion].notnull()]
+        dfNotNull, nDuplicatesRemoved = removeDuplicates(dfNotNull, [timeCriterion, "value"])
+        df = pd.concat([dfIsNull, dfNotNull])
+        df.sort_values(by=[timeCriterion, "uploadTime"],
+                       ascending=[False, False],
+                       inplace=True)
+    else:
+        nDuplicatesRemoved = 0
+
+    return df, nDuplicatesRemoved
+
+
+def getStartAndEndTimes(df, dateTimeField):
+    dfBeginDate = df[dateTimeField].min()
+    dfEndDate = df[dateTimeField].max()
+
+    return dfBeginDate, dfEndDate
+
+
+def getListOfDexcomCGMDays(df):
+    # search for dexcom cgms
+    searchfor = ["Dex", "tan", "IR", "unk"]
+    # create dexcom boolean field
+    if "deviceId" in df.columns.values:
+        totalCgms = len(df.deviceId.notnull())
+        df["dexcomCGM"] = df.deviceId.str.contains("|".join(searchfor))
+        percentDexcomCGM = df.dexcomCGM.sum() / totalCgms * 100
+    return df, percentDexcomCGM
+
+
 # %% LOAD IN ONE FILE, BUT EVENTUALLY THIS WILL LOOOP THROUGH ALL USER'S
 dataPulledDate = "2018-09-28"
 phiDate = "PHI-" + dataPulledDate
@@ -443,7 +534,12 @@ def get_basalDaySummary(basal):
                                                               "bgInput": "bg_mmolL"})
                     bolusEvents["bg_mgdL"] = mmolL_to_mgdL(bolusEvents["bg_mmolL"])
                     bolusEvents["eventType"] = "correction"
-                    bolusEvents.loc[bolusEvents["carbInput"] == 0, "eventType"] = "meal"
+                    bolusEvents.loc[bolusEvents["carbInput"] > 0, "eventType"] = "meal"
+
+                    # get start and end times
+                    bolusBeginDate, bolusEndDate = getStartAndEndTimes(bolus, "day")
+                    metadata["bolus.beginDate"] = bolusBeginDate
+                    metadata["bolus.endDate"] = bolusEndDate
 
 
                     # %% PUMP SETTINGS
@@ -509,7 +605,7 @@ def get_basalDaySummary(basal):
                         # CORRECTION TARGET
                         ctColHeadings = commonColumnHeadings.copy()
                         ctColHeadings.extend(["ct.time", "ct.low", "ct.high", "ct.target", "ct.range"])
-                        correctionTarget
+
                         if "bgTarget.start" in list(pumpSettings):
                             ctColHead = "bgTarget."
 
@@ -573,6 +669,14 @@ def get_basalDaySummary(basal):
                             tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
                             sbr = pd.concat([sbr, tempDF[sbrColHeadings]], ignore_index=True)
 
+                        # max basal rate, max bolus amount, and insulin duration
+                        if "rateMaximum" in list(data):
+                            pdb.set_trace()
+                        if "amountMaximum" in list(data):
+                            pdb.set_trace()
+                        if "bolus.calculator" in list(data):
+                            pdb.set_trace()
+
 
                         # %% ACTUAL BASAL RATES (TIME, VALUE, DURATION, TYPE (SCHEDULED, TEMP, SUSPEND))
                         if "basal" in data.type.unique():
@@ -604,10 +708,60 @@ def get_basalDaySummary(basal):
                             basalDaySummary = get_basalDaySummary(basal)
 
 
-    # %% LOOP DATA (BINARY T/F)
-
-
-    # %% CGM DATA
+                            # %% GET CLOSED LOOP DAYS WITH TEMP BASAL DATA
+                            # group data by type
+                            groupedData = data.groupby(by="type")
+
+                            isClosedLoopDay, is670g, metadata = \
+                                getClosedLoopDays(groupedData, 30, metadata)
+
+                            # %% CGM DATA
+                            if "cbg" in data.type.unique():
+
+                                # filter by cgm and sort by uploadTime
+                                cgmData = groupedData.get_group("cbg").dropna(axis=1, how="all")
+
+                                # get rid of duplicates that have the same ["deviceTime", "value"]
+                                cgmData, nCgmDuplicatesRemovedDeviceTime = removeCgmDuplicates(cgmData, "deviceTime")
+                                metadata["nCgmDuplicatesRemovedDeviceTime"] = nCgmDuplicatesRemovedDeviceTime
+
+                                # get rid of duplicates that have the same ["time", "value"]
+                                cgmData, nCgmDuplicatesRemovedUtcTime = removeCgmDuplicates(cgmData, "time")
+                                metadata["cnCgmDuplicatesRemovedUtcTime"] = nCgmDuplicatesRemovedUtcTime
+
+                                # get rid of duplicates that have the same "roundedTime"
+                                cgmData, nCgmDuplicatesRemovedRoundedTime = removeDuplicates(cgmData, "roundedTime")
+                                metadata["nCgmDuplicatesRemovedRoundedTime"] = nCgmDuplicatesRemovedRoundedTime
+
+                                # get start and end times
+                                cgmBeginDate, cgmEndDate = getStartAndEndTimes(cgmData, "day")
+                                metadata["cgm.beginDate"] = cgmBeginDate
+                                metadata["cgm.endDate"] = cgmEndDate
+
+                                # get a list of dexcom cgms
+                                cgmData, percentDexcom = getListOfDexcomCGMDays(cgmData)
+                                metadata["cgm.percentDexcomCGM"] = percentDexcom
+
+                                # group by date (day) and get stats
+                                catDF = cgmData.groupby(cgmData["day"])
+                                cgmRecordsPerDay = \
+                                    pd.DataFrame(catDF.value.count()). \
+                                    rename(columns={"value": "cgm.count"})
+                                dayDate = catDF.day.describe()["top"]
+                                dexcomCGM = catDF.dexcomCGM.describe()["top"]
+                                nTypesCGM = catDF.dexcomCGM.describe()["unique"]
+                                cgmRecordsPerDay["cgm.dexcomOnly"] = \
+                                    (dexcomCGM & (nTypesCGM == 1))
+                                cgmRecordsPerDay["date"] = cgmRecordsPerDay.index
+
+                                # filter the cgm data
+                                cgmColHeadings = commonColumnHeadings.copy()
+                                cgmColHeadings.extend(["utcTime", "roundedTime", "value"])
+
+                                # get data in mg/dL units
+                                cgm = cgmData[cgmColHeadings]
+                                cgm = cgm.rename(columns={'value': 'mmol_L'})
+                                cgm["mg_dL"] = mmolL_to_mgdL(cgm["mmol_L"]).astype(int)
 
 
     # %% NUMBER OF DAYS OF PUMP AND CGM DATA, OVERALL AND PER EACH AGE & YLW

From 1050b6338d456ed41f3d0d3eee94bd381e156a7c Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sat, 12 Jan 2019 09:17:18 -0600
Subject: [PATCH 20/78] make an actual basal rate delivered df

---
 .../predict-simulate/get-users-settings-and-events.py     | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 701d2302..53718d25 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -704,6 +704,11 @@ def getListOfDexcomCGMDays(df):
                             basal["durationHours"] = basal["duration"] / 1000.0 / 3600.0
                             basal["totalAmountOfBasalInsulin"] = basal["durationHours"] * basal["rate"]
 
+                            # actual basal delivered
+                            abrColHeadings = commonColumnHeadings.copy()
+                            abrColHeadings.extend(["utcTime", "roundedTime", "durationHours", "rate"])
+                            abr = basal[abrColHeadings]
+
                             # get a summary of basals per day
                             basalDaySummary = get_basalDaySummary(basal)
 
@@ -764,7 +769,8 @@ def getListOfDexcomCGMDays(df):
                                 cgm["mg_dL"] = mmolL_to_mgdL(cgm["mmol_L"]).astype(int)
 
 
-    # %% NUMBER OF DAYS OF PUMP AND CGM DATA, OVERALL AND PER EACH AGE & YLW
+                                # %% NUMBER OF DAYS OF PUMP AND CGM DATA, OVERALL AND PER EACH AGE & YLW
+                                pdb.set_trace()
 
 
     # %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)

From 53cfe1ff485db617749b93bcba50f8ad8ba299cd Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sat, 12 Jan 2019 10:10:36 -0600
Subject: [PATCH 21/78] add in extended boluses to the actual basals delivered

---
 .../get-users-settings-and-events.py          | 29 ++++++++++++++-----
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 53718d25..1fcb7ac5 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -511,13 +511,10 @@ def getListOfDexcomCGMDays(df):
                     # get a summary of boluses per day
                     bolusDaySummary = get_bolusDaySummary(bolus)
 
-                    if "extended" not in bolus:
-                        bolus["extended"] = np.nan
-                        bolus["duration"] = np.nan
-
-                    # cir associated with bolus event
+                    # isf and cir associated with bolus event
                     if "insulinSensitivities" in list(bolus):
                         pdb.set_trace()
+
                     if "carbRatios" in list(bolus):
                         pdb.set_trace()
 
@@ -536,6 +533,20 @@ def getListOfDexcomCGMDays(df):
                     bolusEvents["eventType"] = "correction"
                     bolusEvents.loc[bolusEvents["carbInput"] > 0, "eventType"] = "meal"
 
+                    if "duration" in list(bolus):
+                        bolus["durationHours"] = bolus["duration"] / 1000.0 / 3600.0
+                        bolus["rate"] = bolus["extended"] / bolus["durationHours"]
+                        bolusExtendedCH = commonColumnHeadings.copy()
+                        bolusExtendedCH.extend(["utcTime", "roundedTime", "durationHours", "rate",  "type"])
+                        bolusExtendedEvents = bolus.loc[
+                                ((bolus["extended"].notnull()) &
+                                 (bolus["duration"] > 0)), bolusExtendedCH]
+
+                    if "extended" not in bolus:
+                        bolus["extended"] = np.nan
+                        bolus["duration"] = np.nan
+
+
                     # get start and end times
                     bolusBeginDate, bolusEndDate = getStartAndEndTimes(bolus, "day")
                     metadata["bolus.beginDate"] = bolusBeginDate
@@ -706,8 +717,12 @@ def getListOfDexcomCGMDays(df):
 
                             # actual basal delivered
                             abrColHeadings = commonColumnHeadings.copy()
-                            abrColHeadings.extend(["utcTime", "roundedTime", "durationHours", "rate"])
+                            abrColHeadings.extend(["utcTime", "roundedTime", "durationHours", "rate", "type"])
                             abr = basal[abrColHeadings]
+                            if "duration" in list(bolus):
+                                abr = pd.concat([abr, bolusExtendedEvents], ignore_index=True)
+                                abr.sort_values("utcTime", inplace=True)
+
 
                             # get a summary of basals per day
                             basalDaySummary = get_basalDaySummary(basal)
@@ -770,7 +785,7 @@ def getListOfDexcomCGMDays(df):
 
 
                                 # %% NUMBER OF DAYS OF PUMP AND CGM DATA, OVERALL AND PER EACH AGE & YLW
-                                pdb.set_trace()
+
 
 
     # %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)

From 3258b1bda4812d88d671dd697bfb2cb928ac024b Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sat, 12 Jan 2019 11:53:43 -0600
Subject: [PATCH 22/78] get day level summary stats by age and years living
 with (ylw)

---
 .../get-users-settings-and-events.py          | 82 +++++++++++++++----
 1 file changed, 67 insertions(+), 15 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 1fcb7ac5..11780289 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -343,22 +343,22 @@ def getClosedLoopDays(groupedData, nTempBasalsPerDayIsClosedLoop, metadata):
         nTempBasalsPerDay = tbDataFrame.resample("D").sum()
         closedLoopDF = pd.DataFrame(nTempBasalsPerDay,
                                     index=nTempBasalsPerDay.index.date)
-        closedLoopDF["date"] = nTempBasalsPerDay.index.date
+        closedLoopDF["day"] = nTempBasalsPerDay.index.date
         closedLoopDF["basal.closedLoopDays"] = \
             closedLoopDF["basal.temp.count"] >= nTB
         nClosedLoopDays = closedLoopDF["basal.closedLoopDays"].sum()
 
         # get the number of days with 670g
-        basalData["date"] = pd.to_datetime(basalData.time).dt.date
-        bdGroup = basalData.groupby("date")
+        basalData["day"] = pd.to_datetime(basalData.time).dt.date
+        bdGroup = basalData.groupby("day")
         topPump = bdGroup.deviceId.describe()["top"]
         med670g = pd.DataFrame(topPump.str.contains("1780")).rename(columns={"top":"670g"})
         med670g.reset_index(inplace=True)
         n670gDays = med670g["670g"].sum()
 
     else:
-        closedLoopDF = pd.DataFrame(columns=["basal.closedLoopDays", "date"])
-        med670g = pd.DataFrame(columns=["670g", "date"])
+        closedLoopDF = pd.DataFrame(columns=["basal.closedLoopDays", "day"])
+        med670g = pd.DataFrame(columns=["670g", "day"])
         nClosedLoopDays = 0
         n670gDays = 0
 
@@ -421,6 +421,8 @@ def getListOfDexcomCGMDays(df):
 donorList = phiDate + "-uniqueDonorList.csv"
 donors = td.load.load_csv(os.path.join(donorPath, donorList))
 
+# %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
+
 # this is where the loop will go:
 for dIndex in range(0, len(donors)):
 
@@ -694,6 +696,10 @@ def getListOfDexcomCGMDays(df):
                             basal = data[data.type == "basal"].copy().dropna(axis=1, how="all")
                             basal.sort_values("uploadTime", ascending=False, inplace=True)
 
+                            basalBeginDate, basalEndDate = getStartAndEndTimes(basal, "day")
+                            metadata["basal.beginDate"] = basalBeginDate
+                            metadata["basal.endDate"] = basalEndDate
+
                             basal, nBasalDuplicatesRemoved = \
                                 td.clean.remove_duplicates(basal, basal[["deliveryType", "deviceTime", "duration", "rate"]])
                             metadata["basal.nBasalDuplicatesRemoved"] = nBasalDuplicatesRemoved
@@ -723,7 +729,6 @@ def getListOfDexcomCGMDays(df):
                                 abr = pd.concat([abr, bolusExtendedEvents], ignore_index=True)
                                 abr.sort_values("utcTime", inplace=True)
 
-
                             # get a summary of basals per day
                             basalDaySummary = get_basalDaySummary(basal)
 
@@ -786,15 +791,62 @@ def getListOfDexcomCGMDays(df):
 
                                 # %% NUMBER OF DAYS OF PUMP AND CGM DATA, OVERALL AND PER EACH AGE & YLW
 
+                                # COMBINE DAY SUMMARIES
+                                # group by date (day) and get stats
+                                catDF = data.groupby(data["day"])
+                                dataPerDay = \
+                                    pd.DataFrame(catDF.hashID.describe()["top"]). \
+                                    rename(columns={"top": "hashID"})
+                                dataPerDay["age"] = catDF.age.mean()
+                                dataPerDay["ylw"] = catDF.ylw.mean()
+
+
+                                # calculate all of the data start and end range
+                                # this can be used for looking at settings
+                                dayBeginDate = min(cgmBeginDate, bolusBeginDate, basalBeginDate)
+                                dayEndDate = max(cgmEndDate, bolusEndDate, basalEndDate)
+                                metadata["day.beginDate"] = dayBeginDate
+                                metadata["day.endDate"] = dayEndDate
+                                rng = pd.date_range(dayBeginDate, dayEndDate).date
+                                dayData = pd.DataFrame(rng, columns=["day"])
+                                for dfType in [dataPerDay, basalDaySummary, bolusDaySummary, cgmRecordsPerDay]:
+                                    dayData = pd.merge(dayData, dfType.reset_index(), on="day", how="left")
+                                for dfType in [isClosedLoopDay, is670g]:
+                                    dayData = pd.merge(dayData, dfType, on="day", how="left")
+
+
+                                dayData["validPumpData"] = dayData["numberOfNormalBoluses"] > 3
+                                dayData["validCGMData"] = dayData["cgm.count"] > (288*.75)
+                                # calculate the start and end of contiguous data
+                                # these dates can be used when simulating and predicting, where
+                                # you need both pump and cgm data
+                                contiguousBeginDate = max(cgmBeginDate, bolusBeginDate, basalBeginDate)
+                                contiguousEndDate = min(cgmEndDate, bolusEndDate, basalEndDate)
+                                metadata["contiguous.beginDate"] = contiguousBeginDate
+                                metadata["contiguous.endDate"] = contiguousEndDate
+
+                                # get a summary by age, and ylw
+                                catDF = dayData.groupby("age")
+                                ageSummary = pd.DataFrame(catDF.validPumpData.sum())
+                                ageSummary.rename(columns={"validPumpData": "nDaysValidPump"}, inplace=True)
+                                ageSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
+                                ageSummary["nDaysclosedLopp"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
+                                ageSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
+                                ageSummary.reset_index(inplace=True)
+
+                                catDF = dayData.groupby("ylw")
+                                ylwSummary = pd.DataFrame(catDF.validPumpData.sum())
+                                ylwSummary.rename(columns={"validPumpData": "nDaysValidPump"}, inplace=True)
+                                ylwSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
+                                ylwSummary["nDaysclosedLopp"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
+                                ylwSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
+                                ylwSummary.reset_index(inplace=True)
+
+                                # %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)
+
+
+                                # %% SAVE RESULTS
 
-
-    # %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)
-
-
-    # %% SAVE RESULTS
-
-
-    # %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
                         else:
                             metadata["flags"] = "no basal data"
                     else:
@@ -812,8 +864,8 @@ def getListOfDexcomCGMDays(df):
 
 
 # %% V2 DATA TO GRAB
+# FIGURE OUT WHY TEMP BASAL COUNTS ARE DIFFERENT BETWEEN THE TWO DIFFERENT METHODS
 # MAX BASAL RATE, MAX BOLUS AMOUNT, AND INSULIN DURATION SET ON SELECT PUMPS
-# RE-EVALUATE THE WAY EXTENDED BOLUSES ARE BEING ACCOUNTED (ARE THEY ALSO SHOWING UP IN BASAL DATA?)
 # ALERT SETTINGS
 # ESTIMATED LOCAL TIME
 # PUMP AND CGM DEVICE ()

From 5cf9cdcb328fa5c271f85b7272edee8543412257 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sat, 12 Jan 2019 13:59:09 -0600
Subject: [PATCH 23/78] require pump and cgm data for this analysis

---
 .../get-users-settings-and-events.py          | 570 +++++++++---------
 1 file changed, 286 insertions(+), 284 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 11780289..96a2ed0d 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -424,7 +424,7 @@ def getListOfDexcomCGMDays(df):
 # %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
 
 # this is where the loop will go:
-for dIndex in range(0, len(donors)):
+for dIndex in range(140, len(donors)):
 
     # clear output dataframes
     isf, cir, correctionTarget = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
@@ -432,131 +432,139 @@ def getListOfDexcomCGMDays(df):
     # %% ID, HASHID, AGE, & YLW
     userID = donors.userID[dIndex]
     hashID = donors.hashID[dIndex]
+    metadata = pd.DataFrame(index=[dIndex])
     # round all birthdays and diagnosis dates to the first day of the month (to protect identities)
-    bDate = pd.to_datetime(donors.bDay[dIndex][0:7])
-    dDate = pd.to_datetime(donors.dDay[dIndex][0:7])
+    if (pd.isnull(donors.bDay[dIndex]) + pd.isnull(donors.dDay[dIndex])) == 0:
 
+        bDate = pd.to_datetime(donors.bDay[dIndex][0:7])
+        dDate = pd.to_datetime(donors.dDay[dIndex][0:7])
 
-    # %% LOAD IN DONOR JSON DATA
-    metadata = pd.DataFrame(index=[dIndex])
-    jsonDataPath = os.path.join(donorPath, phiDate + "-donorJsonData")
-    jsonFileName = os.path.join(jsonDataPath, "PHI-" + userID + ".json")
-
-    if os.path.exists(jsonFileName):
-        fileSize = os.stat(jsonFileName).st_size
-        metadata["fileSizeKB"] = fileSize / 1000
-        if fileSize > 1000:
-            data = td.load.load_json(jsonFileName)
-
-            # sort the data by time
-            data.sort_values("time", inplace=True)
-
-            # flatten the embedded json
-            data = flattenJson(data)
-
-
-            # %% CLEAN DATA
-            # remove negative durations
-            data, nNegativeDurations = removeNegativeDurations(data)
-            metadata["nNegativeDurations"] = nNegativeDurations
-
-            # get rid of cgm values too low/high (< 38 & > 402 mg/dL)
-            data, nInvalidCgmValues = removeInvalidCgmValues(data)
-            metadata["nInvalidCgmValues"] = nInvalidCgmValues
-
-            # Tslim calibration bug fix
-            data, nTandemAndPayloadCalReadings = tslimCalibrationFix(data)
-            metadata["nTandemAndPayloadCalReadings"] = nTandemAndPayloadCalReadings
-
-
-            # %% ADD UPLOAD DATE
-            # attach upload time to each record, for resolving duplicates
-            if "upload" in data.type.unique():
-                data = addUploadDate(data)
-
-
-                # %% TIME (UTC, TIMEZONE, DAY AND EVENTUALLY LOCAL TIME)
-                data["utcTime"] = pd.to_datetime(data["time"])
-                data["timezone"].fillna(method='ffill', inplace=True)
-                data["timezone"].fillna(method='bfill', inplace=True)
-                data["day"] = pd.DatetimeIndex(data["utcTime"]).date
-
-                # round to the nearest 5 minutes
-                # TODO: once roundTime is pushed to tidals repository then this line can be replaced
-                # with td.clean.round_time
-                data = round_time(data, timeIntervalMinutes=5, timeField="time",
-                                  roundedTimeFieldName="roundedTime", startWithFirstRecord=True,
-                                  verbose=False)
-                data.sort_values("uploadTime", ascending=False, inplace=True)
-
-
-                # %% ID, HASHID, AGE, & YLW
-                data["userID"] = userID
-                data["hashID"] = hashID
-                data["age"] = np.floor((data["utcTime"] - bDate).dt.days/365.25).astype(int)
-                data["ylw"] = np.floor((data["utcTime"] - dDate).dt.days/365.25).astype(int)
-
-                commonColumnHeadings = ["hashID",
-                                        "age",
-                                        "ylw"]
-
-
-                # %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
-                bolus = mergeWizardWithBolus(data)
-                if len(bolus) > 0:
-                    # get rid of duplicates that have the same ["time", "normal"]
-                    bolus.sort_values("uploadTime", ascending=False, inplace=True)
-                    bolus, nBolusDuplicatesRemoved = \
-                        td.clean.remove_duplicates(bolus, bolus[["deviceTime", "normal"]])
-                    metadata["nBolusDuplicatesRemoved"] = nBolusDuplicatesRemoved
-
-                    # get a summary of boluses per day
-                    bolusDaySummary = get_bolusDaySummary(bolus)
-
-                    # isf and cir associated with bolus event
-                    if "insulinSensitivities" in list(bolus):
-                        pdb.set_trace()
-
-                    if "carbRatios" in list(bolus):
-                        pdb.set_trace()
-
-                    bolus["isf_mmolL_U"] = bolus["insulinSensitivity"]
-                    bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
-
-                    bolusCH = commonColumnHeadings.copy()
-                    bolusCH.extend(["utcTime", "roundedTime", "normal", "carbInput", "subType",
-                                    "insulinOnBoard", "bgInput",
-                                    "isf", "isf_mmolL_U", "insulinCarbRatio"])
-                    bolusEvents = bolus.loc[bolus["normal"].notnull(), bolusCH]
-                    bolusEvents.loc[bolusEvents["bgInput"] == 0, "bgInput"] = np.nan
-                    bolusEvents = bolusEvents.rename(columns={"normal": "unitsInsulin",
-                                                              "bgInput": "bg_mmolL"})
-                    bolusEvents["bg_mgdL"] = mmolL_to_mgdL(bolusEvents["bg_mmolL"])
-                    bolusEvents["eventType"] = "correction"
-                    bolusEvents.loc[bolusEvents["carbInput"] > 0, "eventType"] = "meal"
-
-                    if "duration" in list(bolus):
-                        bolus["durationHours"] = bolus["duration"] / 1000.0 / 3600.0
-                        bolus["rate"] = bolus["extended"] / bolus["durationHours"]
-                        bolusExtendedCH = commonColumnHeadings.copy()
-                        bolusExtendedCH.extend(["utcTime", "roundedTime", "durationHours", "rate",  "type"])
-                        bolusExtendedEvents = bolus.loc[
-                                ((bolus["extended"].notnull()) &
-                                 (bolus["duration"] > 0)), bolusExtendedCH]
-
-                    if "extended" not in bolus:
-                        bolus["extended"] = np.nan
-                        bolus["duration"] = np.nan
 
-
-                    # get start and end times
-                    bolusBeginDate, bolusEndDate = getStartAndEndTimes(bolus, "day")
-                    metadata["bolus.beginDate"] = bolusBeginDate
-                    metadata["bolus.endDate"] = bolusEndDate
-
-
-                    # %% PUMP SETTINGS
-                    if "pumpSettings" in data.type.unique():
+        # %% LOAD IN DONOR JSON DATA
+
+        jsonDataPath = os.path.join(donorPath, phiDate + "-donorJsonData")
+        jsonFileName = os.path.join(jsonDataPath, "PHI-" + userID + ".json")
+
+        if os.path.exists(jsonFileName):
+            fileSize = os.stat(jsonFileName).st_size
+            metadata["fileSizeKB"] = fileSize / 1000
+            if fileSize > 1000:
+                data = td.load.load_json(jsonFileName)
+
+                # sort the data by time
+                data.sort_values("time", inplace=True)
+
+                # flatten the embedded json
+                data = flattenJson(data)
+
+
+                # %% CLEAN DATA
+                # remove negative durations
+                data, nNegativeDurations = removeNegativeDurations(data)
+                metadata["nNegativeDurations"] = nNegativeDurations
+
+                # get rid of cgm values too low/high (< 38 & > 402 mg/dL)
+                data, nInvalidCgmValues = removeInvalidCgmValues(data)
+                metadata["nInvalidCgmValues"] = nInvalidCgmValues
+
+                # Tslim calibration bug fix
+                data, nTandemAndPayloadCalReadings = tslimCalibrationFix(data)
+                metadata["nTandemAndPayloadCalReadings"] = nTandemAndPayloadCalReadings
+
+
+                # %% ADD UPLOAD DATE
+                # attach upload time to each record, for resolving duplicates
+                if (("upload" in data.type.unique()) &
+                    ("basal" in data.type.unique()) &
+                    ("bolus" in data.type.unique()) &
+                    ("cbg" in data.type.unique()) &
+                    ("pumpSettings" in data.type.unique())):
+                    data = addUploadDate(data)
+
+
+                    # %% TIME (UTC, TIMEZONE, DAY AND EVENTUALLY LOCAL TIME)
+                    data["utcTime"] = pd.to_datetime(data["time"])
+                    data["timezone"].fillna(method='ffill', inplace=True)
+                    data["timezone"].fillna(method='bfill', inplace=True)
+                    data["day"] = pd.DatetimeIndex(data["utcTime"]).date
+
+                    # round to the nearest 5 minutes
+                    # TODO: once roundTime is pushed to tidals repository then this line can be replaced
+                    # with td.clean.round_time
+                    data = round_time(data, timeIntervalMinutes=5, timeField="time",
+                                      roundedTimeFieldName="roundedTime", startWithFirstRecord=True,
+                                      verbose=False)
+                    data.sort_values("uploadTime", ascending=False, inplace=True)
+
+
+                    # %% ID, HASHID, AGE, & YLW
+                    data["userID"] = userID
+                    data["hashID"] = hashID
+                    data["age"] = np.floor((data["utcTime"] - bDate).dt.days/365.25).astype(int)
+                    data["ylw"] = np.floor((data["utcTime"] - dDate).dt.days/365.25).astype(int)
+
+                    commonColumnHeadings = ["hashID",
+                                            "age",
+                                            "ylw"]
+
+
+                    # %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
+                    bolus = mergeWizardWithBolus(data)
+                    if len(bolus) > 0:
+                        # get rid of duplicates that have the same ["time", "normal"]
+                        bolus.sort_values("uploadTime", ascending=False, inplace=True)
+                        bolus, nBolusDuplicatesRemoved = \
+                            td.clean.remove_duplicates(bolus, bolus[["deviceTime", "normal"]])
+                        metadata["nBolusDuplicatesRemoved"] = nBolusDuplicatesRemoved
+
+                        # get a summary of boluses per day
+                        bolusDaySummary = get_bolusDaySummary(bolus)
+
+                        # isf and cir associated with bolus event
+                        if "insulinSensitivities" in list(bolus):
+                            pdb.set_trace()
+
+                        if "carbRatios" in list(bolus):
+                            pdb.set_trace()
+
+                        bolus["isf_mmolL_U"] = bolus["insulinSensitivity"]
+                        bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
+
+                        bolusCH = commonColumnHeadings.copy()
+                        bolusCH.extend(["utcTime", "roundedTime", "normal", "carbInput", "subType",
+                                        "insulinOnBoard", "bgInput",
+                                        "isf", "isf_mmolL_U", "insulinCarbRatio"])
+                        bolusEvents = bolus.loc[bolus["normal"].notnull(), bolusCH]
+                        bolusEvents.loc[bolusEvents["bgInput"] == 0, "bgInput"] = np.nan
+                        bolusEvents = bolusEvents.rename(columns={"normal": "unitsInsulin",
+                                                                  "bgInput": "bg_mmolL"})
+                        bolusEvents["bg_mgdL"] = mmolL_to_mgdL(bolusEvents["bg_mmolL"])
+                        bolusEvents["eventType"] = "correction"
+                        bolusEvents.loc[bolusEvents["carbInput"] > 0, "eventType"] = "meal"
+
+                        if "duration" in list(bolus):
+                            bolus["duration"].replace(0, np.nan, inplace=True)
+                            bolus["durationHours"] = bolus["duration"] / 1000.0 / 3600.0
+                            bolus["rate"] = bolus["extended"] / bolus["durationHours"]
+                            bolusExtendedCH = commonColumnHeadings.copy()
+                            bolusExtendedCH.extend(["utcTime", "roundedTime", "durationHours", "rate",  "type"])
+                            bolusExtendedEvents = bolus.loc[
+                                    ((bolus["extended"].notnull()) &
+                                     (bolus["duration"] > 0)), bolusExtendedCH]
+
+                        if "extended" not in bolus:
+                            bolus["extended"] = np.nan
+                            bolus["duration"] = np.nan
+
+
+                        # get start and end times
+                        bolusBeginDate, bolusEndDate = getStartAndEndTimes(bolus, "day")
+                        metadata["bolus.beginDate"] = bolusBeginDate
+                        metadata["bolus.endDate"] = bolusEndDate
+
+
+                        # %% PUMP SETTINGS
+
                         pumpSettings = data[data.type == "pumpSettings"].copy().dropna(axis=1, how="all")
                         pumpSettings.sort_values("uploadTime", ascending=False, inplace=True)
 
@@ -692,174 +700,168 @@ def getListOfDexcomCGMDays(df):
 
 
                         # %% ACTUAL BASAL RATES (TIME, VALUE, DURATION, TYPE (SCHEDULED, TEMP, SUSPEND))
-                        if "basal" in data.type.unique():
-                            basal = data[data.type == "basal"].copy().dropna(axis=1, how="all")
-                            basal.sort_values("uploadTime", ascending=False, inplace=True)
-
-                            basalBeginDate, basalEndDate = getStartAndEndTimes(basal, "day")
-                            metadata["basal.beginDate"] = basalBeginDate
-                            metadata["basal.endDate"] = basalEndDate
-
-                            basal, nBasalDuplicatesRemoved = \
-                                td.clean.remove_duplicates(basal, basal[["deliveryType", "deviceTime", "duration", "rate"]])
-                            metadata["basal.nBasalDuplicatesRemoved"] = nBasalDuplicatesRemoved
-
-                            # fill NaNs with 0, as it indicates a suspend (temp basal of 0)
-                            basal.rate.fillna(0, inplace=True)
-
-                            # get rid of basals that have durations of 0
-                            nBasalDuration0 = sum(basal.duration > 0)
-                            basal = basal[basal.duration > 0]
-                            metadata["basal.nBasalDuration0"] = nBasalDuration0
-
-                            # get rid of basal durations that are unrealistic
-                            nUnrealisticBasalDuration = ((basal.duration < 0) | (basal.duration > 86400000))
-                            metadata["nUnrealisticBasalDuration"] = sum(nUnrealisticBasalDuration)
-                            basal.loc[nUnrealisticBasalDuration, "duration"] = np.nan
-
-                            # calculate the total amount of insulin delivered (duration * rate)
-                            basal["durationHours"] = basal["duration"] / 1000.0 / 3600.0
-                            basal["totalAmountOfBasalInsulin"] = basal["durationHours"] * basal["rate"]
-
-                            # actual basal delivered
-                            abrColHeadings = commonColumnHeadings.copy()
-                            abrColHeadings.extend(["utcTime", "roundedTime", "durationHours", "rate", "type"])
-                            abr = basal[abrColHeadings]
-                            if "duration" in list(bolus):
-                                abr = pd.concat([abr, bolusExtendedEvents], ignore_index=True)
-                                abr.sort_values("utcTime", inplace=True)
-
-                            # get a summary of basals per day
-                            basalDaySummary = get_basalDaySummary(basal)
-
-
-                            # %% GET CLOSED LOOP DAYS WITH TEMP BASAL DATA
-                            # group data by type
-                            groupedData = data.groupby(by="type")
-
-                            isClosedLoopDay, is670g, metadata = \
-                                getClosedLoopDays(groupedData, 30, metadata)
-
-                            # %% CGM DATA
-                            if "cbg" in data.type.unique():
-
-                                # filter by cgm and sort by uploadTime
-                                cgmData = groupedData.get_group("cbg").dropna(axis=1, how="all")
-
-                                # get rid of duplicates that have the same ["deviceTime", "value"]
-                                cgmData, nCgmDuplicatesRemovedDeviceTime = removeCgmDuplicates(cgmData, "deviceTime")
-                                metadata["nCgmDuplicatesRemovedDeviceTime"] = nCgmDuplicatesRemovedDeviceTime
-
-                                # get rid of duplicates that have the same ["time", "value"]
-                                cgmData, nCgmDuplicatesRemovedUtcTime = removeCgmDuplicates(cgmData, "time")
-                                metadata["cnCgmDuplicatesRemovedUtcTime"] = nCgmDuplicatesRemovedUtcTime
-
-                                # get rid of duplicates that have the same "roundedTime"
-                                cgmData, nCgmDuplicatesRemovedRoundedTime = removeDuplicates(cgmData, "roundedTime")
-                                metadata["nCgmDuplicatesRemovedRoundedTime"] = nCgmDuplicatesRemovedRoundedTime
-
-                                # get start and end times
-                                cgmBeginDate, cgmEndDate = getStartAndEndTimes(cgmData, "day")
-                                metadata["cgm.beginDate"] = cgmBeginDate
-                                metadata["cgm.endDate"] = cgmEndDate
-
-                                # get a list of dexcom cgms
-                                cgmData, percentDexcom = getListOfDexcomCGMDays(cgmData)
-                                metadata["cgm.percentDexcomCGM"] = percentDexcom
-
-                                # group by date (day) and get stats
-                                catDF = cgmData.groupby(cgmData["day"])
-                                cgmRecordsPerDay = \
-                                    pd.DataFrame(catDF.value.count()). \
-                                    rename(columns={"value": "cgm.count"})
-                                dayDate = catDF.day.describe()["top"]
-                                dexcomCGM = catDF.dexcomCGM.describe()["top"]
-                                nTypesCGM = catDF.dexcomCGM.describe()["unique"]
-                                cgmRecordsPerDay["cgm.dexcomOnly"] = \
-                                    (dexcomCGM & (nTypesCGM == 1))
-                                cgmRecordsPerDay["date"] = cgmRecordsPerDay.index
-
-                                # filter the cgm data
-                                cgmColHeadings = commonColumnHeadings.copy()
-                                cgmColHeadings.extend(["utcTime", "roundedTime", "value"])
-
-                                # get data in mg/dL units
-                                cgm = cgmData[cgmColHeadings]
-                                cgm = cgm.rename(columns={'value': 'mmol_L'})
-                                cgm["mg_dL"] = mmolL_to_mgdL(cgm["mmol_L"]).astype(int)
-
-
-                                # %% NUMBER OF DAYS OF PUMP AND CGM DATA, OVERALL AND PER EACH AGE & YLW
-
-                                # COMBINE DAY SUMMARIES
-                                # group by date (day) and get stats
-                                catDF = data.groupby(data["day"])
-                                dataPerDay = \
-                                    pd.DataFrame(catDF.hashID.describe()["top"]). \
-                                    rename(columns={"top": "hashID"})
-                                dataPerDay["age"] = catDF.age.mean()
-                                dataPerDay["ylw"] = catDF.ylw.mean()
-
-
-                                # calculate all of the data start and end range
-                                # this can be used for looking at settings
-                                dayBeginDate = min(cgmBeginDate, bolusBeginDate, basalBeginDate)
-                                dayEndDate = max(cgmEndDate, bolusEndDate, basalEndDate)
-                                metadata["day.beginDate"] = dayBeginDate
-                                metadata["day.endDate"] = dayEndDate
-                                rng = pd.date_range(dayBeginDate, dayEndDate).date
-                                dayData = pd.DataFrame(rng, columns=["day"])
-                                for dfType in [dataPerDay, basalDaySummary, bolusDaySummary, cgmRecordsPerDay]:
-                                    dayData = pd.merge(dayData, dfType.reset_index(), on="day", how="left")
-                                for dfType in [isClosedLoopDay, is670g]:
-                                    dayData = pd.merge(dayData, dfType, on="day", how="left")
-
-
-                                dayData["validPumpData"] = dayData["numberOfNormalBoluses"] > 3
-                                dayData["validCGMData"] = dayData["cgm.count"] > (288*.75)
-                                # calculate the start and end of contiguous data
-                                # these dates can be used when simulating and predicting, where
-                                # you need both pump and cgm data
-                                contiguousBeginDate = max(cgmBeginDate, bolusBeginDate, basalBeginDate)
-                                contiguousEndDate = min(cgmEndDate, bolusEndDate, basalEndDate)
-                                metadata["contiguous.beginDate"] = contiguousBeginDate
-                                metadata["contiguous.endDate"] = contiguousEndDate
-
-                                # get a summary by age, and ylw
-                                catDF = dayData.groupby("age")
-                                ageSummary = pd.DataFrame(catDF.validPumpData.sum())
-                                ageSummary.rename(columns={"validPumpData": "nDaysValidPump"}, inplace=True)
-                                ageSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
-                                ageSummary["nDaysclosedLopp"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
-                                ageSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
-                                ageSummary.reset_index(inplace=True)
-
-                                catDF = dayData.groupby("ylw")
-                                ylwSummary = pd.DataFrame(catDF.validPumpData.sum())
-                                ylwSummary.rename(columns={"validPumpData": "nDaysValidPump"}, inplace=True)
-                                ylwSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
-                                ylwSummary["nDaysclosedLopp"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
-                                ylwSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
-                                ylwSummary.reset_index(inplace=True)
-
-                                # %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)
-
-
-                                # %% SAVE RESULTS
+                        basal = data[data.type == "basal"].copy().dropna(axis=1, how="all")
+                        basal.sort_values("uploadTime", ascending=False, inplace=True)
+
+                        basalBeginDate, basalEndDate = getStartAndEndTimes(basal, "day")
+                        metadata["basal.beginDate"] = basalBeginDate
+                        metadata["basal.endDate"] = basalEndDate
+
+                        basal, nBasalDuplicatesRemoved = \
+                            td.clean.remove_duplicates(basal, basal[["deliveryType", "deviceTime", "duration", "rate"]])
+                        metadata["basal.nBasalDuplicatesRemoved"] = nBasalDuplicatesRemoved
+
+                        # fill NaNs with 0, as it indicates a suspend (temp basal of 0)
+                        basal.rate.fillna(0, inplace=True)
+
+                        # get rid of basals that have durations of 0
+                        nBasalDuration0 = sum(basal.duration > 0)
+                        basal = basal[basal.duration > 0]
+                        metadata["basal.nBasalDuration0"] = nBasalDuration0
+
+                        # get rid of basal durations that are unrealistic
+                        nUnrealisticBasalDuration = ((basal.duration < 0) | (basal.duration > 86400000))
+                        metadata["nUnrealisticBasalDuration"] = sum(nUnrealisticBasalDuration)
+                        basal.loc[nUnrealisticBasalDuration, "duration"] = np.nan
+
+                        # calculate the total amount of insulin delivered (duration * rate)
+                        basal["durationHours"] = basal["duration"] / 1000.0 / 3600.0
+                        basal["totalAmountOfBasalInsulin"] = basal["durationHours"] * basal["rate"]
+
+                        # actual basal delivered
+                        abrColHeadings = commonColumnHeadings.copy()
+                        abrColHeadings.extend(["utcTime", "roundedTime", "durationHours", "rate", "type"])
+                        abr = basal[abrColHeadings]
+                        if "duration" in list(bolus):
+                            abr = pd.concat([abr, bolusExtendedEvents], ignore_index=True)
+                            abr.sort_values("utcTime", inplace=True)
+
+                        # get a summary of basals per day
+                        basalDaySummary = get_basalDaySummary(basal)
+
+
+                        # %% GET CLOSED LOOP DAYS WITH TEMP BASAL DATA
+                        # group data by type
+                        groupedData = data.groupby(by="type")
+
+                        isClosedLoopDay, is670g, metadata = \
+                            getClosedLoopDays(groupedData, 30, metadata)
+
+                        # %% CGM DATA
+                        # filter by cgm and sort by uploadTime
+                        cgmData = groupedData.get_group("cbg").dropna(axis=1, how="all")
+
+                        # get rid of duplicates that have the same ["deviceTime", "value"]
+                        cgmData, nCgmDuplicatesRemovedDeviceTime = removeCgmDuplicates(cgmData, "deviceTime")
+                        metadata["nCgmDuplicatesRemovedDeviceTime"] = nCgmDuplicatesRemovedDeviceTime
+
+                        # get rid of duplicates that have the same ["time", "value"]
+                        cgmData, nCgmDuplicatesRemovedUtcTime = removeCgmDuplicates(cgmData, "time")
+                        metadata["cnCgmDuplicatesRemovedUtcTime"] = nCgmDuplicatesRemovedUtcTime
+
+                        # get rid of duplicates that have the same "roundedTime"
+                        cgmData, nCgmDuplicatesRemovedRoundedTime = removeDuplicates(cgmData, "roundedTime")
+                        metadata["nCgmDuplicatesRemovedRoundedTime"] = nCgmDuplicatesRemovedRoundedTime
+
+                        # get start and end times
+                        cgmBeginDate, cgmEndDate = getStartAndEndTimes(cgmData, "day")
+                        metadata["cgm.beginDate"] = cgmBeginDate
+                        metadata["cgm.endDate"] = cgmEndDate
+
+                        # get a list of dexcom cgms
+                        cgmData, percentDexcom = getListOfDexcomCGMDays(cgmData)
+                        metadata["cgm.percentDexcomCGM"] = percentDexcom
+
+                        # group by date (day) and get stats
+                        catDF = cgmData.groupby(cgmData["day"])
+                        cgmRecordsPerDay = \
+                            pd.DataFrame(catDF.value.count()). \
+                            rename(columns={"value": "cgm.count"})
+                        dayDate = catDF.day.describe()["top"]
+                        dexcomCGM = catDF.dexcomCGM.describe()["top"]
+                        nTypesCGM = catDF.dexcomCGM.describe()["unique"]
+                        cgmRecordsPerDay["cgm.dexcomOnly"] = \
+                            (dexcomCGM & (nTypesCGM == 1))
+                        cgmRecordsPerDay["date"] = cgmRecordsPerDay.index
+
+                        # filter the cgm data
+                        cgmColHeadings = commonColumnHeadings.copy()
+                        cgmColHeadings.extend(["utcTime", "roundedTime", "value"])
+
+                        # get data in mg/dL units
+                        cgm = cgmData[cgmColHeadings]
+                        cgm = cgm.rename(columns={'value': 'mmol_L'})
+                        cgm["mg_dL"] = mmolL_to_mgdL(cgm["mmol_L"]).astype(int)
+
+
+                        # %% NUMBER OF DAYS OF PUMP AND CGM DATA, OVERALL AND PER EACH AGE & YLW
+
+                        # COMBINE DAY SUMMARIES
+                        # group by date (day) and get stats
+                        catDF = data.groupby(data["day"])
+                        dataPerDay = \
+                            pd.DataFrame(catDF.hashID.describe()["top"]). \
+                            rename(columns={"top": "hashID"})
+                        dataPerDay["age"] = catDF.age.mean()
+                        dataPerDay["ylw"] = catDF.ylw.mean()
+
+
+                        # calculate all of the data start and end range
+                        # this can be used for looking at settings
+                        dayBeginDate = min(cgmBeginDate, bolusBeginDate, basalBeginDate)
+                        dayEndDate = max(cgmEndDate, bolusEndDate, basalEndDate)
+                        metadata["day.beginDate"] = dayBeginDate
+                        metadata["day.endDate"] = dayEndDate
+                        rng = pd.date_range(dayBeginDate, dayEndDate).date
+                        dayData = pd.DataFrame(rng, columns=["day"])
+                        for dfType in [dataPerDay, basalDaySummary, bolusDaySummary, cgmRecordsPerDay]:
+                            dayData = pd.merge(dayData, dfType.reset_index(), on="day", how="left")
+                        for dfType in [isClosedLoopDay, is670g]:
+                            dayData = pd.merge(dayData, dfType, on="day", how="left")
+
+
+                        dayData["validPumpData"] = dayData["numberOfNormalBoluses"] > 3
+                        dayData["validCGMData"] = dayData["cgm.count"] > (288*.75)
+                        # calculate the start and end of contiguous data
+                        # these dates can be used when simulating and predicting, where
+                        # you need both pump and cgm data
+                        contiguousBeginDate = max(cgmBeginDate, bolusBeginDate, basalBeginDate)
+                        contiguousEndDate = min(cgmEndDate, bolusEndDate, basalEndDate)
+                        metadata["contiguous.beginDate"] = contiguousBeginDate
+                        metadata["contiguous.endDate"] = contiguousEndDate
+
+                        # get a summary by age, and ylw
+                        catDF = dayData.groupby("age")
+                        ageSummary = pd.DataFrame(catDF.validPumpData.sum())
+                        ageSummary.rename(columns={"validPumpData": "nDaysValidPump"}, inplace=True)
+                        ageSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
+                        ageSummary["nDaysclosedLopp"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
+                        ageSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
+                        ageSummary.reset_index(inplace=True)
+
+                        catDF = dayData.groupby("ylw")
+                        ylwSummary = pd.DataFrame(catDF.validPumpData.sum())
+                        ylwSummary.rename(columns={"validPumpData": "nDaysValidPump"}, inplace=True)
+                        ylwSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
+                        ylwSummary["nDaysclosedLopp"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
+                        ylwSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
+                        ylwSummary.reset_index(inplace=True)
+
+                            # %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)
+
+
+                            # %% SAVE RESULTS
 
-                        else:
-                            metadata["flags"] = "no basal data"
                     else:
-                        metadata["flags"] = "no pump settings"
+                        metadata["flags"] = "no bolus wizard data"
                 else:
-                    metadata["flags"] = "no bolus wizard data"
+                    metadata["flags"] = "missing either pump or cgm  data"
             else:
-                metadata["flags"] = "no upload data"
+                metadata["flags"] = "file contains no data"
         else:
-            metadata["flags"] = "file contains no data"
+            metadata["flags"] = "file does not exist"
     else:
-        metadata["flags"] = "file does not exist"
-
+        metadata["flags"] = "fmissing bDay/dDay"
     print("done with", dIndex)
 
 

From 75b78a2504c01aa0562035a4f07fbdac30db7853 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sat, 12 Jan 2019 14:09:57 -0600
Subject: [PATCH 24/78] remove dependence on tidals

---
 .../get-users-settings-and-events.py          | 26 +++++++++++++------
 1 file changed, 18 insertions(+), 8 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 96a2ed0d..1a016850 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -14,7 +14,6 @@
 # %% REQUIRED LIBRARIES
 import pandas as pd
 import numpy as np
-import tidals as td
 import os
 import pdb
 
@@ -413,18 +412,28 @@ def getListOfDexcomCGMDays(df):
     return df, percentDexcomCGM
 
 
+def load_csv(dataPathAndName):
+    df = pd.read_csv(dataPathAndName, low_memory=False)
+    return df
+
+
+def load_json(dataPathAndName):
+    df = pd.read_json(dataPathAndName, orient="records")
+    return df
+
+
 # %% LOAD IN ONE FILE, BUT EVENTUALLY THIS WILL LOOOP THROUGH ALL USER'S
 dataPulledDate = "2018-09-28"
 phiDate = "PHI-" + dataPulledDate
 donorPath = os.path.join("..", "bigdata-processing-pipeline", "data", phiDate + "-donor-data")
 
 donorList = phiDate + "-uniqueDonorList.csv"
-donors = td.load.load_csv(os.path.join(donorPath, donorList))
+donors = load_csv(os.path.join(donorPath, donorList))
 
 # %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
 
 # this is where the loop will go:
-for dIndex in range(140, len(donors)):
+for dIndex in range(335, len(donors)):
 
     # clear output dataframes
     isf, cir, correctionTarget = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
@@ -449,7 +458,7 @@ def getListOfDexcomCGMDays(df):
             fileSize = os.stat(jsonFileName).st_size
             metadata["fileSizeKB"] = fileSize / 1000
             if fileSize > 1000:
-                data = td.load.load_json(jsonFileName)
+                data = load_json(jsonFileName)
 
                 # sort the data by time
                 data.sort_values("time", inplace=True)
@@ -514,7 +523,7 @@ def getListOfDexcomCGMDays(df):
                         # get rid of duplicates that have the same ["time", "normal"]
                         bolus.sort_values("uploadTime", ascending=False, inplace=True)
                         bolus, nBolusDuplicatesRemoved = \
-                            td.clean.remove_duplicates(bolus, bolus[["deviceTime", "normal"]])
+                            removeDuplicates(bolus, ["deviceTime", "normal"])
                         metadata["nBolusDuplicatesRemoved"] = nBolusDuplicatesRemoved
 
                         # get a summary of boluses per day
@@ -569,7 +578,7 @@ def getListOfDexcomCGMDays(df):
                         pumpSettings.sort_values("uploadTime", ascending=False, inplace=True)
 
                         pumpSettings, nPumpSettingsDuplicatesRemoved = \
-                        td.clean.remove_duplicates(pumpSettings, pumpSettings[["deviceTime"]])
+                        removeDuplicates(pumpSettings, "deviceTime")
                         metadata["nPumpSettingsDuplicatesRemoved"] = nPumpSettingsDuplicatesRemoved
 
                         # ISF
@@ -587,7 +596,8 @@ def getListOfDexcomCGMDays(df):
                         else:
                             isfColHead = "insulinSensitivities"
                             isf = pd.DataFrame(columns=isfColHeadings)
-                            for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
+                            for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"].astype(str)):
+                                print(p, actSched)
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, isfColHead + "." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
                                 tempDF["isf.time"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
@@ -708,7 +718,7 @@ def getListOfDexcomCGMDays(df):
                         metadata["basal.endDate"] = basalEndDate
 
                         basal, nBasalDuplicatesRemoved = \
-                            td.clean.remove_duplicates(basal, basal[["deliveryType", "deviceTime", "duration", "rate"]])
+                            removeDuplicates(basal, ["deliveryType", "deviceTime", "duration", "rate"])
                         metadata["basal.nBasalDuplicatesRemoved"] = nBasalDuplicatesRemoved
 
                         # fill NaNs with 0, as it indicates a suspend (temp basal of 0)

From 28a46545e032915945cf7924eceb0cda39a7e089 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sat, 12 Jan 2019 14:18:28 -0600
Subject: [PATCH 25/78] edge case where the active schedule is a float (convert
 back to string)

---
 .../get-users-settings-and-events.py          | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 1a016850..34db1228 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -596,8 +596,15 @@ def load_json(dataPathAndName):
                         else:
                             isfColHead = "insulinSensitivities"
                             isf = pd.DataFrame(columns=isfColHeadings)
-                            for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"].astype(str)):
-                                print(p, actSched)
+
+                            # edge case where active schedule is a float
+
+
+                            for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
+                                # edge case where actSchedule is float
+                                if isinstance(actSched, float):
+                                    actSched = str(int(actSched))
+
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, isfColHead + "." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
                                 tempDF["isf.time"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
@@ -623,6 +630,9 @@ def load_json(dataPathAndName):
                             cirColHead = "carbRatios"
                             cir = pd.DataFrame(columns=cirColHeadings)
                             for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
+                                # edge case where actSchedule is float
+                                if isinstance(actSched, float):
+                                    actSched = str(int(actSched))
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, cirColHead + "." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
                                 tempDF["cir.time"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
@@ -660,6 +670,9 @@ def load_json(dataPathAndName):
                             ctColHead = "bgTargets"
                             correctionTarget = pd.DataFrame(columns=ctColHeadings)
                             for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
+                                # edge case where actSchedule is float
+                                if isinstance(actSched, float):
+                                    actSched = str(int(actSched))
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, ctColHead + "." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
                                 tempDF["ct.time"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
@@ -684,6 +697,9 @@ def load_json(dataPathAndName):
                         sbrColHeadings.extend(["sbrTime", "rate", "type"])
                         sbr = pd.DataFrame(columns=sbrColHeadings)
                         for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
+                            # edge case where actSchedule is float
+                            if isinstance(actSched, float):
+                                actSched = str(int(actSched))
                             if 'Auto Mode' not in actSched:
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]

From 2ceaf1501a8b462f58ddd78805127cdf96f2aa82 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sat, 12 Jan 2019 15:04:48 -0600
Subject: [PATCH 26/78] save all preprocessed data

---
 .../get-users-settings-and-events.py          | 40 +++++++++++++++++--
 1 file changed, 37 insertions(+), 3 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 34db1228..d6060056 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -427,13 +427,24 @@ def load_json(dataPathAndName):
 phiDate = "PHI-" + dataPulledDate
 donorPath = os.path.join("..", "bigdata-processing-pipeline", "data", phiDate + "-donor-data")
 
+phiOutputPath = os.path.join(donorPath, "PHI-settings-and-events")
+outputPath = os.path.join(donorPath, "settings-and-events")
+
+
+# create anonExportDataPath folders
+if not os.path.exists(phiOutputPath):
+    os.makedirs(phiOutputPath)
+    os.makedirs(outputPath)
+
 donorList = phiDate + "-uniqueDonorList.csv"
 donors = load_csv(os.path.join(donorPath, donorList))
 
+allMetadata = donors[['hashID', 'diagnosisType']].copy()
+
 # %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
 
 # this is where the loop will go:
-for dIndex in range(335, len(donors)):
+for dIndex in range(0, len(donors)):
 
     # clear output dataframes
     isf, cir, correctionTarget = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
@@ -442,6 +453,14 @@ def load_json(dataPathAndName):
     userID = donors.userID[dIndex]
     hashID = donors.hashID[dIndex]
     metadata = pd.DataFrame(index=[dIndex])
+    metadata["hashID"] = hashID
+
+    # make folder to save data
+    processedDataPath = os.path.join(phiOutputPath, "PHI-" + userID)
+    if not os.path.exists(processedDataPath):
+        os.makedirs(processedDataPath)
+
+
     # round all birthdays and diagnosis dates to the first day of the month (to protect identities)
     if (pd.isnull(donors.bDay[dIndex]) + pd.isnull(donors.dDay[dIndex])) == 0:
 
@@ -873,10 +892,20 @@ def load_json(dataPathAndName):
                         ylwSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
                         ylwSummary.reset_index(inplace=True)
 
-                            # %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)
+                        # %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)
 
 
-                            # %% SAVE RESULTS
+
+
+
+                        # %% SAVE RESULTS
+
+
+                        # save the processed data
+                        basal.to_csv(os.path.join(processedDataPath, "basal-PHI-" + userID + ".csv"))
+                        bolus.to_csv(os.path.join(processedDataPath, "bolus-PHI-" + userID + ".csv"))
+                        cgmData.to_csv(os.path.join(processedDataPath, "cgm-PHI-" + userID + ".csv"))
+                        pumpSettings.to_csv(os.path.join(processedDataPath, "pumpSettings-PHI-" + userID + ".csv"))
 
                     else:
                         metadata["flags"] = "no bolus wizard data"
@@ -888,6 +917,11 @@ def load_json(dataPathAndName):
             metadata["flags"] = "file does not exist"
     else:
         metadata["flags"] = "fmissing bDay/dDay"
+
+    # write metaData to allMetadata
+    allMetadata = pd.merge(allMetadata, metadata, how="left", on="hashID")
+    allMetadata.to_csv(os.path.join(outputPath, "allMetadata.csv"))
+
     print("done with", dIndex)
 
 

From ba45b20901bc14291a1512069fa806e821752791 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sun, 13 Jan 2019 08:15:26 -0600
Subject: [PATCH 27/78] calc local time and save settings and events

---
 .../get-users-settings-and-events.py          | 237 +++++++++++++++---
 1 file changed, 200 insertions(+), 37 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index d6060056..88c31088 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -14,6 +14,9 @@
 # %% REQUIRED LIBRARIES
 import pandas as pd
 import numpy as np
+from pytz import timezone
+from datetime import timedelta
+import datetime as dt
 import os
 import pdb
 
@@ -422,6 +425,43 @@ def load_json(dataPathAndName):
     return df
 
 
+def getTzoForDateTime(utcTime, currentTimezone):
+
+    tz = timezone(currentTimezone)
+    tzoNum = int(tz.localize(utcTime).strftime("%z"))
+    tzoNum = int(tz.localize(utcTime).strftime("%z"))
+    tzoHours = np.floor(tzoNum / 100)
+    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+    tzoSign = np.sign(tzoHours)
+    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+    localTime = utcTime + pd.to_timedelta(tzo, unit="m")
+
+    return localTime
+
+
+def getTimezoneOffset(currentDate, currentTimezone):
+
+    tz = timezone(currentTimezone)
+    # here we add 1 day to the current date to account for changes to/from DST
+    tzoNum = int(tz.localize(currentDate + timedelta(days=1)).strftime("%z"))
+    tzoHours = np.floor(tzoNum / 100)
+    tzoMinutes = round((tzoNum / 100 - tzoHours) * 100, 0)
+    tzoSign = np.sign(tzoHours)
+    tzo = int((tzoHours * 60) + (tzoMinutes * tzoSign))
+
+    return tzo
+
+
+def isDSTChangeDay(currentDate, currentTimezone):
+    tzoCurrentDay = getTimezoneOffset(pd.to_datetime(currentDate),
+                                      currentTimezone)
+    tzoPreviousDay = getTimezoneOffset(pd.to_datetime(currentDate) +
+                                       timedelta(days=-1), currentTimezone)
+
+    return (tzoCurrentDay != tzoPreviousDay)
+
+
+
 # %% LOAD IN ONE FILE, BUT EVENTUALLY THIS WILL LOOOP THROUGH ALL USER'S
 dataPulledDate = "2018-09-28"
 phiDate = "PHI-" + dataPulledDate
@@ -531,9 +571,9 @@ def load_json(dataPathAndName):
                     data["age"] = np.floor((data["utcTime"] - bDate).dt.days/365.25).astype(int)
                     data["ylw"] = np.floor((data["utcTime"] - dDate).dt.days/365.25).astype(int)
 
-                    commonColumnHeadings = ["hashID",
-                                            "age",
-                                            "ylw"]
+#                    commonColumnHeadings = ["hashID",
+#                                            "age",
+#                                            "ylw"]
 
 
                     # %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
@@ -558,10 +598,10 @@ def load_json(dataPathAndName):
                         bolus["isf_mmolL_U"] = bolus["insulinSensitivity"]
                         bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
 
-                        bolusCH = commonColumnHeadings.copy()
-                        bolusCH.extend(["utcTime", "roundedTime", "normal", "carbInput", "subType",
+#                        bolusCH = commonColumnHeadings.copy()
+                        bolusCH = ["utcTime", "timezone", "roundedTime", "normal", "carbInput", "subType",
                                         "insulinOnBoard", "bgInput",
-                                        "isf", "isf_mmolL_U", "insulinCarbRatio"])
+                                        "isf", "isf_mmolL_U", "insulinCarbRatio"]
                         bolusEvents = bolus.loc[bolus["normal"].notnull(), bolusCH]
                         bolusEvents.loc[bolusEvents["bgInput"] == 0, "bgInput"] = np.nan
                         bolusEvents = bolusEvents.rename(columns={"normal": "unitsInsulin",
@@ -574,8 +614,8 @@ def load_json(dataPathAndName):
                             bolus["duration"].replace(0, np.nan, inplace=True)
                             bolus["durationHours"] = bolus["duration"] / 1000.0 / 3600.0
                             bolus["rate"] = bolus["extended"] / bolus["durationHours"]
-                            bolusExtendedCH = commonColumnHeadings.copy()
-                            bolusExtendedCH.extend(["utcTime", "roundedTime", "durationHours", "rate",  "type"])
+#                            bolusExtendedCH = commonColumnHeadings.copy()
+                            bolusExtendedCH = ["utcTime", "timezone", "roundedTime", "durationHours", "rate",  "type"]
                             bolusExtendedEvents = bolus.loc[
                                     ((bolus["extended"].notnull()) &
                                      (bolus["duration"] > 0)), bolusExtendedCH]
@@ -601,14 +641,14 @@ def load_json(dataPathAndName):
                         metadata["nPumpSettingsDuplicatesRemoved"] = nPumpSettingsDuplicatesRemoved
 
                         # ISF
-                        isfColHeadings = commonColumnHeadings.copy()
-                        isfColHeadings.extend(["isf.time", "isf", "isf_mmolL_U"])
+#                        isfColHeadings = commonColumnHeadings.copy()
+                        isfColHeadings = ["localTime", "isf", "isf_mmolL_U"]
 
                         if "insulinSensitivity.amount" in list(pumpSettings):
                             isfColHead = "insulinSensitivity"
                             pumpSettings["isf_mmolL_U"] = pumpSettings[isfColHead + ".amount"]
                             pumpSettings["isf"] = mmolL_to_mgdL(pumpSettings["isf_mmolL_U"])
-                            pumpSettings["isf.time"] = pd.to_datetime(pumpSettings["day"]) + \
+                            pumpSettings["localTime"] = pd.to_datetime(pumpSettings["day"]) + \
                                 pd.to_timedelta(pumpSettings[isfColHead + ".start"], unit="ms")
 
                             isf = pumpSettings.loc[pumpSettings["isf"].notnull(), isfColHeadings]
@@ -626,7 +666,7 @@ def load_json(dataPathAndName):
 
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, isfColHead + "." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
-                                tempDF["isf.time"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                tempDF["localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
                                 tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
                                 tempDF["age"] = pumpSettings.loc[p, "age"]
                                 tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
@@ -635,13 +675,13 @@ def load_json(dataPathAndName):
                                 isf = pd.concat([isf, tempDF[isfColHeadings]], ignore_index=True)
 
                         # CIR
-                        cirColHeadings = commonColumnHeadings.copy()
-                        cirColHeadings.extend(["cir.time", "cir"])
+#                        cirColHeadings = commonColumnHeadings.copy()
+                        cirColHeadings = ["localTime", "cir"]
 
                         if "carbRatio.amount" in list(pumpSettings):
                             cirColHead = "carbRatio"
                             pumpSettings["cir"] = pumpSettings[cirColHead + ".amount"]
-                            pumpSettings["cir.time"] = pd.to_datetime(pumpSettings["day"]) + \
+                            pumpSettings["localTime"] = pd.to_datetime(pumpSettings["day"]) + \
                                 pd.to_timedelta(pumpSettings[cirColHead + ".start"], unit="ms")
 
                             cir = pumpSettings.loc[pumpSettings["carbRatio.amount"].notnull(), cirColHeadings]
@@ -654,7 +694,7 @@ def load_json(dataPathAndName):
                                     actSched = str(int(actSched))
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, cirColHead + "." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
-                                tempDF["cir.time"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                tempDF["localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
                                 tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
                                 tempDF["age"] = pumpSettings.loc[p, "age"]
                                 tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
@@ -663,8 +703,8 @@ def load_json(dataPathAndName):
 
 
                         # CORRECTION TARGET
-                        ctColHeadings = commonColumnHeadings.copy()
-                        ctColHeadings.extend(["ct.time", "ct.low", "ct.high", "ct.target", "ct.range"])
+#                        ctColHeadings = commonColumnHeadings.copy()
+                        ctColHeadings = ["localTime", "ct.low", "ct.high", "ct.target", "ct.range"]
 
                         if "bgTarget.start" in list(pumpSettings):
                             ctColHead = "bgTarget."
@@ -680,7 +720,7 @@ def load_json(dataPathAndName):
                                     pumpSettings["ct." + targetType + "_mmolL"] = np.nan
                                     pumpSettings["ct." + targetType]  = np.nan
 
-                            pumpSettings["ct.time"] = pd.to_datetime(pumpSettings["day"]) + \
+                            pumpSettings["localTime"] = pd.to_datetime(pumpSettings["day"]) + \
                                 pd.to_timedelta(pumpSettings[ctColHead + "start"], unit="ms")
 
                             correctionTarget = pumpSettings.loc[pumpSettings["bgTarget.start"].notnull(), ctColHeadings]
@@ -694,7 +734,7 @@ def load_json(dataPathAndName):
                                     actSched = str(int(actSched))
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, ctColHead + "." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
-                                tempDF["ct.time"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                tempDF["localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
                                 tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
                                 tempDF["age"] = pumpSettings.loc[p, "age"]
                                 tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
@@ -712,8 +752,8 @@ def load_json(dataPathAndName):
                                 correctionTarget = pd.concat([correctionTarget, tempDF[ctColHeadings]], ignore_index=True)
 
                         # SCHEDULED BASAL RATES
-                        sbrColHeadings = commonColumnHeadings.copy()
-                        sbrColHeadings.extend(["sbrTime", "rate", "type"])
+#                        sbrColHeadings = commonColumnHeadings.copy()
+                        sbrColHeadings = ["localTime", "rate", "type"]
                         sbr = pd.DataFrame(columns=sbrColHeadings)
                         for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
                             # edge case where actSchedule is float
@@ -723,10 +763,10 @@ def load_json(dataPathAndName):
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
                                 tempDF["type"] = np.nan
-                                tempDF["sbrTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                tempDF["localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
                             else:
                                 tempDF = pd.DataFrame(index=[0])
-                                tempDF["sbrTime"] = np.nan
+                                tempDF["localTime"] = np.nan
                                 tempDF["rate"] = np.nan
                                 tempDF["type"] = "AutoMode"
 
@@ -774,13 +814,16 @@ def load_json(dataPathAndName):
                         basal["totalAmountOfBasalInsulin"] = basal["durationHours"] * basal["rate"]
 
                         # actual basal delivered
-                        abrColHeadings = commonColumnHeadings.copy()
-                        abrColHeadings.extend(["utcTime", "roundedTime", "durationHours", "rate", "type"])
+#                        abrColHeadings = commonColumnHeadings.copy()
+                        abrColHeadings = ["utcTime", "timezone", "roundedTime", "durationHours", "rate", "type"]
                         abr = basal[abrColHeadings]
                         if "duration" in list(bolus):
                             abr = pd.concat([abr, bolusExtendedEvents], ignore_index=True)
                             abr.sort_values("utcTime", inplace=True)
 
+                        abr["timezone"].fillna(method='ffill', inplace=True)
+                        abr["timezone"].fillna(method='bfill', inplace=True)
+
                         # get a summary of basals per day
                         basalDaySummary = get_basalDaySummary(basal)
 
@@ -830,8 +873,8 @@ def load_json(dataPathAndName):
                         cgmRecordsPerDay["date"] = cgmRecordsPerDay.index
 
                         # filter the cgm data
-                        cgmColHeadings = commonColumnHeadings.copy()
-                        cgmColHeadings.extend(["utcTime", "roundedTime", "value"])
+#                        cgmColHeadings = commonColumnHeadings.copy()
+                        cgmColHeadings = ["utcTime", "timezone", "roundedTime", "value"]
 
                         # get data in mg/dL units
                         cgm = cgmData[cgmColHeadings]
@@ -849,6 +892,7 @@ def load_json(dataPathAndName):
                             rename(columns={"top": "hashID"})
                         dataPerDay["age"] = catDF.age.mean()
                         dataPerDay["ylw"] = catDF.ylw.mean()
+                        dataPerDay["timezone"] = catDF.timezone.describe()["top"]
 
 
                         # calculate all of the data start and end range
@@ -867,6 +911,11 @@ def load_json(dataPathAndName):
 
                         dayData["validPumpData"] = dayData["numberOfNormalBoluses"] > 3
                         dayData["validCGMData"] = dayData["cgm.count"] > (288*.75)
+
+                        dayData["isDSTChangeDay"] = dayData[['day', 'timezone']].apply(lambda x: isDSTChangeDay(*x), axis=1)
+                        dayData["date"] = pd.to_datetime(dayData["day"])
+                        dayData["tzo"] = dayData[['date', 'timezone']].apply(lambda x: getTimezoneOffset(*x), axis=1)
+
                         # calculate the start and end of contiguous data
                         # these dates can be used when simulating and predicting, where
                         # you need both pump and cgm data
@@ -880,32 +929,142 @@ def load_json(dataPathAndName):
                         ageSummary = pd.DataFrame(catDF.validPumpData.sum())
                         ageSummary.rename(columns={"validPumpData": "nDaysValidPump"}, inplace=True)
                         ageSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
-                        ageSummary["nDaysclosedLopp"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
+                        ageSummary["nDaysClosedLoop"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
                         ageSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
                         ageSummary.reset_index(inplace=True)
 
+                        analysisCriterion = ageSummary[((ageSummary["nDaysValidPump"]> 28) &
+                                                        (ageSummary["nDaysValidCgm"]> 28))]
+                        minAge = analysisCriterion["age"].min()
+                        maxAge = analysisCriterion["age"].max()
+                        nDaysClosedLoop = analysisCriterion["nDaysClosedLoop"].sum()
+                        n670gDays = analysisCriterion["n670gDays"].sum()
+                        metadata["minAge"] = minAge
+                        metadata["maxAge"] = maxAge
+                        metadata["nDaysClosedLoop"] = nDaysClosedLoop
+                        metadata["n670gDays"] = n670gDays
+
                         catDF = dayData.groupby("ylw")
                         ylwSummary = pd.DataFrame(catDF.validPumpData.sum())
                         ylwSummary.rename(columns={"validPumpData": "nDaysValidPump"}, inplace=True)
                         ylwSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
-                        ylwSummary["nDaysclosedLopp"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
+                        ylwSummary["nDaysClosedLoop"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
                         ylwSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
                         ylwSummary.reset_index(inplace=True)
 
-                        # %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)
+                        analysisCriterion = ylwSummary[((ylwSummary["nDaysValidPump"]> 28) &
+                                                        (ylwSummary["nDaysValidCgm"]> 28))]
+                        minYLW = analysisCriterion["ylw"].min()
+                        maxYLW = analysisCriterion["ylw"].max()
+                        metadata["minYLW"] = minYLW
+                        metadata["maxYLW"] = maxYLW
 
 
 
+                        # %% calculate local time
+                        abr["date"] = pd.to_datetime(abr["utcTime"].dt.date)
+                        abr = pd.merge(abr, dayData[["date", "tzo", "isDSTChangeDay"]], how="left", on="date")
+                        abr["localTime"] = abr["utcTime"] + pd.to_timedelta(abr["tzo"], unit="m")
 
+                        cgm["date"] = pd.to_datetime(cgm["utcTime"].dt.date)
+                        cgm = pd.merge(cgm, dayData[["date", "tzo", "isDSTChangeDay"]], how="left", on="date")
+                        cgm["localTime"] = cgm["utcTime"] + pd.to_timedelta(cgm["tzo"], unit="m")
+
+                        bolusEvents["date"] = pd.to_datetime(bolusEvents["utcTime"].dt.date)
+                        bolusEvents = pd.merge(bolusEvents, dayData[["date", "tzo", "isDSTChangeDay"]], how="left", on="date")
+                        bolusEvents["localTime"] = bolusEvents["utcTime"] + pd.to_timedelta(bolusEvents["tzo"], unit="m")
 
-                        # %% SAVE RESULTS
 
+                        # %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)
+                        # all settings
+                        allSettings = pd.merge(isf, cir, how="outer", on="localTime")
+                        allSettings = pd.merge(allSettings,
+                                               sbr.rename(columns={"rate": "sbr", "type": "sbr.type"}),
+                                               how="outer", on="localTime")
+                        allSettings = pd.merge(allSettings, correctionTarget, how="outer", on="localTime")
+                        allSettings["hashID"] = hashID
+                        allSettings["age"] = np.floor((allSettings["localTime"] - bDate).dt.days/365.25).astype(int)
+                        allSettings["ylw"] = np.floor((allSettings["localTime"] - dDate).dt.days/365.25).astype(int)
+                        allSettings = round_time(allSettings, timeIntervalMinutes=5,
+                                                 timeField="localTime",
+                                                 roundedTimeFieldName="localRoundedTime",
+                                                 startWithFirstRecord=True, verbose=False)
+
+                        colOrder = ["hashID", "age", "ylw", "localTime", "localRoundedTime",
+                                    "isf", "cir", "sbr",
+                                    "ct.low", "ct.high", "ct.target", "ct.range",
+                                    "sbr.type", "isf_mmolL_U"]
+                        allSettings = allSettings[colOrder]
+
+
+                        fieldsToDrop = ["utcTime", "timezone", "roundedTime", "date", "tzo", "isDSTChangeDay"]
+                        pumpEvents = pd.merge(abr.drop(columns=fieldsToDrop),
+                                              bolusEvents.drop(columns=fieldsToDrop),
+                                              how="outer", on="localTime")
+                        pumpEvents["type"].fillna("bolus", inplace=True)
+                        pumpEvents["eventType"].fillna("basal", inplace=True)
+                        pumpEvents["hashID"] = hashID
+                        pumpEvents["age"] = np.floor((pumpEvents["localTime"] - bDate).dt.days/365.25).astype(int)
+                        pumpEvents["ylw"] = np.floor((pumpEvents["localTime"] - dDate).dt.days/365.25).astype(int)
+                        pumpEvents = round_time(pumpEvents, timeIntervalMinutes=5,
+                                                timeField="localTime",
+                                                roundedTimeFieldName="localRoundedTime",
+                                                startWithFirstRecord=True, verbose=False)
+
+
+                        colOrder = ["hashID", "age", "ylw", "localTime", "localRoundedTime",
+                                    "rate", "durationHours",
+                                    "unitsInsulin", "carbInput", "type", "eventType", "subType",
+                                    "isf", "isf_mmolL_U", "insulinCarbRatio", "insulinOnBoard",
+                                    "bg_mgdL", "bg_mmolL"]
+
+                        pumpEvents = pumpEvents[colOrder]
+
+                        cgmLite = cgm.drop(columns=fieldsToDrop)
+                        cgmLite["hashID"] = hashID
+                        cgmLite["age"] = np.floor((cgmLite["localTime"] - bDate).dt.days/365.25).astype(int)
+                        cgmLite["ylw"] = np.floor((cgmLite["localTime"] - dDate).dt.days/365.25).astype(int)
+                        cgmLite = round_time(cgmLite, timeIntervalMinutes=5,
+                                             timeField="localTime",
+                                             roundedTimeFieldName="localRoundedTime",
+                                             startWithFirstRecord=True, verbose=False)
+
+                        colOrder = ["hashID", "age", "ylw", "localTime", "localRoundedTime",
+                                    "mg_dL", "mmol_L"]
+
+                        cgmLite = cgmLite[colOrder]
 
-                        # save the processed data
-                        basal.to_csv(os.path.join(processedDataPath, "basal-PHI-" + userID + ".csv"))
-                        bolus.to_csv(os.path.join(processedDataPath, "bolus-PHI-" + userID + ".csv"))
-                        cgmData.to_csv(os.path.join(processedDataPath, "cgm-PHI-" + userID + ".csv"))
-                        pumpSettings.to_csv(os.path.join(processedDataPath, "pumpSettings-PHI-" + userID + ".csv"))
+
+                        # %% SAVE RESULTS
+                        outputString = "age-%s-%s-ylw-%s-%s-lp-%s-670g-%s-id-%s"
+                        outputFormat = (f"{minAge:02d}",
+                                        f"{maxAge:02d}",
+                                        f"{minYLW:02d}",
+                                        f"{maxYLW:02d}",
+                                        f"{nDaysClosedLoop:03d}",
+                                        f"{n670gDays:03d}",
+                                        hashID[0:4])
+                        outputFolderName = outputString % outputFormat
+                        outputFolderName_Path = os.path.join(outputPath,"data", outputFolderName)
+                        if not os.path.exists(outputFolderName_Path):
+                            os.makedirs(outputFolderName_Path)
+
+                        # save data for this person
+                        fName = outputFolderName + "-allSettings.csv"
+                        allSettings.to_csv(os.path.join(outputFolderName_Path, fName))
+                        fName = outputFolderName + "-pumpEvents.csv"
+                        pumpEvents.to_csv(os.path.join(outputFolderName_Path, fName))
+                        fName = outputFolderName + "-cgmLite.csv"
+                        cgmLite.to_csv(os.path.join(outputFolderName_Path, fName))
+
+
+
+                        # %% save the processed data (saving this data will take up a lot of space and time)
+                        #data.to_csv(os.path.join(processedDataPath, "allDataCleaned-PHI-" + userID + ".csv"))
+                        #basal.to_csv(os.path.join(processedDataPath, "basal-PHI-" + userID + ".csv"))
+                        #bolus.to_csv(os.path.join(processedDataPath, "bolus-PHI-" + userID + ".csv"))
+                        #cgmData.to_csv(os.path.join(processedDataPath, "cgm-PHI-" + userID + ".csv"))
+                        #pumpSettings.to_csv(os.path.join(processedDataPath, "pumpSettings-PHI-" + userID + ".csv"))
 
                     else:
                         metadata["flags"] = "no bolus wizard data"
@@ -926,6 +1085,10 @@ def load_json(dataPathAndName):
 
 
 # %% V2 DATA TO GRAB
+# ADD ROUNDEDLOCAL TIME TO THE END RESULTS
+# GET RID OF ROUNDING TIME AT THE BEGINNING
+# EXPAND THE CORRECTION TIME VALUES TO BE UNIFORM ACROSS ALL USERS AND DEVICES
+# FIX DAYLIGHT SAVINGS TIME TIMES
 # FIGURE OUT WHY TEMP BASAL COUNTS ARE DIFFERENT BETWEEN THE TWO DIFFERENT METHODS
 # MAX BASAL RATE, MAX BOLUS AMOUNT, AND INSULIN DURATION SET ON SELECT PUMPS
 # ALERT SETTINGS

From 334f19bc2770bd5c690efa3a87957caec855f336 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sun, 13 Jan 2019 13:28:46 -0600
Subject: [PATCH 28/78] add isf day stats

---
 .../get-users-settings-and-events.py          | 150 +++++++++++++++---
 1 file changed, 126 insertions(+), 24 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 88c31088..df162541 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -461,9 +461,90 @@ def isDSTChangeDay(currentDate, currentTimezone):
     return (tzoCurrentDay != tzoPreviousDay)
 
 
+def get_setting_durations(df, col, dataPulledDF):
+    df = pd.concat([df, dataPulledDF], sort=False)
+    df.sort_values(col + ".localTime", inplace=True)
+    df.reset_index(inplace=True, drop=True)
+    df.fillna(method='ffill', inplace=True)
+    durationHours = (df[col + ".localTime"].shift(-1) -
+                     df[col + ".localTime"]).dt.total_seconds() / 3600
+    durationHours.fillna(0, inplace=True)
+    durationHours[durationHours > 24] = 24
+    df[col + ".durationHours"] = durationHours
+
+    return df
+
+
+def get_settingStats(df, col, pumpCol):
+    df[col] = df[pumpCol]
+    df[col + ".min"] = df[col].min()
+    df[col + ".weightedMean"] = np.sum(df[col] * df[col + ".durationHours"]) / df[col + ".durationHours"].sum()
+    df[col + ".max"] = df[col].max()
+
+    return df
+
+
+def getPumpSettingsStats(df, col, pumpCol):
+    pumpColHeadings = [col + ".localTime", col, col + ".min",
+                       col + ".weightedMean", col + ".max"]
+    df[col] = df[pumpCol + ".amount"]
+    df[col + ".localTime"] = pd.to_datetime(df["day"]) + \
+        pd.to_timedelta(df[pumpCol + ".start"], unit="ms")
+    df[col + ".min"] = df[col]
+    df[col + ".weightedMean"] = df[col]
+    df[col + ".max"] = df[col]
+
+    df2 = df.loc[df[pumpCol + ".amount"].notnull(), pumpColHeadings]
+
+    return df, df2
+
+
+def processBasalSchedule(df, col):
+    colHeadings = [col + ".localTime", col, col + ".durationHours", col + ".type",
+                   col + ".min", col + ".weightedMean", col + ".max"]
+    summaryColHeadings = ["day", col + ".min", col + ".weightedMean", col + ".max"]
+    dropCols = ["rate", "start", col + ".localTime", col, col + ".durationHours", col + ".type"]
+
+    dailySchedule = pd.DataFrame(columns=colHeadings)
+    dailySummary = pd.DataFrame(columns=summaryColHeadings)
+
+    for p, actSched in zip(df.index, df["activeSchedule"]):
+        # edge case where actSchedule is float
+        if isinstance(actSched, float):
+            actSched = str(int(actSched))
+        if 'Auto Mode' not in actSched:
+            tempDF = pd.DataFrame(df.loc[p, "basalSchedules." + actSched])
+            tempDF["day"] = df.loc[p, "day"]
+            tempDF[col + ".type"] = np.nan
+            tempDF[col + ".localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+            endOfDay = pd.DataFrame(pd.to_datetime(df.loc[p, "day"] + pd.Timedelta(1, "D")), columns=[col + ".localTime"], index=[0])
+            tempDF = get_setting_durations(tempDF, col, endOfDay)
+            tempDF = tempDF[:-1]
+            tempDF = get_settingStats(tempDF, col, "rate")
+            dailySchedule = pd.concat([dailySchedule, tempDF[colHeadings]], ignore_index=True, sort=False)
+            tempSummary = tempDF.drop(columns=dropCols)
+            tempSummary["day"] = df.loc[p, "day"]
+            tempSummary = tempSummary[0:1]
+            dailySummary = pd.concat([dailySummary, tempSummary], ignore_index=True, sort=False)
+
+        else:
+            pdb.set_trace()
+            tempDF = pd.DataFrame(index=[0])
+            tempDF[col + ".type"] = "AutoMode"
+            dailySchedule = pd.concat([dailySchedule, tempDF], ignore_index=True, sort=False)
+            tempSummary["day"] = df.loc[p, "day"]
+            tempSummary = tempSummary[0:1]
+            dailySummary = pd.concat([dailySummary, tempSummary], ignore_index=True, sort=False)
+
+    return dailySchedule, dailySummary
+
+
+
 
 # %% LOAD IN ONE FILE, BUT EVENTUALLY THIS WILL LOOOP THROUGH ALL USER'S
 dataPulledDate = "2018-09-28"
+dataPulledDF = pd.DataFrame(pd.to_datetime(dataPulledDate), columns=["day"], index=[0])
+dataPulledDF["day"] = dataPulledDF["day"].dt.date
 phiDate = "PHI-" + dataPulledDate
 donorPath = os.path.join("..", "bigdata-processing-pipeline", "data", phiDate + "-donor-data")
 
@@ -484,7 +565,7 @@ def isDSTChangeDay(currentDate, currentTimezone):
 # %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
 
 # this is where the loop will go:
-for dIndex in range(0, len(donors)):
+for dIndex in [0]:  #range(0, len(donors)):
 
     # clear output dataframes
     isf, cir, correctionTarget = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
@@ -640,18 +721,33 @@ def isDSTChangeDay(currentDate, currentTimezone):
                         removeDuplicates(pumpSettings, "deviceTime")
                         metadata["nPumpSettingsDuplicatesRemoved"] = nPumpSettingsDuplicatesRemoved
 
+                        pumpSettings.sort_values("utcTime", ascending=True, inplace=True)
+                        pumpSettings.reset_index(drop=True, inplace=True)
+
                         # ISF
 #                        isfColHeadings = commonColumnHeadings.copy()
-                        isfColHeadings = ["localTime", "isf", "isf_mmolL_U"]
+                        isfColHeadings = ["isf.localTime", "isf", "isf_mmolL_U"]
 
                         if "insulinSensitivity.amount" in list(pumpSettings):
                             isfColHead = "insulinSensitivity"
                             pumpSettings["isf_mmolL_U"] = pumpSettings[isfColHead + ".amount"]
                             pumpSettings["isf"] = mmolL_to_mgdL(pumpSettings["isf_mmolL_U"])
-                            pumpSettings["localTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                            pumpSettings["isf.localTime"] = pd.to_datetime(pumpSettings["day"]) + \
                                 pd.to_timedelta(pumpSettings[isfColHead + ".start"], unit="ms")
 
                             isf = pumpSettings.loc[pumpSettings["isf"].notnull(), isfColHeadings]
+
+                            # add a day summary
+                            isfDaySummary = isf.copy()
+                            isfDaySummary["day"] = isfDaySummary["isf.localTime"].dt.date
+                            isfDaySummary.drop(columns=["isf.localTime"], inplace=True)
+                            isfDaySummary["isf.min"] = isfDaySummary["isf"]
+                            isfDaySummary["isf.weightedMean"] = isfDaySummary["isf"]
+                            isfDaySummary["isf.max"] = isfDaySummary["isf"]
+                            isfDaySummary = pd.concat([isfDaySummary, dataPulledDF], sort=False)
+                            isfDaySummary.reset_index(inplace=True, drop=True)
+                            isfDaySummary.fillna(method='ffill', inplace=True)
+
                         else:
                             isfColHead = "insulinSensitivities"
                             isf = pd.DataFrame(columns=isfColHeadings)
@@ -673,6 +769,7 @@ def isDSTChangeDay(currentDate, currentTimezone):
                                 tempDF["isf_mmolL_U"] = tempDF["amount"]
                                 tempDF["isf"] = mmolL_to_mgdL(tempDF["isf_mmolL_U"])
                                 isf = pd.concat([isf, tempDF[isfColHeadings]], ignore_index=True)
+                                pdb.set_trace()
 
                         # CIR
 #                        cirColHeadings = commonColumnHeadings.copy()
@@ -1035,27 +1132,31 @@ def isDSTChangeDay(currentDate, currentTimezone):
                         cgmLite = cgmLite[colOrder]
 
 
+                        # %% day level stats
+
+
+
                         # %% SAVE RESULTS
-                        outputString = "age-%s-%s-ylw-%s-%s-lp-%s-670g-%s-id-%s"
-                        outputFormat = (f"{minAge:02d}",
-                                        f"{maxAge:02d}",
-                                        f"{minYLW:02d}",
-                                        f"{maxYLW:02d}",
-                                        f"{nDaysClosedLoop:03d}",
-                                        f"{n670gDays:03d}",
-                                        hashID[0:4])
-                        outputFolderName = outputString % outputFormat
-                        outputFolderName_Path = os.path.join(outputPath,"data", outputFolderName)
-                        if not os.path.exists(outputFolderName_Path):
-                            os.makedirs(outputFolderName_Path)
-
-                        # save data for this person
-                        fName = outputFolderName + "-allSettings.csv"
-                        allSettings.to_csv(os.path.join(outputFolderName_Path, fName))
-                        fName = outputFolderName + "-pumpEvents.csv"
-                        pumpEvents.to_csv(os.path.join(outputFolderName_Path, fName))
-                        fName = outputFolderName + "-cgmLite.csv"
-                        cgmLite.to_csv(os.path.join(outputFolderName_Path, fName))
+#                        outputString = "age-%s-%s-ylw-%s-%s-lp-%s-670g-%s-id-%s"
+#                        outputFormat = (f"{minAge:02d}",
+#                                        f"{maxAge:02d}",
+#                                        f"{minYLW:02d}",
+#                                        f"{maxYLW:02d}",
+#                                        f"{nDaysClosedLoop:03d}",
+#                                        f"{n670gDays:03d}",
+#                                        hashID[0:4])
+#                        outputFolderName = outputString % outputFormat
+#                        outputFolderName_Path = os.path.join(outputPath,"data", outputFolderName)
+#                        if not os.path.exists(outputFolderName_Path):
+#                            os.makedirs(outputFolderName_Path)
+#
+#                        # save data for this person
+#                        fName = outputFolderName + "-allSettings.csv"
+#                        allSettings.to_csv(os.path.join(outputFolderName_Path, fName))
+#                        fName = outputFolderName + "-pumpEvents.csv"
+#                        pumpEvents.to_csv(os.path.join(outputFolderName_Path, fName))
+#                        fName = outputFolderName + "-cgmLite.csv"
+#                        cgmLite.to_csv(os.path.join(outputFolderName_Path, fName))
 
 
 
@@ -1075,7 +1176,7 @@ def isDSTChangeDay(currentDate, currentTimezone):
         else:
             metadata["flags"] = "file does not exist"
     else:
-        metadata["flags"] = "fmissing bDay/dDay"
+        metadata["flags"] = "missing bDay/dDay"
 
     # write metaData to allMetadata
     allMetadata = pd.merge(allMetadata, metadata, how="left", on="hashID")
@@ -1087,6 +1188,7 @@ def isDSTChangeDay(currentDate, currentTimezone):
 # %% V2 DATA TO GRAB
 # ADD ROUNDEDLOCAL TIME TO THE END RESULTS
 # GET RID OF ROUNDING TIME AT THE BEGINNING
+# DEFINE A DAY BETWEEN 6AM AND 6AM
 # EXPAND THE CORRECTION TIME VALUES TO BE UNIFORM ACROSS ALL USERS AND DEVICES
 # FIX DAYLIGHT SAVINGS TIME TIMES
 # FIGURE OUT WHY TEMP BASAL COUNTS ARE DIFFERENT BETWEEN THE TWO DIFFERENT METHODS

From 7076abab48866549dbcbfaffd97f3f1cfa54c934 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sun, 13 Jan 2019 14:32:57 -0600
Subject: [PATCH 29/78] calculate day summaries for settings (isf, cir, ct, and
 sbr)

---
 .../get-users-settings-and-events.py          | 102 ++++++++++++++----
 1 file changed, 82 insertions(+), 20 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index df162541..a6cb2906 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -752,9 +752,6 @@ def processBasalSchedule(df, col):
                             isfColHead = "insulinSensitivities"
                             isf = pd.DataFrame(columns=isfColHeadings)
 
-                            # edge case where active schedule is a float
-
-
                             for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
                                 # edge case where actSchedule is float
                                 if isinstance(actSched, float):
@@ -762,7 +759,7 @@ def processBasalSchedule(df, col):
 
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, isfColHead + "." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
-                                tempDF["localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                tempDF["isf.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
                                 tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
                                 tempDF["age"] = pumpSettings.loc[p, "age"]
                                 tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
@@ -773,16 +770,29 @@ def processBasalSchedule(df, col):
 
                         # CIR
 #                        cirColHeadings = commonColumnHeadings.copy()
-                        cirColHeadings = ["localTime", "cir"]
+                        cirColHeadings = ["cir.localTime", "cir"]
 
                         if "carbRatio.amount" in list(pumpSettings):
                             cirColHead = "carbRatio"
                             pumpSettings["cir"] = pumpSettings[cirColHead + ".amount"]
-                            pumpSettings["localTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                            pumpSettings["cir.localTime"] = pd.to_datetime(pumpSettings["day"]) + \
                                 pd.to_timedelta(pumpSettings[cirColHead + ".start"], unit="ms")
 
                             cir = pumpSettings.loc[pumpSettings["carbRatio.amount"].notnull(), cirColHeadings]
+
+                            # add a day summary
+                            cirDaySummary = cir.copy()
+                            cirDaySummary["day"] = cirDaySummary["cir.localTime"].dt.date
+                            cirDaySummary.drop(columns=["cir.localTime"], inplace=True)
+                            cirDaySummary["cir.min"] = cirDaySummary["cir"]
+                            cirDaySummary["cir.weightedMean"] = cirDaySummary["cir"]
+                            cirDaySummary["cir.max"] = cirDaySummary["cir"]
+                            cirDaySummary = pd.concat([cirDaySummary, dataPulledDF], sort=False)
+                            cirDaySummary.reset_index(inplace=True, drop=True)
+                            cirDaySummary.fillna(method='ffill', inplace=True)
+
                         else:
+
                             cirColHead = "carbRatios"
                             cir = pd.DataFrame(columns=cirColHeadings)
                             for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
@@ -791,17 +801,18 @@ def processBasalSchedule(df, col):
                                     actSched = str(int(actSched))
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, cirColHead + "." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
-                                tempDF["localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                tempDF["cir.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
                                 tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
                                 tempDF["age"] = pumpSettings.loc[p, "age"]
                                 tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
                                 tempDF["cir"] = tempDF["amount"].astype(float)
                                 cir = pd.concat([cir, tempDF[cirColHeadings]], ignore_index=True)
+                                pdb.set_trace()
 
 
                         # CORRECTION TARGET
 #                        ctColHeadings = commonColumnHeadings.copy()
-                        ctColHeadings = ["localTime", "ct.low", "ct.high", "ct.target", "ct.range"]
+                        ctColHeadings = ["ct.localTime", "ct.low", "ct.high", "ct.target", "ct.range"]
 
                         if "bgTarget.start" in list(pumpSettings):
                             ctColHead = "bgTarget."
@@ -817,12 +828,25 @@ def processBasalSchedule(df, col):
                                     pumpSettings["ct." + targetType + "_mmolL"] = np.nan
                                     pumpSettings["ct." + targetType]  = np.nan
 
-                            pumpSettings["localTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                            pumpSettings["ct.localTime"] = pd.to_datetime(pumpSettings["day"]) + \
                                 pd.to_timedelta(pumpSettings[ctColHead + "start"], unit="ms")
 
                             correctionTarget = pumpSettings.loc[pumpSettings["bgTarget.start"].notnull(), ctColHeadings]
 
+
+                            # add a day summary
+                            ctDaySummary = correctionTarget.copy()
+                            ctDaySummary["day"] = ctDaySummary["ct.localTime"].dt.date
+                            ctDaySummary.drop(columns=["ct.localTime"], inplace=True)
+#                            ctDaySummary["ct.min"] = ctDaySummary["ct.target"]
+#                            ctDaySummary["ct.weightedMean"] = ctDaySummary["ct"]
+#                            ctDaySummary["ct.max"] = ctDaySummary["ct"]
+                            ctDaySummary = pd.concat([ctDaySummary, dataPulledDF], sort=False)
+                            ctDaySummary.reset_index(inplace=True, drop=True)
+                            ctDaySummary.fillna(method='ffill', inplace=True)
+
                         else:
+
                             ctColHead = "bgTargets"
                             correctionTarget = pd.DataFrame(columns=ctColHeadings)
                             for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
@@ -831,7 +855,7 @@ def processBasalSchedule(df, col):
                                     actSched = str(int(actSched))
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, ctColHead + "." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
-                                tempDF["localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                tempDF["ct.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
                                 tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
                                 tempDF["age"] = pumpSettings.loc[p, "age"]
                                 tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
@@ -847,11 +871,14 @@ def processBasalSchedule(df, col):
                                         tempDF["ct." + targetType]  = np.nan
 
                                 correctionTarget = pd.concat([correctionTarget, tempDF[ctColHeadings]], ignore_index=True)
+                                pdb.set_trace()
 
                         # SCHEDULED BASAL RATES
 #                        sbrColHeadings = commonColumnHeadings.copy()
-                        sbrColHeadings = ["localTime", "rate", "type"]
+                        sbrColHeadings = ["sbr.localTime", "rate", "type"]
                         sbr = pd.DataFrame(columns=sbrColHeadings)
+                        sbrDayColHeadings = ['day', 'sbr.min', 'sbr.weightedMean', 'sbr.max', 'type']
+                        sbrDaySummary = pd.DataFrame(columns=sbrDayColHeadings)
                         for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
                             # edge case where actSchedule is float
                             if isinstance(actSched, float):
@@ -860,17 +887,39 @@ def processBasalSchedule(df, col):
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
                                 tempDF["type"] = np.nan
-                                tempDF["localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                tempDF["sbr.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["sbr.localTime"], index=[0])
+                                tempDF = get_setting_durations(tempDF, "sbr", endOfDay)
+                                tempDF = tempDF[:-1]
+
+                                tempDaySummary = pd.DataFrame(index=[0])
+                                tempDaySummary["day"] = tempDF["sbr.localTime"].dt.date
+                                tempDaySummary["sbr.min"] = tempDF["rate"].min()
+                                tempDaySummary["sbr.weightedMean"] = \
+                                    np.sum(tempDF["rate"] * tempDF["sbr.durationHours"]) / tempDF["sbr.durationHours"].sum()
+                                tempDaySummary["sbr.max"] = tempDF["rate"].max()
+                                tempDaySummary["type"] = np.nan
+
                             else:
                                 tempDF = pd.DataFrame(index=[0])
-                                tempDF["localTime"] = np.nan
+                                tempDF["day"] = pumpSettings.loc[p, "day"]
+                                tempDF["sbr.localTime"] = pd.to_datetime(tempDF["day"])
                                 tempDF["rate"] = np.nan
                                 tempDF["type"] = "AutoMode"
 
-                            tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
-                            tempDF["age"] = pumpSettings.loc[p, "age"]
-                            tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
+                                tempDaySummary = pd.DataFrame(index=[0])
+                                tempDaySummary["day"] = tempDF["sbr.localTime"].dt.date
+                                tempDaySummary["sbr.min"] = np.nan
+                                tempDaySummary["sbr.weightedMean"] = np.nan
+                                tempDaySummary["sbr.max"] = np.nan
+                                tempDaySummary["type"] = "AutoMode"
+
                             sbr = pd.concat([sbr, tempDF[sbrColHeadings]], ignore_index=True)
+                            sbrDaySummary = pd.concat([sbrDaySummary, tempDaySummary], ignore_index=True)
+
+                        sbrDaySummary = pd.concat([sbrDaySummary, dataPulledDF], sort=False)
+                        sbrDaySummary.reset_index(inplace=True, drop=True)
+                        sbrDaySummary.fillna(method='ffill', inplace=True)
 
                         # max basal rate, max bolus amount, and insulin duration
                         if "rateMaximum" in list(data):
@@ -1013,6 +1062,10 @@ def processBasalSchedule(df, col):
                         dayData["date"] = pd.to_datetime(dayData["day"])
                         dayData["tzo"] = dayData[['date', 'timezone']].apply(lambda x: getTimezoneOffset(*x), axis=1)
 
+
+
+
+
                         # calculate the start and end of contiguous data
                         # these dates can be used when simulating and predicting, where
                         # you need both pump and cgm data
@@ -1057,7 +1110,6 @@ def processBasalSchedule(df, col):
                         metadata["maxYLW"] = maxYLW
 
 
-
                         # %% calculate local time
                         abr["date"] = pd.to_datetime(abr["utcTime"].dt.date)
                         abr = pd.merge(abr, dayData[["date", "tzo", "isDSTChangeDay"]], how="left", on="date")
@@ -1074,11 +1126,18 @@ def processBasalSchedule(df, col):
 
                         # %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)
                         # all settings
-                        allSettings = pd.merge(isf, cir, how="outer", on="localTime")
+
+                        allSettings = pd.merge(isf.rename(columns={"isf.localTime": "localTime"}),
+                                               cir.rename(columns={"cir.localTime": "localTime"}),
+                                               how="outer", on="localTime")
+                        allSettings = pd.merge(allSettings,
+                                               sbr.rename(columns={"rate": "sbr",
+                                                                   "type": "sbr.type",
+                                                                   "sbr.localTime": "localTime"}),
+                                               how="outer", on="localTime")
                         allSettings = pd.merge(allSettings,
-                                               sbr.rename(columns={"rate": "sbr", "type": "sbr.type"}),
+                                               correctionTarget.rename(columns={"ct.localTime": "localTime"}),
                                                how="outer", on="localTime")
-                        allSettings = pd.merge(allSettings, correctionTarget, how="outer", on="localTime")
                         allSettings["hashID"] = hashID
                         allSettings["age"] = np.floor((allSettings["localTime"] - bDate).dt.days/365.25).astype(int)
                         allSettings["ylw"] = np.floor((allSettings["localTime"] - dDate).dt.days/365.25).astype(int)
@@ -1136,6 +1195,9 @@ def processBasalSchedule(df, col):
 
 
 
+                        # %% age and ylw stats
+
+
                         # %% SAVE RESULTS
 #                        outputString = "age-%s-%s-ylw-%s-%s-lp-%s-670g-%s-id-%s"
 #                        outputFormat = (f"{minAge:02d}",

From 98d10f6a9cb47ff6fc394d653fa8234ad9f2434b Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sun, 13 Jan 2019 14:41:19 -0600
Subject: [PATCH 30/78] day summaries only include summary stats

---
 .../get-users-settings-and-events.py          | 28 ++++++++-----------
 1 file changed, 12 insertions(+), 16 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index a6cb2906..8923c5ab 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -738,12 +738,11 @@ def processBasalSchedule(df, col):
                             isf = pumpSettings.loc[pumpSettings["isf"].notnull(), isfColHeadings]
 
                             # add a day summary
-                            isfDaySummary = isf.copy()
-                            isfDaySummary["day"] = isfDaySummary["isf.localTime"].dt.date
-                            isfDaySummary.drop(columns=["isf.localTime"], inplace=True)
-                            isfDaySummary["isf.min"] = isfDaySummary["isf"]
-                            isfDaySummary["isf.weightedMean"] = isfDaySummary["isf"]
-                            isfDaySummary["isf.max"] = isfDaySummary["isf"]
+                            isfDaySummary = pd.DataFrame()
+                            isfDaySummary["day"] = isf["isf.localTime"].dt.date
+                            isfDaySummary["isf.min"] = isf["isf"]
+                            isfDaySummary["isf.weightedMean"] = isf["isf"]
+                            isfDaySummary["isf.max"] = isf["isf"]
                             isfDaySummary = pd.concat([isfDaySummary, dataPulledDF], sort=False)
                             isfDaySummary.reset_index(inplace=True, drop=True)
                             isfDaySummary.fillna(method='ffill', inplace=True)
@@ -781,12 +780,11 @@ def processBasalSchedule(df, col):
                             cir = pumpSettings.loc[pumpSettings["carbRatio.amount"].notnull(), cirColHeadings]
 
                             # add a day summary
-                            cirDaySummary = cir.copy()
-                            cirDaySummary["day"] = cirDaySummary["cir.localTime"].dt.date
-                            cirDaySummary.drop(columns=["cir.localTime"], inplace=True)
-                            cirDaySummary["cir.min"] = cirDaySummary["cir"]
-                            cirDaySummary["cir.weightedMean"] = cirDaySummary["cir"]
-                            cirDaySummary["cir.max"] = cirDaySummary["cir"]
+                            cirDaySummary = pd.DataFrame()
+                            cirDaySummary["day"] = cir["cir.localTime"].dt.date
+                            cirDaySummary["cir.min"] = cir["cir"]
+                            cirDaySummary["cir.weightedMean"] = cir["cir"]
+                            cirDaySummary["cir.max"] = cir["cir"]
                             cirDaySummary = pd.concat([cirDaySummary, dataPulledDF], sort=False)
                             cirDaySummary.reset_index(inplace=True, drop=True)
                             cirDaySummary.fillna(method='ffill', inplace=True)
@@ -838,9 +836,6 @@ def processBasalSchedule(df, col):
                             ctDaySummary = correctionTarget.copy()
                             ctDaySummary["day"] = ctDaySummary["ct.localTime"].dt.date
                             ctDaySummary.drop(columns=["ct.localTime"], inplace=True)
-#                            ctDaySummary["ct.min"] = ctDaySummary["ct.target"]
-#                            ctDaySummary["ct.weightedMean"] = ctDaySummary["ct"]
-#                            ctDaySummary["ct.max"] = ctDaySummary["ct"]
                             ctDaySummary = pd.concat([ctDaySummary, dataPulledDF], sort=False)
                             ctDaySummary.reset_index(inplace=True, drop=True)
                             ctDaySummary.fillna(method='ffill', inplace=True)
@@ -1062,7 +1057,8 @@ def processBasalSchedule(df, col):
                         dayData["date"] = pd.to_datetime(dayData["day"])
                         dayData["tzo"] = dayData[['date', 'timezone']].apply(lambda x: getTimezoneOffset(*x), axis=1)
 
-
+                        # add settings to the dayData
+                        dayData = pd.merge(dayData, isfDaySummary, on="day", how="left")
 
 
 

From 0dbe2d91914dee3881e6b124a5924344763b7ea2 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sun, 13 Jan 2019 15:25:18 -0600
Subject: [PATCH 31/78] get settings summaries across for each age and ylw

---
 .../get-users-settings-and-events.py          | 96 ++++++++++++++++---
 1 file changed, 81 insertions(+), 15 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 8923c5ab..103747aa 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -565,7 +565,7 @@ def processBasalSchedule(df, col):
 # %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
 
 # this is where the loop will go:
-for dIndex in [0]:  #range(0, len(donors)):
+for dIndex in range(0, len(donors)):
 
     # clear output dataframes
     isf, cir, correctionTarget = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
@@ -831,7 +831,6 @@ def processBasalSchedule(df, col):
 
                             correctionTarget = pumpSettings.loc[pumpSettings["bgTarget.start"].notnull(), ctColHeadings]
 
-
                             # add a day summary
                             ctDaySummary = correctionTarget.copy()
                             ctDaySummary["day"] = ctDaySummary["ct.localTime"].dt.date
@@ -870,9 +869,9 @@ def processBasalSchedule(df, col):
 
                         # SCHEDULED BASAL RATES
 #                        sbrColHeadings = commonColumnHeadings.copy()
-                        sbrColHeadings = ["sbr.localTime", "rate", "type"]
+                        sbrColHeadings = ["sbr.localTime", "rate", "sbr.type"]
                         sbr = pd.DataFrame(columns=sbrColHeadings)
-                        sbrDayColHeadings = ['day', 'sbr.min', 'sbr.weightedMean', 'sbr.max', 'type']
+                        sbrDayColHeadings = ['day', 'sbr.min', 'sbr.weightedMean', 'sbr.max', 'sbr.type']
                         sbrDaySummary = pd.DataFrame(columns=sbrDayColHeadings)
                         for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
                             # edge case where actSchedule is float
@@ -881,7 +880,7 @@ def processBasalSchedule(df, col):
                             if 'Auto Mode' not in actSched:
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
-                                tempDF["type"] = np.nan
+                                tempDF["sbr.type"] = np.nan
                                 tempDF["sbr.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
                                 endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["sbr.localTime"], index=[0])
                                 tempDF = get_setting_durations(tempDF, "sbr", endOfDay)
@@ -893,21 +892,21 @@ def processBasalSchedule(df, col):
                                 tempDaySummary["sbr.weightedMean"] = \
                                     np.sum(tempDF["rate"] * tempDF["sbr.durationHours"]) / tempDF["sbr.durationHours"].sum()
                                 tempDaySummary["sbr.max"] = tempDF["rate"].max()
-                                tempDaySummary["type"] = np.nan
+                                tempDaySummary["sbr.type"] = np.nan
 
                             else:
                                 tempDF = pd.DataFrame(index=[0])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
                                 tempDF["sbr.localTime"] = pd.to_datetime(tempDF["day"])
                                 tempDF["rate"] = np.nan
-                                tempDF["type"] = "AutoMode"
+                                tempDF["sbr.type"] = "AutoMode"
 
                                 tempDaySummary = pd.DataFrame(index=[0])
                                 tempDaySummary["day"] = tempDF["sbr.localTime"].dt.date
                                 tempDaySummary["sbr.min"] = np.nan
                                 tempDaySummary["sbr.weightedMean"] = np.nan
                                 tempDaySummary["sbr.max"] = np.nan
-                                tempDaySummary["type"] = "AutoMode"
+                                tempDaySummary["sbr.type"] = "AutoMode"
 
                             sbr = pd.concat([sbr, tempDF[sbrColHeadings]], ignore_index=True)
                             sbrDaySummary = pd.concat([sbrDaySummary, tempDaySummary], ignore_index=True)
@@ -1049,7 +1048,6 @@ def processBasalSchedule(df, col):
                         for dfType in [isClosedLoopDay, is670g]:
                             dayData = pd.merge(dayData, dfType, on="day", how="left")
 
-
                         dayData["validPumpData"] = dayData["numberOfNormalBoluses"] > 3
                         dayData["validCGMData"] = dayData["cgm.count"] > (288*.75)
 
@@ -1059,8 +1057,27 @@ def processBasalSchedule(df, col):
 
                         # add settings to the dayData
                         dayData = pd.merge(dayData, isfDaySummary, on="day", how="left")
-
-
+                        dayData = pd.merge(dayData, cirDaySummary, on="day", how="left")
+                        dayData = pd.merge(dayData, ctDaySummary, on="day", how="left")
+                        dayData = pd.merge(dayData, sbrDaySummary, on="day", how="left")
+
+                        # fill data forward
+                        fillList = ['isf.min',
+                                    'isf.weightedMean',
+                                    'isf.max',
+                                    'cir.min',
+                                    'cir.weightedMean',
+                                    'cir.max',
+                                    'ct.low',
+                                    'ct.high',
+                                    'ct.target',
+                                    'ct.range',
+                                    'sbr.min',
+                                    'sbr.weightedMean',
+                                    'sbr.max',
+                                    'sbr.type']
+                        for fl in fillList:
+                            dayData[fl].fillna(method='ffill', inplace=True)
 
                         # calculate the start and end of contiguous data
                         # these dates can be used when simulating and predicting, where
@@ -1077,6 +1094,33 @@ def processBasalSchedule(df, col):
                         ageSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
                         ageSummary["nDaysClosedLoop"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
                         ageSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
+
+                        # add in isf stats
+                        ageSummary["isf.nDays"] = catDF["isf.min"].count()
+                        ageSummary["isf.min"] = catDF["isf.min"].min()
+                        ageSummary["isf.weightedMean"] = catDF["isf.weightedMean"].sum() / catDF["isf.weightedMean"].count()
+                        ageSummary["isf.max"] = catDF["isf.max"].max()
+
+                        # add cir stats
+                        ageSummary["cir.nDays"] = catDF["cir.min"].count()
+                        ageSummary["cir.min"] = catDF["cir.min"].min()
+                        ageSummary["cir.weightedMean"] = catDF["cir.weightedMean"].sum() / catDF["cir.weightedMean"].count()
+                        ageSummary["cir.max"] = catDF["cir.max"].max()
+
+                        # correctionTarget stats
+                        for ch in ['ct.low','ct.high','ct.target', 'ct.range']:
+                            ageSummary[ch + ".nDays"] = catDF[ch].count()
+                            ageSummary[ch + ".min"] = catDF[ch].min()
+                            ageSummary[ch + ".weightedMean"] = catDF[ch].sum() / catDF[ch].count()
+                            ageSummary[ch + ".max"] = catDF[ch].max()
+
+                        # add sbr stats
+                        ageSummary["sbr.nDays"] = catDF["sbr.min"].count()
+                        ageSummary["sbr.min"] = catDF["sbr.min"].min()
+                        ageSummary["sbr.weightedMean"] = catDF["sbr.weightedMean"].sum() / catDF["sbr.weightedMean"].count()
+                        ageSummary["sbr.max"] = catDF["sbr.max"].max()
+                        ageSummary["sbr.nAutoMode"] = catDF["sbr.type"].count()
+
                         ageSummary.reset_index(inplace=True)
 
                         analysisCriterion = ageSummary[((ageSummary["nDaysValidPump"]> 28) &
@@ -1096,6 +1140,32 @@ def processBasalSchedule(df, col):
                         ylwSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
                         ylwSummary["nDaysClosedLoop"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
                         ylwSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
+
+                        ylwSummary["isf.nDays"] = catDF["isf.min"].count()
+                        ylwSummary["isf.min"] = catDF["isf.min"].min()
+                        ylwSummary["isf.weightedMean"] = catDF["isf.weightedMean"].sum() / catDF["isf.weightedMean"].count()
+                        ylwSummary["isf.max"] = catDF["isf.max"].max()
+
+                        # add cir stats
+                        ylwSummary["cir.nDays"] = catDF["cir.min"].count()
+                        ylwSummary["cir.min"] = catDF["cir.min"].min()
+                        ylwSummary["cir.weightedMean"] = catDF["cir.weightedMean"].sum() / catDF["cir.weightedMean"].count()
+                        ylwSummary["cir.max"] = catDF["cir.max"].max()
+
+                        # correctionTarget stats
+                        for ch in ['ct.low','ct.high','ct.target', 'ct.range']:
+                            ylwSummary[ch + ".nDays"] = catDF[ch].count()
+                            ylwSummary[ch + ".min"] = catDF[ch].min()
+                            ylwSummary[ch + ".weightedMean"] = catDF[ch].sum() / catDF[ch].count()
+                            ylwSummary[ch + ".max"] = catDF[ch].max()
+
+                        # add sbr stats
+                        ylwSummary["sbr.nDays"] = catDF["sbr.min"].count()
+                        ylwSummary["sbr.min"] = catDF["sbr.min"].min()
+                        ylwSummary["sbr.weightedMean"] = catDF["sbr.weightedMean"].sum() / catDF["sbr.weightedMean"].count()
+                        ylwSummary["sbr.max"] = catDF["sbr.max"].max()
+                        ylwSummary["sbr.nAutoMode"] = catDF["sbr.type"].count()
+
                         ylwSummary.reset_index(inplace=True)
 
                         analysisCriterion = ylwSummary[((ylwSummary["nDaysValidPump"]> 28) &
@@ -1187,10 +1257,6 @@ def processBasalSchedule(df, col):
                         cgmLite = cgmLite[colOrder]
 
 
-                        # %% day level stats
-
-
-
                         # %% age and ylw stats
 
 

From fc4a253ff7b05c2ead31f162f11df0c13dada0e9 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sun, 13 Jan 2019 18:34:08 -0600
Subject: [PATCH 32/78] fix edge case 'US/Pacific-New'

---
 projects/predict-simulate/get-users-settings-and-events.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 103747aa..e692481c 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -441,6 +441,10 @@ def getTzoForDateTime(utcTime, currentTimezone):
 
 def getTimezoneOffset(currentDate, currentTimezone):
 
+    # edge case for 'US/Pacific-New'
+    if currentTimezone in 'US/Pacific-New':
+        currentTimezone = 'US/Pacific'
+
     tz = timezone(currentTimezone)
     # here we add 1 day to the current date to account for changes to/from DST
     tzoNum = int(tz.localize(currentDate + timedelta(days=1)).strftime("%z"))

From b5ed0c6e727b257b81174eee7ba16ffd01c46046 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sun, 13 Jan 2019 18:35:14 -0600
Subject: [PATCH 33/78] fix scheduled isf and cir

---
 .../get-users-settings-and-events.py          | 48 +++++++++++++++----
 1 file changed, 38 insertions(+), 10 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index e692481c..50a3d33b 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -754,7 +754,8 @@ def processBasalSchedule(df, col):
                         else:
                             isfColHead = "insulinSensitivities"
                             isf = pd.DataFrame(columns=isfColHeadings)
-
+                            isfDayColHeadings = ['day', 'isf.min', 'isf.weightedMean', 'isf.max']
+                            isfDaySummary = pd.DataFrame(columns=isfDayColHeadings)
                             for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
                                 # edge case where actSchedule is float
                                 if isinstance(actSched, float):
@@ -763,13 +764,25 @@ def processBasalSchedule(df, col):
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, isfColHead + "." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
                                 tempDF["isf.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
-                                tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
-                                tempDF["age"] = pumpSettings.loc[p, "age"]
-                                tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
                                 tempDF["isf_mmolL_U"] = tempDF["amount"]
                                 tempDF["isf"] = mmolL_to_mgdL(tempDF["isf_mmolL_U"])
+                                endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["isf.localTime"], index=[0])
+                                tempDF = get_setting_durations(tempDF, "isf", endOfDay)
+                                tempDF = tempDF[:-1]
+
+                                tempDaySummary = pd.DataFrame(index=[0])
+                                tempDaySummary["day"] = tempDF["isf.localTime"].dt.date
+                                tempDaySummary["isf.min"] = tempDF["isf"].min()
+                                tempDaySummary["isf.weightedMean"] = \
+                                    np.sum(tempDF["isf"] * tempDF["isf.durationHours"]) / tempDF["isf.durationHours"].sum()
+                                tempDaySummary["isf.max"] = tempDF["isf"].max()
+
                                 isf = pd.concat([isf, tempDF[isfColHeadings]], ignore_index=True)
-                                pdb.set_trace()
+                                isfDaySummary = pd.concat([isfDaySummary, tempDaySummary], ignore_index=True)
+
+                            isfDaySummary = pd.concat([isfDaySummary, dataPulledDF], sort=False)
+                            isfDaySummary.reset_index(inplace=True, drop=True)
+                            isfDaySummary.fillna(method='ffill', inplace=True)
 
                         # CIR
 #                        cirColHeadings = commonColumnHeadings.copy()
@@ -797,19 +810,34 @@ def processBasalSchedule(df, col):
 
                             cirColHead = "carbRatios"
                             cir = pd.DataFrame(columns=cirColHeadings)
+                            cirDayColHeadings = ['day', 'cir.min', 'cir.weightedMean', 'cir.max']
+                            cirDaySummary = pd.DataFrame(columns=cirDayColHeadings)
                             for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
                                 # edge case where actSchedule is float
                                 if isinstance(actSched, float):
                                     actSched = str(int(actSched))
+
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, cirColHead + "." + actSched])
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
                                 tempDF["cir.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
-                                tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
-                                tempDF["age"] = pumpSettings.loc[p, "age"]
-                                tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
-                                tempDF["cir"] = tempDF["amount"].astype(float)
+                                tempDF["cir"] = tempDF["amount"]
+                                endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["cir.localTime"], index=[0])
+                                tempDF = get_setting_durations(tempDF, "cir", endOfDay)
+                                tempDF = tempDF[:-1]
+
+                                tempDaySummary = pd.DataFrame(index=[0])
+                                tempDaySummary["day"] = tempDF["cir.localTime"].dt.date
+                                tempDaySummary["cir.min"] = tempDF["cir"].min()
+                                tempDaySummary["cir.weightedMean"] = \
+                                    np.sum(tempDF["cir"] * tempDF["cir.durationHours"]) / tempDF["cir.durationHours"].sum()
+                                tempDaySummary["cir.max"] = tempDF["cir"].max()
+
                                 cir = pd.concat([cir, tempDF[cirColHeadings]], ignore_index=True)
-                                pdb.set_trace()
+                                cirDaySummary = pd.concat([cirDaySummary, tempDaySummary], ignore_index=True)
+
+                            cirDaySummary = pd.concat([cirDaySummary, dataPulledDF], sort=False)
+                            cirDaySummary.reset_index(inplace=True, drop=True)
+                            cirDaySummary.fillna(method='ffill', inplace=True)
 
 
                         # CORRECTION TARGET

From cf7759c392e083a1cbb63906afd1aa451581e661 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sun, 13 Jan 2019 19:57:18 -0600
Subject: [PATCH 34/78] fix correction target age summaries to include min,
 wMean, and max

---
 .../get-users-settings-and-events.py          | 111 +++++++++++-------
 1 file changed, 69 insertions(+), 42 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 50a3d33b..c1631a0e 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -729,7 +729,6 @@ def processBasalSchedule(df, col):
                         pumpSettings.reset_index(drop=True, inplace=True)
 
                         # ISF
-#                        isfColHeadings = commonColumnHeadings.copy()
                         isfColHeadings = ["isf.localTime", "isf", "isf_mmolL_U"]
 
                         if "insulinSensitivity.amount" in list(pumpSettings):
@@ -785,7 +784,6 @@ def processBasalSchedule(df, col):
                             isfDaySummary.fillna(method='ffill', inplace=True)
 
                         # CIR
-#                        cirColHeadings = commonColumnHeadings.copy()
                         cirColHeadings = ["cir.localTime", "cir"]
 
                         if "carbRatio.amount" in list(pumpSettings):
@@ -841,8 +839,12 @@ def processBasalSchedule(df, col):
 
 
                         # CORRECTION TARGET
-#                        ctColHeadings = commonColumnHeadings.copy()
                         ctColHeadings = ["ct.localTime", "ct.low", "ct.high", "ct.target", "ct.range"]
+                        ctDayColHeadings = ['day',
+                                            "ct.low.min", "ct.low.weightedMean", "ct.low.max",
+                                            "ct.high.min", "ct.high.weightedMean", "ct.high.max",
+                                            "ct.target.min", "ct.target.weightedMean", "ct.target.max",
+                                            "ct.range.min", "ct.range.weightedMean", "ct.range.max"]
 
                         if "bgTarget.start" in list(pumpSettings):
                             ctColHead = "bgTarget."
@@ -864,43 +866,63 @@ def processBasalSchedule(df, col):
                             correctionTarget = pumpSettings.loc[pumpSettings["bgTarget.start"].notnull(), ctColHeadings]
 
                             # add a day summary
-                            ctDaySummary = correctionTarget.copy()
-                            ctDaySummary["day"] = ctDaySummary["ct.localTime"].dt.date
-                            ctDaySummary.drop(columns=["ct.localTime"], inplace=True)
+                            ctDaySummary = pd.DataFrame(columns=ctDayColHeadings)
+                            ctDaySummary["day"] = correctionTarget["ct.localTime"].dt.date
+                            # add min, weightedMean, and max
+                            for targetType in ["ct.low", "ct.high", "ct.target", "ct.range"]:
+                                for stat in [".min", ".weightedMean", ".max"]:
+                                    ctDaySummary[targetType + stat] = correctionTarget[targetType]
+
+
                             ctDaySummary = pd.concat([ctDaySummary, dataPulledDF], sort=False)
                             ctDaySummary.reset_index(inplace=True, drop=True)
                             ctDaySummary.fillna(method='ffill', inplace=True)
 
                         else:
-
                             ctColHead = "bgTargets"
                             correctionTarget = pd.DataFrame(columns=ctColHeadings)
+
+                            ctDaySummary = pd.DataFrame(columns=ctDayColHeadings)
                             for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
                                 # edge case where actSchedule is float
                                 if isinstance(actSched, float):
                                     actSched = str(int(actSched))
+
                                 tempDF = pd.DataFrame(pumpSettings.loc[p, ctColHead + "." + actSched])
+                                targetTypes = list(set(list(tempDF)) - set(["start"]))
                                 tempDF["day"] = pumpSettings.loc[p, "day"]
                                 tempDF["ct.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
-                                tempDF["hashID"] = pumpSettings.loc[p, "hashID"]
-                                tempDF["age"] = pumpSettings.loc[p, "age"]
-                                tempDF["ylw"] = pumpSettings.loc[p, "ylw"]
-                                for targetType in ["low", "high", "target", "range"]:
-                                    if targetType in list(tempDF):
-                                        tempDF["ct." + targetType + "_mmolL"] = \
-                                            tempDF[targetType]
-
-                                        tempDF["ct." + targetType] = \
-                                            mmolL_to_mgdL(tempDF["ct." + targetType + "_mmolL"])
-                                    else:
-                                        tempDF["ct." + targetType + "_mmolL"] = np.nan
-                                        tempDF["ct." + targetType]  = np.nan
-
-                                correctionTarget = pd.concat([correctionTarget, tempDF[ctColHeadings]], ignore_index=True)
-                                pdb.set_trace()
+                                endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["ct.localTime"], index=[0])
+                                tempDF = get_setting_durations(tempDF, "ct", endOfDay)
+                                tempDF = tempDF[:-1]
+
+                                tempDaySummary = pd.DataFrame(columns=ctDayColHeadings, index=[0])
+                                tempDaySummary["day"] = tempDF["ct.localTime"].dt.date
+
+                                for targetType in targetTypes:
+                                    tempDF["ct." + targetType] = mmolL_to_mgdL(tempDF[targetType])
+
+                                    tempDaySummary["ct." + targetType + ".min"] = tempDF["ct." + targetType].min()
+                                    tempDaySummary["ct." + targetType + ".weightedMean"] = \
+                                        np.sum(tempDF["ct." + targetType] * tempDF["ct.durationHours"]) / tempDF["ct.durationHours"].sum()
+                                    tempDaySummary["ct." + targetType + ".max"] = tempDF["ct." + targetType].max()
+
+                                correctionTarget = \
+                                    pd.concat([correctionTarget,
+                                               tempDF.drop(columns=['start',
+                                                                    'target',
+                                                                    'day',
+                                                                    'ct.durationHours'])],
+                                               ignore_index=True, sort=False)
+                                ctDaySummary = pd.concat([ctDaySummary, tempDaySummary],
+                                                         ignore_index=True, sort=False)
+
+                            ctDaySummary = pd.concat([ctDaySummary, dataPulledDF], sort=False)
+                            ctDaySummary.fillna(method='ffill', inplace=True)
+                            ctDaySummary.drop_duplicates(inplace=True)
+                            ctDaySummary.reset_index(inplace=True, drop=True)
 
                         # SCHEDULED BASAL RATES
-#                        sbrColHeadings = commonColumnHeadings.copy()
                         sbrColHeadings = ["sbr.localTime", "rate", "sbr.type"]
                         sbr = pd.DataFrame(columns=sbrColHeadings)
                         sbrDayColHeadings = ['day', 'sbr.min', 'sbr.weightedMean', 'sbr.max', 'sbr.type']
@@ -1100,10 +1122,10 @@ def processBasalSchedule(df, col):
                                     'cir.min',
                                     'cir.weightedMean',
                                     'cir.max',
-                                    'ct.low',
-                                    'ct.high',
-                                    'ct.target',
-                                    'ct.range',
+                                    'ct.low.min', 'ct.low.weightedMean', 'ct.low.max',
+                                    'ct.high.min', 'ct.high.weightedMean', 'ct.high.max',
+                                    'ct.target.min', 'ct.target.weightedMean', 'ct.target.max',
+                                    'ct.range.min', 'ct.range.weightedMean', 'ct.range.max',
                                     'sbr.min',
                                     'sbr.weightedMean',
                                     'sbr.max',
@@ -1139,13 +1161,6 @@ def processBasalSchedule(df, col):
                         ageSummary["cir.weightedMean"] = catDF["cir.weightedMean"].sum() / catDF["cir.weightedMean"].count()
                         ageSummary["cir.max"] = catDF["cir.max"].max()
 
-                        # correctionTarget stats
-                        for ch in ['ct.low','ct.high','ct.target', 'ct.range']:
-                            ageSummary[ch + ".nDays"] = catDF[ch].count()
-                            ageSummary[ch + ".min"] = catDF[ch].min()
-                            ageSummary[ch + ".weightedMean"] = catDF[ch].sum() / catDF[ch].count()
-                            ageSummary[ch + ".max"] = catDF[ch].max()
-
                         # add sbr stats
                         ageSummary["sbr.nDays"] = catDF["sbr.min"].count()
                         ageSummary["sbr.min"] = catDF["sbr.min"].min()
@@ -1153,6 +1168,15 @@ def processBasalSchedule(df, col):
                         ageSummary["sbr.max"] = catDF["sbr.max"].max()
                         ageSummary["sbr.nAutoMode"] = catDF["sbr.type"].count()
 
+                        # correctionTarget stats
+                        for targetType in ["ct.low", "ct.high", "ct.target", "ct.range"]:
+                            for stat in [".min", ".weightedMean", ".max"]:
+                                ch = targetType + stat
+                                ageSummary[ch + ".nDays"] = catDF[ch].count()
+                                ageSummary[ch + ".min"] = catDF[ch].min()
+                                ageSummary[ch + ".weightedMean"] = catDF[ch].sum() / catDF[ch].count()
+                                ageSummary[ch + ".max"] = catDF[ch].max()
+
                         ageSummary.reset_index(inplace=True)
 
                         analysisCriterion = ageSummary[((ageSummary["nDaysValidPump"]> 28) &
@@ -1184,13 +1208,6 @@ def processBasalSchedule(df, col):
                         ylwSummary["cir.weightedMean"] = catDF["cir.weightedMean"].sum() / catDF["cir.weightedMean"].count()
                         ylwSummary["cir.max"] = catDF["cir.max"].max()
 
-                        # correctionTarget stats
-                        for ch in ['ct.low','ct.high','ct.target', 'ct.range']:
-                            ylwSummary[ch + ".nDays"] = catDF[ch].count()
-                            ylwSummary[ch + ".min"] = catDF[ch].min()
-                            ylwSummary[ch + ".weightedMean"] = catDF[ch].sum() / catDF[ch].count()
-                            ylwSummary[ch + ".max"] = catDF[ch].max()
-
                         # add sbr stats
                         ylwSummary["sbr.nDays"] = catDF["sbr.min"].count()
                         ylwSummary["sbr.min"] = catDF["sbr.min"].min()
@@ -1198,6 +1215,15 @@ def processBasalSchedule(df, col):
                         ylwSummary["sbr.max"] = catDF["sbr.max"].max()
                         ylwSummary["sbr.nAutoMode"] = catDF["sbr.type"].count()
 
+                        # correctionTarget stats
+                        for targetType in ["ct.low", "ct.high", "ct.target", "ct.range"]:
+                            for stat in [".min", ".weightedMean", ".max"]:
+                                ch = targetType + stat
+                                ylwSummary[ch + ".nDays"] = catDF[ch].count()
+                                ylwSummary[ch + ".min"] = catDF[ch].min()
+                                ylwSummary[ch + ".weightedMean"] = catDF[ch].sum() / catDF[ch].count()
+                                ylwSummary[ch + ".max"] = catDF[ch].max()
+
                         ylwSummary.reset_index(inplace=True)
 
                         analysisCriterion = ylwSummary[((ylwSummary["nDaysValidPump"]> 28) &
@@ -1343,6 +1369,7 @@ def processBasalSchedule(df, col):
 
 # %% V2 DATA TO GRAB
 # ADD ROUNDEDLOCAL TIME TO THE END RESULTS
+# CALCULATE MMOL SUMMARIES
 # GET RID OF ROUNDING TIME AT THE BEGINNING
 # DEFINE A DAY BETWEEN 6AM AND 6AM
 # EXPAND THE CORRECTION TIME VALUES TO BE UNIFORM ACROSS ALL USERS AND DEVICES

From 72e568bc686c9d9f5df1ee00eef687aefe98c746 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sun, 13 Jan 2019 22:07:55 -0600
Subject: [PATCH 35/78] add insulin/carb events and basic cgm stats

---
 .../get-users-settings-and-events.py          | 93 ++++++++++++++++---
 1 file changed, 82 insertions(+), 11 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index c1631a0e..479e35ec 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -442,7 +442,7 @@ def getTzoForDateTime(utcTime, currentTimezone):
 def getTimezoneOffset(currentDate, currentTimezone):
 
     # edge case for 'US/Pacific-New'
-    if currentTimezone in 'US/Pacific-New':
+    if currentTimezone == 'US/Pacific-New':
         currentTimezone = 'US/Pacific'
 
     tz = timezone(currentTimezone)
@@ -457,6 +457,8 @@ def getTimezoneOffset(currentDate, currentTimezone):
 
 
 def isDSTChangeDay(currentDate, currentTimezone):
+    if currentTimezone == 'US/Pacific-New':
+        currentTimezone = 'US/Pacific'
     tzoCurrentDay = getTimezoneOffset(pd.to_datetime(currentDate),
                                       currentTimezone)
     tzoPreviousDay = getTimezoneOffset(pd.to_datetime(currentDate) +
@@ -543,8 +545,6 @@ def processBasalSchedule(df, col):
     return dailySchedule, dailySummary
 
 
-
-
 # %% LOAD IN ONE FILE, BUT EVENTUALLY THIS WILL LOOOP THROUGH ALL USER'S
 dataPulledDate = "2018-09-28"
 dataPulledDF = pd.DataFrame(pd.to_datetime(dataPulledDate), columns=["day"], index=[0])
@@ -564,12 +564,17 @@ def processBasalSchedule(df, col):
 donorList = phiDate + "-uniqueDonorList.csv"
 donors = load_csv(os.path.join(donorPath, donorList))
 
-allMetadata = donors[['hashID', 'diagnosisType']].copy()
+allMetadata = pd.DataFrame()
+allAgeSummaries = pd.DataFrame()
+allYlwSummaries = pd.DataFrame()
+
 
 # %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
 
 # this is where the loop will go:
-for dIndex in range(0, len(donors)):
+startIndex = 0
+endIndex = len(donors)
+for dIndex in range(startIndex, endIndex):
 
     # clear output dataframes
     isf, cir, correctionTarget = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
@@ -1088,7 +1093,6 @@ def processBasalSchedule(df, col):
                         dataPerDay["ylw"] = catDF.ylw.mean()
                         dataPerDay["timezone"] = catDF.timezone.describe()["top"]
 
-
                         # calculate all of the data start and end range
                         # this can be used for looking at settings
                         dayBeginDate = min(cgmBeginDate, bolusBeginDate, basalBeginDate)
@@ -1105,6 +1109,9 @@ def processBasalSchedule(df, col):
                         dayData["validPumpData"] = dayData["numberOfNormalBoluses"] > 3
                         dayData["validCGMData"] = dayData["cgm.count"] > (288*.75)
 
+                        dayData["timezone"].fillna(method='ffill', inplace=True)
+                        dayData["timezone"].fillna(method='bfill', inplace=True)
+
                         dayData["isDSTChangeDay"] = dayData[['day', 'timezone']].apply(lambda x: isDSTChangeDay(*x), axis=1)
                         dayData["date"] = pd.to_datetime(dayData["day"])
                         dayData["tzo"] = dayData[['date', 'timezone']].apply(lambda x: getTimezoneOffset(*x), axis=1)
@@ -1315,10 +1322,73 @@ def processBasalSchedule(df, col):
                         cgmLite = cgmLite[colOrder]
 
 
-                        # %% age and ylw stats
+                        # %% SAVE RESULTS
 
+                        # age and ylw stats
+                        pumpEvents["rateTimesDurationHours"] = pumpEvents["rate"] * pumpEvents["durationHours"]
+                        pumpEvents.rename(columns={"rate":"basalRate"}, inplace=True)
+                        catDF = pumpEvents.groupby("age")
 
-                        # %% SAVE RESULTS
+                        # actual basal rates
+                        agePump = pd.DataFrame(catDF["basalRate"].count()).add_suffix(".count")
+                        agePump["basalRate.min"] = catDF["basalRate"].min()
+                        agePump["basalRate.weightedMean"] = catDF["rateTimesDurationHours"].sum() / catDF["durationHours"].sum()
+                        agePump["basalRate.max"] = catDF["basalRate"].max()
+
+                        # insulin events
+                        insulinEvents = catDF["unitsInsulin"].describe().add_prefix("insulin.")
+                        agePump = pd.concat([agePump, insulinEvents], axis=1)
+
+                        # carbs entered in bolus calculator
+                        carbEvents = catDF["carbInput"].describe().add_prefix("carb.")
+                        agePump = pd.concat([agePump, carbEvents], axis=1)
+
+                        # very low level cgm stats per age
+                        catDF = cgmLite.groupby("age")
+                        cgmStats = catDF["mg_dL"].describe().add_prefix("cgm.")
+                        agePumpCgm = pd.concat([agePump, cgmStats], axis=1)
+
+                        agePumpCgm.reset_index(inplace=True)
+
+                        ageSummary = pd.merge(ageSummary, agePumpCgm, on="age", how="left")
+                        ageSummary["hashID"] = hashID
+                        allAgeSummaries = pd.concat([allAgeSummaries, ageSummary], ignore_index=True)
+
+                        allAgeSummaries.to_csv(os.path.join(outputPath,
+                            "allAgeSummaries-dIndex-" + str(startIndex) + "-" + str(dIndex) + ".csv"))
+
+                        # repoeat for years living with
+                        catDF = pumpEvents.groupby("ylw")
+                        # actual basal rates
+                        ylwPump = pd.DataFrame(catDF["basalRate"].count()).add_suffix(".count")
+                        ylwPump["basalRate.min"] = catDF["basalRate"].min()
+                        ylwPump["basalRate.weightedMean"] = catDF["rateTimesDurationHours"].sum() / catDF["durationHours"].sum()
+                        ylwPump["basalRate.max"] = catDF["basalRate"].max()
+
+                        # insulin events
+                        insulinEvents = catDF["unitsInsulin"].describe().add_prefix("insulin.")
+                        ylwPump = pd.concat([ylwPump, insulinEvents], axis=1)
+
+                        # carbs entered in bolus calculator
+                        carbEvents = catDF["carbInput"].describe().add_prefix("carb.")
+                        ylwPump = pd.concat([ylwPump, carbEvents], axis=1)
+
+                        # very low level cgm stats per age
+                        catDF = cgmLite.groupby("ylw")
+                        cgmStats = catDF["mg_dL"].describe().add_prefix("cgm.")
+                        ylwPumpCgm = pd.concat([ylwPump, cgmStats], axis=1)
+
+                        ylwPumpCgm.reset_index(inplace=True)
+
+                        ylwSummary = pd.merge(ylwSummary, ylwPumpCgm, on="ylw", how="left")
+
+                        ylwSummary["hashID"] = hashID
+                        allYlwSummaries = pd.concat([allYlwSummaries, ylwSummary], ignore_index=True)
+
+                        allYlwSummaries.to_csv(os.path.join(outputPath,
+                            "allYlwSummaries-dIndex-" + str(startIndex) + "-" + str(dIndex) + ".csv"))
+
+                         # %% save data for this person
 #                        outputString = "age-%s-%s-ylw-%s-%s-lp-%s-670g-%s-id-%s"
 #                        outputFormat = (f"{minAge:02d}",
 #                                        f"{maxAge:02d}",
@@ -1332,7 +1402,6 @@ def processBasalSchedule(df, col):
 #                        if not os.path.exists(outputFolderName_Path):
 #                            os.makedirs(outputFolderName_Path)
 #
-#                        # save data for this person
 #                        fName = outputFolderName + "-allSettings.csv"
 #                        allSettings.to_csv(os.path.join(outputFolderName_Path, fName))
 #                        fName = outputFolderName + "-pumpEvents.csv"
@@ -1361,13 +1430,15 @@ def processBasalSchedule(df, col):
         metadata["flags"] = "missing bDay/dDay"
 
     # write metaData to allMetadata
-    allMetadata = pd.merge(allMetadata, metadata, how="left", on="hashID")
-    allMetadata.to_csv(os.path.join(outputPath, "allMetadata.csv"))
+    allMetadata = pd.concat([allMetadata, metadata], axis=0, sort=True)
+    allMetadata.to_csv(os.path.join(outputPath,
+        "allMetadata-dIndex-" + str(startIndex) + "-" + str(dIndex) + ".csv"))
 
     print("done with", dIndex)
 
 
 # %% V2 DATA TO GRAB
+# THERE IS AN ISSUE WITH COUNTING 670G SETTINGS
 # ADD ROUNDEDLOCAL TIME TO THE END RESULTS
 # CALCULATE MMOL SUMMARIES
 # GET RID OF ROUNDING TIME AT THE BEGINNING

From 42c86bf1fdc1f0b9b1dddb7ecb966f3b8f51c0d1 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Sun, 13 Jan 2019 23:12:45 -0600
Subject: [PATCH 36/78] add argparse to run from commandline

---
 .../get-users-settings-and-events.py          | 122 +++++++++---------
 1 file changed, 64 insertions(+), 58 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 479e35ec..47780ce0 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -18,15 +18,51 @@
 from datetime import timedelta
 import datetime as dt
 import os
+import argparse
 import pdb
 
 
 # %% USER INPUTS (ADD THIS IN LATER)
-#codeDescription = "Get user's settings and events"
-#parser = argparse.ArgumentParser(description=codeDescription)
-
-
+codeDescription = "Get user's settings and events"
+parser = argparse.ArgumentParser(description=codeDescription)
+
+parser.add_argument("-d",
+                    "--date-stamp",
+                    dest="dateStamp",
+                    default="2019-01-10",
+                    help="date in '%Y-%m-%d' format of unique donor list" +
+                    "(e.g., PHI-2018-03-02-uniqueDonorList)")
+
+parser.add_argument("-s",
+                    "--start-index",
+                    dest="startIndex",
+                    default=0,
+                    help="donor index (integer) to start at")
+
+parser.add_argument("-e",
+                    "--end-index",
+                    dest="endIndex",
+                    default=-1,
+                    help="donor index (integer) to end at," +
+                    "-1 will result in 1 file if startIndex != 0," +
+                    "and will default to number of unique donors" +
+                    "if startIndex = 0, or endIndex = -2")
+
+
+args = parser.parse_args()
 # %% FUNCTIONS
+def defineStartAndEndIndex(args, nDonors):
+    startIndex = int(args.startIndex)
+    endIndex = int(args.endIndex)
+    if endIndex == -1:
+        if startIndex == 0:
+            endIndex = nDonors
+        else:
+            endIndex = startIndex + 1
+    if endIndex == -2:
+        endIndex = nDonors
+    return startIndex, endIndex
+
 
 # CLEAN DATA FUNCTIONS
 def removeNegativeDurations(df):
@@ -412,6 +448,8 @@ def getListOfDexcomCGMDays(df):
         totalCgms = len(df.deviceId.notnull())
         df["dexcomCGM"] = df.deviceId.str.contains("|".join(searchfor))
         percentDexcomCGM = df.dexcomCGM.sum() / totalCgms * 100
+    else:
+        percentDexcomCGM = np.nan
     return df, percentDexcomCGM
 
 
@@ -505,48 +543,14 @@ def getPumpSettingsStats(df, col, pumpCol):
     return df, df2
 
 
-def processBasalSchedule(df, col):
-    colHeadings = [col + ".localTime", col, col + ".durationHours", col + ".type",
-                   col + ".min", col + ".weightedMean", col + ".max"]
-    summaryColHeadings = ["day", col + ".min", col + ".weightedMean", col + ".max"]
-    dropCols = ["rate", "start", col + ".localTime", col, col + ".durationHours", col + ".type"]
-
-    dailySchedule = pd.DataFrame(columns=colHeadings)
-    dailySummary = pd.DataFrame(columns=summaryColHeadings)
-
-    for p, actSched in zip(df.index, df["activeSchedule"]):
-        # edge case where actSchedule is float
-        if isinstance(actSched, float):
-            actSched = str(int(actSched))
-        if 'Auto Mode' not in actSched:
-            tempDF = pd.DataFrame(df.loc[p, "basalSchedules." + actSched])
-            tempDF["day"] = df.loc[p, "day"]
-            tempDF[col + ".type"] = np.nan
-            tempDF[col + ".localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
-            endOfDay = pd.DataFrame(pd.to_datetime(df.loc[p, "day"] + pd.Timedelta(1, "D")), columns=[col + ".localTime"], index=[0])
-            tempDF = get_setting_durations(tempDF, col, endOfDay)
-            tempDF = tempDF[:-1]
-            tempDF = get_settingStats(tempDF, col, "rate")
-            dailySchedule = pd.concat([dailySchedule, tempDF[colHeadings]], ignore_index=True, sort=False)
-            tempSummary = tempDF.drop(columns=dropCols)
-            tempSummary["day"] = df.loc[p, "day"]
-            tempSummary = tempSummary[0:1]
-            dailySummary = pd.concat([dailySummary, tempSummary], ignore_index=True, sort=False)
-
-        else:
-            pdb.set_trace()
-            tempDF = pd.DataFrame(index=[0])
-            tempDF[col + ".type"] = "AutoMode"
-            dailySchedule = pd.concat([dailySchedule, tempDF], ignore_index=True, sort=False)
-            tempSummary["day"] = df.loc[p, "day"]
-            tempSummary = tempSummary[0:1]
-            dailySummary = pd.concat([dailySummary, tempSummary], ignore_index=True, sort=False)
 
-    return dailySchedule, dailySummary
 
 
 # %% LOAD IN ONE FILE, BUT EVENTUALLY THIS WILL LOOOP THROUGH ALL USER'S
-dataPulledDate = "2018-09-28"
+
+
+
+dataPulledDate = args.dateStamp
 dataPulledDF = pd.DataFrame(pd.to_datetime(dataPulledDate), columns=["day"], index=[0])
 dataPulledDF["day"] = dataPulledDF["day"].dt.date
 phiDate = "PHI-" + dataPulledDate
@@ -571,9 +575,11 @@ def processBasalSchedule(df, col):
 
 # %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
 
-# this is where the loop will go:
-startIndex = 0
-endIndex = len(donors)
+nUniqueDonors = len(donors)
+
+# define start and end index
+startIndex, endIndex = defineStartAndEndIndex(args, nUniqueDonors)
+
 for dIndex in range(startIndex, endIndex):
 
     # clear output dataframes
@@ -678,12 +684,12 @@ def processBasalSchedule(df, col):
                         # get a summary of boluses per day
                         bolusDaySummary = get_bolusDaySummary(bolus)
 
-                        # isf and cir associated with bolus event
-                        if "insulinSensitivities" in list(bolus):
-                            pdb.set_trace()
-
-                        if "carbRatios" in list(bolus):
-                            pdb.set_trace()
+#                        # isf and cir associated with bolus event
+#                        if "insulinSensitivities" in list(bolus):
+#                            pdb.set_trace()
+#
+#                        if "carbRatios" in list(bolus):
+#                            pdb.set_trace()
 
                         bolus["isf_mmolL_U"] = bolus["insulinSensitivity"]
                         bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
@@ -974,13 +980,13 @@ def processBasalSchedule(df, col):
                         sbrDaySummary.reset_index(inplace=True, drop=True)
                         sbrDaySummary.fillna(method='ffill', inplace=True)
 
-                        # max basal rate, max bolus amount, and insulin duration
-                        if "rateMaximum" in list(data):
-                            pdb.set_trace()
-                        if "amountMaximum" in list(data):
-                            pdb.set_trace()
-                        if "bolus.calculator" in list(data):
-                            pdb.set_trace()
+#                        # max basal rate, max bolus amount, and insulin duration
+#                        if "rateMaximum" in list(data):
+#                            pdb.set_trace()
+#                        if "amountMaximum" in list(data):
+#                            pdb.set_trace()
+#                        if "bolus.calculator" in list(data):
+#                            pdb.set_trace()
 
 
                         # %% ACTUAL BASAL RATES (TIME, VALUE, DURATION, TYPE (SCHEDULED, TEMP, SUSPEND))

From 640eaf61953861d63a6648819d18cdac7bef43e0 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 14 Jan 2019 00:04:57 -0600
Subject: [PATCH 37/78] add try catch to help batch process

---
 .../get-users-settings-and-events.py          | 1622 ++++++++---------
 1 file changed, 811 insertions(+), 811 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 47780ce0..1c979d2f 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -29,7 +29,7 @@
 parser.add_argument("-d",
                     "--date-stamp",
                     dest="dateStamp",
-                    default="2019-01-10",
+                    default="2018-09-28",
                     help="date in '%Y-%m-%d' format of unique donor list" +
                     "(e.g., PHI-2018-03-02-uniqueDonorList)")
 
@@ -574,866 +574,866 @@ def getPumpSettingsStats(df, col, pumpCol):
 
 
 # %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
-
 nUniqueDonors = len(donors)
 
 # define start and end index
 startIndex, endIndex = defineStartAndEndIndex(args, nUniqueDonors)
 
 for dIndex in range(startIndex, endIndex):
-
-    # clear output dataframes
-    isf, cir, correctionTarget = pd.DataFrame(), pd.DataFrame(), pd.DataFrame()
-
-    # %% ID, HASHID, AGE, & YLW
+    # % ID, HASHID, AGE, & YLW
     userID = donors.userID[dIndex]
     hashID = donors.hashID[dIndex]
     metadata = pd.DataFrame(index=[dIndex])
     metadata["hashID"] = hashID
 
-    # make folder to save data
-    processedDataPath = os.path.join(phiOutputPath, "PHI-" + userID)
-    if not os.path.exists(processedDataPath):
-        os.makedirs(processedDataPath)
-
-
-    # round all birthdays and diagnosis dates to the first day of the month (to protect identities)
-    if (pd.isnull(donors.bDay[dIndex]) + pd.isnull(donors.dDay[dIndex])) == 0:
-
-        bDate = pd.to_datetime(donors.bDay[dIndex][0:7])
-        dDate = pd.to_datetime(donors.dDay[dIndex][0:7])
+    try:
+        # make folder to save data
+        processedDataPath = os.path.join(phiOutputPath, "PHI-" + userID)
+        if not os.path.exists(processedDataPath):
+            os.makedirs(processedDataPath)
 
+        # round all birthdays and diagnosis dates to the first day of the month (to protect identities)
+        if (pd.isnull(donors.bDay[dIndex]) + pd.isnull(donors.dDay[dIndex])) == 0:
+
+            bDate = pd.to_datetime(donors.bDay[dIndex][0:7])
+            dDate = pd.to_datetime(donors.dDay[dIndex][0:7])
+
+
+            # %% LOAD IN DONOR JSON DATA
+
+            jsonDataPath = os.path.join(donorPath, phiDate + "-donorJsonData")
+            jsonFileName = os.path.join(jsonDataPath, "PHI-" + userID + ".json")
+
+            if os.path.exists(jsonFileName):
+                fileSize = os.stat(jsonFileName).st_size
+                metadata["fileSizeKB"] = fileSize / 1000
+                if fileSize > 1000:
+                    data = load_json(jsonFileName)
+
+                    # sort the data by time
+                    data.sort_values("time", inplace=True)
+
+                    # flatten the embedded json
+                    data = flattenJson(data)
+
+
+                    # %% CLEAN DATA
+                    # remove negative durations
+                    data, nNegativeDurations = removeNegativeDurations(data)
+                    metadata["nNegativeDurations"] = nNegativeDurations
+
+                    # get rid of cgm values too low/high (< 38 & > 402 mg/dL)
+                    data, nInvalidCgmValues = removeInvalidCgmValues(data)
+                    metadata["nInvalidCgmValues"] = nInvalidCgmValues
+
+                    # Tslim calibration bug fix
+                    data, nTandemAndPayloadCalReadings = tslimCalibrationFix(data)
+                    metadata["nTandemAndPayloadCalReadings"] = nTandemAndPayloadCalReadings
 
-        # %% LOAD IN DONOR JSON DATA
 
-        jsonDataPath = os.path.join(donorPath, phiDate + "-donorJsonData")
-        jsonFileName = os.path.join(jsonDataPath, "PHI-" + userID + ".json")
+                    # %% ADD UPLOAD DATE
+                    # attach upload time to each record, for resolving duplicates
+                    if (("upload" in data.type.unique()) &
+                        ("basal" in data.type.unique()) &
+                        ("bolus" in data.type.unique()) &
+                        ("cbg" in data.type.unique()) &
+                        ("pumpSettings" in data.type.unique())):
+                        data = addUploadDate(data)
 
-        if os.path.exists(jsonFileName):
-            fileSize = os.stat(jsonFileName).st_size
-            metadata["fileSizeKB"] = fileSize / 1000
-            if fileSize > 1000:
-                data = load_json(jsonFileName)
-
-                # sort the data by time
-                data.sort_values("time", inplace=True)
-
-                # flatten the embedded json
-                data = flattenJson(data)
 
+                        # %% TIME (UTC, TIMEZONE, DAY AND EVENTUALLY LOCAL TIME)
+                        data["utcTime"] = pd.to_datetime(data["time"])
+                        data["timezone"].fillna(method='ffill', inplace=True)
+                        data["timezone"].fillna(method='bfill', inplace=True)
+                        data["day"] = pd.DatetimeIndex(data["utcTime"]).date
+
+                        # round to the nearest 5 minutes
+                        # TODO: once roundTime is pushed to tidals repository then this line can be replaced
+                        # with td.clean.round_time
+                        data = round_time(data, timeIntervalMinutes=5, timeField="time",
+                                          roundedTimeFieldName="roundedTime", startWithFirstRecord=True,
+                                          verbose=False)
+                        data.sort_values("uploadTime", ascending=False, inplace=True)
+
+
+                        # %% ID, HASHID, AGE, & YLW
+                        data["userID"] = userID
+                        data["hashID"] = hashID
+                        data["age"] = np.floor((data["utcTime"] - bDate).dt.days/365.25).astype(int)
+                        data["ylw"] = np.floor((data["utcTime"] - dDate).dt.days/365.25).astype(int)
+
+    #                    commonColumnHeadings = ["hashID",
+    #                                            "age",
+    #                                            "ylw"]
+
+
+                        # %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
+                        bolus = mergeWizardWithBolus(data)
+                        if len(bolus) > 0:
+                            # get rid of duplicates that have the same ["time", "normal"]
+                            bolus.sort_values("uploadTime", ascending=False, inplace=True)
+                            bolus, nBolusDuplicatesRemoved = \
+                                removeDuplicates(bolus, ["deviceTime", "normal"])
+                            metadata["nBolusDuplicatesRemoved"] = nBolusDuplicatesRemoved
+
+                            # get a summary of boluses per day
+                            bolusDaySummary = get_bolusDaySummary(bolus)
+
+    #                        # isf and cir associated with bolus event
+    #                        if "insulinSensitivities" in list(bolus):
+    #                            pdb.set_trace()
+    #
+    #                        if "carbRatios" in list(bolus):
+    #                            pdb.set_trace()
+
+                            bolus["isf_mmolL_U"] = bolus["insulinSensitivity"]
+                            bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
+
+    #                        bolusCH = commonColumnHeadings.copy()
+                            bolusCH = ["utcTime", "timezone", "roundedTime", "normal", "carbInput", "subType",
+                                            "insulinOnBoard", "bgInput",
+                                            "isf", "isf_mmolL_U", "insulinCarbRatio"]
+                            bolusEvents = bolus.loc[bolus["normal"].notnull(), bolusCH]
+                            bolusEvents.loc[bolusEvents["bgInput"] == 0, "bgInput"] = np.nan
+                            bolusEvents = bolusEvents.rename(columns={"normal": "unitsInsulin",
+                                                                      "bgInput": "bg_mmolL"})
+                            bolusEvents["bg_mgdL"] = mmolL_to_mgdL(bolusEvents["bg_mmolL"])
+                            bolusEvents["eventType"] = "correction"
+                            bolusEvents.loc[bolusEvents["carbInput"] > 0, "eventType"] = "meal"
+
+                            if "duration" in list(bolus):
+                                bolus["duration"].replace(0, np.nan, inplace=True)
+                                bolus["durationHours"] = bolus["duration"] / 1000.0 / 3600.0
+                                bolus["rate"] = bolus["extended"] / bolus["durationHours"]
+    #                            bolusExtendedCH = commonColumnHeadings.copy()
+                                bolusExtendedCH = ["utcTime", "timezone", "roundedTime", "durationHours", "rate",  "type"]
+                                bolusExtendedEvents = bolus.loc[
+                                        ((bolus["extended"].notnull()) &
+                                         (bolus["duration"] > 0)), bolusExtendedCH]
+
+                            if "extended" not in bolus:
+                                bolus["extended"] = np.nan
+                                bolus["duration"] = np.nan
+
+
+                            # get start and end times
+                            bolusBeginDate, bolusEndDate = getStartAndEndTimes(bolus, "day")
+                            metadata["bolus.beginDate"] = bolusBeginDate
+                            metadata["bolus.endDate"] = bolusEndDate
+
+
+                            # %% PUMP SETTINGS
+
+                            pumpSettings = data[data.type == "pumpSettings"].copy().dropna(axis=1, how="all")
+                            pumpSettings.sort_values("uploadTime", ascending=False, inplace=True)
+
+                            pumpSettings, nPumpSettingsDuplicatesRemoved = \
+                            removeDuplicates(pumpSettings, "deviceTime")
+                            metadata["nPumpSettingsDuplicatesRemoved"] = nPumpSettingsDuplicatesRemoved
+
+                            pumpSettings.sort_values("utcTime", ascending=True, inplace=True)
+                            pumpSettings.reset_index(drop=True, inplace=True)
+
+                            # ISF
+                            isfColHeadings = ["isf.localTime", "isf", "isf_mmolL_U"]
+
+                            if "insulinSensitivity.amount" in list(pumpSettings):
+                                isfColHead = "insulinSensitivity"
+                                pumpSettings["isf_mmolL_U"] = pumpSettings[isfColHead + ".amount"]
+                                pumpSettings["isf"] = mmolL_to_mgdL(pumpSettings["isf_mmolL_U"])
+                                pumpSettings["isf.localTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                                    pd.to_timedelta(pumpSettings[isfColHead + ".start"], unit="ms")
+
+                                isf = pumpSettings.loc[pumpSettings["isf"].notnull(), isfColHeadings]
+
+                                # add a day summary
+                                isfDaySummary = pd.DataFrame()
+                                isfDaySummary["day"] = isf["isf.localTime"].dt.date
+                                isfDaySummary["isf.min"] = isf["isf"]
+                                isfDaySummary["isf.weightedMean"] = isf["isf"]
+                                isfDaySummary["isf.max"] = isf["isf"]
+                                isfDaySummary = pd.concat([isfDaySummary, dataPulledDF], sort=False)
+                                isfDaySummary.reset_index(inplace=True, drop=True)
+                                isfDaySummary.fillna(method='ffill', inplace=True)
 
-                # %% CLEAN DATA
-                # remove negative durations
-                data, nNegativeDurations = removeNegativeDurations(data)
-                metadata["nNegativeDurations"] = nNegativeDurations
-
-                # get rid of cgm values too low/high (< 38 & > 402 mg/dL)
-                data, nInvalidCgmValues = removeInvalidCgmValues(data)
-                metadata["nInvalidCgmValues"] = nInvalidCgmValues
-
-                # Tslim calibration bug fix
-                data, nTandemAndPayloadCalReadings = tslimCalibrationFix(data)
-                metadata["nTandemAndPayloadCalReadings"] = nTandemAndPayloadCalReadings
+                            else:
+                                isfColHead = "insulinSensitivities"
+                                isf = pd.DataFrame(columns=isfColHeadings)
+                                isfDayColHeadings = ['day', 'isf.min', 'isf.weightedMean', 'isf.max']
+                                isfDaySummary = pd.DataFrame(columns=isfDayColHeadings)
+                                for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
+                                    # edge case where actSchedule is float
+                                    if isinstance(actSched, float):
+                                        actSched = str(int(actSched))
+
+                                    tempDF = pd.DataFrame(pumpSettings.loc[p, isfColHead + "." + actSched])
+                                    tempDF["day"] = pumpSettings.loc[p, "day"]
+                                    tempDF["isf.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                    tempDF["isf_mmolL_U"] = tempDF["amount"]
+                                    tempDF["isf"] = mmolL_to_mgdL(tempDF["isf_mmolL_U"])
+                                    endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["isf.localTime"], index=[0])
+                                    tempDF = get_setting_durations(tempDF, "isf", endOfDay)
+                                    tempDF = tempDF[:-1]
+
+                                    tempDaySummary = pd.DataFrame(index=[0])
+                                    tempDaySummary["day"] = tempDF["isf.localTime"].dt.date
+                                    tempDaySummary["isf.min"] = tempDF["isf"].min()
+                                    tempDaySummary["isf.weightedMean"] = \
+                                        np.sum(tempDF["isf"] * tempDF["isf.durationHours"]) / tempDF["isf.durationHours"].sum()
+                                    tempDaySummary["isf.max"] = tempDF["isf"].max()
+
+                                    isf = pd.concat([isf, tempDF[isfColHeadings]], ignore_index=True)
+                                    isfDaySummary = pd.concat([isfDaySummary, tempDaySummary], ignore_index=True)
+
+                                isfDaySummary = pd.concat([isfDaySummary, dataPulledDF], sort=False)
+                                isfDaySummary.reset_index(inplace=True, drop=True)
+                                isfDaySummary.fillna(method='ffill', inplace=True)
+
+                            # CIR
+                            cirColHeadings = ["cir.localTime", "cir"]
+
+                            if "carbRatio.amount" in list(pumpSettings):
+                                cirColHead = "carbRatio"
+                                pumpSettings["cir"] = pumpSettings[cirColHead + ".amount"]
+                                pumpSettings["cir.localTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                                    pd.to_timedelta(pumpSettings[cirColHead + ".start"], unit="ms")
+
+                                cir = pumpSettings.loc[pumpSettings["carbRatio.amount"].notnull(), cirColHeadings]
+
+                                # add a day summary
+                                cirDaySummary = pd.DataFrame()
+                                cirDaySummary["day"] = cir["cir.localTime"].dt.date
+                                cirDaySummary["cir.min"] = cir["cir"]
+                                cirDaySummary["cir.weightedMean"] = cir["cir"]
+                                cirDaySummary["cir.max"] = cir["cir"]
+                                cirDaySummary = pd.concat([cirDaySummary, dataPulledDF], sort=False)
+                                cirDaySummary.reset_index(inplace=True, drop=True)
+                                cirDaySummary.fillna(method='ffill', inplace=True)
 
+                            else:
 
-                # %% ADD UPLOAD DATE
-                # attach upload time to each record, for resolving duplicates
-                if (("upload" in data.type.unique()) &
-                    ("basal" in data.type.unique()) &
-                    ("bolus" in data.type.unique()) &
-                    ("cbg" in data.type.unique()) &
-                    ("pumpSettings" in data.type.unique())):
-                    data = addUploadDate(data)
+                                cirColHead = "carbRatios"
+                                cir = pd.DataFrame(columns=cirColHeadings)
+                                cirDayColHeadings = ['day', 'cir.min', 'cir.weightedMean', 'cir.max']
+                                cirDaySummary = pd.DataFrame(columns=cirDayColHeadings)
+                                for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
+                                    # edge case where actSchedule is float
+                                    if isinstance(actSched, float):
+                                        actSched = str(int(actSched))
+
+                                    tempDF = pd.DataFrame(pumpSettings.loc[p, cirColHead + "." + actSched])
+                                    tempDF["day"] = pumpSettings.loc[p, "day"]
+                                    tempDF["cir.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                    tempDF["cir"] = tempDF["amount"]
+                                    endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["cir.localTime"], index=[0])
+                                    tempDF = get_setting_durations(tempDF, "cir", endOfDay)
+                                    tempDF = tempDF[:-1]
+
+                                    tempDaySummary = pd.DataFrame(index=[0])
+                                    tempDaySummary["day"] = tempDF["cir.localTime"].dt.date
+                                    tempDaySummary["cir.min"] = tempDF["cir"].min()
+                                    tempDaySummary["cir.weightedMean"] = \
+                                        np.sum(tempDF["cir"] * tempDF["cir.durationHours"]) / tempDF["cir.durationHours"].sum()
+                                    tempDaySummary["cir.max"] = tempDF["cir"].max()
+
+                                    cir = pd.concat([cir, tempDF[cirColHeadings]], ignore_index=True)
+                                    cirDaySummary = pd.concat([cirDaySummary, tempDaySummary], ignore_index=True)
+
+                                cirDaySummary = pd.concat([cirDaySummary, dataPulledDF], sort=False)
+                                cirDaySummary.reset_index(inplace=True, drop=True)
+                                cirDaySummary.fillna(method='ffill', inplace=True)
+
+
+                            # CORRECTION TARGET
+                            ctColHeadings = ["ct.localTime", "ct.low", "ct.high", "ct.target", "ct.range"]
+                            ctDayColHeadings = ['day',
+                                                "ct.low.min", "ct.low.weightedMean", "ct.low.max",
+                                                "ct.high.min", "ct.high.weightedMean", "ct.high.max",
+                                                "ct.target.min", "ct.target.weightedMean", "ct.target.max",
+                                                "ct.range.min", "ct.range.weightedMean", "ct.range.max"]
+
+                            if "bgTarget.start" in list(pumpSettings):
+                                ctColHead = "bgTarget."
+
+                                for targetType in ["low", "high", "target", "range"]:
+                                    if ctColHead + targetType in list(pumpSettings):
+                                        pumpSettings["ct." + targetType + "_mmolL"] = \
+                                            pumpSettings[ctColHead + targetType]
+
+                                        pumpSettings["ct." + targetType] = \
+                                            mmolL_to_mgdL(pumpSettings["ct." + targetType + "_mmolL"])
+                                    else:
+                                        pumpSettings["ct." + targetType + "_mmolL"] = np.nan
+                                        pumpSettings["ct." + targetType]  = np.nan
+
+                                pumpSettings["ct.localTime"] = pd.to_datetime(pumpSettings["day"]) + \
+                                    pd.to_timedelta(pumpSettings[ctColHead + "start"], unit="ms")
+
+                                correctionTarget = pumpSettings.loc[pumpSettings["bgTarget.start"].notnull(), ctColHeadings]
+
+                                # add a day summary
+                                ctDaySummary = pd.DataFrame(columns=ctDayColHeadings)
+                                ctDaySummary["day"] = correctionTarget["ct.localTime"].dt.date
+                                # add min, weightedMean, and max
+                                for targetType in ["ct.low", "ct.high", "ct.target", "ct.range"]:
+                                    for stat in [".min", ".weightedMean", ".max"]:
+                                        ctDaySummary[targetType + stat] = correctionTarget[targetType]
+
+
+                                ctDaySummary = pd.concat([ctDaySummary, dataPulledDF], sort=False)
+                                ctDaySummary.reset_index(inplace=True, drop=True)
+                                ctDaySummary.fillna(method='ffill', inplace=True)
 
+                            else:
+                                ctColHead = "bgTargets"
+                                correctionTarget = pd.DataFrame(columns=ctColHeadings)
+
+                                ctDaySummary = pd.DataFrame(columns=ctDayColHeadings)
+                                for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
+                                    # edge case where actSchedule is float
+                                    if isinstance(actSched, float):
+                                        actSched = str(int(actSched))
+
+                                    tempDF = pd.DataFrame(pumpSettings.loc[p, ctColHead + "." + actSched])
+                                    targetTypes = list(set(list(tempDF)) - set(["start"]))
+                                    tempDF["day"] = pumpSettings.loc[p, "day"]
+                                    tempDF["ct.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                    endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["ct.localTime"], index=[0])
+                                    tempDF = get_setting_durations(tempDF, "ct", endOfDay)
+                                    tempDF = tempDF[:-1]
+
+                                    tempDaySummary = pd.DataFrame(columns=ctDayColHeadings, index=[0])
+                                    tempDaySummary["day"] = tempDF["ct.localTime"].dt.date
+
+                                    for targetType in targetTypes:
+                                        tempDF["ct." + targetType] = mmolL_to_mgdL(tempDF[targetType])
+
+                                        tempDaySummary["ct." + targetType + ".min"] = tempDF["ct." + targetType].min()
+                                        tempDaySummary["ct." + targetType + ".weightedMean"] = \
+                                            np.sum(tempDF["ct." + targetType] * tempDF["ct.durationHours"]) / tempDF["ct.durationHours"].sum()
+                                        tempDaySummary["ct." + targetType + ".max"] = tempDF["ct." + targetType].max()
+
+                                    correctionTarget = \
+                                        pd.concat([correctionTarget,
+                                                   tempDF.drop(columns=['start',
+                                                                        'target',
+                                                                        'day',
+                                                                        'ct.durationHours'])],
+                                                   ignore_index=True, sort=False)
+                                    ctDaySummary = pd.concat([ctDaySummary, tempDaySummary],
+                                                             ignore_index=True, sort=False)
+
+                                ctDaySummary = pd.concat([ctDaySummary, dataPulledDF], sort=False)
+                                ctDaySummary.fillna(method='ffill', inplace=True)
+                                ctDaySummary.drop_duplicates(inplace=True)
+                                ctDaySummary.reset_index(inplace=True, drop=True)
+
+                            # SCHEDULED BASAL RATES
+                            sbrColHeadings = ["sbr.localTime", "rate", "sbr.type"]
+                            sbr = pd.DataFrame(columns=sbrColHeadings)
+                            sbrDayColHeadings = ['day', 'sbr.min', 'sbr.weightedMean', 'sbr.max', 'sbr.type']
+                            sbrDaySummary = pd.DataFrame(columns=sbrDayColHeadings)
+                            for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
+                                # edge case where actSchedule is float
+                                if isinstance(actSched, float):
+                                    actSched = str(int(actSched))
+                                if 'Auto Mode' not in actSched:
+                                    tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
+                                    tempDF["day"] = pumpSettings.loc[p, "day"]
+                                    tempDF["sbr.type"] = np.nan
+                                    tempDF["sbr.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                    endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["sbr.localTime"], index=[0])
+                                    tempDF = get_setting_durations(tempDF, "sbr", endOfDay)
+                                    tempDF = tempDF[:-1]
+
+                                    tempDaySummary = pd.DataFrame(index=[0])
+                                    tempDaySummary["day"] = tempDF["sbr.localTime"].dt.date
+                                    tempDaySummary["sbr.min"] = tempDF["rate"].min()
+                                    tempDaySummary["sbr.weightedMean"] = \
+                                        np.sum(tempDF["rate"] * tempDF["sbr.durationHours"]) / tempDF["sbr.durationHours"].sum()
+                                    tempDaySummary["sbr.max"] = tempDF["rate"].max()
+                                    tempDaySummary["sbr.type"] = np.nan
 
-                    # %% TIME (UTC, TIMEZONE, DAY AND EVENTUALLY LOCAL TIME)
-                    data["utcTime"] = pd.to_datetime(data["time"])
-                    data["timezone"].fillna(method='ffill', inplace=True)
-                    data["timezone"].fillna(method='bfill', inplace=True)
-                    data["day"] = pd.DatetimeIndex(data["utcTime"]).date
-
-                    # round to the nearest 5 minutes
-                    # TODO: once roundTime is pushed to tidals repository then this line can be replaced
-                    # with td.clean.round_time
-                    data = round_time(data, timeIntervalMinutes=5, timeField="time",
-                                      roundedTimeFieldName="roundedTime", startWithFirstRecord=True,
-                                      verbose=False)
-                    data.sort_values("uploadTime", ascending=False, inplace=True)
+                                else:
+                                    tempDF = pd.DataFrame(index=[0])
+                                    tempDF["day"] = pumpSettings.loc[p, "day"]
+                                    tempDF["sbr.localTime"] = pd.to_datetime(tempDF["day"])
+                                    tempDF["rate"] = np.nan
+                                    tempDF["sbr.type"] = "AutoMode"
+
+                                    tempDaySummary = pd.DataFrame(index=[0])
+                                    tempDaySummary["day"] = tempDF["sbr.localTime"].dt.date
+                                    tempDaySummary["sbr.min"] = np.nan
+                                    tempDaySummary["sbr.weightedMean"] = np.nan
+                                    tempDaySummary["sbr.max"] = np.nan
+                                    tempDaySummary["sbr.type"] = "AutoMode"
+
+                                sbr = pd.concat([sbr, tempDF[sbrColHeadings]], ignore_index=True)
+                                sbrDaySummary = pd.concat([sbrDaySummary, tempDaySummary], ignore_index=True)
+
+                            sbrDaySummary = pd.concat([sbrDaySummary, dataPulledDF], sort=False)
+                            sbrDaySummary.reset_index(inplace=True, drop=True)
+                            sbrDaySummary.fillna(method='ffill', inplace=True)
+
+    #                        # max basal rate, max bolus amount, and insulin duration
+    #                        if "rateMaximum" in list(data):
+    #                            pdb.set_trace()
+    #                        if "amountMaximum" in list(data):
+    #                            pdb.set_trace()
+    #                        if "bolus.calculator" in list(data):
+    #                            pdb.set_trace()
+
+
+                            # %% ACTUAL BASAL RATES (TIME, VALUE, DURATION, TYPE (SCHEDULED, TEMP, SUSPEND))
+                            basal = data[data.type == "basal"].copy().dropna(axis=1, how="all")
+                            basal.sort_values("uploadTime", ascending=False, inplace=True)
+
+                            basalBeginDate, basalEndDate = getStartAndEndTimes(basal, "day")
+                            metadata["basal.beginDate"] = basalBeginDate
+                            metadata["basal.endDate"] = basalEndDate
+
+                            basal, nBasalDuplicatesRemoved = \
+                                removeDuplicates(basal, ["deliveryType", "deviceTime", "duration", "rate"])
+                            metadata["basal.nBasalDuplicatesRemoved"] = nBasalDuplicatesRemoved
+
+                            # fill NaNs with 0, as it indicates a suspend (temp basal of 0)
+                            basal.rate.fillna(0, inplace=True)
+
+                            # get rid of basals that have durations of 0
+                            nBasalDuration0 = sum(basal.duration > 0)
+                            basal = basal[basal.duration > 0]
+                            metadata["basal.nBasalDuration0"] = nBasalDuration0
+
+                            # get rid of basal durations that are unrealistic
+                            nUnrealisticBasalDuration = ((basal.duration < 0) | (basal.duration > 86400000))
+                            metadata["nUnrealisticBasalDuration"] = sum(nUnrealisticBasalDuration)
+                            basal.loc[nUnrealisticBasalDuration, "duration"] = np.nan
+
+                            # calculate the total amount of insulin delivered (duration * rate)
+                            basal["durationHours"] = basal["duration"] / 1000.0 / 3600.0
+                            basal["totalAmountOfBasalInsulin"] = basal["durationHours"] * basal["rate"]
+
+                            # actual basal delivered
+    #                        abrColHeadings = commonColumnHeadings.copy()
+                            abrColHeadings = ["utcTime", "timezone", "roundedTime", "durationHours", "rate", "type"]
+                            abr = basal[abrColHeadings]
+                            if "duration" in list(bolus):
+                                abr = pd.concat([abr, bolusExtendedEvents], ignore_index=True)
+                                abr.sort_values("utcTime", inplace=True)
+
+                            abr["timezone"].fillna(method='ffill', inplace=True)
+                            abr["timezone"].fillna(method='bfill', inplace=True)
+
+                            # get a summary of basals per day
+                            basalDaySummary = get_basalDaySummary(basal)
+
+
+                            # %% GET CLOSED LOOP DAYS WITH TEMP BASAL DATA
+                            # group data by type
+                            groupedData = data.groupby(by="type")
+
+                            isClosedLoopDay, is670g, metadata = \
+                                getClosedLoopDays(groupedData, 30, metadata)
+
+                            # %% CGM DATA
+                            # filter by cgm and sort by uploadTime
+                            cgmData = groupedData.get_group("cbg").dropna(axis=1, how="all")
+
+                            # get rid of duplicates that have the same ["deviceTime", "value"]
+                            cgmData, nCgmDuplicatesRemovedDeviceTime = removeCgmDuplicates(cgmData, "deviceTime")
+                            metadata["nCgmDuplicatesRemovedDeviceTime"] = nCgmDuplicatesRemovedDeviceTime
+
+                            # get rid of duplicates that have the same ["time", "value"]
+                            cgmData, nCgmDuplicatesRemovedUtcTime = removeCgmDuplicates(cgmData, "time")
+                            metadata["cnCgmDuplicatesRemovedUtcTime"] = nCgmDuplicatesRemovedUtcTime
+
+                            # get rid of duplicates that have the same "roundedTime"
+                            cgmData, nCgmDuplicatesRemovedRoundedTime = removeDuplicates(cgmData, "roundedTime")
+                            metadata["nCgmDuplicatesRemovedRoundedTime"] = nCgmDuplicatesRemovedRoundedTime
+
+                            # get start and end times
+                            cgmBeginDate, cgmEndDate = getStartAndEndTimes(cgmData, "day")
+                            metadata["cgm.beginDate"] = cgmBeginDate
+                            metadata["cgm.endDate"] = cgmEndDate
+
+                            # get a list of dexcom cgms
+                            cgmData, percentDexcom = getListOfDexcomCGMDays(cgmData)
+                            metadata["cgm.percentDexcomCGM"] = percentDexcom
+
+                            # group by date (day) and get stats
+                            catDF = cgmData.groupby(cgmData["day"])
+                            cgmRecordsPerDay = \
+                                pd.DataFrame(catDF.value.count()). \
+                                rename(columns={"value": "cgm.count"})
+                            dayDate = catDF.day.describe()["top"]
+                            dexcomCGM = catDF.dexcomCGM.describe()["top"]
+                            nTypesCGM = catDF.dexcomCGM.describe()["unique"]
+                            cgmRecordsPerDay["cgm.dexcomOnly"] = \
+                                (dexcomCGM & (nTypesCGM == 1))
+                            cgmRecordsPerDay["date"] = cgmRecordsPerDay.index
+
+                            # filter the cgm data
+    #                        cgmColHeadings = commonColumnHeadings.copy()
+                            cgmColHeadings = ["utcTime", "timezone", "roundedTime", "value"]
+
+                            # get data in mg/dL units
+                            cgm = cgmData[cgmColHeadings]
+                            cgm = cgm.rename(columns={'value': 'mmol_L'})
+                            cgm["mg_dL"] = mmolL_to_mgdL(cgm["mmol_L"]).astype(int)
+
+
+                            # %% NUMBER OF DAYS OF PUMP AND CGM DATA, OVERALL AND PER EACH AGE & YLW
+
+                            # COMBINE DAY SUMMARIES
+                            # group by date (day) and get stats
+                            catDF = data.groupby(data["day"])
+                            dataPerDay = \
+                                pd.DataFrame(catDF.hashID.describe()["top"]). \
+                                rename(columns={"top": "hashID"})
+                            dataPerDay["age"] = catDF.age.mean()
+                            dataPerDay["ylw"] = catDF.ylw.mean()
+                            dataPerDay["timezone"] = catDF.timezone.describe()["top"]
+
+                            # calculate all of the data start and end range
+                            # this can be used for looking at settings
+                            dayBeginDate = min(cgmBeginDate, bolusBeginDate, basalBeginDate)
+                            dayEndDate = max(cgmEndDate, bolusEndDate, basalEndDate)
+                            metadata["day.beginDate"] = dayBeginDate
+                            metadata["day.endDate"] = dayEndDate
+                            rng = pd.date_range(dayBeginDate, dayEndDate).date
+                            dayData = pd.DataFrame(rng, columns=["day"])
+                            for dfType in [dataPerDay, basalDaySummary, bolusDaySummary, cgmRecordsPerDay]:
+                                dayData = pd.merge(dayData, dfType.reset_index(), on="day", how="left")
+                            for dfType in [isClosedLoopDay, is670g]:
+                                dayData = pd.merge(dayData, dfType, on="day", how="left")
+
+                            dayData["validPumpData"] = dayData["numberOfNormalBoluses"] > 3
+                            dayData["validCGMData"] = dayData["cgm.count"] > (288*.75)
+
+                            dayData["timezone"].fillna(method='ffill', inplace=True)
+                            dayData["timezone"].fillna(method='bfill', inplace=True)
+
+                            dayData["isDSTChangeDay"] = dayData[['day', 'timezone']].apply(lambda x: isDSTChangeDay(*x), axis=1)
+                            dayData["date"] = pd.to_datetime(dayData["day"])
+                            dayData["tzo"] = dayData[['date', 'timezone']].apply(lambda x: getTimezoneOffset(*x), axis=1)
+
+                            # add settings to the dayData
+                            dayData = pd.merge(dayData, isfDaySummary, on="day", how="left")
+                            dayData = pd.merge(dayData, cirDaySummary, on="day", how="left")
+                            dayData = pd.merge(dayData, ctDaySummary, on="day", how="left")
+                            dayData = pd.merge(dayData, sbrDaySummary, on="day", how="left")
+
+                            # fill data forward
+                            fillList = ['isf.min',
+                                        'isf.weightedMean',
+                                        'isf.max',
+                                        'cir.min',
+                                        'cir.weightedMean',
+                                        'cir.max',
+                                        'ct.low.min', 'ct.low.weightedMean', 'ct.low.max',
+                                        'ct.high.min', 'ct.high.weightedMean', 'ct.high.max',
+                                        'ct.target.min', 'ct.target.weightedMean', 'ct.target.max',
+                                        'ct.range.min', 'ct.range.weightedMean', 'ct.range.max',
+                                        'sbr.min',
+                                        'sbr.weightedMean',
+                                        'sbr.max',
+                                        'sbr.type']
+                            for fl in fillList:
+                                dayData[fl].fillna(method='ffill', inplace=True)
+
+                            # calculate the start and end of contiguous data
+                            # these dates can be used when simulating and predicting, where
+                            # you need both pump and cgm data
+                            contiguousBeginDate = max(cgmBeginDate, bolusBeginDate, basalBeginDate)
+                            contiguousEndDate = min(cgmEndDate, bolusEndDate, basalEndDate)
+                            metadata["contiguous.beginDate"] = contiguousBeginDate
+                            metadata["contiguous.endDate"] = contiguousEndDate
+
+                            # get a summary by age, and ylw
+                            catDF = dayData.groupby("age")
+                            ageSummary = pd.DataFrame(catDF.validPumpData.sum())
+                            ageSummary.rename(columns={"validPumpData": "nDaysValidPump"}, inplace=True)
+                            ageSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
+                            ageSummary["nDaysClosedLoop"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
+                            ageSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
+
+                            # add in isf stats
+                            ageSummary["isf.nDays"] = catDF["isf.min"].count()
+                            ageSummary["isf.min"] = catDF["isf.min"].min()
+                            ageSummary["isf.weightedMean"] = catDF["isf.weightedMean"].sum() / catDF["isf.weightedMean"].count()
+                            ageSummary["isf.max"] = catDF["isf.max"].max()
+
+                            # add cir stats
+                            ageSummary["cir.nDays"] = catDF["cir.min"].count()
+                            ageSummary["cir.min"] = catDF["cir.min"].min()
+                            ageSummary["cir.weightedMean"] = catDF["cir.weightedMean"].sum() / catDF["cir.weightedMean"].count()
+                            ageSummary["cir.max"] = catDF["cir.max"].max()
+
+                            # add sbr stats
+                            ageSummary["sbr.nDays"] = catDF["sbr.min"].count()
+                            ageSummary["sbr.min"] = catDF["sbr.min"].min()
+                            ageSummary["sbr.weightedMean"] = catDF["sbr.weightedMean"].sum() / catDF["sbr.weightedMean"].count()
+                            ageSummary["sbr.max"] = catDF["sbr.max"].max()
+                            ageSummary["sbr.nAutoMode"] = catDF["sbr.type"].count()
+
+                            # correctionTarget stats
+                            for targetType in ["ct.low", "ct.high", "ct.target", "ct.range"]:
+                                for stat in [".min", ".weightedMean", ".max"]:
+                                    ch = targetType + stat
+                                    ageSummary[ch + ".nDays"] = catDF[ch].count()
+                                    ageSummary[ch + ".min"] = catDF[ch].min()
+                                    ageSummary[ch + ".weightedMean"] = catDF[ch].sum() / catDF[ch].count()
+                                    ageSummary[ch + ".max"] = catDF[ch].max()
+
+                            ageSummary.reset_index(inplace=True)
+
+                            analysisCriterion = ageSummary[((ageSummary["nDaysValidPump"]> 28) &
+                                                            (ageSummary["nDaysValidCgm"]> 28))]
+                            minAge = analysisCriterion["age"].min()
+                            maxAge = analysisCriterion["age"].max()
+                            nDaysClosedLoop = analysisCriterion["nDaysClosedLoop"].sum()
+                            n670gDays = analysisCriterion["n670gDays"].sum()
+                            metadata["minAge"] = minAge
+                            metadata["maxAge"] = maxAge
+                            metadata["nDaysClosedLoop"] = nDaysClosedLoop
+                            metadata["n670gDays"] = n670gDays
+
+                            catDF = dayData.groupby("ylw")
+                            ylwSummary = pd.DataFrame(catDF.validPumpData.sum())
+                            ylwSummary.rename(columns={"validPumpData": "nDaysValidPump"}, inplace=True)
+                            ylwSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
+                            ylwSummary["nDaysClosedLoop"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
+                            ylwSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
+
+                            ylwSummary["isf.nDays"] = catDF["isf.min"].count()
+                            ylwSummary["isf.min"] = catDF["isf.min"].min()
+                            ylwSummary["isf.weightedMean"] = catDF["isf.weightedMean"].sum() / catDF["isf.weightedMean"].count()
+                            ylwSummary["isf.max"] = catDF["isf.max"].max()
+
+                            # add cir stats
+                            ylwSummary["cir.nDays"] = catDF["cir.min"].count()
+                            ylwSummary["cir.min"] = catDF["cir.min"].min()
+                            ylwSummary["cir.weightedMean"] = catDF["cir.weightedMean"].sum() / catDF["cir.weightedMean"].count()
+                            ylwSummary["cir.max"] = catDF["cir.max"].max()
+
+                            # add sbr stats
+                            ylwSummary["sbr.nDays"] = catDF["sbr.min"].count()
+                            ylwSummary["sbr.min"] = catDF["sbr.min"].min()
+                            ylwSummary["sbr.weightedMean"] = catDF["sbr.weightedMean"].sum() / catDF["sbr.weightedMean"].count()
+                            ylwSummary["sbr.max"] = catDF["sbr.max"].max()
+                            ylwSummary["sbr.nAutoMode"] = catDF["sbr.type"].count()
+
+                            # correctionTarget stats
+                            for targetType in ["ct.low", "ct.high", "ct.target", "ct.range"]:
+                                for stat in [".min", ".weightedMean", ".max"]:
+                                    ch = targetType + stat
+                                    ylwSummary[ch + ".nDays"] = catDF[ch].count()
+                                    ylwSummary[ch + ".min"] = catDF[ch].min()
+                                    ylwSummary[ch + ".weightedMean"] = catDF[ch].sum() / catDF[ch].count()
+                                    ylwSummary[ch + ".max"] = catDF[ch].max()
+
+                            ylwSummary.reset_index(inplace=True)
+
+                            analysisCriterion = ylwSummary[((ylwSummary["nDaysValidPump"]> 28) &
+                                                            (ylwSummary["nDaysValidCgm"]> 28))]
+                            minYLW = analysisCriterion["ylw"].min()
+                            maxYLW = analysisCriterion["ylw"].max()
+                            metadata["minYLW"] = minYLW
+                            metadata["maxYLW"] = maxYLW
+
+
+                            # %% calculate local time
+                            abr["date"] = pd.to_datetime(abr["utcTime"].dt.date)
+                            abr = pd.merge(abr, dayData[["date", "tzo", "isDSTChangeDay"]], how="left", on="date")
+                            abr["localTime"] = abr["utcTime"] + pd.to_timedelta(abr["tzo"], unit="m")
+
+                            cgm["date"] = pd.to_datetime(cgm["utcTime"].dt.date)
+                            cgm = pd.merge(cgm, dayData[["date", "tzo", "isDSTChangeDay"]], how="left", on="date")
+                            cgm["localTime"] = cgm["utcTime"] + pd.to_timedelta(cgm["tzo"], unit="m")
+
+                            bolusEvents["date"] = pd.to_datetime(bolusEvents["utcTime"].dt.date)
+                            bolusEvents = pd.merge(bolusEvents, dayData[["date", "tzo", "isDSTChangeDay"]], how="left", on="date")
+                            bolusEvents["localTime"] = bolusEvents["utcTime"] + pd.to_timedelta(bolusEvents["tzo"], unit="m")
+
+
+                            # %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)
+                            # all settings
+
+                            allSettings = pd.merge(isf.rename(columns={"isf.localTime": "localTime"}),
+                                                   cir.rename(columns={"cir.localTime": "localTime"}),
+                                                   how="outer", on="localTime")
+                            allSettings = pd.merge(allSettings,
+                                                   sbr.rename(columns={"rate": "sbr",
+                                                                       "type": "sbr.type",
+                                                                       "sbr.localTime": "localTime"}),
+                                                   how="outer", on="localTime")
+                            allSettings = pd.merge(allSettings,
+                                                   correctionTarget.rename(columns={"ct.localTime": "localTime"}),
+                                                   how="outer", on="localTime")
+                            allSettings["hashID"] = hashID
+                            allSettings["age"] = np.floor((allSettings["localTime"] - bDate).dt.days/365.25).astype(int)
+                            allSettings["ylw"] = np.floor((allSettings["localTime"] - dDate).dt.days/365.25).astype(int)
+                            allSettings = round_time(allSettings, timeIntervalMinutes=5,
+                                                     timeField="localTime",
+                                                     roundedTimeFieldName="localRoundedTime",
+                                                     startWithFirstRecord=True, verbose=False)
+
+                            colOrder = ["hashID", "age", "ylw", "localTime", "localRoundedTime",
+                                        "isf", "cir", "sbr",
+                                        "ct.low", "ct.high", "ct.target", "ct.range",
+                                        "sbr.type", "isf_mmolL_U"]
+                            allSettings = allSettings[colOrder]
+
+
+                            fieldsToDrop = ["utcTime", "timezone", "roundedTime", "date", "tzo", "isDSTChangeDay"]
+                            pumpEvents = pd.merge(abr.drop(columns=fieldsToDrop),
+                                                  bolusEvents.drop(columns=fieldsToDrop),
+                                                  how="outer", on="localTime")
+                            pumpEvents["type"].fillna("bolus", inplace=True)
+                            pumpEvents["eventType"].fillna("basal", inplace=True)
+                            pumpEvents["hashID"] = hashID
+                            pumpEvents["age"] = np.floor((pumpEvents["localTime"] - bDate).dt.days/365.25).astype(int)
+                            pumpEvents["ylw"] = np.floor((pumpEvents["localTime"] - dDate).dt.days/365.25).astype(int)
+                            pumpEvents = round_time(pumpEvents, timeIntervalMinutes=5,
+                                                    timeField="localTime",
+                                                    roundedTimeFieldName="localRoundedTime",
+                                                    startWithFirstRecord=True, verbose=False)
+
+
+                            colOrder = ["hashID", "age", "ylw", "localTime", "localRoundedTime",
+                                        "rate", "durationHours",
+                                        "unitsInsulin", "carbInput", "type", "eventType", "subType",
+                                        "isf", "isf_mmolL_U", "insulinCarbRatio", "insulinOnBoard",
+                                        "bg_mgdL", "bg_mmolL"]
+
+                            pumpEvents = pumpEvents[colOrder]
+
+                            cgmLite = cgm.drop(columns=fieldsToDrop)
+                            cgmLite["hashID"] = hashID
+                            cgmLite["age"] = np.floor((cgmLite["localTime"] - bDate).dt.days/365.25).astype(int)
+                            cgmLite["ylw"] = np.floor((cgmLite["localTime"] - dDate).dt.days/365.25).astype(int)
+                            cgmLite = round_time(cgmLite, timeIntervalMinutes=5,
+                                                 timeField="localTime",
+                                                 roundedTimeFieldName="localRoundedTime",
+                                                 startWithFirstRecord=True, verbose=False)
 
+                            colOrder = ["hashID", "age", "ylw", "localTime", "localRoundedTime",
+                                        "mg_dL", "mmol_L"]
 
-                    # %% ID, HASHID, AGE, & YLW
-                    data["userID"] = userID
-                    data["hashID"] = hashID
-                    data["age"] = np.floor((data["utcTime"] - bDate).dt.days/365.25).astype(int)
-                    data["ylw"] = np.floor((data["utcTime"] - dDate).dt.days/365.25).astype(int)
+                            cgmLite = cgmLite[colOrder]
 
-#                    commonColumnHeadings = ["hashID",
-#                                            "age",
-#                                            "ylw"]
 
+                            # %% SAVE RESULTS
 
-                    # %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
-                    bolus = mergeWizardWithBolus(data)
-                    if len(bolus) > 0:
-                        # get rid of duplicates that have the same ["time", "normal"]
-                        bolus.sort_values("uploadTime", ascending=False, inplace=True)
-                        bolus, nBolusDuplicatesRemoved = \
-                            removeDuplicates(bolus, ["deviceTime", "normal"])
-                        metadata["nBolusDuplicatesRemoved"] = nBolusDuplicatesRemoved
+                            # age and ylw stats
+                            pumpEvents["rateTimesDurationHours"] = pumpEvents["rate"] * pumpEvents["durationHours"]
+                            pumpEvents.rename(columns={"rate":"basalRate"}, inplace=True)
+                            catDF = pumpEvents.groupby("age")
 
-                        # get a summary of boluses per day
-                        bolusDaySummary = get_bolusDaySummary(bolus)
+                            # actual basal rates
+                            agePump = pd.DataFrame(catDF["basalRate"].count()).add_suffix(".count")
+                            agePump["basalRate.min"] = catDF["basalRate"].min()
+                            agePump["basalRate.weightedMean"] = catDF["rateTimesDurationHours"].sum() / catDF["durationHours"].sum()
+                            agePump["basalRate.max"] = catDF["basalRate"].max()
 
-#                        # isf and cir associated with bolus event
-#                        if "insulinSensitivities" in list(bolus):
-#                            pdb.set_trace()
-#
-#                        if "carbRatios" in list(bolus):
-#                            pdb.set_trace()
+                            # insulin events
+                            insulinEvents = catDF["unitsInsulin"].describe().add_prefix("insulin.")
+                            agePump = pd.concat([agePump, insulinEvents], axis=1)
 
-                        bolus["isf_mmolL_U"] = bolus["insulinSensitivity"]
-                        bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
-
-#                        bolusCH = commonColumnHeadings.copy()
-                        bolusCH = ["utcTime", "timezone", "roundedTime", "normal", "carbInput", "subType",
-                                        "insulinOnBoard", "bgInput",
-                                        "isf", "isf_mmolL_U", "insulinCarbRatio"]
-                        bolusEvents = bolus.loc[bolus["normal"].notnull(), bolusCH]
-                        bolusEvents.loc[bolusEvents["bgInput"] == 0, "bgInput"] = np.nan
-                        bolusEvents = bolusEvents.rename(columns={"normal": "unitsInsulin",
-                                                                  "bgInput": "bg_mmolL"})
-                        bolusEvents["bg_mgdL"] = mmolL_to_mgdL(bolusEvents["bg_mmolL"])
-                        bolusEvents["eventType"] = "correction"
-                        bolusEvents.loc[bolusEvents["carbInput"] > 0, "eventType"] = "meal"
-
-                        if "duration" in list(bolus):
-                            bolus["duration"].replace(0, np.nan, inplace=True)
-                            bolus["durationHours"] = bolus["duration"] / 1000.0 / 3600.0
-                            bolus["rate"] = bolus["extended"] / bolus["durationHours"]
-#                            bolusExtendedCH = commonColumnHeadings.copy()
-                            bolusExtendedCH = ["utcTime", "timezone", "roundedTime", "durationHours", "rate",  "type"]
-                            bolusExtendedEvents = bolus.loc[
-                                    ((bolus["extended"].notnull()) &
-                                     (bolus["duration"] > 0)), bolusExtendedCH]
-
-                        if "extended" not in bolus:
-                            bolus["extended"] = np.nan
-                            bolus["duration"] = np.nan
-
-
-                        # get start and end times
-                        bolusBeginDate, bolusEndDate = getStartAndEndTimes(bolus, "day")
-                        metadata["bolus.beginDate"] = bolusBeginDate
-                        metadata["bolus.endDate"] = bolusEndDate
-
-
-                        # %% PUMP SETTINGS
-
-                        pumpSettings = data[data.type == "pumpSettings"].copy().dropna(axis=1, how="all")
-                        pumpSettings.sort_values("uploadTime", ascending=False, inplace=True)
-
-                        pumpSettings, nPumpSettingsDuplicatesRemoved = \
-                        removeDuplicates(pumpSettings, "deviceTime")
-                        metadata["nPumpSettingsDuplicatesRemoved"] = nPumpSettingsDuplicatesRemoved
-
-                        pumpSettings.sort_values("utcTime", ascending=True, inplace=True)
-                        pumpSettings.reset_index(drop=True, inplace=True)
-
-                        # ISF
-                        isfColHeadings = ["isf.localTime", "isf", "isf_mmolL_U"]
-
-                        if "insulinSensitivity.amount" in list(pumpSettings):
-                            isfColHead = "insulinSensitivity"
-                            pumpSettings["isf_mmolL_U"] = pumpSettings[isfColHead + ".amount"]
-                            pumpSettings["isf"] = mmolL_to_mgdL(pumpSettings["isf_mmolL_U"])
-                            pumpSettings["isf.localTime"] = pd.to_datetime(pumpSettings["day"]) + \
-                                pd.to_timedelta(pumpSettings[isfColHead + ".start"], unit="ms")
-
-                            isf = pumpSettings.loc[pumpSettings["isf"].notnull(), isfColHeadings]
-
-                            # add a day summary
-                            isfDaySummary = pd.DataFrame()
-                            isfDaySummary["day"] = isf["isf.localTime"].dt.date
-                            isfDaySummary["isf.min"] = isf["isf"]
-                            isfDaySummary["isf.weightedMean"] = isf["isf"]
-                            isfDaySummary["isf.max"] = isf["isf"]
-                            isfDaySummary = pd.concat([isfDaySummary, dataPulledDF], sort=False)
-                            isfDaySummary.reset_index(inplace=True, drop=True)
-                            isfDaySummary.fillna(method='ffill', inplace=True)
+                            # carbs entered in bolus calculator
+                            carbEvents = catDF["carbInput"].describe().add_prefix("carb.")
+                            agePump = pd.concat([agePump, carbEvents], axis=1)
 
-                        else:
-                            isfColHead = "insulinSensitivities"
-                            isf = pd.DataFrame(columns=isfColHeadings)
-                            isfDayColHeadings = ['day', 'isf.min', 'isf.weightedMean', 'isf.max']
-                            isfDaySummary = pd.DataFrame(columns=isfDayColHeadings)
-                            for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
-                                # edge case where actSchedule is float
-                                if isinstance(actSched, float):
-                                    actSched = str(int(actSched))
+                            # very low level cgm stats per age
+                            catDF = cgmLite.groupby("age")
+                            cgmStats = catDF["mg_dL"].describe().add_prefix("cgm.")
+                            agePumpCgm = pd.concat([agePump, cgmStats], axis=1)
 
-                                tempDF = pd.DataFrame(pumpSettings.loc[p, isfColHead + "." + actSched])
-                                tempDF["day"] = pumpSettings.loc[p, "day"]
-                                tempDF["isf.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
-                                tempDF["isf_mmolL_U"] = tempDF["amount"]
-                                tempDF["isf"] = mmolL_to_mgdL(tempDF["isf_mmolL_U"])
-                                endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["isf.localTime"], index=[0])
-                                tempDF = get_setting_durations(tempDF, "isf", endOfDay)
-                                tempDF = tempDF[:-1]
-
-                                tempDaySummary = pd.DataFrame(index=[0])
-                                tempDaySummary["day"] = tempDF["isf.localTime"].dt.date
-                                tempDaySummary["isf.min"] = tempDF["isf"].min()
-                                tempDaySummary["isf.weightedMean"] = \
-                                    np.sum(tempDF["isf"] * tempDF["isf.durationHours"]) / tempDF["isf.durationHours"].sum()
-                                tempDaySummary["isf.max"] = tempDF["isf"].max()
-
-                                isf = pd.concat([isf, tempDF[isfColHeadings]], ignore_index=True)
-                                isfDaySummary = pd.concat([isfDaySummary, tempDaySummary], ignore_index=True)
-
-                            isfDaySummary = pd.concat([isfDaySummary, dataPulledDF], sort=False)
-                            isfDaySummary.reset_index(inplace=True, drop=True)
-                            isfDaySummary.fillna(method='ffill', inplace=True)
-
-                        # CIR
-                        cirColHeadings = ["cir.localTime", "cir"]
-
-                        if "carbRatio.amount" in list(pumpSettings):
-                            cirColHead = "carbRatio"
-                            pumpSettings["cir"] = pumpSettings[cirColHead + ".amount"]
-                            pumpSettings["cir.localTime"] = pd.to_datetime(pumpSettings["day"]) + \
-                                pd.to_timedelta(pumpSettings[cirColHead + ".start"], unit="ms")
-
-                            cir = pumpSettings.loc[pumpSettings["carbRatio.amount"].notnull(), cirColHeadings]
-
-                            # add a day summary
-                            cirDaySummary = pd.DataFrame()
-                            cirDaySummary["day"] = cir["cir.localTime"].dt.date
-                            cirDaySummary["cir.min"] = cir["cir"]
-                            cirDaySummary["cir.weightedMean"] = cir["cir"]
-                            cirDaySummary["cir.max"] = cir["cir"]
-                            cirDaySummary = pd.concat([cirDaySummary, dataPulledDF], sort=False)
-                            cirDaySummary.reset_index(inplace=True, drop=True)
-                            cirDaySummary.fillna(method='ffill', inplace=True)
+                            agePumpCgm.reset_index(inplace=True)
 
-                        else:
+                            ageSummary = pd.merge(ageSummary, agePumpCgm, on="age", how="left")
+                            ageSummary["hashID"] = hashID
+                            allAgeSummaries = pd.concat([allAgeSummaries, ageSummary], ignore_index=True)
 
-                            cirColHead = "carbRatios"
-                            cir = pd.DataFrame(columns=cirColHeadings)
-                            cirDayColHeadings = ['day', 'cir.min', 'cir.weightedMean', 'cir.max']
-                            cirDaySummary = pd.DataFrame(columns=cirDayColHeadings)
-                            for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
-                                # edge case where actSchedule is float
-                                if isinstance(actSched, float):
-                                    actSched = str(int(actSched))
+                            allAgeSummaries.to_csv(os.path.join(outputPath,
+                                "allAgeSummaries-dIndex-" + str(startIndex) + "-" + str(dIndex) + ".csv"))
 
-                                tempDF = pd.DataFrame(pumpSettings.loc[p, cirColHead + "." + actSched])
-                                tempDF["day"] = pumpSettings.loc[p, "day"]
-                                tempDF["cir.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
-                                tempDF["cir"] = tempDF["amount"]
-                                endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["cir.localTime"], index=[0])
-                                tempDF = get_setting_durations(tempDF, "cir", endOfDay)
-                                tempDF = tempDF[:-1]
-
-                                tempDaySummary = pd.DataFrame(index=[0])
-                                tempDaySummary["day"] = tempDF["cir.localTime"].dt.date
-                                tempDaySummary["cir.min"] = tempDF["cir"].min()
-                                tempDaySummary["cir.weightedMean"] = \
-                                    np.sum(tempDF["cir"] * tempDF["cir.durationHours"]) / tempDF["cir.durationHours"].sum()
-                                tempDaySummary["cir.max"] = tempDF["cir"].max()
-
-                                cir = pd.concat([cir, tempDF[cirColHeadings]], ignore_index=True)
-                                cirDaySummary = pd.concat([cirDaySummary, tempDaySummary], ignore_index=True)
-
-                            cirDaySummary = pd.concat([cirDaySummary, dataPulledDF], sort=False)
-                            cirDaySummary.reset_index(inplace=True, drop=True)
-                            cirDaySummary.fillna(method='ffill', inplace=True)
-
-
-                        # CORRECTION TARGET
-                        ctColHeadings = ["ct.localTime", "ct.low", "ct.high", "ct.target", "ct.range"]
-                        ctDayColHeadings = ['day',
-                                            "ct.low.min", "ct.low.weightedMean", "ct.low.max",
-                                            "ct.high.min", "ct.high.weightedMean", "ct.high.max",
-                                            "ct.target.min", "ct.target.weightedMean", "ct.target.max",
-                                            "ct.range.min", "ct.range.weightedMean", "ct.range.max"]
-
-                        if "bgTarget.start" in list(pumpSettings):
-                            ctColHead = "bgTarget."
-
-                            for targetType in ["low", "high", "target", "range"]:
-                                if ctColHead + targetType in list(pumpSettings):
-                                    pumpSettings["ct." + targetType + "_mmolL"] = \
-                                        pumpSettings[ctColHead + targetType]
-
-                                    pumpSettings["ct." + targetType] = \
-                                        mmolL_to_mgdL(pumpSettings["ct." + targetType + "_mmolL"])
-                                else:
-                                    pumpSettings["ct." + targetType + "_mmolL"] = np.nan
-                                    pumpSettings["ct." + targetType]  = np.nan
+                            # repoeat for years living with
+                            catDF = pumpEvents.groupby("ylw")
+                            # actual basal rates
+                            ylwPump = pd.DataFrame(catDF["basalRate"].count()).add_suffix(".count")
+                            ylwPump["basalRate.min"] = catDF["basalRate"].min()
+                            ylwPump["basalRate.weightedMean"] = catDF["rateTimesDurationHours"].sum() / catDF["durationHours"].sum()
+                            ylwPump["basalRate.max"] = catDF["basalRate"].max()
 
-                            pumpSettings["ct.localTime"] = pd.to_datetime(pumpSettings["day"]) + \
-                                pd.to_timedelta(pumpSettings[ctColHead + "start"], unit="ms")
+                            # insulin events
+                            insulinEvents = catDF["unitsInsulin"].describe().add_prefix("insulin.")
+                            ylwPump = pd.concat([ylwPump, insulinEvents], axis=1)
 
-                            correctionTarget = pumpSettings.loc[pumpSettings["bgTarget.start"].notnull(), ctColHeadings]
+                            # carbs entered in bolus calculator
+                            carbEvents = catDF["carbInput"].describe().add_prefix("carb.")
+                            ylwPump = pd.concat([ylwPump, carbEvents], axis=1)
 
-                            # add a day summary
-                            ctDaySummary = pd.DataFrame(columns=ctDayColHeadings)
-                            ctDaySummary["day"] = correctionTarget["ct.localTime"].dt.date
-                            # add min, weightedMean, and max
-                            for targetType in ["ct.low", "ct.high", "ct.target", "ct.range"]:
-                                for stat in [".min", ".weightedMean", ".max"]:
-                                    ctDaySummary[targetType + stat] = correctionTarget[targetType]
+                            # very low level cgm stats per age
+                            catDF = cgmLite.groupby("ylw")
+                            cgmStats = catDF["mg_dL"].describe().add_prefix("cgm.")
+                            ylwPumpCgm = pd.concat([ylwPump, cgmStats], axis=1)
 
+                            ylwPumpCgm.reset_index(inplace=True)
 
-                            ctDaySummary = pd.concat([ctDaySummary, dataPulledDF], sort=False)
-                            ctDaySummary.reset_index(inplace=True, drop=True)
-                            ctDaySummary.fillna(method='ffill', inplace=True)
+                            ylwSummary = pd.merge(ylwSummary, ylwPumpCgm, on="ylw", how="left")
 
-                        else:
-                            ctColHead = "bgTargets"
-                            correctionTarget = pd.DataFrame(columns=ctColHeadings)
+                            ylwSummary["hashID"] = hashID
+                            allYlwSummaries = pd.concat([allYlwSummaries, ylwSummary], ignore_index=True)
 
-                            ctDaySummary = pd.DataFrame(columns=ctDayColHeadings)
-                            for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
-                                # edge case where actSchedule is float
-                                if isinstance(actSched, float):
-                                    actSched = str(int(actSched))
+                            allYlwSummaries.to_csv(os.path.join(outputPath,
+                                "allYlwSummaries-dIndex-" + str(startIndex) + "-" + str(dIndex) + ".csv"))
 
-                                tempDF = pd.DataFrame(pumpSettings.loc[p, ctColHead + "." + actSched])
-                                targetTypes = list(set(list(tempDF)) - set(["start"]))
-                                tempDF["day"] = pumpSettings.loc[p, "day"]
-                                tempDF["ct.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
-                                endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["ct.localTime"], index=[0])
-                                tempDF = get_setting_durations(tempDF, "ct", endOfDay)
-                                tempDF = tempDF[:-1]
-
-                                tempDaySummary = pd.DataFrame(columns=ctDayColHeadings, index=[0])
-                                tempDaySummary["day"] = tempDF["ct.localTime"].dt.date
-
-                                for targetType in targetTypes:
-                                    tempDF["ct." + targetType] = mmolL_to_mgdL(tempDF[targetType])
-
-                                    tempDaySummary["ct." + targetType + ".min"] = tempDF["ct." + targetType].min()
-                                    tempDaySummary["ct." + targetType + ".weightedMean"] = \
-                                        np.sum(tempDF["ct." + targetType] * tempDF["ct.durationHours"]) / tempDF["ct.durationHours"].sum()
-                                    tempDaySummary["ct." + targetType + ".max"] = tempDF["ct." + targetType].max()
-
-                                correctionTarget = \
-                                    pd.concat([correctionTarget,
-                                               tempDF.drop(columns=['start',
-                                                                    'target',
-                                                                    'day',
-                                                                    'ct.durationHours'])],
-                                               ignore_index=True, sort=False)
-                                ctDaySummary = pd.concat([ctDaySummary, tempDaySummary],
-                                                         ignore_index=True, sort=False)
-
-                            ctDaySummary = pd.concat([ctDaySummary, dataPulledDF], sort=False)
-                            ctDaySummary.fillna(method='ffill', inplace=True)
-                            ctDaySummary.drop_duplicates(inplace=True)
-                            ctDaySummary.reset_index(inplace=True, drop=True)
-
-                        # SCHEDULED BASAL RATES
-                        sbrColHeadings = ["sbr.localTime", "rate", "sbr.type"]
-                        sbr = pd.DataFrame(columns=sbrColHeadings)
-                        sbrDayColHeadings = ['day', 'sbr.min', 'sbr.weightedMean', 'sbr.max', 'sbr.type']
-                        sbrDaySummary = pd.DataFrame(columns=sbrDayColHeadings)
-                        for p, actSched in zip(pumpSettings.index, pumpSettings["activeSchedule"]):
-                            # edge case where actSchedule is float
-                            if isinstance(actSched, float):
-                                actSched = str(int(actSched))
-                            if 'Auto Mode' not in actSched:
-                                tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
-                                tempDF["day"] = pumpSettings.loc[p, "day"]
-                                tempDF["sbr.type"] = np.nan
-                                tempDF["sbr.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
-                                endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["sbr.localTime"], index=[0])
-                                tempDF = get_setting_durations(tempDF, "sbr", endOfDay)
-                                tempDF = tempDF[:-1]
-
-                                tempDaySummary = pd.DataFrame(index=[0])
-                                tempDaySummary["day"] = tempDF["sbr.localTime"].dt.date
-                                tempDaySummary["sbr.min"] = tempDF["rate"].min()
-                                tempDaySummary["sbr.weightedMean"] = \
-                                    np.sum(tempDF["rate"] * tempDF["sbr.durationHours"]) / tempDF["sbr.durationHours"].sum()
-                                tempDaySummary["sbr.max"] = tempDF["rate"].max()
-                                tempDaySummary["sbr.type"] = np.nan
+                             # %% save data for this person
+    #                        outputString = "age-%s-%s-ylw-%s-%s-lp-%s-670g-%s-id-%s"
+    #                        outputFormat = (f"{minAge:02d}",
+    #                                        f"{maxAge:02d}",
+    #                                        f"{minYLW:02d}",
+    #                                        f"{maxYLW:02d}",
+    #                                        f"{nDaysClosedLoop:03d}",
+    #                                        f"{n670gDays:03d}",
+    #                                        hashID[0:4])
+    #                        outputFolderName = outputString % outputFormat
+    #                        outputFolderName_Path = os.path.join(outputPath,"data", outputFolderName)
+    #                        if not os.path.exists(outputFolderName_Path):
+    #                            os.makedirs(outputFolderName_Path)
+    #
+    #                        fName = outputFolderName + "-allSettings.csv"
+    #                        allSettings.to_csv(os.path.join(outputFolderName_Path, fName))
+    #                        fName = outputFolderName + "-pumpEvents.csv"
+    #                        pumpEvents.to_csv(os.path.join(outputFolderName_Path, fName))
+    #                        fName = outputFolderName + "-cgmLite.csv"
+    #                        cgmLite.to_csv(os.path.join(outputFolderName_Path, fName))
 
-                            else:
-                                tempDF = pd.DataFrame(index=[0])
-                                tempDF["day"] = pumpSettings.loc[p, "day"]
-                                tempDF["sbr.localTime"] = pd.to_datetime(tempDF["day"])
-                                tempDF["rate"] = np.nan
-                                tempDF["sbr.type"] = "AutoMode"
-
-                                tempDaySummary = pd.DataFrame(index=[0])
-                                tempDaySummary["day"] = tempDF["sbr.localTime"].dt.date
-                                tempDaySummary["sbr.min"] = np.nan
-                                tempDaySummary["sbr.weightedMean"] = np.nan
-                                tempDaySummary["sbr.max"] = np.nan
-                                tempDaySummary["sbr.type"] = "AutoMode"
-
-                            sbr = pd.concat([sbr, tempDF[sbrColHeadings]], ignore_index=True)
-                            sbrDaySummary = pd.concat([sbrDaySummary, tempDaySummary], ignore_index=True)
-
-                        sbrDaySummary = pd.concat([sbrDaySummary, dataPulledDF], sort=False)
-                        sbrDaySummary.reset_index(inplace=True, drop=True)
-                        sbrDaySummary.fillna(method='ffill', inplace=True)
-
-#                        # max basal rate, max bolus amount, and insulin duration
-#                        if "rateMaximum" in list(data):
-#                            pdb.set_trace()
-#                        if "amountMaximum" in list(data):
-#                            pdb.set_trace()
-#                        if "bolus.calculator" in list(data):
-#                            pdb.set_trace()
-
-
-                        # %% ACTUAL BASAL RATES (TIME, VALUE, DURATION, TYPE (SCHEDULED, TEMP, SUSPEND))
-                        basal = data[data.type == "basal"].copy().dropna(axis=1, how="all")
-                        basal.sort_values("uploadTime", ascending=False, inplace=True)
-
-                        basalBeginDate, basalEndDate = getStartAndEndTimes(basal, "day")
-                        metadata["basal.beginDate"] = basalBeginDate
-                        metadata["basal.endDate"] = basalEndDate
-
-                        basal, nBasalDuplicatesRemoved = \
-                            removeDuplicates(basal, ["deliveryType", "deviceTime", "duration", "rate"])
-                        metadata["basal.nBasalDuplicatesRemoved"] = nBasalDuplicatesRemoved
-
-                        # fill NaNs with 0, as it indicates a suspend (temp basal of 0)
-                        basal.rate.fillna(0, inplace=True)
-
-                        # get rid of basals that have durations of 0
-                        nBasalDuration0 = sum(basal.duration > 0)
-                        basal = basal[basal.duration > 0]
-                        metadata["basal.nBasalDuration0"] = nBasalDuration0
-
-                        # get rid of basal durations that are unrealistic
-                        nUnrealisticBasalDuration = ((basal.duration < 0) | (basal.duration > 86400000))
-                        metadata["nUnrealisticBasalDuration"] = sum(nUnrealisticBasalDuration)
-                        basal.loc[nUnrealisticBasalDuration, "duration"] = np.nan
-
-                        # calculate the total amount of insulin delivered (duration * rate)
-                        basal["durationHours"] = basal["duration"] / 1000.0 / 3600.0
-                        basal["totalAmountOfBasalInsulin"] = basal["durationHours"] * basal["rate"]
-
-                        # actual basal delivered
-#                        abrColHeadings = commonColumnHeadings.copy()
-                        abrColHeadings = ["utcTime", "timezone", "roundedTime", "durationHours", "rate", "type"]
-                        abr = basal[abrColHeadings]
-                        if "duration" in list(bolus):
-                            abr = pd.concat([abr, bolusExtendedEvents], ignore_index=True)
-                            abr.sort_values("utcTime", inplace=True)
-
-                        abr["timezone"].fillna(method='ffill', inplace=True)
-                        abr["timezone"].fillna(method='bfill', inplace=True)
-
-                        # get a summary of basals per day
-                        basalDaySummary = get_basalDaySummary(basal)
-
-
-                        # %% GET CLOSED LOOP DAYS WITH TEMP BASAL DATA
-                        # group data by type
-                        groupedData = data.groupby(by="type")
-
-                        isClosedLoopDay, is670g, metadata = \
-                            getClosedLoopDays(groupedData, 30, metadata)
-
-                        # %% CGM DATA
-                        # filter by cgm and sort by uploadTime
-                        cgmData = groupedData.get_group("cbg").dropna(axis=1, how="all")
-
-                        # get rid of duplicates that have the same ["deviceTime", "value"]
-                        cgmData, nCgmDuplicatesRemovedDeviceTime = removeCgmDuplicates(cgmData, "deviceTime")
-                        metadata["nCgmDuplicatesRemovedDeviceTime"] = nCgmDuplicatesRemovedDeviceTime
-
-                        # get rid of duplicates that have the same ["time", "value"]
-                        cgmData, nCgmDuplicatesRemovedUtcTime = removeCgmDuplicates(cgmData, "time")
-                        metadata["cnCgmDuplicatesRemovedUtcTime"] = nCgmDuplicatesRemovedUtcTime
-
-                        # get rid of duplicates that have the same "roundedTime"
-                        cgmData, nCgmDuplicatesRemovedRoundedTime = removeDuplicates(cgmData, "roundedTime")
-                        metadata["nCgmDuplicatesRemovedRoundedTime"] = nCgmDuplicatesRemovedRoundedTime
-
-                        # get start and end times
-                        cgmBeginDate, cgmEndDate = getStartAndEndTimes(cgmData, "day")
-                        metadata["cgm.beginDate"] = cgmBeginDate
-                        metadata["cgm.endDate"] = cgmEndDate
-
-                        # get a list of dexcom cgms
-                        cgmData, percentDexcom = getListOfDexcomCGMDays(cgmData)
-                        metadata["cgm.percentDexcomCGM"] = percentDexcom
-
-                        # group by date (day) and get stats
-                        catDF = cgmData.groupby(cgmData["day"])
-                        cgmRecordsPerDay = \
-                            pd.DataFrame(catDF.value.count()). \
-                            rename(columns={"value": "cgm.count"})
-                        dayDate = catDF.day.describe()["top"]
-                        dexcomCGM = catDF.dexcomCGM.describe()["top"]
-                        nTypesCGM = catDF.dexcomCGM.describe()["unique"]
-                        cgmRecordsPerDay["cgm.dexcomOnly"] = \
-                            (dexcomCGM & (nTypesCGM == 1))
-                        cgmRecordsPerDay["date"] = cgmRecordsPerDay.index
-
-                        # filter the cgm data
-#                        cgmColHeadings = commonColumnHeadings.copy()
-                        cgmColHeadings = ["utcTime", "timezone", "roundedTime", "value"]
-
-                        # get data in mg/dL units
-                        cgm = cgmData[cgmColHeadings]
-                        cgm = cgm.rename(columns={'value': 'mmol_L'})
-                        cgm["mg_dL"] = mmolL_to_mgdL(cgm["mmol_L"]).astype(int)
-
-
-                        # %% NUMBER OF DAYS OF PUMP AND CGM DATA, OVERALL AND PER EACH AGE & YLW
-
-                        # COMBINE DAY SUMMARIES
-                        # group by date (day) and get stats
-                        catDF = data.groupby(data["day"])
-                        dataPerDay = \
-                            pd.DataFrame(catDF.hashID.describe()["top"]). \
-                            rename(columns={"top": "hashID"})
-                        dataPerDay["age"] = catDF.age.mean()
-                        dataPerDay["ylw"] = catDF.ylw.mean()
-                        dataPerDay["timezone"] = catDF.timezone.describe()["top"]
-
-                        # calculate all of the data start and end range
-                        # this can be used for looking at settings
-                        dayBeginDate = min(cgmBeginDate, bolusBeginDate, basalBeginDate)
-                        dayEndDate = max(cgmEndDate, bolusEndDate, basalEndDate)
-                        metadata["day.beginDate"] = dayBeginDate
-                        metadata["day.endDate"] = dayEndDate
-                        rng = pd.date_range(dayBeginDate, dayEndDate).date
-                        dayData = pd.DataFrame(rng, columns=["day"])
-                        for dfType in [dataPerDay, basalDaySummary, bolusDaySummary, cgmRecordsPerDay]:
-                            dayData = pd.merge(dayData, dfType.reset_index(), on="day", how="left")
-                        for dfType in [isClosedLoopDay, is670g]:
-                            dayData = pd.merge(dayData, dfType, on="day", how="left")
-
-                        dayData["validPumpData"] = dayData["numberOfNormalBoluses"] > 3
-                        dayData["validCGMData"] = dayData["cgm.count"] > (288*.75)
-
-                        dayData["timezone"].fillna(method='ffill', inplace=True)
-                        dayData["timezone"].fillna(method='bfill', inplace=True)
-
-                        dayData["isDSTChangeDay"] = dayData[['day', 'timezone']].apply(lambda x: isDSTChangeDay(*x), axis=1)
-                        dayData["date"] = pd.to_datetime(dayData["day"])
-                        dayData["tzo"] = dayData[['date', 'timezone']].apply(lambda x: getTimezoneOffset(*x), axis=1)
-
-                        # add settings to the dayData
-                        dayData = pd.merge(dayData, isfDaySummary, on="day", how="left")
-                        dayData = pd.merge(dayData, cirDaySummary, on="day", how="left")
-                        dayData = pd.merge(dayData, ctDaySummary, on="day", how="left")
-                        dayData = pd.merge(dayData, sbrDaySummary, on="day", how="left")
-
-                        # fill data forward
-                        fillList = ['isf.min',
-                                    'isf.weightedMean',
-                                    'isf.max',
-                                    'cir.min',
-                                    'cir.weightedMean',
-                                    'cir.max',
-                                    'ct.low.min', 'ct.low.weightedMean', 'ct.low.max',
-                                    'ct.high.min', 'ct.high.weightedMean', 'ct.high.max',
-                                    'ct.target.min', 'ct.target.weightedMean', 'ct.target.max',
-                                    'ct.range.min', 'ct.range.weightedMean', 'ct.range.max',
-                                    'sbr.min',
-                                    'sbr.weightedMean',
-                                    'sbr.max',
-                                    'sbr.type']
-                        for fl in fillList:
-                            dayData[fl].fillna(method='ffill', inplace=True)
-
-                        # calculate the start and end of contiguous data
-                        # these dates can be used when simulating and predicting, where
-                        # you need both pump and cgm data
-                        contiguousBeginDate = max(cgmBeginDate, bolusBeginDate, basalBeginDate)
-                        contiguousEndDate = min(cgmEndDate, bolusEndDate, basalEndDate)
-                        metadata["contiguous.beginDate"] = contiguousBeginDate
-                        metadata["contiguous.endDate"] = contiguousEndDate
-
-                        # get a summary by age, and ylw
-                        catDF = dayData.groupby("age")
-                        ageSummary = pd.DataFrame(catDF.validPumpData.sum())
-                        ageSummary.rename(columns={"validPumpData": "nDaysValidPump"}, inplace=True)
-                        ageSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
-                        ageSummary["nDaysClosedLoop"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
-                        ageSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
-
-                        # add in isf stats
-                        ageSummary["isf.nDays"] = catDF["isf.min"].count()
-                        ageSummary["isf.min"] = catDF["isf.min"].min()
-                        ageSummary["isf.weightedMean"] = catDF["isf.weightedMean"].sum() / catDF["isf.weightedMean"].count()
-                        ageSummary["isf.max"] = catDF["isf.max"].max()
-
-                        # add cir stats
-                        ageSummary["cir.nDays"] = catDF["cir.min"].count()
-                        ageSummary["cir.min"] = catDF["cir.min"].min()
-                        ageSummary["cir.weightedMean"] = catDF["cir.weightedMean"].sum() / catDF["cir.weightedMean"].count()
-                        ageSummary["cir.max"] = catDF["cir.max"].max()
-
-                        # add sbr stats
-                        ageSummary["sbr.nDays"] = catDF["sbr.min"].count()
-                        ageSummary["sbr.min"] = catDF["sbr.min"].min()
-                        ageSummary["sbr.weightedMean"] = catDF["sbr.weightedMean"].sum() / catDF["sbr.weightedMean"].count()
-                        ageSummary["sbr.max"] = catDF["sbr.max"].max()
-                        ageSummary["sbr.nAutoMode"] = catDF["sbr.type"].count()
-
-                        # correctionTarget stats
-                        for targetType in ["ct.low", "ct.high", "ct.target", "ct.range"]:
-                            for stat in [".min", ".weightedMean", ".max"]:
-                                ch = targetType + stat
-                                ageSummary[ch + ".nDays"] = catDF[ch].count()
-                                ageSummary[ch + ".min"] = catDF[ch].min()
-                                ageSummary[ch + ".weightedMean"] = catDF[ch].sum() / catDF[ch].count()
-                                ageSummary[ch + ".max"] = catDF[ch].max()
-
-                        ageSummary.reset_index(inplace=True)
-
-                        analysisCriterion = ageSummary[((ageSummary["nDaysValidPump"]> 28) &
-                                                        (ageSummary["nDaysValidCgm"]> 28))]
-                        minAge = analysisCriterion["age"].min()
-                        maxAge = analysisCriterion["age"].max()
-                        nDaysClosedLoop = analysisCriterion["nDaysClosedLoop"].sum()
-                        n670gDays = analysisCriterion["n670gDays"].sum()
-                        metadata["minAge"] = minAge
-                        metadata["maxAge"] = maxAge
-                        metadata["nDaysClosedLoop"] = nDaysClosedLoop
-                        metadata["n670gDays"] = n670gDays
-
-                        catDF = dayData.groupby("ylw")
-                        ylwSummary = pd.DataFrame(catDF.validPumpData.sum())
-                        ylwSummary.rename(columns={"validPumpData": "nDaysValidPump"}, inplace=True)
-                        ylwSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
-                        ylwSummary["nDaysClosedLoop"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
-                        ylwSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
-
-                        ylwSummary["isf.nDays"] = catDF["isf.min"].count()
-                        ylwSummary["isf.min"] = catDF["isf.min"].min()
-                        ylwSummary["isf.weightedMean"] = catDF["isf.weightedMean"].sum() / catDF["isf.weightedMean"].count()
-                        ylwSummary["isf.max"] = catDF["isf.max"].max()
-
-                        # add cir stats
-                        ylwSummary["cir.nDays"] = catDF["cir.min"].count()
-                        ylwSummary["cir.min"] = catDF["cir.min"].min()
-                        ylwSummary["cir.weightedMean"] = catDF["cir.weightedMean"].sum() / catDF["cir.weightedMean"].count()
-                        ylwSummary["cir.max"] = catDF["cir.max"].max()
-
-                        # add sbr stats
-                        ylwSummary["sbr.nDays"] = catDF["sbr.min"].count()
-                        ylwSummary["sbr.min"] = catDF["sbr.min"].min()
-                        ylwSummary["sbr.weightedMean"] = catDF["sbr.weightedMean"].sum() / catDF["sbr.weightedMean"].count()
-                        ylwSummary["sbr.max"] = catDF["sbr.max"].max()
-                        ylwSummary["sbr.nAutoMode"] = catDF["sbr.type"].count()
-
-                        # correctionTarget stats
-                        for targetType in ["ct.low", "ct.high", "ct.target", "ct.range"]:
-                            for stat in [".min", ".weightedMean", ".max"]:
-                                ch = targetType + stat
-                                ylwSummary[ch + ".nDays"] = catDF[ch].count()
-                                ylwSummary[ch + ".min"] = catDF[ch].min()
-                                ylwSummary[ch + ".weightedMean"] = catDF[ch].sum() / catDF[ch].count()
-                                ylwSummary[ch + ".max"] = catDF[ch].max()
-
-                        ylwSummary.reset_index(inplace=True)
-
-                        analysisCriterion = ylwSummary[((ylwSummary["nDaysValidPump"]> 28) &
-                                                        (ylwSummary["nDaysValidCgm"]> 28))]
-                        minYLW = analysisCriterion["ylw"].min()
-                        maxYLW = analysisCriterion["ylw"].max()
-                        metadata["minYLW"] = minYLW
-                        metadata["maxYLW"] = maxYLW
-
-
-                        # %% calculate local time
-                        abr["date"] = pd.to_datetime(abr["utcTime"].dt.date)
-                        abr = pd.merge(abr, dayData[["date", "tzo", "isDSTChangeDay"]], how="left", on="date")
-                        abr["localTime"] = abr["utcTime"] + pd.to_timedelta(abr["tzo"], unit="m")
-
-                        cgm["date"] = pd.to_datetime(cgm["utcTime"].dt.date)
-                        cgm = pd.merge(cgm, dayData[["date", "tzo", "isDSTChangeDay"]], how="left", on="date")
-                        cgm["localTime"] = cgm["utcTime"] + pd.to_timedelta(cgm["tzo"], unit="m")
-
-                        bolusEvents["date"] = pd.to_datetime(bolusEvents["utcTime"].dt.date)
-                        bolusEvents = pd.merge(bolusEvents, dayData[["date", "tzo", "isDSTChangeDay"]], how="left", on="date")
-                        bolusEvents["localTime"] = bolusEvents["utcTime"] + pd.to_timedelta(bolusEvents["tzo"], unit="m")
-
-
-                        # %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)
-                        # all settings
-
-                        allSettings = pd.merge(isf.rename(columns={"isf.localTime": "localTime"}),
-                                               cir.rename(columns={"cir.localTime": "localTime"}),
-                                               how="outer", on="localTime")
-                        allSettings = pd.merge(allSettings,
-                                               sbr.rename(columns={"rate": "sbr",
-                                                                   "type": "sbr.type",
-                                                                   "sbr.localTime": "localTime"}),
-                                               how="outer", on="localTime")
-                        allSettings = pd.merge(allSettings,
-                                               correctionTarget.rename(columns={"ct.localTime": "localTime"}),
-                                               how="outer", on="localTime")
-                        allSettings["hashID"] = hashID
-                        allSettings["age"] = np.floor((allSettings["localTime"] - bDate).dt.days/365.25).astype(int)
-                        allSettings["ylw"] = np.floor((allSettings["localTime"] - dDate).dt.days/365.25).astype(int)
-                        allSettings = round_time(allSettings, timeIntervalMinutes=5,
-                                                 timeField="localTime",
-                                                 roundedTimeFieldName="localRoundedTime",
-                                                 startWithFirstRecord=True, verbose=False)
 
-                        colOrder = ["hashID", "age", "ylw", "localTime", "localRoundedTime",
-                                    "isf", "cir", "sbr",
-                                    "ct.low", "ct.high", "ct.target", "ct.range",
-                                    "sbr.type", "isf_mmolL_U"]
-                        allSettings = allSettings[colOrder]
 
+                            # %% save the processed data (saving this data will take up a lot of space and time)
+                            #data.to_csv(os.path.join(processedDataPath, "allDataCleaned-PHI-" + userID + ".csv"))
+                            #basal.to_csv(os.path.join(processedDataPath, "basal-PHI-" + userID + ".csv"))
+                            #bolus.to_csv(os.path.join(processedDataPath, "bolus-PHI-" + userID + ".csv"))
+                            #cgmData.to_csv(os.path.join(processedDataPath, "cgm-PHI-" + userID + ".csv"))
+                            #pumpSettings.to_csv(os.path.join(processedDataPath, "pumpSettings-PHI-" + userID + ".csv"))
 
-                        fieldsToDrop = ["utcTime", "timezone", "roundedTime", "date", "tzo", "isDSTChangeDay"]
-                        pumpEvents = pd.merge(abr.drop(columns=fieldsToDrop),
-                                              bolusEvents.drop(columns=fieldsToDrop),
-                                              how="outer", on="localTime")
-                        pumpEvents["type"].fillna("bolus", inplace=True)
-                        pumpEvents["eventType"].fillna("basal", inplace=True)
-                        pumpEvents["hashID"] = hashID
-                        pumpEvents["age"] = np.floor((pumpEvents["localTime"] - bDate).dt.days/365.25).astype(int)
-                        pumpEvents["ylw"] = np.floor((pumpEvents["localTime"] - dDate).dt.days/365.25).astype(int)
-                        pumpEvents = round_time(pumpEvents, timeIntervalMinutes=5,
-                                                timeField="localTime",
-                                                roundedTimeFieldName="localRoundedTime",
-                                                startWithFirstRecord=True, verbose=False)
-
-
-                        colOrder = ["hashID", "age", "ylw", "localTime", "localRoundedTime",
-                                    "rate", "durationHours",
-                                    "unitsInsulin", "carbInput", "type", "eventType", "subType",
-                                    "isf", "isf_mmolL_U", "insulinCarbRatio", "insulinOnBoard",
-                                    "bg_mgdL", "bg_mmolL"]
-
-                        pumpEvents = pumpEvents[colOrder]
-
-                        cgmLite = cgm.drop(columns=fieldsToDrop)
-                        cgmLite["hashID"] = hashID
-                        cgmLite["age"] = np.floor((cgmLite["localTime"] - bDate).dt.days/365.25).astype(int)
-                        cgmLite["ylw"] = np.floor((cgmLite["localTime"] - dDate).dt.days/365.25).astype(int)
-                        cgmLite = round_time(cgmLite, timeIntervalMinutes=5,
-                                             timeField="localTime",
-                                             roundedTimeFieldName="localRoundedTime",
-                                             startWithFirstRecord=True, verbose=False)
-
-                        colOrder = ["hashID", "age", "ylw", "localTime", "localRoundedTime",
-                                    "mg_dL", "mmol_L"]
-
-                        cgmLite = cgmLite[colOrder]
-
-
-                        # %% SAVE RESULTS
-
-                        # age and ylw stats
-                        pumpEvents["rateTimesDurationHours"] = pumpEvents["rate"] * pumpEvents["durationHours"]
-                        pumpEvents.rename(columns={"rate":"basalRate"}, inplace=True)
-                        catDF = pumpEvents.groupby("age")
-
-                        # actual basal rates
-                        agePump = pd.DataFrame(catDF["basalRate"].count()).add_suffix(".count")
-                        agePump["basalRate.min"] = catDF["basalRate"].min()
-                        agePump["basalRate.weightedMean"] = catDF["rateTimesDurationHours"].sum() / catDF["durationHours"].sum()
-                        agePump["basalRate.max"] = catDF["basalRate"].max()
-
-                        # insulin events
-                        insulinEvents = catDF["unitsInsulin"].describe().add_prefix("insulin.")
-                        agePump = pd.concat([agePump, insulinEvents], axis=1)
-
-                        # carbs entered in bolus calculator
-                        carbEvents = catDF["carbInput"].describe().add_prefix("carb.")
-                        agePump = pd.concat([agePump, carbEvents], axis=1)
-
-                        # very low level cgm stats per age
-                        catDF = cgmLite.groupby("age")
-                        cgmStats = catDF["mg_dL"].describe().add_prefix("cgm.")
-                        agePumpCgm = pd.concat([agePump, cgmStats], axis=1)
-
-                        agePumpCgm.reset_index(inplace=True)
-
-                        ageSummary = pd.merge(ageSummary, agePumpCgm, on="age", how="left")
-                        ageSummary["hashID"] = hashID
-                        allAgeSummaries = pd.concat([allAgeSummaries, ageSummary], ignore_index=True)
-
-                        allAgeSummaries.to_csv(os.path.join(outputPath,
-                            "allAgeSummaries-dIndex-" + str(startIndex) + "-" + str(dIndex) + ".csv"))
-
-                        # repoeat for years living with
-                        catDF = pumpEvents.groupby("ylw")
-                        # actual basal rates
-                        ylwPump = pd.DataFrame(catDF["basalRate"].count()).add_suffix(".count")
-                        ylwPump["basalRate.min"] = catDF["basalRate"].min()
-                        ylwPump["basalRate.weightedMean"] = catDF["rateTimesDurationHours"].sum() / catDF["durationHours"].sum()
-                        ylwPump["basalRate.max"] = catDF["basalRate"].max()
-
-                        # insulin events
-                        insulinEvents = catDF["unitsInsulin"].describe().add_prefix("insulin.")
-                        ylwPump = pd.concat([ylwPump, insulinEvents], axis=1)
-
-                        # carbs entered in bolus calculator
-                        carbEvents = catDF["carbInput"].describe().add_prefix("carb.")
-                        ylwPump = pd.concat([ylwPump, carbEvents], axis=1)
-
-                        # very low level cgm stats per age
-                        catDF = cgmLite.groupby("ylw")
-                        cgmStats = catDF["mg_dL"].describe().add_prefix("cgm.")
-                        ylwPumpCgm = pd.concat([ylwPump, cgmStats], axis=1)
-
-                        ylwPumpCgm.reset_index(inplace=True)
-
-                        ylwSummary = pd.merge(ylwSummary, ylwPumpCgm, on="ylw", how="left")
-
-                        ylwSummary["hashID"] = hashID
-                        allYlwSummaries = pd.concat([allYlwSummaries, ylwSummary], ignore_index=True)
-
-                        allYlwSummaries.to_csv(os.path.join(outputPath,
-                            "allYlwSummaries-dIndex-" + str(startIndex) + "-" + str(dIndex) + ".csv"))
-
-                         # %% save data for this person
-#                        outputString = "age-%s-%s-ylw-%s-%s-lp-%s-670g-%s-id-%s"
-#                        outputFormat = (f"{minAge:02d}",
-#                                        f"{maxAge:02d}",
-#                                        f"{minYLW:02d}",
-#                                        f"{maxYLW:02d}",
-#                                        f"{nDaysClosedLoop:03d}",
-#                                        f"{n670gDays:03d}",
-#                                        hashID[0:4])
-#                        outputFolderName = outputString % outputFormat
-#                        outputFolderName_Path = os.path.join(outputPath,"data", outputFolderName)
-#                        if not os.path.exists(outputFolderName_Path):
-#                            os.makedirs(outputFolderName_Path)
-#
-#                        fName = outputFolderName + "-allSettings.csv"
-#                        allSettings.to_csv(os.path.join(outputFolderName_Path, fName))
-#                        fName = outputFolderName + "-pumpEvents.csv"
-#                        pumpEvents.to_csv(os.path.join(outputFolderName_Path, fName))
-#                        fName = outputFolderName + "-cgmLite.csv"
-#                        cgmLite.to_csv(os.path.join(outputFolderName_Path, fName))
-
-
-
-                        # %% save the processed data (saving this data will take up a lot of space and time)
-                        #data.to_csv(os.path.join(processedDataPath, "allDataCleaned-PHI-" + userID + ".csv"))
-                        #basal.to_csv(os.path.join(processedDataPath, "basal-PHI-" + userID + ".csv"))
-                        #bolus.to_csv(os.path.join(processedDataPath, "bolus-PHI-" + userID + ".csv"))
-                        #cgmData.to_csv(os.path.join(processedDataPath, "cgm-PHI-" + userID + ".csv"))
-                        #pumpSettings.to_csv(os.path.join(processedDataPath, "pumpSettings-PHI-" + userID + ".csv"))
-
+                        else:
+                            metadata["flags"] = "no bolus wizard data"
                     else:
-                        metadata["flags"] = "no bolus wizard data"
+                        metadata["flags"] = "missing either pump or cgm  data"
                 else:
-                    metadata["flags"] = "missing either pump or cgm  data"
+                    metadata["flags"] = "file contains no data"
             else:
-                metadata["flags"] = "file contains no data"
+                metadata["flags"] = "file does not exist"
         else:
-            metadata["flags"] = "file does not exist"
-    else:
-        metadata["flags"] = "missing bDay/dDay"
+            metadata["flags"] = "missing bDay/dDay"
+
+    except:
+        print("something is broke dIndex=", dIndex)
+        metadata["flags"] = "something is broke"
+
 
     # write metaData to allMetadata
     allMetadata = pd.concat([allMetadata, metadata], axis=0, sort=True)

From ef2f955d29dc0de91d8dcd3ee2d12d8ea0ef3c58 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 15 Jan 2019 18:52:14 -0600
Subject: [PATCH 38/78] update flatten_json to include a list of fields to NOT
 flatten

---
 .../predict-simulate/get-users-settings-and-events.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 1c979d2f..6de58293 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -116,10 +116,8 @@ def tslimCalibrationFix(df):
 
 
 # OTHER
-def tempRemoveFields(df):
-    removeFields = ["suppressed",
-                    "recommended",
-                    "payload"]
+def tempRemoveFields(df, removeFields):
+
 
     tempRemoveFields = list(set(df) & set(removeFields))
     tempDf = df[tempRemoveFields]
@@ -128,10 +126,9 @@ def tempRemoveFields(df):
     return df, tempDf
 
 
-def flattenJson(df):
-
+def flattenJson(df, doNotFlattenList):
     # remove fields that we don't want to flatten
-    df, holdData = tempRemoveFields(df)
+    df, holdData = tempRemoveFields(df, doNotFlattenList)
 
     # get a list of data types of column headings
     columnHeadings = list(df)

From c10f4c6183bbff15b91d86091d040756becfc001 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Thu, 17 Jan 2019 04:43:49 -0600
Subject: [PATCH 39/78] syntax of new flatten_json function

---
 projects/predict-simulate/get-users-settings-and-events.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 6de58293..fe4c00ec 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -611,7 +611,10 @@ def getPumpSettingsStats(df, col, pumpCol):
                     data.sort_values("time", inplace=True)
 
                     # flatten the embedded json
-                    data = flattenJson(data)
+                    doNotFlattenList = ["suppressed", "recommended", "payload"]
+                    data = flattenJson(data, doNotFlattenList)
+
+                    pdb.set_trace()
 
 
                     # %% CLEAN DATA

From 7750289c40d42f10d8cef06d522b2e5351ab6fc2 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 18 Jan 2019 05:44:55 -0600
Subject: [PATCH 40/78] setting summaries per day should only have one entry
 per day

---
 .../get-users-settings-and-events.py          | 78 +++++++++----------
 1 file changed, 36 insertions(+), 42 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index fe4c00ec..693032a3 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -540,13 +540,7 @@ def getPumpSettingsStats(df, col, pumpCol):
     return df, df2
 
 
-
-
-
-# %% LOAD IN ONE FILE, BUT EVENTUALLY THIS WILL LOOOP THROUGH ALL USER'S
-
-
-
+# %% START OF CODE
 dataPulledDate = args.dateStamp
 dataPulledDF = pd.DataFrame(pd.to_datetime(dataPulledDate), columns=["day"], index=[0])
 dataPulledDF["day"] = dataPulledDF["day"].dt.date
@@ -556,7 +550,6 @@ def getPumpSettingsStats(df, col, pumpCol):
 phiOutputPath = os.path.join(donorPath, "PHI-settings-and-events")
 outputPath = os.path.join(donorPath, "settings-and-events")
 
-
 # create anonExportDataPath folders
 if not os.path.exists(phiOutputPath):
     os.makedirs(phiOutputPath)
@@ -614,8 +607,6 @@ def getPumpSettingsStats(df, col, pumpCol):
                     doNotFlattenList = ["suppressed", "recommended", "payload"]
                     data = flattenJson(data, doNotFlattenList)
 
-                    pdb.set_trace()
-
 
                     # %% CLEAN DATA
                     # remove negative durations
@@ -662,10 +653,6 @@ def getPumpSettingsStats(df, col, pumpCol):
                         data["age"] = np.floor((data["utcTime"] - bDate).dt.days/365.25).astype(int)
                         data["ylw"] = np.floor((data["utcTime"] - dDate).dt.days/365.25).astype(int)
 
-    #                    commonColumnHeadings = ["hashID",
-    #                                            "age",
-    #                                            "ylw"]
-
 
                         # %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)
                         bolus = mergeWizardWithBolus(data)
@@ -689,7 +676,6 @@ def getPumpSettingsStats(df, col, pumpCol):
                             bolus["isf_mmolL_U"] = bolus["insulinSensitivity"]
                             bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
 
-    #                        bolusCH = commonColumnHeadings.copy()
                             bolusCH = ["utcTime", "timezone", "roundedTime", "normal", "carbInput", "subType",
                                             "insulinOnBoard", "bgInput",
                                             "isf", "isf_mmolL_U", "insulinCarbRatio"]
@@ -752,9 +738,6 @@ def getPumpSettingsStats(df, col, pumpCol):
                                 isfDaySummary["isf.min"] = isf["isf"]
                                 isfDaySummary["isf.weightedMean"] = isf["isf"]
                                 isfDaySummary["isf.max"] = isf["isf"]
-                                isfDaySummary = pd.concat([isfDaySummary, dataPulledDF], sort=False)
-                                isfDaySummary.reset_index(inplace=True, drop=True)
-                                isfDaySummary.fillna(method='ffill', inplace=True)
 
                             else:
                                 isfColHead = "insulinSensitivities"
@@ -785,9 +768,14 @@ def getPumpSettingsStats(df, col, pumpCol):
                                     isf = pd.concat([isf, tempDF[isfColHeadings]], ignore_index=True)
                                     isfDaySummary = pd.concat([isfDaySummary, tempDaySummary], ignore_index=True)
 
-                                isfDaySummary = pd.concat([isfDaySummary, dataPulledDF], sort=False)
-                                isfDaySummary.reset_index(inplace=True, drop=True)
-                                isfDaySummary.fillna(method='ffill', inplace=True)
+                            isfDaySummary = pd.concat([isfDaySummary, dataPulledDF], sort=False)
+                            isfDaySummary.reset_index(inplace=True, drop=True)
+                            isfDaySummary.fillna(method='ffill', inplace=True)
+                            # it is possible for someone to someone to change their schedule
+                            # in the middle of the day, take the latest change as the schedule
+                            # for that day.
+                            isfDaySummary.drop_duplicates(subset="day", keep="last", inplace=True)
+                            isfDaySummary.reset_index(inplace=True, drop=True)
 
                             # CIR
                             cirColHeadings = ["cir.localTime", "cir"]
@@ -806,9 +794,6 @@ def getPumpSettingsStats(df, col, pumpCol):
                                 cirDaySummary["cir.min"] = cir["cir"]
                                 cirDaySummary["cir.weightedMean"] = cir["cir"]
                                 cirDaySummary["cir.max"] = cir["cir"]
-                                cirDaySummary = pd.concat([cirDaySummary, dataPulledDF], sort=False)
-                                cirDaySummary.reset_index(inplace=True, drop=True)
-                                cirDaySummary.fillna(method='ffill', inplace=True)
 
                             else:
 
@@ -839,9 +824,13 @@ def getPumpSettingsStats(df, col, pumpCol):
                                     cir = pd.concat([cir, tempDF[cirColHeadings]], ignore_index=True)
                                     cirDaySummary = pd.concat([cirDaySummary, tempDaySummary], ignore_index=True)
 
-                                cirDaySummary = pd.concat([cirDaySummary, dataPulledDF], sort=False)
-                                cirDaySummary.reset_index(inplace=True, drop=True)
-                                cirDaySummary.fillna(method='ffill', inplace=True)
+                            cirDaySummary = pd.concat([cirDaySummary, dataPulledDF], sort=False)
+                            cirDaySummary.fillna(method='ffill', inplace=True)
+                            # it is possible for someone to someone to change their schedule
+                            # in the middle of the day, take the latest change as the schedule
+                            # for that day.
+                            cirDaySummary.drop_duplicates(subset="day", keep="last", inplace=True)
+                            cirDaySummary.reset_index(inplace=True, drop=True)
 
 
                             # CORRECTION TARGET
@@ -880,10 +869,6 @@ def getPumpSettingsStats(df, col, pumpCol):
                                         ctDaySummary[targetType + stat] = correctionTarget[targetType]
 
 
-                                ctDaySummary = pd.concat([ctDaySummary, dataPulledDF], sort=False)
-                                ctDaySummary.reset_index(inplace=True, drop=True)
-                                ctDaySummary.fillna(method='ffill', inplace=True)
-
                             else:
                                 ctColHead = "bgTargets"
                                 correctionTarget = pd.DataFrame(columns=ctColHeadings)
@@ -923,10 +908,13 @@ def getPumpSettingsStats(df, col, pumpCol):
                                     ctDaySummary = pd.concat([ctDaySummary, tempDaySummary],
                                                              ignore_index=True, sort=False)
 
-                                ctDaySummary = pd.concat([ctDaySummary, dataPulledDF], sort=False)
-                                ctDaySummary.fillna(method='ffill', inplace=True)
-                                ctDaySummary.drop_duplicates(inplace=True)
-                                ctDaySummary.reset_index(inplace=True, drop=True)
+                            ctDaySummary = pd.concat([ctDaySummary, dataPulledDF], sort=False)
+                            ctDaySummary.fillna(method='ffill', inplace=True)
+                            # it is possible for someone to someone to change their schedule
+                            # in the middle of the day, take the latest change as the schedule
+                            # for that day.
+                            ctDaySummary.drop_duplicates(subset="day", keep="last", inplace=True)
+                            ctDaySummary.reset_index(inplace=True, drop=True)
 
                             # SCHEDULED BASAL RATES
                             sbrColHeadings = ["sbr.localTime", "rate", "sbr.type"]
@@ -940,7 +928,7 @@ def getPumpSettingsStats(df, col, pumpCol):
                                 if 'Auto Mode' not in actSched:
                                     tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
                                     tempDF["day"] = pumpSettings.loc[p, "day"]
-                                    tempDF["sbr.type"] = np.nan
+                                    tempDF["sbr.type"] = "regular"
                                     tempDF["sbr.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
                                     endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["sbr.localTime"], index=[0])
                                     tempDF = get_setting_durations(tempDF, "sbr", endOfDay)
@@ -952,7 +940,7 @@ def getPumpSettingsStats(df, col, pumpCol):
                                     tempDaySummary["sbr.weightedMean"] = \
                                         np.sum(tempDF["rate"] * tempDF["sbr.durationHours"]) / tempDF["sbr.durationHours"].sum()
                                     tempDaySummary["sbr.max"] = tempDF["rate"].max()
-                                    tempDaySummary["sbr.type"] = np.nan
+                                    tempDaySummary["sbr.type"] = "regular"
 
                                 else:
                                     tempDF = pd.DataFrame(index=[0])
@@ -972,9 +960,15 @@ def getPumpSettingsStats(df, col, pumpCol):
                                 sbrDaySummary = pd.concat([sbrDaySummary, tempDaySummary], ignore_index=True)
 
                             sbrDaySummary = pd.concat([sbrDaySummary, dataPulledDF], sort=False)
-                            sbrDaySummary.reset_index(inplace=True, drop=True)
                             sbrDaySummary.fillna(method='ffill', inplace=True)
+                            # it is possible for someone to someone to change their schedule
+                            # in the middle of the day, take the latest change as the schedule
+                            # for that day.
+                            sbrDaySummary.drop_duplicates(subset="day", keep="last", inplace=True)
+                            sbrDaySummary.reset_index(inplace=True, drop=True)
+
 
+                            # %% test this later
     #                        # max basal rate, max bolus amount, and insulin duration
     #                        if "rateMaximum" in list(data):
     #                            pdb.set_trace()
@@ -1014,7 +1008,6 @@ def getPumpSettingsStats(df, col, pumpCol):
                             basal["totalAmountOfBasalInsulin"] = basal["durationHours"] * basal["rate"]
 
                             # actual basal delivered
-    #                        abrColHeadings = commonColumnHeadings.copy()
                             abrColHeadings = ["utcTime", "timezone", "roundedTime", "durationHours", "rate", "type"]
                             abr = basal[abrColHeadings]
                             if "duration" in list(bolus):
@@ -1073,7 +1066,6 @@ def getPumpSettingsStats(df, col, pumpCol):
                             cgmRecordsPerDay["date"] = cgmRecordsPerDay.index
 
                             # filter the cgm data
-    #                        cgmColHeadings = commonColumnHeadings.copy()
                             cgmColHeadings = ["utcTime", "timezone", "roundedTime", "value"]
 
                             # get data in mg/dL units
@@ -1156,6 +1148,7 @@ def getPumpSettingsStats(df, col, pumpCol):
                             ageSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
                             ageSummary["nDaysClosedLoop"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
                             ageSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
+                            pdb.set_trace()
 
                             # add in isf stats
                             ageSummary["isf.nDays"] = catDF["isf.min"].count()
@@ -1356,7 +1349,7 @@ def getPumpSettingsStats(df, col, pumpCol):
                             allAgeSummaries = pd.concat([allAgeSummaries, ageSummary], ignore_index=True)
 
                             allAgeSummaries.to_csv(os.path.join(outputPath,
-                                "allAgeSummaries-dIndex-" + str(startIndex) + "-" + str(dIndex) + ".csv"))
+                                "allAgeSummaries-dIndex-" + str(startIndex) + ".csv"))
 
                             # repoeat for years living with
                             catDF = pumpEvents.groupby("ylw")
@@ -1387,7 +1380,8 @@ def getPumpSettingsStats(df, col, pumpCol):
                             allYlwSummaries = pd.concat([allYlwSummaries, ylwSummary], ignore_index=True)
 
                             allYlwSummaries.to_csv(os.path.join(outputPath,
-                                "allYlwSummaries-dIndex-" + str(startIndex) + "-" + str(dIndex) + ".csv"))
+                                "allYlwSummaries-dIndex-" + str(startIndex) + ".csv"))
+
 
                              # %% save data for this person
     #                        outputString = "age-%s-%s-ylw-%s-%s-lp-%s-670g-%s-id-%s"

From a2c02e423a8f12b095a7ff1a88974719b477af6a Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 18 Jan 2019 05:45:50 -0600
Subject: [PATCH 41/78] get rid of break

---
 projects/predict-simulate/get-users-settings-and-events.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 693032a3..ff850b21 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -1148,7 +1148,6 @@ def getPumpSettingsStats(df, col, pumpCol):
                             ageSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
                             ageSummary["nDaysClosedLoop"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
                             ageSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
-                            pdb.set_trace()
 
                             # add in isf stats
                             ageSummary["isf.nDays"] = catDF["isf.min"].count()

From 7ce58a93d147e8a8bb543c47582f7988610cc252 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 18 Jan 2019 06:30:09 -0600
Subject: [PATCH 42/78] add deviceId to correction target data

---
 .../predict-simulate/get-users-settings-and-events.py | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index ff850b21..b09ca4e6 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -834,8 +834,8 @@ def getPumpSettingsStats(df, col, pumpCol):
 
 
                             # CORRECTION TARGET
-                            ctColHeadings = ["ct.localTime", "ct.low", "ct.high", "ct.target", "ct.range"]
-                            ctDayColHeadings = ['day',
+                            ctColHeadings = ['deviceId', "ct.localTime", "ct.low", "ct.high", "ct.target", "ct.range"]
+                            ctDayColHeadings = ['day', 'deviceId',
                                                 "ct.low.min", "ct.low.weightedMean", "ct.low.max",
                                                 "ct.high.min", "ct.high.weightedMean", "ct.high.max",
                                                 "ct.target.min", "ct.target.weightedMean", "ct.target.max",
@@ -863,6 +863,7 @@ def getPumpSettingsStats(df, col, pumpCol):
                                 # add a day summary
                                 ctDaySummary = pd.DataFrame(columns=ctDayColHeadings)
                                 ctDaySummary["day"] = correctionTarget["ct.localTime"].dt.date
+                                ctDaySummary["deviceId"] = correctionTarget["deviceId"]
                                 # add min, weightedMean, and max
                                 for targetType in ["ct.low", "ct.high", "ct.target", "ct.range"]:
                                     for stat in [".min", ".weightedMean", ".max"]:
@@ -882,6 +883,7 @@ def getPumpSettingsStats(df, col, pumpCol):
                                     tempDF = pd.DataFrame(pumpSettings.loc[p, ctColHead + "." + actSched])
                                     targetTypes = list(set(list(tempDF)) - set(["start"]))
                                     tempDF["day"] = pumpSettings.loc[p, "day"]
+                                    tempDF["deviceId"] = pumpSettings.loc[p, "deviceId"]
                                     tempDF["ct.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
                                     endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["ct.localTime"], index=[0])
                                     tempDF = get_setting_durations(tempDF, "ct", endOfDay)
@@ -889,6 +891,7 @@ def getPumpSettingsStats(df, col, pumpCol):
 
                                     tempDaySummary = pd.DataFrame(columns=ctDayColHeadings, index=[0])
                                     tempDaySummary["day"] = tempDF["ct.localTime"].dt.date
+                                    tempDaySummary["deviceId"] = tempDF["deviceId"]
 
                                     for targetType in targetTypes:
                                         tempDF["ct." + targetType] = mmolL_to_mgdL(tempDF[targetType])
@@ -1177,6 +1180,8 @@ def getPumpSettingsStats(df, col, pumpCol):
                                     ageSummary[ch + ".weightedMean"] = catDF[ch].sum() / catDF[ch].count()
                                     ageSummary[ch + ".max"] = catDF[ch].max()
 
+
+
                             ageSummary.reset_index(inplace=True)
 
                             analysisCriterion = ageSummary[((ageSummary["nDaysValidPump"]> 28) &
@@ -1271,7 +1276,7 @@ def getPumpSettingsStats(df, col, pumpCol):
                                                      startWithFirstRecord=True, verbose=False)
 
                             colOrder = ["hashID", "age", "ylw", "localTime", "localRoundedTime",
-                                        "isf", "cir", "sbr",
+                                        "isf", "cir", "sbr", "deviceId",
                                         "ct.low", "ct.high", "ct.target", "ct.range",
                                         "sbr.type", "isf_mmolL_U"]
                             allSettings = allSettings[colOrder]

From d576b42ec8fa0eb790868a1c9511d1cb097a6f13 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 18 Jan 2019 08:46:39 -0600
Subject: [PATCH 43/78] fix correction target to match how pumps set correction
 target

medtronic uses the upper limit of the range
---
 .../get-users-settings-and-events.py          | 111 +++++++++++-------
 1 file changed, 67 insertions(+), 44 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index b09ca4e6..74a142a2 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -835,11 +835,8 @@ def getPumpSettingsStats(df, col, pumpCol):
 
                             # CORRECTION TARGET
                             ctColHeadings = ['deviceId', "ct.localTime", "ct.low", "ct.high", "ct.target", "ct.range"]
-                            ctDayColHeadings = ['day', 'deviceId',
-                                                "ct.low.min", "ct.low.weightedMean", "ct.low.max",
-                                                "ct.high.min", "ct.high.weightedMean", "ct.high.max",
-                                                "ct.target.min", "ct.target.weightedMean", "ct.target.max",
-                                                "ct.range.min", "ct.range.weightedMean", "ct.range.max"]
+                            ctDayColHeadings = ['day', 'deviceId', "ct.low", "ct.high", "ct.target", "ct.range",
+                                                "ct.target.min", "ct.target.weightedMean", "ct.target.max"]
 
                             if "bgTarget.start" in list(pumpSettings):
                                 ctColHead = "bgTarget."
@@ -864,13 +861,26 @@ def getPumpSettingsStats(df, col, pumpCol):
                                 ctDaySummary = pd.DataFrame(columns=ctDayColHeadings)
                                 ctDaySummary["day"] = correctionTarget["ct.localTime"].dt.date
                                 ctDaySummary["deviceId"] = correctionTarget["deviceId"]
-                                # add min, weightedMean, and max
+
+                                # medtronic pumps use the target high as the correction target
+                                if sum(correctionTarget.deviceId.str.contains("ed")) > 0:
+                                    correctionTarget.loc[correctionTarget.deviceId.str.contains("ed"), "ct.target"] = \
+                                        correctionTarget.loc[correctionTarget.deviceId.str.contains("ed"), 'ct.high']
+
+                                if sum(correctionTarget.deviceId.str.contains("MMT")) > 0:
+                                    correctionTarget.loc[correctionTarget.deviceId.str.contains("MMT"), "ct.target"] = \
+                                        correctionTarget.loc[correctionTarget.deviceId.str.contains("MMT"), 'ct.high']
+
                                 for targetType in ["ct.low", "ct.high", "ct.target", "ct.range"]:
-                                    for stat in [".min", ".weightedMean", ".max"]:
-                                        ctDaySummary[targetType + stat] = correctionTarget[targetType]
+                                    ctDaySummary[targetType] = correctionTarget[targetType]
+
+                                ctDaySummary["ct.target.min"] = correctionTarget["ct.target"]
+                                ctDaySummary["ct.target.weightedMean"] = correctionTarget["ct.target"]
+                                ctDaySummary["ct.target.max"] = correctionTarget["ct.target"]
 
 
                             else:
+
                                 ctColHead = "bgTargets"
                                 correctionTarget = pd.DataFrame(columns=ctColHeadings)
 
@@ -884,32 +894,46 @@ def getPumpSettingsStats(df, col, pumpCol):
                                     targetTypes = list(set(list(tempDF)) - set(["start"]))
                                     tempDF["day"] = pumpSettings.loc[p, "day"]
                                     tempDF["deviceId"] = pumpSettings.loc[p, "deviceId"]
+
+                                    for targetType in ["low", "high", "target", "range"]:
+                                        if targetType in list(tempDF):
+                                            tempDF["ct." + targetType + "_mmolL"] = \
+                                                tempDF[targetType]
+
+                                            tempDF["ct." + targetType] = \
+                                                mmolL_to_mgdL(tempDF["ct." + targetType + "_mmolL"])
+                                        else:
+                                            tempDF["ct." + targetType + "_mmolL"] = np.nan
+                                            tempDF["ct." + targetType]  = np.nan
+
                                     tempDF["ct.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
                                     endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["ct.localTime"], index=[0])
                                     tempDF = get_setting_durations(tempDF, "ct", endOfDay)
                                     tempDF = tempDF[:-1]
 
-                                    tempDaySummary = pd.DataFrame(columns=ctDayColHeadings, index=[0])
+                                    # medtronic pumps use the target high as the correction target
+                                    if sum(tempDF.deviceId.str.contains("ed")) > 0:
+                                        tempDF.loc[tempDF.deviceId.str.contains("ed"), "ct.target"] = \
+                                            tempDF.loc[tempDF.deviceId.str.contains("ed"), 'ct.high']
+
+                                    if sum(tempDF.deviceId.str.contains("MMT")) > 0:
+                                        tempDF.loc[tempDF.deviceId.str.contains("MMT"), "ct.target"] = \
+                                            tempDF.loc[tempDF.deviceId.str.contains("MMT"), 'ct.high']
+
+                                    tempDaySummary = pd.DataFrame(index=[0], columns=ctDayColHeadings)
                                     tempDaySummary["day"] = tempDF["ct.localTime"].dt.date
                                     tempDaySummary["deviceId"] = tempDF["deviceId"]
+                                    tempDaySummary["ct.target.min"] = tempDF["ct.target"].min()
+                                    tempDaySummary["ct.target.weightedMean"] = \
+                                        np.sum(tempDF["ct.target"] * tempDF["ct.durationHours"]) / tempDF["ct.durationHours"].sum()
+                                    tempDaySummary["ct.target.max"] = tempDF["ct.target"].max()
 
-                                    for targetType in targetTypes:
-                                        tempDF["ct." + targetType] = mmolL_to_mgdL(tempDF[targetType])
+                                    for targetType in ["ct.low", "ct.high", "ct.target", "ct.range"]:
+                                        tempDaySummary[targetType] = tempDF[targetType]
 
-                                        tempDaySummary["ct." + targetType + ".min"] = tempDF["ct." + targetType].min()
-                                        tempDaySummary["ct." + targetType + ".weightedMean"] = \
-                                            np.sum(tempDF["ct." + targetType] * tempDF["ct.durationHours"]) / tempDF["ct.durationHours"].sum()
-                                        tempDaySummary["ct." + targetType + ".max"] = tempDF["ct." + targetType].max()
 
-                                    correctionTarget = \
-                                        pd.concat([correctionTarget,
-                                                   tempDF.drop(columns=['start',
-                                                                        'target',
-                                                                        'day',
-                                                                        'ct.durationHours'])],
-                                                   ignore_index=True, sort=False)
-                                    ctDaySummary = pd.concat([ctDaySummary, tempDaySummary],
-                                                             ignore_index=True, sort=False)
+                                    correctionTarget = pd.concat([correctionTarget, tempDF[ctColHeadings]], ignore_index=True)
+                                    ctDaySummary = pd.concat([ctDaySummary, tempDaySummary[ctDayColHeadings]], ignore_index=True)
 
                             ctDaySummary = pd.concat([ctDaySummary, dataPulledDF], sort=False)
                             ctDaySummary.fillna(method='ffill', inplace=True)
@@ -919,6 +943,10 @@ def getPumpSettingsStats(df, col, pumpCol):
                             ctDaySummary.drop_duplicates(subset="day", keep="last", inplace=True)
                             ctDaySummary.reset_index(inplace=True, drop=True)
 
+
+                            print(correctionTarget)
+                            print(ctDaySummary)
+
                             # SCHEDULED BASAL RATES
                             sbrColHeadings = ["sbr.localTime", "rate", "sbr.type"]
                             sbr = pd.DataFrame(columns=sbrColHeadings)
@@ -1125,10 +1153,13 @@ def getPumpSettingsStats(df, col, pumpCol):
                                         'cir.min',
                                         'cir.weightedMean',
                                         'cir.max',
-                                        'ct.low.min', 'ct.low.weightedMean', 'ct.low.max',
-                                        'ct.high.min', 'ct.high.weightedMean', 'ct.high.max',
-                                        'ct.target.min', 'ct.target.weightedMean', 'ct.target.max',
-                                        'ct.range.min', 'ct.range.weightedMean', 'ct.range.max',
+                                        'ct.low',
+                                        'ct.high',
+                                        'ct.target',
+                                        'ct.range',
+                                        'ct.target.min',
+                                        'ct.target.weightedMean',
+                                        'ct.target.max',
                                         'sbr.min',
                                         'sbr.weightedMean',
                                         'sbr.max',
@@ -1172,15 +1203,10 @@ def getPumpSettingsStats(df, col, pumpCol):
                             ageSummary["sbr.nAutoMode"] = catDF["sbr.type"].count()
 
                             # correctionTarget stats
-                            for targetType in ["ct.low", "ct.high", "ct.target", "ct.range"]:
-                                for stat in [".min", ".weightedMean", ".max"]:
-                                    ch = targetType + stat
-                                    ageSummary[ch + ".nDays"] = catDF[ch].count()
-                                    ageSummary[ch + ".min"] = catDF[ch].min()
-                                    ageSummary[ch + ".weightedMean"] = catDF[ch].sum() / catDF[ch].count()
-                                    ageSummary[ch + ".max"] = catDF[ch].max()
-
-
+                            ageSummary["ct.nDays"] = catDF["ct.target.min"].count()
+                            ageSummary["ct.target.min"] = catDF["ct.target.min"].min()
+                            ageSummary["ct.target.weightedMean"] = catDF["ct.target.weightedMean"].sum() / catDF["ct.target.weightedMean"].count()
+                            ageSummary["ct.target.max"] = catDF["ct.target.max"].max()
 
                             ageSummary.reset_index(inplace=True)
 
@@ -1221,13 +1247,10 @@ def getPumpSettingsStats(df, col, pumpCol):
                             ylwSummary["sbr.nAutoMode"] = catDF["sbr.type"].count()
 
                             # correctionTarget stats
-                            for targetType in ["ct.low", "ct.high", "ct.target", "ct.range"]:
-                                for stat in [".min", ".weightedMean", ".max"]:
-                                    ch = targetType + stat
-                                    ylwSummary[ch + ".nDays"] = catDF[ch].count()
-                                    ylwSummary[ch + ".min"] = catDF[ch].min()
-                                    ylwSummary[ch + ".weightedMean"] = catDF[ch].sum() / catDF[ch].count()
-                                    ylwSummary[ch + ".max"] = catDF[ch].max()
+                            ylwSummary["ct.nDays"] = catDF["ct.target.min"].count()
+                            ylwSummary["ct.target.min"] = catDF["ct.target.min"].min()
+                            ylwSummary["ct.target.weightedMean"] = catDF["ct.target.weightedMean"].sum() / catDF["ct.target.weightedMean"].count()
+                            ylwSummary["ct.target.max"] = catDF["ct.target.max"].max()
 
                             ylwSummary.reset_index(inplace=True)
 

From 5ed36901549819d997e275eeb646b2955d880968 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 18 Jan 2019 09:07:26 -0600
Subject: [PATCH 44/78] only check deviceId if payload exists

---
 .../predict-simulate/get-users-settings-and-events.py  | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 74a142a2..4c8cff6f 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -90,11 +90,15 @@ def removeInvalidCgmValues(df):
 
 
 def tslimCalibrationFix(df):
-    searchfor = ['tan']
-    tandemDataIndex = ((df.deviceId.str.contains('|'.join(searchfor))) &
-                       (df.type == "deviceEvent"))
+
 
     if "payload.calibration_reading" in list(df):
+
+        searchfor = ['tan']
+        tandemDataIndex = ((df.deviceId.str.contains('|'.join(searchfor))) &
+                           (df.type == "deviceEvent"))
+
+
         payloadCalReadingIndex = df["payload.calibration_reading"].notnull()
 
         nTandemAndPayloadCalReadings = sum(tandemDataIndex &

From 39eee21e18ef4a37ad5ad2162cd5addebdb145b2 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 18 Jan 2019 09:14:32 -0600
Subject: [PATCH 45/78] add age and years with summaries, and save all data

---
 .../get-users-settings-and-events.py          | 123 ++++++++++++++----
 1 file changed, 97 insertions(+), 26 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 4c8cff6f..d27ef501 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -543,6 +543,9 @@ def getPumpSettingsStats(df, col, pumpCol):
 
     return df, df2
 
+# %% DELELET LATER
+#args.startIndex = 2
+
 
 # %% START OF CODE
 dataPulledDate = args.dateStamp
@@ -565,7 +568,7 @@ def getPumpSettingsStats(df, col, pumpCol):
 allMetadata = pd.DataFrame()
 allAgeSummaries = pd.DataFrame()
 allYlwSummaries = pd.DataFrame()
-
+allAgeANDylwSummaries = pd.DataFrame()
 
 # %% MAKE THIS A FUNCTION SO THAT IT CAN BE RUN PER EACH INDIVIDUAL
 nUniqueDonors = len(donors)
@@ -1265,6 +1268,41 @@ def getPumpSettingsStats(df, col, pumpCol):
                             metadata["minYLW"] = minYLW
                             metadata["maxYLW"] = maxYLW
 
+                            # age and ylw
+                            catDF = dayData.groupby(["age", "ylw"])
+                            ageANDylwSummary = pd.DataFrame(catDF.validPumpData.sum())
+                            ageANDylwSummary.rename(columns={"validPumpData": "nDaysValidPump"}, inplace=True)
+                            ageANDylwSummary["nDaysValidCgm"] = pd.DataFrame(catDF.validCGMData.sum())
+                            ageANDylwSummary["nDaysClosedLoop"] = pd.DataFrame(catDF["basal.closedLoopDays"].sum())
+                            ageANDylwSummary["n670gDays"] = pd.DataFrame(catDF["670g"].sum())
+
+                            ageANDylwSummary["isf.nDays"] = catDF["isf.min"].count()
+                            ageANDylwSummary["isf.min"] = catDF["isf.min"].min()
+                            ageANDylwSummary["isf.weightedMean"] = catDF["isf.weightedMean"].sum() / catDF["isf.weightedMean"].count()
+                            ageANDylwSummary["isf.max"] = catDF["isf.max"].max()
+
+                            # add cir stats
+                            ageANDylwSummary["cir.nDays"] = catDF["cir.min"].count()
+                            ageANDylwSummary["cir.min"] = catDF["cir.min"].min()
+                            ageANDylwSummary["cir.weightedMean"] = catDF["cir.weightedMean"].sum() / catDF["cir.weightedMean"].count()
+                            ageANDylwSummary["cir.max"] = catDF["cir.max"].max()
+
+                            # add sbr stats
+                            ageANDylwSummary["sbr.nDays"] = catDF["sbr.min"].count()
+                            ageANDylwSummary["sbr.min"] = catDF["sbr.min"].min()
+                            ageANDylwSummary["sbr.weightedMean"] = catDF["sbr.weightedMean"].sum() / catDF["sbr.weightedMean"].count()
+                            ageANDylwSummary["sbr.max"] = catDF["sbr.max"].max()
+                            ageANDylwSummary["sbr.nAutoMode"] = catDF["sbr.type"].count()
+
+                            # correctionTarget stats
+                            ageANDylwSummary["ct.nDays"] = catDF["ct.target.min"].count()
+                            ageANDylwSummary["ct.target.min"] = catDF["ct.target.min"].min()
+                            ageANDylwSummary["ct.target.weightedMean"] = catDF["ct.target.weightedMean"].sum() / catDF["ct.target.weightedMean"].count()
+                            ageANDylwSummary["ct.target.max"] = catDF["ct.target.max"].max()
+
+#                            analysisCriterion = ageANDylwSummary[((ageANDylwSummary["nDaysValidPump"]> 28) &
+#                                                            (ageANDylwSummary["nDaysValidCgm"]> 28))]
+
 
                             # %% calculate local time
                             abr["date"] = pd.to_datetime(abr["utcTime"].dt.date)
@@ -1414,35 +1452,68 @@ def getPumpSettingsStats(df, col, pumpCol):
                                 "allYlwSummaries-dIndex-" + str(startIndex) + ".csv"))
 
 
+                            # repoeat for agne AND years living with
+                            catDF = pumpEvents.groupby(["age", "ylw"])
+                            # actual basal rates
+                            ageANDylwPump = pd.DataFrame(catDF["basalRate"].count()).add_suffix(".count")
+                            ageANDylwPump["basalRate.min"] = catDF["basalRate"].min()
+                            ageANDylwPump["basalRate.weightedMean"] = catDF["rateTimesDurationHours"].sum() / catDF["durationHours"].sum()
+                            ageANDylwPump["basalRate.max"] = catDF["basalRate"].max()
+
+                            # insulin events
+                            insulinEvents = catDF["unitsInsulin"].describe().add_prefix("insulin.")
+                            ageANDylwPump = pd.concat([ageANDylwPump, insulinEvents], axis=1)
+
+                            # carbs entered in bolus calculator
+                            carbEvents = catDF["carbInput"].describe().add_prefix("carb.")
+                            ageANDylwPump = pd.concat([ageANDylwPump, carbEvents], axis=1)
+
+                            # very low level cgm stats per age
+                            catDF = cgmLite.groupby(["age", "ylw"])
+                            cgmStats = catDF["mg_dL"].describe().add_prefix("cgm.")
+                            ageANDylwPumpCgm = pd.concat([ageANDylwPump, cgmStats], axis=1)
+
+                            ageANDylwSummary = ageANDylwSummary.join(ageANDylwPumpCgm, how="left")
+
+                            ageANDylwPumpCgm.reset_index(inplace=True)
+                            ageANDylwSummary.reset_index(inplace=True)
+
+                            ageANDylwSummary["hashID"] = hashID
+                            allAgeANDylwSummaries = pd.concat([allAgeANDylwSummaries, ageANDylwSummary], ignore_index=True)
+
+                            allAgeANDylwSummaries.to_csv(os.path.join(outputPath,
+                                "allAgeANDylwSummaries-dIndex-" + str(startIndex) + ".csv"))
+
+
                              # %% save data for this person
-    #                        outputString = "age-%s-%s-ylw-%s-%s-lp-%s-670g-%s-id-%s"
-    #                        outputFormat = (f"{minAge:02d}",
-    #                                        f"{maxAge:02d}",
-    #                                        f"{minYLW:02d}",
-    #                                        f"{maxYLW:02d}",
-    #                                        f"{nDaysClosedLoop:03d}",
-    #                                        f"{n670gDays:03d}",
-    #                                        hashID[0:4])
-    #                        outputFolderName = outputString % outputFormat
-    #                        outputFolderName_Path = os.path.join(outputPath,"data", outputFolderName)
-    #                        if not os.path.exists(outputFolderName_Path):
-    #                            os.makedirs(outputFolderName_Path)
-    #
-    #                        fName = outputFolderName + "-allSettings.csv"
-    #                        allSettings.to_csv(os.path.join(outputFolderName_Path, fName))
-    #                        fName = outputFolderName + "-pumpEvents.csv"
-    #                        pumpEvents.to_csv(os.path.join(outputFolderName_Path, fName))
-    #                        fName = outputFolderName + "-cgmLite.csv"
-    #                        cgmLite.to_csv(os.path.join(outputFolderName_Path, fName))
+                            outputString = "age-%s-%s-ylw-%s-%s-lp-%s-670g-%s-id-%s"
+                            outputFormat = (f"{minAge:02d}",
+                                            f"{maxAge:02d}",
+                                            f"{minYLW:02d}",
+                                            f"{maxYLW:02d}",
+                                            f"{int(nDaysClosedLoop):03d}",
+                                            f"{int(n670gDays):03d}",
+                                            hashID[0:4])
+                            outputFolderName = outputString % outputFormat
+                            outputFolderName_Path = os.path.join(outputPath, "data", outputFolderName)
+                            if not os.path.exists(outputFolderName_Path):
+                                os.makedirs(outputFolderName_Path)
+
+                            fName = outputFolderName + "-allSettings.csv"
+                            allSettings.to_csv(os.path.join(outputFolderName_Path, fName))
+                            fName = outputFolderName + "-pumpEvents.csv"
+                            pumpEvents.to_csv(os.path.join(outputFolderName_Path, fName))
+                            fName = outputFolderName + "-cgmLite.csv"
+                            cgmLite.to_csv(os.path.join(outputFolderName_Path, fName))
 
 
 
                             # %% save the processed data (saving this data will take up a lot of space and time)
-                            #data.to_csv(os.path.join(processedDataPath, "allDataCleaned-PHI-" + userID + ".csv"))
-                            #basal.to_csv(os.path.join(processedDataPath, "basal-PHI-" + userID + ".csv"))
-                            #bolus.to_csv(os.path.join(processedDataPath, "bolus-PHI-" + userID + ".csv"))
-                            #cgmData.to_csv(os.path.join(processedDataPath, "cgm-PHI-" + userID + ".csv"))
-                            #pumpSettings.to_csv(os.path.join(processedDataPath, "pumpSettings-PHI-" + userID + ".csv"))
+                            data.to_csv(os.path.join(processedDataPath, "allDataCleaned-PHI-" + userID + ".csv"))
+                            basal.to_csv(os.path.join(processedDataPath, "basal-PHI-" + userID + ".csv"))
+                            bolus.to_csv(os.path.join(processedDataPath, "bolus-PHI-" + userID + ".csv"))
+                            cgmData.to_csv(os.path.join(processedDataPath, "cgm-PHI-" + userID + ".csv"))
+                            pumpSettings.to_csv(os.path.join(processedDataPath, "pumpSettings-PHI-" + userID + ".csv"))
 
                         else:
                             metadata["flags"] = "no bolus wizard data"
@@ -1463,7 +1534,7 @@ def getPumpSettingsStats(df, col, pumpCol):
     # write metaData to allMetadata
     allMetadata = pd.concat([allMetadata, metadata], axis=0, sort=True)
     allMetadata.to_csv(os.path.join(outputPath,
-        "allMetadata-dIndex-" + str(startIndex) + "-" + str(dIndex) + ".csv"))
+        "allMetadata-dIndex-" + str(startIndex) + ".csv"))
 
     print("done with", dIndex)
 

From 4833b06dbc22931e19a3ebb4e1e4b21cdbd89c6e Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 22 Jan 2019 08:41:21 -0600
Subject: [PATCH 46/78] get rid of print correction target

---
 projects/predict-simulate/get-users-settings-and-events.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index d27ef501..a0a3328b 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -951,9 +951,6 @@ def getPumpSettingsStats(df, col, pumpCol):
                             ctDaySummary.reset_index(inplace=True, drop=True)
 
 
-                            print(correctionTarget)
-                            print(ctDaySummary)
-
                             # SCHEDULED BASAL RATES
                             sbrColHeadings = ["sbr.localTime", "rate", "sbr.type"]
                             sbr = pd.DataFrame(columns=sbrColHeadings)

From 47025c6935b74b87a61fe20330f4d351e0b8b926 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 22 Jan 2019 14:39:20 -0600
Subject: [PATCH 47/78] ignore copy of slice warning

---
 projects/predict-simulate/get-users-settings-and-events.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index a0a3328b..5cab1f4d 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -20,7 +20,7 @@
 import os
 import argparse
 import pdb
-
+pd.options.mode.chained_assignment = None  # default='warn'
 
 # %% USER INPUTS (ADD THIS IN LATER)
 codeDescription = "Get user's settings and events"

From ca91f4e1dbee22b1d4fb9f35a04183bc0b103ed0 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 22 Jan 2019 14:40:37 -0600
Subject: [PATCH 48/78] sense units of isf

---
 .../get-users-settings-and-events.py          | 77 +++++++++++++++----
 1 file changed, 64 insertions(+), 13 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 5cab1f4d..2ba00129 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -218,6 +218,10 @@ def mmolL_to_mgdL(mmolL):
     return mmolL * 18.01559
 
 
+def mgdL_to_mmolL(mgdL):
+    return mgdL / 18.01559
+
+
 def round_time(df, timeIntervalMinutes=5, timeField="time",
                roundedTimeFieldName="roundedTime", startWithFirstRecord=True,
                verbose=False):
@@ -543,8 +547,23 @@ def getPumpSettingsStats(df, col, pumpCol):
 
     return df, df2
 
+
+def isf_likely_units(df, columnHeading):
+    isfNotNull = df[df[columnHeading].notnull()][columnHeading]
+    minVal = np.min(isfNotNull)
+    maxVal = np.max(isfNotNull)
+    minDiff = np.abs(minVal - np.round(minVal))
+    maxDiff = np.abs(maxVal - np.round(maxVal))
+    if ((maxDiff == 0) & (maxDiff == 0) & (maxVal > 22.1)):
+        likelyUnits = "mg/dL"
+    else:
+        likelyUnits = "mmol/L"
+    return likelyUnits
+
+
+
 # %% DELELET LATER
-#args.startIndex = 2
+args.startIndex = 96
 
 
 # %% START OF CODE
@@ -673,15 +692,21 @@ def getPumpSettingsStats(df, col, pumpCol):
                             # get a summary of boluses per day
                             bolusDaySummary = get_bolusDaySummary(bolus)
 
-    #                        # isf and cir associated with bolus event
-    #                        if "insulinSensitivities" in list(bolus):
-    #                            pdb.set_trace()
-    #
-    #                        if "carbRatios" in list(bolus):
-    #                            pdb.set_trace()
+                            # figure out likely isf units
+                            isfUnits = isf_likely_units(bolus, "insulinSensitivity")
+                            metadata["bolus.isfLikelyUnits"] = isfUnits
+
+                            if isfUnits in "mmol/L":
+
+                                bolus["isf_mmolL_U"] = bolus["insulinSensitivity"]
+                                bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
+
+                            else:
+                                # I am pretty sure this case does NOT exist
+                                pdb.set_trace()
+                                bolus["isf"] = bolus["insulinSensitivity"]
+                                bolus["isf_mmolL_U"]  = mgdL_to_mmolL(bolus["isf"])
 
-                            bolus["isf_mmolL_U"] = bolus["insulinSensitivity"]
-                            bolus["isf"] = mmolL_to_mgdL(bolus["isf_mmolL_U"])
 
                             bolusCH = ["utcTime", "timezone", "roundedTime", "normal", "carbInput", "subType",
                                             "insulinOnBoard", "bgInput",
@@ -732,8 +757,21 @@ def getPumpSettingsStats(df, col, pumpCol):
 
                             if "insulinSensitivity.amount" in list(pumpSettings):
                                 isfColHead = "insulinSensitivity"
-                                pumpSettings["isf_mmolL_U"] = pumpSettings[isfColHead + ".amount"]
-                                pumpSettings["isf"] = mmolL_to_mgdL(pumpSettings["isf_mmolL_U"])
+
+                                # figure out likely isf units
+                                isfUnits = isf_likely_units(pumpSettings, "insulinSensitivity.amount")
+                                metadata["pumpSettings.isfLikelyUnits"] = isfUnits
+
+                                if isfUnits in "mmol/L":
+
+                                    pumpSettings["isf_mmolL_U"] = pumpSettings[isfColHead + ".amount"]
+                                    pumpSettings["isf"] = mmolL_to_mgdL(pumpSettings["isf_mmolL_U"])
+
+                                else:
+
+                                    pumpSettings["isf"] = pumpSettings[isfColHead + ".amount"]
+                                    pumpSettings["isf_mmolL_U"] = mgdL_to_mmolL(pumpSettings["isf"])
+
                                 pumpSettings["isf.localTime"] = pd.to_datetime(pumpSettings["day"]) + \
                                     pd.to_timedelta(pumpSettings[isfColHead + ".start"], unit="ms")
 
@@ -759,8 +797,21 @@ def getPumpSettingsStats(df, col, pumpCol):
                                     tempDF = pd.DataFrame(pumpSettings.loc[p, isfColHead + "." + actSched])
                                     tempDF["day"] = pumpSettings.loc[p, "day"]
                                     tempDF["isf.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
-                                    tempDF["isf_mmolL_U"] = tempDF["amount"]
-                                    tempDF["isf"] = mmolL_to_mgdL(tempDF["isf_mmolL_U"])
+
+                                    # figure out likely isf units
+                                    isfUnits = isf_likely_units(tempDF, "amount")
+                                    metadata["tempDF.isfLikelyUnits"] = isfUnits
+
+                                    if isfUnits in "mmol/L":
+
+                                        tempDF["isf_mmolL_U"] = tempDF["amount"]
+                                        tempDF["isf"] = mmolL_to_mgdL(tempDF["isf_mmolL_U"])
+
+                                    else:
+
+                                        tempDF["isf"] = tempDF["amount"]
+                                        tempDF["isf_mmolL_U"] = mgdL_to_mmolL(tempDF["isf"])
+
                                     endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["isf.localTime"], index=[0])
                                     tempDF = get_setting_durations(tempDF, "isf", endOfDay)
                                     tempDF = tempDF[:-1]

From 691b0e0d589ddd84942044be1760db5ae4a8501c Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 22 Jan 2019 14:41:03 -0600
Subject: [PATCH 49/78] make sure n670g days returns 0 instead of False when no
 data

---
 projects/predict-simulate/get-users-settings-and-events.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 2ba00129..dfbc6f1a 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -398,6 +398,9 @@ def getClosedLoopDays(groupedData, nTempBasalsPerDayIsClosedLoop, metadata):
         med670g = pd.DataFrame(topPump.str.contains("1780")).rename(columns={"top":"670g"})
         med670g.reset_index(inplace=True)
         n670gDays = med670g["670g"].sum()
+        if n670gDays == 0:
+            med670g = pd.DataFrame(columns=["670g", "day"])
+
 
     else:
         closedLoopDF = pd.DataFrame(columns=["basal.closedLoopDays", "day"])

From a7469788bbb5707cfc1752b8e50e5fd4903be9d1 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 22 Jan 2019 14:42:24 -0600
Subject: [PATCH 50/78] allow for valid cgm to account for free style 15 minute
 data interval

and a few other small refactors
---
 .../get-users-settings-and-events.py          | 39 ++++++++++++-------
 1 file changed, 24 insertions(+), 15 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index dfbc6f1a..81a4ee66 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -726,7 +726,6 @@ def isf_likely_units(df, columnHeading):
                                 bolus["duration"].replace(0, np.nan, inplace=True)
                                 bolus["durationHours"] = bolus["duration"] / 1000.0 / 3600.0
                                 bolus["rate"] = bolus["extended"] / bolus["durationHours"]
-    #                            bolusExtendedCH = commonColumnHeadings.copy()
                                 bolusExtendedCH = ["utcTime", "timezone", "roundedTime", "durationHours", "rate",  "type"]
                                 bolusExtendedEvents = bolus.loc[
                                         ((bolus["extended"].notnull()) &
@@ -1071,6 +1070,8 @@ def isf_likely_units(df, columnHeading):
                             basal = data[data.type == "basal"].copy().dropna(axis=1, how="all")
                             basal.sort_values("uploadTime", ascending=False, inplace=True)
 
+                            metadata["pump.top"] = basal.deviceId.describe()["top"]
+
                             basalBeginDate, basalEndDate = getStartAndEndTimes(basal, "day")
                             metadata["basal.beginDate"] = basalBeginDate
                             metadata["basal.endDate"] = basalEndDate
@@ -1142,6 +1143,10 @@ def isf_likely_units(df, columnHeading):
                             cgmData, percentDexcom = getListOfDexcomCGMDays(cgmData)
                             metadata["cgm.percentDexcomCGM"] = percentDexcom
 
+                            # see if cgm is freestyle
+                            cgmData["isFreeStyle"] = cgmData["deviceId"].str.contains("Free")
+                            metadata["cgm.top"] = cgmData.deviceId.describe()["top"]
+
                             # group by date (day) and get stats
                             catDF = cgmData.groupby(cgmData["day"])
                             cgmRecordsPerDay = \
@@ -1149,9 +1154,12 @@ def isf_likely_units(df, columnHeading):
                                 rename(columns={"value": "cgm.count"})
                             dayDate = catDF.day.describe()["top"]
                             dexcomCGM = catDF.dexcomCGM.describe()["top"]
-                            nTypesCGM = catDF.dexcomCGM.describe()["unique"]
+                            freeStyleCGM = catDF.isFreeStyle.describe()["top"]
+#                            nTypesCGM = catDF.dexcomCGM.describe()["unique"]
                             cgmRecordsPerDay["cgm.dexcomOnly"] = \
-                                (dexcomCGM & (nTypesCGM == 1))
+                                (dexcomCGM & (catDF.dexcomCGM.describe()["unique"] == 1))
+                            cgmRecordsPerDay["cgm.freeStyleOnly"] = \
+                                (freeStyleCGM & (catDF.isFreeStyle.describe()["unique"] == 1))
                             cgmRecordsPerDay["date"] = cgmRecordsPerDay.index
 
                             # filter the cgm data
@@ -1189,7 +1197,10 @@ def isf_likely_units(df, columnHeading):
                                 dayData = pd.merge(dayData, dfType, on="day", how="left")
 
                             dayData["validPumpData"] = dayData["numberOfNormalBoluses"] > 3
-                            dayData["validCGMData"] = dayData["cgm.count"] > (288*.75)
+
+                            dayData["validCGMData"] = \
+                                ((dayData["cgm.count"] > (288*.75)) |
+                                 (dayData["cgm.count"] > (96*.75)) & (dayData["cgm.freeStyleOnly"]))
 
                             dayData["timezone"].fillna(method='ffill', inplace=True)
                             dayData["timezone"].fillna(method='bfill', inplace=True)
@@ -1268,8 +1279,8 @@ def isf_likely_units(df, columnHeading):
 
                             ageSummary.reset_index(inplace=True)
 
-                            analysisCriterion = ageSummary[((ageSummary["nDaysValidPump"]> 28) &
-                                                            (ageSummary["nDaysValidCgm"]> 28))]
+                            analysisCriterion = ageSummary[((ageSummary["nDaysValidPump"]> 0) &
+                                                            (ageSummary["nDaysValidCgm"]> 0))]
                             minAge = analysisCriterion["age"].min()
                             maxAge = analysisCriterion["age"].max()
                             nDaysClosedLoop = analysisCriterion["nDaysClosedLoop"].sum()
@@ -1312,8 +1323,8 @@ def isf_likely_units(df, columnHeading):
 
                             ylwSummary.reset_index(inplace=True)
 
-                            analysisCriterion = ylwSummary[((ylwSummary["nDaysValidPump"]> 28) &
-                                                            (ylwSummary["nDaysValidCgm"]> 28))]
+                            analysisCriterion = ylwSummary[((ylwSummary["nDaysValidPump"]> 0) &
+                                                            (ylwSummary["nDaysValidCgm"]> 0))]
                             minYLW = analysisCriterion["ylw"].min()
                             maxYLW = analysisCriterion["ylw"].max()
                             metadata["minYLW"] = minYLW
@@ -1351,12 +1362,10 @@ def isf_likely_units(df, columnHeading):
                             ageANDylwSummary["ct.target.weightedMean"] = catDF["ct.target.weightedMean"].sum() / catDF["ct.target.weightedMean"].count()
                             ageANDylwSummary["ct.target.max"] = catDF["ct.target.max"].max()
 
-#                            analysisCriterion = ageANDylwSummary[((ageANDylwSummary["nDaysValidPump"]> 28) &
-#                                                            (ageANDylwSummary["nDaysValidCgm"]> 28))]
-
 
                             # %% calculate local time
                             abr["date"] = pd.to_datetime(abr["utcTime"].dt.date)
+
                             abr = pd.merge(abr, dayData[["date", "tzo", "isDSTChangeDay"]], how="left", on="date")
                             abr["localTime"] = abr["utcTime"] + pd.to_timedelta(abr["tzo"], unit="m")
 
@@ -1538,10 +1547,10 @@ def isf_likely_units(df, columnHeading):
 
                              # %% save data for this person
                             outputString = "age-%s-%s-ylw-%s-%s-lp-%s-670g-%s-id-%s"
-                            outputFormat = (f"{minAge:02d}",
-                                            f"{maxAge:02d}",
-                                            f"{minYLW:02d}",
-                                            f"{maxYLW:02d}",
+                            outputFormat = (f"{int(minAge):02d}",
+                                            f"{int(maxAge):02d}",
+                                            f"{int(minYLW):02d}",
+                                            f"{int(maxYLW):02d}",
                                             f"{int(nDaysClosedLoop):03d}",
                                             f"{int(n670gDays):03d}",
                                             hashID[0:4])

From d92521401d22789ef70632d9ca04699c5eeb3b60 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 22 Jan 2019 14:45:31 -0600
Subject: [PATCH 51/78] comment out breakpoints (for now while developing)

---
 projects/predict-simulate/get-users-settings-and-events.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 81a4ee66..05fd87b8 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -566,7 +566,7 @@ def isf_likely_units(df, columnHeading):
 
 
 # %% DELELET LATER
-args.startIndex = 96
+#args.startIndex = 96
 
 
 # %% START OF CODE
@@ -706,7 +706,7 @@ def isf_likely_units(df, columnHeading):
 
                             else:
                                 # I am pretty sure this case does NOT exist
-                                pdb.set_trace()
+#                                pdb.set_trace()
                                 bolus["isf"] = bolus["insulinSensitivity"]
                                 bolus["isf_mmolL_U"]  = mgdL_to_mmolL(bolus["isf"])
 

From 62202bd54d1cb81a1be8b1a305f0fd6160cebb2f Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 22 Jan 2019 14:51:59 -0600
Subject: [PATCH 52/78] make sure nDays with closed loop data is 0 and not
 False

---
 projects/predict-simulate/get-users-settings-and-events.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 05fd87b8..59f0f0f4 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -391,6 +391,9 @@ def getClosedLoopDays(groupedData, nTempBasalsPerDayIsClosedLoop, metadata):
             closedLoopDF["basal.temp.count"] >= nTB
         nClosedLoopDays = closedLoopDF["basal.closedLoopDays"].sum()
 
+        if nClosedLoopDays == 0:
+            closedLoopDF = pd.DataFrame(columns=["basal.closedLoopDays", "day"])
+
         # get the number of days with 670g
         basalData["day"] = pd.to_datetime(basalData.time).dt.date
         bdGroup = basalData.groupby("day")

From 748d375e53ca8863f0f67ee2db96f1cb80c50378 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 22 Jan 2019 20:52:56 -0600
Subject: [PATCH 53/78] deal with edge case where schedule has no information

---
 projects/predict-simulate/get-users-settings-and-events.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 59f0f0f4..fcba153d 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -1018,6 +1018,10 @@ def isf_likely_units(df, columnHeading):
                                     actSched = str(int(actSched))
                                 if 'Auto Mode' not in actSched:
                                     tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
+                                    if len(tempDF) == 0:
+                                        tempDF.loc[0, "start"] = 0
+                                        tempDF.loc[0, "rate"] = 0
+
                                     tempDF["day"] = pumpSettings.loc[p, "day"]
                                     tempDF["sbr.type"] = "regular"
                                     tempDF["sbr.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")

From 76b83cf7fc989078af49ffc13c944e383efae3a5 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 22 Jan 2019 21:58:03 -0600
Subject: [PATCH 54/78] deal with edge case where there is not enough pump
 and/or cgm data

---
 .../get-users-settings-and-events.py          | 20 +++++++++++--------
 1 file changed, 12 insertions(+), 8 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index fcba153d..47c1b7ab 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -1037,6 +1037,7 @@ def isf_likely_units(df, columnHeading):
                                     tempDaySummary["sbr.max"] = tempDF["rate"].max()
                                     tempDaySummary["sbr.type"] = "regular"
 
+
                                 else:
                                     tempDF = pd.DataFrame(index=[0])
                                     tempDF["day"] = pumpSettings.loc[p, "day"]
@@ -1552,16 +1553,20 @@ def isf_likely_units(df, columnHeading):
                                 "allAgeANDylwSummaries-dIndex-" + str(startIndex) + ".csv"))
 
 
-                             # %% save data for this person
-                            outputString = "age-%s-%s-ylw-%s-%s-lp-%s-670g-%s-id-%s"
-                            outputFormat = (f"{int(minAge):02d}",
-                                            f"{int(maxAge):02d}",
+                            # %% save data for this person
+                            if ((pd.notna(minAge)) & (pd.notna(minYLW))):
+                                outputString = "age-%s-%s-ylw-%s-%s-lp-%s-670g-%s-id-%s"
+                                outputFormat = (f"{int(minAge):02d}",
+                                                f"{int(maxAge):02d}",
                                             f"{int(minYLW):02d}",
                                             f"{int(maxYLW):02d}",
                                             f"{int(nDaysClosedLoop):03d}",
-                                            f"{int(n670gDays):03d}",
-                                            hashID[0:4])
-                            outputFolderName = outputString % outputFormat
+                                                f"{int(n670gDays):03d}",
+                                                hashID[0:4])
+                                outputFolderName = outputString % outputFormat
+                            else:
+                                outputFolderName = "dIndex-" + str(dIndex) + "-investigate-" + str(hashID[0:4])
+
                             outputFolderName_Path = os.path.join(outputPath, "data", outputFolderName)
                             if not os.path.exists(outputFolderName_Path):
                                 os.makedirs(outputFolderName_Path)
@@ -1574,7 +1579,6 @@ def isf_likely_units(df, columnHeading):
                             cgmLite.to_csv(os.path.join(outputFolderName_Path, fName))
 
 
-
                             # %% save the processed data (saving this data will take up a lot of space and time)
                             data.to_csv(os.path.join(processedDataPath, "allDataCleaned-PHI-" + userID + ".csv"))
                             basal.to_csv(os.path.join(processedDataPath, "basal-PHI-" + userID + ".csv"))

From f4fdd04d50c09f2b749dd879dfd8523f52092e33 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 23 Jan 2019 05:36:29 -0600
Subject: [PATCH 55/78] edge case where active schedule is null

---
 .../get-users-settings-and-events.py          | 39 ++++++++++++-------
 1 file changed, 26 insertions(+), 13 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 47c1b7ab..6040af43 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -1017,14 +1017,16 @@ def isf_likely_units(df, columnHeading):
                                 if isinstance(actSched, float):
                                     actSched = str(int(actSched))
                                 if 'Auto Mode' not in actSched:
-                                    tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
-                                    if len(tempDF) == 0:
-                                        tempDF.loc[0, "start"] = 0
-                                        tempDF.loc[0, "rate"] = 0
-
-                                    tempDF["day"] = pumpSettings.loc[p, "day"]
-                                    tempDF["sbr.type"] = "regular"
-                                    tempDF["sbr.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
+                                    # edge case where a active schedule is nan
+                                    try:
+                                        tempDF = pd.DataFrame(pumpSettings.loc[p, "basalSchedules." + actSched])
+                                    except:
+                                        tempDF = pd.DataFrame()
+                                        metadata["issueWithBasalSchedule"] = True
+                                    if len(tempDF) > 0:
+                                        tempDF["day"] = pumpSettings.loc[p, "day"]
+                                        tempDF["sbr.type"] = "regular"
+                                        tempDF["sbr.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
                                     endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["sbr.localTime"], index=[0])
                                     tempDF = get_setting_durations(tempDF, "sbr", endOfDay)
                                     tempDF = tempDF[:-1]
@@ -1033,11 +1035,22 @@ def isf_likely_units(df, columnHeading):
                                     tempDaySummary["day"] = tempDF["sbr.localTime"].dt.date
                                     tempDaySummary["sbr.min"] = tempDF["rate"].min()
                                     tempDaySummary["sbr.weightedMean"] = \
-                                        np.sum(tempDF["rate"] * tempDF["sbr.durationHours"]) / tempDF["sbr.durationHours"].sum()
-                                    tempDaySummary["sbr.max"] = tempDF["rate"].max()
-                                    tempDaySummary["sbr.type"] = "regular"
-
-
+                                            np.sum(tempDF["rate"] * tempDF["sbr.durationHours"]) / tempDF["sbr.durationHours"].sum()
+                                        tempDaySummary["sbr.max"] = tempDF["rate"].max()
+                                        tempDaySummary["sbr.type"] = "regular"
+                                    else:
+                                        tempDF = pd.DataFrame(index=[0])
+                                        tempDF["day"] = pumpSettings.loc[p, "day"]
+                                        tempDF["sbr.localTime"] = pd.to_datetime(tempDF["day"])
+                                        tempDF["rate"] = np.nan
+                                        tempDF["sbr.type"] = "AutoMode"
+
+                                        tempDaySummary = pd.DataFrame(index=[0])
+                                        tempDaySummary["day"] = tempDF["sbr.localTime"].dt.date
+                                        tempDaySummary["sbr.min"] = np.nan
+                                        tempDaySummary["sbr.weightedMean"] = np.nan
+                                        tempDaySummary["sbr.max"] = np.nan
+                                        tempDaySummary["sbr.type"] = "missingNullOrIssue"
                                 else:
                                     tempDF = pd.DataFrame(index=[0])
                                     tempDF["day"] = pumpSettings.loc[p, "day"]

From 61c97b8b1ebfda44ed9f1b212c2e4b15183e843a Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Thu, 24 Jan 2019 05:18:06 -0600
Subject: [PATCH 56/78] rename schedule basal rate summary data columns

---
 projects/predict-simulate/get-users-settings-and-events.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 6040af43..4cb374b8 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -1290,7 +1290,8 @@ def isf_likely_units(df, columnHeading):
                             ageSummary["sbr.min"] = catDF["sbr.min"].min()
                             ageSummary["sbr.weightedMean"] = catDF["sbr.weightedMean"].sum() / catDF["sbr.weightedMean"].count()
                             ageSummary["sbr.max"] = catDF["sbr.max"].max()
-                            ageSummary["sbr.nAutoMode"] = catDF["sbr.type"].count()
+                            ageSummary["sbr.typeTop"] = catDF["sbr.type"].describe()["top"]
+                            ageSummary["sbr.typeCount"] = catDF["sbr.type"].count()
 
                             # correctionTarget stats
                             ageSummary["ct.nDays"] = catDF["ct.target.min"].count()

From 9961c0c02b0cb9f7f73fa3db37b229c1a9c83fae Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Thu, 24 Jan 2019 05:33:19 -0600
Subject: [PATCH 57/78] update to the todo list at end of file

---
 projects/predict-simulate/get-users-settings-and-events.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 4cb374b8..a19dd5a4 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -1625,18 +1625,15 @@ def isf_likely_units(df, columnHeading):
 
 
 # %% V2 DATA TO GRAB
-# THERE IS AN ISSUE WITH COUNTING 670G SETTINGS
+# INVESTIGATE SETTINGS OUTLIERS
 # ADD ROUNDEDLOCAL TIME TO THE END RESULTS
 # CALCULATE MMOL SUMMARIES
-# GET RID OF ROUNDING TIME AT THE BEGINNING
 # DEFINE A DAY BETWEEN 6AM AND 6AM
-# EXPAND THE CORRECTION TIME VALUES TO BE UNIFORM ACROSS ALL USERS AND DEVICES
 # FIX DAYLIGHT SAVINGS TIME TIMES
 # FIGURE OUT WHY TEMP BASAL COUNTS ARE DIFFERENT BETWEEN THE TWO DIFFERENT METHODS
 # MAX BASAL RATE, MAX BOLUS AMOUNT, AND INSULIN DURATION SET ON SELECT PUMPS
 # ALERT SETTINGS
 # ESTIMATED LOCAL TIME
-# PUMP AND CGM DEVICE ()
 # GLYCEMIC OUTCOMES
 # DO NOT ROUND DATA
 # INFUSION SITE CHANGES

From c43efc51715ee3202f9bf0f811a5fd3de0677132 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Thu, 24 Jan 2019 09:11:46 -0600
Subject: [PATCH 58/78] add to list of issues to investigate

---
 projects/predict-simulate/get-users-settings-and-events.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index a19dd5a4..021f5f5e 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -1625,7 +1625,7 @@ def isf_likely_units(df, columnHeading):
 
 
 # %% V2 DATA TO GRAB
-# INVESTIGATE SETTINGS OUTLIERS
+# INVESTIGATE SETTINGS OUTLIERS (Paradigm Veo pumps have unrealistic high ISF)
 # ADD ROUNDEDLOCAL TIME TO THE END RESULTS
 # CALCULATE MMOL SUMMARIES
 # DEFINE A DAY BETWEEN 6AM AND 6AM

From 9d69760ef3234273ec8af478096086763e1955dd Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Thu, 24 Jan 2019 12:33:08 -0600
Subject: [PATCH 59/78] adding to list of potential issues to examine

---
 projects/predict-simulate/get-users-settings-and-events.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 021f5f5e..b393d953 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -1625,7 +1625,7 @@ def isf_likely_units(df, columnHeading):
 
 
 # %% V2 DATA TO GRAB
-# INVESTIGATE SETTINGS OUTLIERS (Paradigm Veo pumps have unrealistic high ISF)
+# INVESTIGATE SETTINGS OUTLIERS (Paradigm Veo pumps have unrealistic high ISF, ommipod with likely mg/dL have wrong correction target)
 # ADD ROUNDEDLOCAL TIME TO THE END RESULTS
 # CALCULATE MMOL SUMMARIES
 # DEFINE A DAY BETWEEN 6AM AND 6AM

From b015a1e2191197a049e4791d144c65b9c156992b Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 29 Jan 2019 04:55:44 -0600
Subject: [PATCH 60/78] changing a day to reflect local time

---
 .../get-users-settings-and-events.py                 | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index b393d953..e90f12f2 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -668,7 +668,11 @@ def isf_likely_units(df, columnHeading):
                         data["utcTime"] = pd.to_datetime(data["time"])
                         data["timezone"].fillna(method='ffill', inplace=True)
                         data["timezone"].fillna(method='bfill', inplace=True)
-                        data["day"] = pd.DatetimeIndex(data["utcTime"]).date
+
+                        # estimate local time (simple method)
+                        data["tzo"] = data[['utcTime', 'timezone']].apply(lambda x: getTimezoneOffset(*x), axis=1)
+                        data["localTime"] = data["utcTime"] + pd.to_timedelta(data["tzo"], unit="m")
+                        data["day"] = pd.DatetimeIndex(data["localTime"]).date
 
                         # round to the nearest 5 minutes
                         # TODO: once roundTime is pushed to tidals repository then this line can be replaced
@@ -676,14 +680,16 @@ def isf_likely_units(df, columnHeading):
                         data = round_time(data, timeIntervalMinutes=5, timeField="time",
                                           roundedTimeFieldName="roundedTime", startWithFirstRecord=True,
                                           verbose=False)
+
+                        data["roundedLocalTime"] = data["roundedTime"] + pd.to_timedelta(data["tzo"], unit="m")
                         data.sort_values("uploadTime", ascending=False, inplace=True)
 
 
                         # %% ID, HASHID, AGE, & YLW
                         data["userID"] = userID
                         data["hashID"] = hashID
-                        data["age"] = np.floor((data["utcTime"] - bDate).dt.days/365.25).astype(int)
-                        data["ylw"] = np.floor((data["utcTime"] - dDate).dt.days/365.25).astype(int)
+                        data["age"] = np.floor((data["localTime"] - bDate).dt.days/365.25).astype(int)
+                        data["ylw"] = np.floor((data["localTime"] - dDate).dt.days/365.25).astype(int)
 
 
                         # %% BOLUS EVENTS (CORRECTION, AND MEAL INCLUING: CARBS, EXTENDED, DUAL)

From 5597a85b4ba30cc68f6b1c291d5b0e9ef17acccd Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 29 Jan 2019 12:15:01 -0600
Subject: [PATCH 61/78] add total daily dose and correct basals that extend
 past midnight

---
 .../get-users-settings-and-events.py          | 412 ++++++++++--------
 1 file changed, 227 insertions(+), 185 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index e90f12f2..21e616b2 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -339,15 +339,18 @@ def get_bolusDaySummary(bolusData):
     return bolusDaySummary
 
 
-def get_basalDaySummary(basal):
+def get_basalDaySummary(df):
     # group data by day
-    basalByDay = basal.groupby(basal["day"])
+    basalByDay = df.groupby(df["day"])
 
     # total basal insulin per day
     basalDaySummary = pd.DataFrame(basalByDay.totalAmountOfBasalInsulin.sum())
 
+    # total duration per each day (this should add up to 24 hours)
+    basalDaySummary["totalBasalDuration"] = basalByDay.durationHours.sum()
+
     # total number of basals types per day
-    basalTypePerDay = basal.groupby(["day", "deliveryType"]).size().unstack()
+    basalTypePerDay = df.groupby(["day", "deliveryType"]).size().unstack()
 
     basalDaySummary["numberOfScheduledBasals"] = basalTypePerDay["scheduled"].fillna(0)
     if "suspend" not in list(basalTypePerDay):
@@ -567,9 +570,96 @@ def isf_likely_units(df, columnHeading):
     return likelyUnits
 
 
+def correct_basal_extends_past_midnight(df, timeCol, dayCol):
+    # deal with case when basal extends past midnight due to utcTime and localTime difference
+    df.sort_values(timeCol, inplace=True)
+    uniqueDays = pd.DatetimeIndex(df[dayCol].unique())
+    midnightsNotInBasalData = uniqueDays[~uniqueDays.isin(df[timeCol])]
+    for midnight in midnightsNotInBasalData:
+        # find the last basal prior to midnight
+        dayBefore = midnight - pd.Timedelta(24, unit="h")
+        dataDayBefore = df[(df[timeCol] < midnight) & (df[timeCol] > dayBefore)]
+
+        if len(dataDayBefore) > 0:
+
+            basalPriorToMidnight = dataDayBefore[dataDayBefore[timeCol] == dataDayBefore[timeCol].max()]
+            indexToDrop = basalPriorToMidnight.index.values[0]
+            oldDuration = basalPriorToMidnight.loc[indexToDrop, "duration"]
+            newDuration = (midnight - basalPriorToMidnight.loc[indexToDrop, timeCol]).seconds * 1000.0
+            newMidnightDuration = oldDuration - newDuration
+
+            newBasalPriorToMidnight = df.copy().drop(index=df.index)
+            newBasalPriorToMidnight.loc[0,:] = basalPriorToMidnight.loc[indexToDrop,:]
+            newBasalPriorToMidnight["duration"] = newDuration
+
+            # new basal at midnight
+            newBasalAtMidnight = df.copy().drop(index=df.index)
+            newBasalAtMidnight.loc[1,:] = basalPriorToMidnight.loc[indexToDrop,:]
+            newBasalAtMidnight["duration"] = newMidnightDuration
+            newBasalAtMidnight[timeCol] = midnight.to_pydatetime()
+            newBasalAtMidnight[dayCol] = newBasalAtMidnight[timeCol].dt.date
+
+            # add data back to the basal data frame
+            newRowsToAdd = pd.concat([newBasalPriorToMidnight, newBasalAtMidnight], ignore_index = True)
+            newRowsToAdd = newRowsToAdd.astype({"rate": "float64",
+                                                "duration": "float64"})
+            df = df.drop(indexToDrop)
+            df = pd.concat([df, newRowsToAdd], ignore_index=True)
+
+    return df
+
+
+def get_basalEvent_summary(df, categories):
+    catDF = df[df["type"] == "basal"].groupby(categories)
+    summaryDF = pd.DataFrame(catDF["rate"].count()).add_suffix(".count")
+    summaryDF["basalRate.min"] = catDF["rate"].min()
+    summaryDF["basalRate.weightedMean"] = catDF["totalAmountOfBasalInsulin"].sum() / catDF["durationHours"].sum()
+    summaryDF["basalRate.max"] = catDF["rate"].max()
+
+    # max basal rate including extended boluses
+    catDF = df.groupby(categories)
+    summaryDF["basalRateIncludingExtendedBoluses.count"] = catDF["rate"].count()
+    summaryDF["basalRateIncludingExtendedBoluses.max"] = catDF["rate"].max()
+
+    return summaryDF
+
+
+def get_bolusEvent_summary(df, categories):
+
+    catDF = df.groupby(categories)
+    summaryDF = pd.DataFrame(catDF["unitsInsulin"].describe().add_prefix("insulin."))
+
+    # carbs entered in bolus calculator
+    carbEvents = catDF["carbInput"].describe().add_prefix("carbsPerMeal.")
+    summaryDF = pd.concat([summaryDF, carbEvents], axis=1)
+
+    return summaryDF
+
+
+def get_dayData_summary(df, categories):
+
+    catDF = df[df["validPumpData"]].groupby(categories)
+    summaryDF = pd.DataFrame(catDF["totalAmountOfInsulin"].describe().add_prefix("totalDailyDose."))
+    totalDailyCarbs = catDF["totalDailyCarbs"].describe().add_prefix("totalDailyCarbs.")
+    percentBasal = catDF["percentBasal"].describe().add_prefix("percentBasal.")
+    percentBolus = catDF["percentBolus"].describe().add_prefix("percentBolus.")
+    summaryDF = pd.concat([summaryDF, totalDailyCarbs, percentBasal, percentBolus], axis=1)
+
+    return summaryDF
+
+
+def get_pumpSummary(basalEventsDF, bolusEventsDF, dayDataDF, categories):
+    basalEventSummary = get_basalEvent_summary(basalEventsDF, categories)
+    bolusEventSummary = get_bolusEvent_summary(bolusEventsDF, categories)
+    dailySummary = get_dayData_summary(dayDataDF, categories)
+    pumpSummaryDF = pd.concat([basalEventSummary, bolusEventSummary, dailySummary], axis=1)
+
+    return pumpSummaryDF
+
 
 # %% DELELET LATER
-#args.startIndex = 96
+args.startIndex = 0
+args.endIndex = 4226
 
 
 # %% START OF CODE
@@ -608,7 +698,7 @@ def isf_likely_units(df, columnHeading):
     metadata = pd.DataFrame(index=[dIndex])
     metadata["hashID"] = hashID
 
-    try:
+    if 1 == 1:  # try:
         # make folder to save data
         processedDataPath = os.path.join(phiOutputPath, "PHI-" + userID)
         if not os.path.exists(processedDataPath):
@@ -720,9 +810,12 @@ def isf_likely_units(df, columnHeading):
                                 bolus["isf_mmolL_U"]  = mgdL_to_mmolL(bolus["isf"])
 
 
-                            bolusCH = ["utcTime", "timezone", "roundedTime", "normal", "carbInput", "subType",
-                                            "insulinOnBoard", "bgInput",
-                                            "isf", "isf_mmolL_U", "insulinCarbRatio"]
+                            bolusCH = ["hashID", "age", "ylw", "day",
+                                       "utcTime", "localTime", "timezone", "tzo",
+                                       "roundedTime", "roundedLocalTime",
+                                       "normal", "carbInput", "subType",
+                                       "insulinOnBoard", "bgInput",
+                                       "isf", "isf_mmolL_U", "insulinCarbRatio"]
                             bolusEvents = bolus.loc[bolus["normal"].notnull(), bolusCH]
                             bolusEvents.loc[bolusEvents["bgInput"] == 0, "bgInput"] = np.nan
                             bolusEvents = bolusEvents.rename(columns={"normal": "unitsInsulin",
@@ -735,7 +828,12 @@ def isf_likely_units(df, columnHeading):
                                 bolus["duration"].replace(0, np.nan, inplace=True)
                                 bolus["durationHours"] = bolus["duration"] / 1000.0 / 3600.0
                                 bolus["rate"] = bolus["extended"] / bolus["durationHours"]
-                                bolusExtendedCH = ["utcTime", "timezone", "roundedTime", "durationHours", "rate",  "type"]
+#                                bolusExtendedCH = ["localTime", "timezone", "roundedTime", "roundedLocalTime",
+#                                                   "durationHours", "rate",  "type"]
+                                bolusExtendedCH = ["hashID", "age", "ylw", "day",
+                                                   "utcTime", "localTime", "timezone", "tzo",
+                                                   "roundedTime", "roundedLocalTime",
+                                                   "durationHours", "rate", "type"]
                                 bolusExtendedEvents = bolus.loc[
                                         ((bolus["extended"].notnull()) &
                                          (bolus["duration"] > 0)), bolusExtendedCH]
@@ -1033,14 +1131,14 @@ def isf_likely_units(df, columnHeading):
                                         tempDF["day"] = pumpSettings.loc[p, "day"]
                                         tempDF["sbr.type"] = "regular"
                                         tempDF["sbr.localTime"] = pd.to_datetime(tempDF["day"]) + pd.to_timedelta(tempDF["start"], unit="ms")
-                                    endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["sbr.localTime"], index=[0])
-                                    tempDF = get_setting_durations(tempDF, "sbr", endOfDay)
-                                    tempDF = tempDF[:-1]
+                                        endOfDay = pd.DataFrame(pd.to_datetime(pumpSettings.loc[p, "day"] + pd.Timedelta(1, "D")), columns=["sbr.localTime"], index=[0])
+                                        tempDF = get_setting_durations(tempDF, "sbr", endOfDay)
+                                        tempDF = tempDF[:-1]
 
-                                    tempDaySummary = pd.DataFrame(index=[0])
-                                    tempDaySummary["day"] = tempDF["sbr.localTime"].dt.date
-                                    tempDaySummary["sbr.min"] = tempDF["rate"].min()
-                                    tempDaySummary["sbr.weightedMean"] = \
+                                        tempDaySummary = pd.DataFrame(index=[0])
+                                        tempDaySummary["day"] = tempDF["sbr.localTime"].dt.date
+                                        tempDaySummary["sbr.min"] = tempDF["rate"].min()
+                                        tempDaySummary["sbr.weightedMean"] = \
                                             np.sum(tempDF["rate"] * tempDF["sbr.durationHours"]) / tempDF["sbr.durationHours"].sum()
                                         tempDaySummary["sbr.max"] = tempDF["rate"].max()
                                         tempDaySummary["sbr.type"] = "regular"
@@ -1107,6 +1205,9 @@ def isf_likely_units(df, columnHeading):
                                 removeDuplicates(basal, ["deliveryType", "deviceTime", "duration", "rate"])
                             metadata["basal.nBasalDuplicatesRemoved"] = nBasalDuplicatesRemoved
 
+                            # deal with case when basal extends past midnight due to utcTime and localTime difference
+                            basal = correct_basal_extends_past_midnight(basal, "localTime", "day")
+
                             # fill NaNs with 0, as it indicates a suspend (temp basal of 0)
                             basal.rate.fillna(0, inplace=True)
 
@@ -1125,14 +1226,18 @@ def isf_likely_units(df, columnHeading):
                             basal["totalAmountOfBasalInsulin"] = basal["durationHours"] * basal["rate"]
 
                             # actual basal delivered
-                            abrColHeadings = ["utcTime", "timezone", "roundedTime", "durationHours", "rate", "type"]
-                            abr = basal[abrColHeadings]
+                            basalEventsColHeadings = ["hashID", "age", "ylw", "day",
+                                                      "utcTime", "localTime", "timezone", "tzo",
+                                                      "roundedTime", "roundedLocalTime",
+                                                      "durationHours", "rate", "type"]
+                            basalEvents = basal[basalEventsColHeadings]
                             if "duration" in list(bolus):
-                                abr = pd.concat([abr, bolusExtendedEvents], ignore_index=True)
-                                abr.sort_values("utcTime", inplace=True)
+                                basalEvents = pd.concat([basalEvents, bolusExtendedEvents], ignore_index=True)
+                                basalEvents.sort_values("localTime", inplace=True)
 
-                            abr["timezone"].fillna(method='ffill', inplace=True)
-                            abr["timezone"].fillna(method='bfill', inplace=True)
+                            basalEvents["timezone"].fillna(method='ffill', inplace=True)
+                            basalEvents["timezone"].fillna(method='bfill', inplace=True)
+                            basalEvents["totalAmountOfBasalInsulin"] = basalEvents["rate"] * basalEvents["durationHours"]
 
                             # get a summary of basals per day
                             basalDaySummary = get_basalDaySummary(basal)
@@ -1145,6 +1250,7 @@ def isf_likely_units(df, columnHeading):
                             isClosedLoopDay, is670g, metadata = \
                                 getClosedLoopDays(groupedData, 30, metadata)
 
+
                             # %% CGM DATA
                             # filter by cgm and sort by uploadTime
                             cgmData = groupedData.get_group("cbg").dropna(axis=1, how="all")
@@ -1190,7 +1296,10 @@ def isf_likely_units(df, columnHeading):
                             cgmRecordsPerDay["date"] = cgmRecordsPerDay.index
 
                             # filter the cgm data
-                            cgmColHeadings = ["utcTime", "timezone", "roundedTime", "value"]
+                            cgmColHeadings = ["hashID", "age", "ylw", "day",
+                                              "utcTime", "localTime",
+                                              "timezone", "tzo",
+                                              "roundedTime", "roundedLocalTime", "value"]
 
                             # get data in mg/dL units
                             cgm = cgmData[cgmColHeadings]
@@ -1218,12 +1327,39 @@ def isf_likely_units(df, columnHeading):
                             metadata["day.endDate"] = dayEndDate
                             rng = pd.date_range(dayBeginDate, dayEndDate).date
                             dayData = pd.DataFrame(rng, columns=["day"])
+
                             for dfType in [dataPerDay, basalDaySummary, bolusDaySummary, cgmRecordsPerDay]:
                                 dayData = pd.merge(dayData, dfType.reset_index(), on="day", how="left")
+
                             for dfType in [isClosedLoopDay, is670g]:
                                 dayData = pd.merge(dayData, dfType, on="day", how="left")
 
-                            dayData["validPumpData"] = dayData["numberOfNormalBoluses"] > 3
+                            # calculate the total amount of daily insulin
+                            dayData["totalAmountOfInsulin"] = (
+                                    dayData["totalAmountOfBasalInsulin"] +
+                                    dayData["totalAmountOfBolusInsulin"]
+                                    )
+
+                            # calculate the percent bolus and percent basal
+                            dayData["percentBasal"] = (
+                                    dayData["totalAmountOfBasalInsulin"] /
+                                    dayData["totalAmountOfInsulin"]
+                                    )
+
+                            dayData["percentBolus"] = (
+                                    dayData["totalAmountOfBolusInsulin"] /
+                                    dayData["totalAmountOfInsulin"]
+                                    )
+
+                            # total daily carbs
+                            totalDailyCarbs = pd.DataFrame(bolusEvents.groupby("day").carbInput.sum())
+                            totalDailyCarbs.reset_index(inplace=True)
+                            totalDailyCarbs.rename(columns={"carbInput": "totalDailyCarbs"}, inplace=True)
+                            dayData = pd.merge(dayData, totalDailyCarbs, how="left", on="day")
+
+                            # valid pump should be having exactly 24 hours of basal rate
+                            dayData["validPumpData"] = dayData["totalBasalDuration"] == 24
+                            dayData["atLeast3Boluses"] = dayData["numberOfNormalBoluses"] >= 3
 
                             dayData["validCGMData"] = \
                                 ((dayData["cgm.count"] > (288*.75)) |
@@ -1392,18 +1528,14 @@ def isf_likely_units(df, columnHeading):
 
 
                             # %% calculate local time
-                            abr["date"] = pd.to_datetime(abr["utcTime"].dt.date)
-
-                            abr = pd.merge(abr, dayData[["date", "tzo", "isDSTChangeDay"]], how="left", on="date")
-                            abr["localTime"] = abr["utcTime"] + pd.to_timedelta(abr["tzo"], unit="m")
+                            basalEvents["day"] = basalEvents["localTime"].dt.date
+                            basalEvents = pd.merge(basalEvents, dayData[["day", "isDSTChangeDay"]], how="left", on="day")
 
-                            cgm["date"] = pd.to_datetime(cgm["utcTime"].dt.date)
-                            cgm = pd.merge(cgm, dayData[["date", "tzo", "isDSTChangeDay"]], how="left", on="date")
-                            cgm["localTime"] = cgm["utcTime"] + pd.to_timedelta(cgm["tzo"], unit="m")
+                            cgm["day"] = cgm["localTime"].dt.date
+                            cgm = pd.merge(cgm, dayData[["day", "isDSTChangeDay"]], how="left", on="day")
 
-                            bolusEvents["date"] = pd.to_datetime(bolusEvents["utcTime"].dt.date)
-                            bolusEvents = pd.merge(bolusEvents, dayData[["date", "tzo", "isDSTChangeDay"]], how="left", on="date")
-                            bolusEvents["localTime"] = bolusEvents["utcTime"] + pd.to_timedelta(bolusEvents["tzo"], unit="m")
+                            bolusEvents["day"] = bolusEvents["localTime"].dt.date
+                            bolusEvents = pd.merge(bolusEvents, dayData[["day", "isDSTChangeDay"]], how="left", on="day")
 
 
                             # %% STATS PER EACH TYPE, OVERALL AND PER EACH AGE & YLW (MIN, PERCENTILES, MAX, MEAN, SD, IQR, COV)
@@ -1428,149 +1560,51 @@ def isf_likely_units(df, columnHeading):
                                                      roundedTimeFieldName="localRoundedTime",
                                                      startWithFirstRecord=True, verbose=False)
 
-                            colOrder = ["hashID", "age", "ylw", "localTime", "localRoundedTime",
+                            allSettings["day"] = allSettings["localTime"].dt.date
+                            allSettings = pd.merge(allSettings, dayData[["day", "isDSTChangeDay"]], how="left", on="day")
+
+
+                            colOrder = ["hashID", "age", "ylw", "day", "isDSTChangeDay",
+                                        "localTime", "localRoundedTime",
                                         "isf", "cir", "sbr", "deviceId",
                                         "ct.low", "ct.high", "ct.target", "ct.range",
                                         "sbr.type", "isf_mmolL_U"]
                             allSettings = allSettings[colOrder]
 
 
-                            fieldsToDrop = ["utcTime", "timezone", "roundedTime", "date", "tzo", "isDSTChangeDay"]
-                            pumpEvents = pd.merge(abr.drop(columns=fieldsToDrop),
-                                                  bolusEvents.drop(columns=fieldsToDrop),
-                                                  how="outer", on="localTime")
-                            pumpEvents["type"].fillna("bolus", inplace=True)
-                            pumpEvents["eventType"].fillna("basal", inplace=True)
-                            pumpEvents["hashID"] = hashID
-                            pumpEvents["age"] = np.floor((pumpEvents["localTime"] - bDate).dt.days/365.25).astype(int)
-                            pumpEvents["ylw"] = np.floor((pumpEvents["localTime"] - dDate).dt.days/365.25).astype(int)
-                            pumpEvents = round_time(pumpEvents, timeIntervalMinutes=5,
-                                                    timeField="localTime",
-                                                    roundedTimeFieldName="localRoundedTime",
-                                                    startWithFirstRecord=True, verbose=False)
-
-
-                            colOrder = ["hashID", "age", "ylw", "localTime", "localRoundedTime",
-                                        "rate", "durationHours",
-                                        "unitsInsulin", "carbInput", "type", "eventType", "subType",
-                                        "isf", "isf_mmolL_U", "insulinCarbRatio", "insulinOnBoard",
-                                        "bg_mgdL", "bg_mmolL"]
-
-                            pumpEvents = pumpEvents[colOrder]
-
-                            cgmLite = cgm.drop(columns=fieldsToDrop)
-                            cgmLite["hashID"] = hashID
-                            cgmLite["age"] = np.floor((cgmLite["localTime"] - bDate).dt.days/365.25).astype(int)
-                            cgmLite["ylw"] = np.floor((cgmLite["localTime"] - dDate).dt.days/365.25).astype(int)
-                            cgmLite = round_time(cgmLite, timeIntervalMinutes=5,
-                                                 timeField="localTime",
-                                                 roundedTimeFieldName="localRoundedTime",
-                                                 startWithFirstRecord=True, verbose=False)
-
-                            colOrder = ["hashID", "age", "ylw", "localTime", "localRoundedTime",
-                                        "mg_dL", "mmol_L"]
-
-                            cgmLite = cgmLite[colOrder]
-
-
-                            # %% SAVE RESULTS
-
-                            # age and ylw stats
-                            pumpEvents["rateTimesDurationHours"] = pumpEvents["rate"] * pumpEvents["durationHours"]
-                            pumpEvents.rename(columns={"rate":"basalRate"}, inplace=True)
-                            catDF = pumpEvents.groupby("age")
-
-                            # actual basal rates
-                            agePump = pd.DataFrame(catDF["basalRate"].count()).add_suffix(".count")
-                            agePump["basalRate.min"] = catDF["basalRate"].min()
-                            agePump["basalRate.weightedMean"] = catDF["rateTimesDurationHours"].sum() / catDF["durationHours"].sum()
-                            agePump["basalRate.max"] = catDF["basalRate"].max()
-
-                            # insulin events
-                            insulinEvents = catDF["unitsInsulin"].describe().add_prefix("insulin.")
-                            agePump = pd.concat([agePump, insulinEvents], axis=1)
-
-                            # carbs entered in bolus calculator
-                            carbEvents = catDF["carbInput"].describe().add_prefix("carb.")
-                            agePump = pd.concat([agePump, carbEvents], axis=1)
-
-                            # very low level cgm stats per age
-                            catDF = cgmLite.groupby("age")
-                            cgmStats = catDF["mg_dL"].describe().add_prefix("cgm.")
-                            agePumpCgm = pd.concat([agePump, cgmStats], axis=1)
-
-                            agePumpCgm.reset_index(inplace=True)
-
-                            ageSummary = pd.merge(ageSummary, agePumpCgm, on="age", how="left")
-                            ageSummary["hashID"] = hashID
-                            allAgeSummaries = pd.concat([allAgeSummaries, ageSummary], ignore_index=True)
-
-                            allAgeSummaries.to_csv(os.path.join(outputPath,
-                                "allAgeSummaries-dIndex-" + str(startIndex) + ".csv"))
-
-                            # repoeat for years living with
-                            catDF = pumpEvents.groupby("ylw")
-                            # actual basal rates
-                            ylwPump = pd.DataFrame(catDF["basalRate"].count()).add_suffix(".count")
-                            ylwPump["basalRate.min"] = catDF["basalRate"].min()
-                            ylwPump["basalRate.weightedMean"] = catDF["rateTimesDurationHours"].sum() / catDF["durationHours"].sum()
-                            ylwPump["basalRate.max"] = catDF["basalRate"].max()
-
-                            # insulin events
-                            insulinEvents = catDF["unitsInsulin"].describe().add_prefix("insulin.")
-                            ylwPump = pd.concat([ylwPump, insulinEvents], axis=1)
-
-                            # carbs entered in bolus calculator
-                            carbEvents = catDF["carbInput"].describe().add_prefix("carb.")
-                            ylwPump = pd.concat([ylwPump, carbEvents], axis=1)
-
-                            # very low level cgm stats per age
-                            catDF = cgmLite.groupby("ylw")
-                            cgmStats = catDF["mg_dL"].describe().add_prefix("cgm.")
-                            ylwPumpCgm = pd.concat([ylwPump, cgmStats], axis=1)
-
-                            ylwPumpCgm.reset_index(inplace=True)
-
-                            ylwSummary = pd.merge(ylwSummary, ylwPumpCgm, on="ylw", how="left")
-
-                            ylwSummary["hashID"] = hashID
-                            allYlwSummaries = pd.concat([allYlwSummaries, ylwSummary], ignore_index=True)
-
-                            allYlwSummaries.to_csv(os.path.join(outputPath,
-                                "allYlwSummaries-dIndex-" + str(startIndex) + ".csv"))
-
-
-                            # repoeat for agne AND years living with
-                            catDF = pumpEvents.groupby(["age", "ylw"])
-                            # actual basal rates
-                            ageANDylwPump = pd.DataFrame(catDF["basalRate"].count()).add_suffix(".count")
-                            ageANDylwPump["basalRate.min"] = catDF["basalRate"].min()
-                            ageANDylwPump["basalRate.weightedMean"] = catDF["rateTimesDurationHours"].sum() / catDF["durationHours"].sum()
-                            ageANDylwPump["basalRate.max"] = catDF["basalRate"].max()
-
-                            # insulin events
-                            insulinEvents = catDF["unitsInsulin"].describe().add_prefix("insulin.")
-                            ageANDylwPump = pd.concat([ageANDylwPump, insulinEvents], axis=1)
-
-                            # carbs entered in bolus calculator
-                            carbEvents = catDF["carbInput"].describe().add_prefix("carb.")
-                            ageANDylwPump = pd.concat([ageANDylwPump, carbEvents], axis=1)
-
-                            # very low level cgm stats per age
-                            catDF = cgmLite.groupby(["age", "ylw"])
-                            cgmStats = catDF["mg_dL"].describe().add_prefix("cgm.")
-                            ageANDylwPumpCgm = pd.concat([ageANDylwPump, cgmStats], axis=1)
-
-                            ageANDylwSummary = ageANDylwSummary.join(ageANDylwPumpCgm, how="left")
-
-                            ageANDylwPumpCgm.reset_index(inplace=True)
-                            ageANDylwSummary.reset_index(inplace=True)
+                            # %% GET AND SAVE RESULTS BY AGE AND YLW
+                            for category in ["age", "ylw", ["age", "ylw"]]:
+                                pumpSummary = get_pumpSummary(basalEvents, bolusEvents, dayData, category)
+
+                                # very low level cgm stats per age
+                                catDF = cgm.groupby(category)
+                                cgmStats = catDF["mg_dL"].describe().add_prefix("cgm.")
+                                pumpCgmSummary = pd.concat([pumpSummary, cgmStats], axis=1)
+
+                                if category == "age":
+                                    pumpCgmSummary.reset_index(inplace=True)
+                                    ageSummary = pd.merge(ageSummary, pumpCgmSummary, on=category, how="left")
+                                    ageSummary["hashID"] = hashID
+                                    allAgeSummaries = pd.concat([allAgeSummaries, ageSummary], ignore_index=True)
+                                    allAgeSummaries.to_csv(os.path.join(outputPath,
+                                        "allAgeSummaries-dIndex-" + str(startIndex) + ".csv"))
+                                elif category == "ylw":
+                                    pumpCgmSummary.reset_index(inplace=True)
+                                    ylwSummary = pd.merge(ylwSummary, pumpCgmSummary, on=category, how="left")
+                                    ylwSummary["hashID"] = hashID
+                                    allYlwSummaries = pd.concat([allYlwSummaries, ylwSummary], ignore_index=True)
+                                    allYlwSummaries.to_csv(os.path.join(outputPath,
+                                        "allYlwSummaries-dIndex-" + str(startIndex) + ".csv"))
+                                else:
 
-                            ageANDylwSummary["hashID"] = hashID
-                            allAgeANDylwSummaries = pd.concat([allAgeANDylwSummaries, ageANDylwSummary], ignore_index=True)
+                                    ageANDylwSummary = ageANDylwSummary.join(pumpCgmSummary, how="left")
+                                    pumpCgmSummary.reset_index(inplace=True)
+                                    pumpCgmSummary.reset_index(inplace=True)
+                                    pumpCgmSummary["hashID"] = hashID
+                                    allAgeANDylwSummaries = pd.concat([allAgeANDylwSummaries, pumpCgmSummary], ignore_index=True)
 
-                            allAgeANDylwSummaries.to_csv(os.path.join(outputPath,
-                                "allAgeANDylwSummaries-dIndex-" + str(startIndex) + ".csv"))
+                                    allAgeANDylwSummaries.to_csv(os.path.join(outputPath,
+                                        "allAgeANDylwSummaries-dIndex-" + str(startIndex) + ".csv"))
 
 
                             # %% save data for this person
@@ -1578,9 +1612,9 @@ def isf_likely_units(df, columnHeading):
                                 outputString = "age-%s-%s-ylw-%s-%s-lp-%s-670g-%s-id-%s"
                                 outputFormat = (f"{int(minAge):02d}",
                                                 f"{int(maxAge):02d}",
-                                            f"{int(minYLW):02d}",
-                                            f"{int(maxYLW):02d}",
-                                            f"{int(nDaysClosedLoop):03d}",
+                                                f"{int(minYLW):02d}",
+                                                f"{int(maxYLW):02d}",
+                                                f"{int(nDaysClosedLoop):03d}",
                                                 f"{int(n670gDays):03d}",
                                                 hashID[0:4])
                                 outputFolderName = outputString % outputFormat
@@ -1592,19 +1626,27 @@ def isf_likely_units(df, columnHeading):
                                 os.makedirs(outputFolderName_Path)
 
                             fName = outputFolderName + "-allSettings.csv"
-                            allSettings.to_csv(os.path.join(outputFolderName_Path, fName))
-                            fName = outputFolderName + "-pumpEvents.csv"
-                            pumpEvents.to_csv(os.path.join(outputFolderName_Path, fName))
-                            fName = outputFolderName + "-cgmLite.csv"
-                            cgmLite.to_csv(os.path.join(outputFolderName_Path, fName))
+                            allSettingsMinusPumpSerial = allSettings.copy().drop(columns=["deviceId"])
+                            allSettingsMinusPumpSerial.to_csv(os.path.join(outputFolderName_Path, fName))
+                            fName = outputFolderName + "-dayData.csv"
+                            dayDataMinusPumpSerial = dayData.copy().drop(columns=["deviceId"])
+                            dayDataMinusPumpSerial.to_csv(os.path.join(outputFolderName_Path, fName))
+                            fName = outputFolderName + "-basalEvents.csv"
+                            basalEvents.to_csv(os.path.join(outputFolderName_Path, fName))
+                            fName = outputFolderName + "-bolusEvents.csv"
+                            bolusEvents.to_csv(os.path.join(outputFolderName_Path, fName))
+                            fName = outputFolderName + "-cgm.csv"
+                            cgm.to_csv(os.path.join(outputFolderName_Path, fName))
 
 
                             # %% save the processed data (saving this data will take up a lot of space and time)
-                            data.to_csv(os.path.join(processedDataPath, "allDataCleaned-PHI-" + userID + ".csv"))
-                            basal.to_csv(os.path.join(processedDataPath, "basal-PHI-" + userID + ".csv"))
-                            bolus.to_csv(os.path.join(processedDataPath, "bolus-PHI-" + userID + ".csv"))
-                            cgmData.to_csv(os.path.join(processedDataPath, "cgm-PHI-" + userID + ".csv"))
-                            pumpSettings.to_csv(os.path.join(processedDataPath, "pumpSettings-PHI-" + userID + ".csv"))
+#                            data.to_csv(os.path.join(processedDataPath, "allDataCleaned-PHI-" + userID + ".csv"))
+#                            basal.to_csv(os.path.join(processedDataPath, "basal-PHI-" + userID + ".csv"))
+#                            bolus.to_csv(os.path.join(processedDataPath, "bolus-PHI-" + userID + ".csv"))
+#                            cgmData.to_csv(os.path.join(processedDataPath, "cgm-PHI-" + userID + ".csv"))
+#                            pumpSettings.to_csv(os.path.join(processedDataPath, "pumpSettings-PHI-" + userID + ".csv"))
+#                            allSettings.to_csv(os.path.join(processedDataPath, "allSettings-PHI-" + userID + ".csv"))
+#                            dayData.to_csv(os.path.join(processedDataPath, "dayData-PHI-" + userID + ".csv"))
 
                         else:
                             metadata["flags"] = "no bolus wizard data"
@@ -1617,9 +1659,9 @@ def isf_likely_units(df, columnHeading):
         else:
             metadata["flags"] = "missing bDay/dDay"
 
-    except:
-        print("something is broke dIndex=", dIndex)
-        metadata["flags"] = "something is broke"
+#    except:
+#        print("something is broke dIndex=", dIndex)
+#        metadata["flags"] = "something is broke"
 
 
     # write metaData to allMetadata

From 99c0de78c890cc9c1c45079488989707c70e8230 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 29 Jan 2019 18:24:26 -0600
Subject: [PATCH 62/78] add cgm and episode stats

---
 .../get-users-settings-and-events.py          | 193 ++++++++++++++++--
 1 file changed, 178 insertions(+), 15 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 21e616b2..fb523d18 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -657,8 +657,130 @@ def get_pumpSummary(basalEventsDF, bolusEventsDF, dayDataDF, categories):
     return pumpSummaryDF
 
 
+def get_episodes(df):
+    df = df.copy().sort_values("localTime").reset_index(drop=True)
+    allEpisodes = pd.DataFrame()
+    cgmFrequency = 5.0
+    episodeCriteria = pd.DataFrame({"threshold": [54, 70, 180, 250],
+                                    "duration": [15, 60, 120, 120],
+                                    "percentReadings": [75, 75, 75, 75],
+                                    "episodeName": ["extreme-hypo", "hypo",
+                                                    "hyper", "extreme-hyper"]})
+    episodes = pd.DataFrame()
+    for episodeType in range(0,len(episodeCriteria)):
+
+        # first find all of the cross points
+        episodeThreshold = episodeCriteria.loc[episodeType, "threshold"]
+        episodeDurationRequirement = episodeCriteria.loc[episodeType, "duration"]
+        episodePercentOfReadings = episodeCriteria.loc[episodeType, "percentReadings"]
+        episodeName = episodeCriteria.loc[episodeType, "episodeName"]
+
+        if episodeThreshold > 110:
+
+            df["startCrossPoint"] = ((df.mg_dL.shift(1) <= episodeThreshold) &
+                                      (df.mg_dL > episodeThreshold))
+
+            df["endCrossPoint"] = ((df.mg_dL.shift(1) > episodeThreshold) &
+                                    (df.mg_dL <= episodeThreshold))
+
+        else:
+            df["startCrossPoint"] = ((df.mg_dL.shift(1) >= episodeThreshold) &
+                                      (df.mg_dL < episodeThreshold))
+
+            df["endCrossPoint"] = ((df.mg_dL.shift(1) < episodeThreshold) &
+                                    (df.mg_dL >= episodeThreshold))
+
+
+        startList = pd.DataFrame(df[df.startCrossPoint].roundedLocalTime)
+        endList = pd.DataFrame(df[df.endCrossPoint].roundedLocalTime)
+        if len(startList) > len(endList):
+            endList = endList.append(
+                    df.loc[df.roundedLocalTime == df.roundedLocalTime.max(),
+                            ["roundedLocalTime"]]
+                    )
+        elif len(startList) < len(endList):
+            startList = startList.append(
+                    df.loc[df.roundedLocalTime == df.roundedLocalTime.min(),
+                            ["roundedLocalTime"]]
+                    ).sort_index()
+
+        if len(startList) == len(endList):
+
+            episodes = pd.concat([startList.reset_index().add_prefix("start."),
+                                  endList.reset_index().add_prefix("end.")], axis=1)
+
+            episodes["durationMinutes"] = \
+                (episodes["end.roundedLocalTime"] - episodes["start.roundedLocalTime"]).dt.seconds / 60
+
+            episodes["totalPoints"] = episodes["end.index"] - episodes["start.index"]
+            episodes["totalPossiblePoints"] = episodes["durationMinutes"] / cgmFrequency
+            episodes["percentOfReadings"] = episodes["totalPoints"] / episodes["totalPossiblePoints"] * 100
+
+        else:
+            "figure out how to resolve this case if it exists"
+            pdb.set_trace()
+
+        episodes = episodes[(episodes.durationMinutes >= episodeDurationRequirement) &
+                            (episodes.percentOfReadings >= episodePercentOfReadings)].reset_index(drop=True)
+        episodes["criterion.name"] = episodeName
+        episodes["criterion.threshold"] = episodeThreshold
+        episodes["criterion.duration"] = episodeDurationRequirement
+        episodes["criterion.percentOfReadings"] = episodePercentOfReadings
+
+        allEpisodes = pd.concat([allEpisodes, episodes]).reset_index(drop=True)
+
+        return allEpisodes
+
+
+def get_cgmStats(df):
+
+    statDF = pd.Series(df.mg_dL.describe())
+    statDF.rename(index={"count":"totalNumberCBGValues"}, inplace=True)
+
+    statDF["mean_mgdL"] = df.mg_dL.mean()
+    statDF["std_mgdL"] = df.mg_dL.std()
+    statDF["cov_mgdL"] = statDF["std_mgdL"] / statDF["mean_mgdL"]
+
+    statDF["totalBelow54"] = sum(df.mg_dL < 54)
+    statDF["totalBelow70"] = sum(df.mg_dL < 70)
+    statDF["total54to70"] = sum((df.mg_dL >= 54) & (df.mg_dL < 70))
+    statDF["total70to140"] = sum((df.mg_dL >= 70) & (df.mg_dL <= 140))
+    statDF["total70to180"] = sum((df.mg_dL >= 70) & (df.mg_dL <= 180))
+    statDF["total180to250"] = sum((df.mg_dL > 180) & (df.mg_dL <= 250))
+    statDF["totalAbove180"] = sum(df.mg_dL > 180)
+    statDF["totalAbove250"] = sum(df.mg_dL > 250)
+
+    statDF["percentBelow54"] = statDF["totalBelow54"] / statDF["totalNumberCBGValues"]
+    statDF["percentBelow70"] = statDF["totalBelow70"] / statDF["totalNumberCBGValues"]
+    statDF["percent70to140"] = statDF["total70to140"] / statDF["totalNumberCBGValues"]
+    statDF["percent70to180"] = statDF["total70to180"] / statDF["totalNumberCBGValues"]
+    statDF["percentAbove180"] = statDF["totalAbove180"] / statDF["totalNumberCBGValues"]
+    statDF["percentAbove250"] = statDF["totalAbove250"]  / statDF["totalNumberCBGValues"]
+
+    statDF["min_mgdL"] = df.mg_dL.min()
+    statDF["median_mgdL"] = df.mg_dL.describe()["50%"]
+    statDF["max_mgdL"] = df.mg_dL.max()
+
+    # calculate the start and end time of the cbg data
+    startTime = df["roundedLocalTime"].min()
+    statDF["startTime"] = startTime
+    endTime = df["roundedLocalTime"].max()
+    statDF["endTime"] = endTime
+    cgmFrequency = np.round((endTime - startTime).seconds / statDF["totalNumberCBGValues"])
+
+    # sense whether cgm data comes in 5 minute or 15 minute intervals
+    cgmFrequency = \
+        np.nanmedian((df["roundedLocalTime"] - df["roundedLocalTime"].shift(1)).dt.seconds / 60)
+
+    statDF["cgmFrequency"] = cgmFrequency
+    statDF["totalNumberPossibleCBGvalues"] = len(pd.date_range(startTime, endTime, freq=str(int(cgmFrequency)) + "min"))
+    statDF["percentCgmValues"] = statDF["totalNumberCBGValues"] / statDF["totalNumberPossibleCBGvalues"]
+
+    return statDF
+
+
 # %% DELELET LATER
-args.startIndex = 0
+args.startIndex = 46
 args.endIndex = 4226
 
 
@@ -1357,7 +1479,34 @@ def get_pumpSummary(basalEventsDF, bolusEventsDF, dayDataDF, categories):
                             totalDailyCarbs.rename(columns={"carbInput": "totalDailyCarbs"}, inplace=True)
                             dayData = pd.merge(dayData, totalDailyCarbs, how="left", on="day")
 
-                            # valid pump should be having exactly 24 hours of basal rate
+                            # get daily cgm stats
+                            cgm.sort_values("localTime", inplace=True)
+                            cgmCountsPerDay = cgm.groupby("day")["mg_dL"].count().reset_index()
+                            cgmCountsPerDay.rename(columns={"mg_dL":"cgmCountsPerDay"}, inplace=True)
+                            cgm = pd.merge(cgm, cgmCountsPerDay, how="left", on="day")
+
+                            cgmStats = cgm[cgm["cgmCountsPerDay"] > 1].groupby("day").apply(get_cgmStats)
+                            # fix start and end times (not sure why the get transformed to ints)
+                            cgmStats["startTime"] = pd.to_datetime(cgmStats["startTime"])
+                            cgmStats["endTime"] = pd.to_datetime(cgmStats["endTime"])
+
+                            cgmStats = cgmStats.add_prefix("cgm.")
+                            cgmStats.reset_index(inplace=True)
+                            dayData = pd.merge(dayData, cgmStats, how="left", on="day")
+
+                            # %% get all episodes
+                            allEpisodes = get_episodes(cgm)
+                            allEpisodes["day"] = allEpisodes["start.roundedLocalTime"].dt.date
+                            allEpisodes = pd.merge(allEpisodes, dayData[["age", "ylw", "day"]], how="left", on="day")
+
+                            for episodeType in allEpisodes["criterion.name"].unique():
+                                episodeGroup = allEpisodes[allEpisodes["criterion.name"] == episodeType].groupby(["day"])
+                                episodeDaySummary = episodeGroup["durationMinutes"].describe().add_prefix(episodeType + "-durationMinutes.")
+                                episodeDaySummary.rename(columns={episodeType + "-durationMinutes.count": episodeType + ".count"}, inplace=True)
+                                episodeDaySummary.reset_index(inplace=True)
+                                dayData = pd.merge(dayData, episodeDaySummary, how="left", on="day")
+
+                            # %% valid pump should be having exactly 24 hours of basal rate
                             dayData["validPumpData"] = dayData["totalBasalDuration"] == 24
                             dayData["atLeast3Boluses"] = dayData["numberOfNormalBoluses"] >= 3
 
@@ -1576,23 +1725,35 @@ def get_pumpSummary(basalEventsDF, bolusEventsDF, dayDataDF, categories):
                             for category in ["age", "ylw", ["age", "ylw"]]:
                                 pumpSummary = get_pumpSummary(basalEvents, bolusEvents, dayData, category)
 
-                                # very low level cgm stats per age
-                                catDF = cgm.groupby(category)
-                                cgmStats = catDF["mg_dL"].describe().add_prefix("cgm.")
+                                # cgm stats per category
+                                catDF = cgm[cgm["cgmCountsPerDay"] > 1].groupby(category)
+                                cgmStats = catDF.apply(get_cgmStats)
+                                # fix start and end times (not sure why the get transformed to ints)
+                                cgmStats["startTime"] = pd.to_datetime(cgmStats["startTime"])
+                                cgmStats["endTime"] = pd.to_datetime(cgmStats["endTime"])
+
+                                cgmStats = cgmStats.add_prefix("cgm.")
                                 pumpCgmSummary = pd.concat([pumpSummary, cgmStats], axis=1)
 
+                                # get all episodes
+                                for episodeType in allEpisodes["criterion.name"].unique():
+                                    episodeGroup = allEpisodes[allEpisodes["criterion.name"] == episodeType].groupby(category)
+                                    episodeDaySummary = episodeGroup["durationMinutes"].describe().add_prefix(episodeType + "-durationMinutes.")
+                                    episodeDaySummary.rename(columns={episodeType + "-durationMinutes.count": episodeType + ".count"}, inplace=True)
+                                    pumpCgmSummary = pd.concat([pumpCgmSummary, episodeDaySummary], axis=1)
+
                                 if category == "age":
                                     pumpCgmSummary.reset_index(inplace=True)
                                     ageSummary = pd.merge(ageSummary, pumpCgmSummary, on=category, how="left")
                                     ageSummary["hashID"] = hashID
-                                    allAgeSummaries = pd.concat([allAgeSummaries, ageSummary], ignore_index=True)
+                                    allAgeSummaries = pd.concat([allAgeSummaries, ageSummary], ignore_index=True, sort=False)
                                     allAgeSummaries.to_csv(os.path.join(outputPath,
                                         "allAgeSummaries-dIndex-" + str(startIndex) + ".csv"))
                                 elif category == "ylw":
                                     pumpCgmSummary.reset_index(inplace=True)
                                     ylwSummary = pd.merge(ylwSummary, pumpCgmSummary, on=category, how="left")
                                     ylwSummary["hashID"] = hashID
-                                    allYlwSummaries = pd.concat([allYlwSummaries, ylwSummary], ignore_index=True)
+                                    allYlwSummaries = pd.concat([allYlwSummaries, ylwSummary], ignore_index=True, sort=False)
                                     allYlwSummaries.to_csv(os.path.join(outputPath,
                                         "allYlwSummaries-dIndex-" + str(startIndex) + ".csv"))
                                 else:
@@ -1601,7 +1762,7 @@ def get_pumpSummary(basalEventsDF, bolusEventsDF, dayDataDF, categories):
                                     pumpCgmSummary.reset_index(inplace=True)
                                     pumpCgmSummary.reset_index(inplace=True)
                                     pumpCgmSummary["hashID"] = hashID
-                                    allAgeANDylwSummaries = pd.concat([allAgeANDylwSummaries, pumpCgmSummary], ignore_index=True)
+                                    allAgeANDylwSummaries = pd.concat([allAgeANDylwSummaries, pumpCgmSummary], ignore_index=True, sort=False)
 
                                     allAgeANDylwSummaries.to_csv(os.path.join(outputPath,
                                         "allAgeANDylwSummaries-dIndex-" + str(startIndex) + ".csv"))
@@ -1637,16 +1798,18 @@ def get_pumpSummary(basalEventsDF, bolusEventsDF, dayDataDF, categories):
                             bolusEvents.to_csv(os.path.join(outputFolderName_Path, fName))
                             fName = outputFolderName + "-cgm.csv"
                             cgm.to_csv(os.path.join(outputFolderName_Path, fName))
+                            fName = outputFolderName + "-allEpisodes.csv"
+                            allEpisodes.to_csv(os.path.join(outputFolderName_Path, fName))
 
 
                             # %% save the processed data (saving this data will take up a lot of space and time)
-#                            data.to_csv(os.path.join(processedDataPath, "allDataCleaned-PHI-" + userID + ".csv"))
-#                            basal.to_csv(os.path.join(processedDataPath, "basal-PHI-" + userID + ".csv"))
-#                            bolus.to_csv(os.path.join(processedDataPath, "bolus-PHI-" + userID + ".csv"))
-#                            cgmData.to_csv(os.path.join(processedDataPath, "cgm-PHI-" + userID + ".csv"))
-#                            pumpSettings.to_csv(os.path.join(processedDataPath, "pumpSettings-PHI-" + userID + ".csv"))
-#                            allSettings.to_csv(os.path.join(processedDataPath, "allSettings-PHI-" + userID + ".csv"))
-#                            dayData.to_csv(os.path.join(processedDataPath, "dayData-PHI-" + userID + ".csv"))
+                            data.to_csv(os.path.join(processedDataPath, "allDataCleaned-PHI-" + userID + ".csv"))
+                            basal.to_csv(os.path.join(processedDataPath, "basal-PHI-" + userID + ".csv"))
+                            bolus.to_csv(os.path.join(processedDataPath, "bolus-PHI-" + userID + ".csv"))
+                            cgmData.to_csv(os.path.join(processedDataPath, "cgm-PHI-" + userID + ".csv"))
+                            pumpSettings.to_csv(os.path.join(processedDataPath, "pumpSettings-PHI-" + userID + ".csv"))
+                            allSettings.to_csv(os.path.join(processedDataPath, "allSettings-PHI-" + userID + ".csv"))
+                            dayData.to_csv(os.path.join(processedDataPath, "dayData-PHI-" + userID + ".csv"))
 
                         else:
                             metadata["flags"] = "no bolus wizard data"

From 6423aeef340f644e6fd165c62adc275a01b08f3a Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 29 Jan 2019 18:46:08 -0600
Subject: [PATCH 63/78] fix return typo

---
 projects/predict-simulate/get-users-settings-and-events.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index fb523d18..357c7180 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -729,7 +729,7 @@ def get_episodes(df):
 
         allEpisodes = pd.concat([allEpisodes, episodes]).reset_index(drop=True)
 
-        return allEpisodes
+    return allEpisodes
 
 
 def get_cgmStats(df):
@@ -780,7 +780,7 @@ def get_cgmStats(df):
 
 
 # %% DELELET LATER
-args.startIndex = 46
+args.startIndex = 0
 args.endIndex = 4226
 
 

From 6934f7c3f085f3ebfeabc7decb76bf56df5df5af Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 29 Jan 2019 19:26:58 -0600
Subject: [PATCH 64/78] turn the try-except back on for run on AWS

---
 .../get-users-settings-and-events.py                 | 12 +++++-------
 1 file changed, 5 insertions(+), 7 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 357c7180..892d0e37 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -820,7 +820,7 @@ def get_cgmStats(df):
     metadata = pd.DataFrame(index=[dIndex])
     metadata["hashID"] = hashID
 
-    if 1 == 1:  # try:
+    try:
         # make folder to save data
         processedDataPath = os.path.join(phiOutputPath, "PHI-" + userID)
         if not os.path.exists(processedDataPath):
@@ -834,7 +834,6 @@ def get_cgmStats(df):
 
 
             # %% LOAD IN DONOR JSON DATA
-
             jsonDataPath = os.path.join(donorPath, phiDate + "-donorJsonData")
             jsonFileName = os.path.join(jsonDataPath, "PHI-" + userID + ".json")
 
@@ -950,8 +949,7 @@ def get_cgmStats(df):
                                 bolus["duration"].replace(0, np.nan, inplace=True)
                                 bolus["durationHours"] = bolus["duration"] / 1000.0 / 3600.0
                                 bolus["rate"] = bolus["extended"] / bolus["durationHours"]
-#                                bolusExtendedCH = ["localTime", "timezone", "roundedTime", "roundedLocalTime",
-#                                                   "durationHours", "rate",  "type"]
+
                                 bolusExtendedCH = ["hashID", "age", "ylw", "day",
                                                    "utcTime", "localTime", "timezone", "tzo",
                                                    "roundedTime", "roundedLocalTime",
@@ -1822,9 +1820,9 @@ def get_cgmStats(df):
         else:
             metadata["flags"] = "missing bDay/dDay"
 
-#    except:
-#        print("something is broke dIndex=", dIndex)
-#        metadata["flags"] = "something is broke"
+    except:
+        print("something is broke dIndex=", dIndex)
+        metadata["flags"] = "something is broke"
 
 
     # write metaData to allMetadata

From 0f8dd254793012cbad1f33414dc4db9bbcb31b25 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 29 Jan 2019 20:08:24 -0600
Subject: [PATCH 65/78] remove input argument bypass

---
 projects/predict-simulate/get-users-settings-and-events.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/predict-simulate/get-users-settings-and-events.py
index 892d0e37..10b2b7c6 100644
--- a/projects/predict-simulate/get-users-settings-and-events.py
+++ b/projects/predict-simulate/get-users-settings-and-events.py
@@ -779,11 +779,6 @@ def get_cgmStats(df):
     return statDF
 
 
-# %% DELELET LATER
-args.startIndex = 0
-args.endIndex = 4226
-
-
 # %% START OF CODE
 dataPulledDate = args.dateStamp
 dataPulledDF = pd.DataFrame(pd.to_datetime(dataPulledDate), columns=["day"], index=[0])

From 7d183b047278b13d387401242d02a8636b1a766a Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Thu, 31 Jan 2019 07:14:41 -0600
Subject: [PATCH 66/78] gather and combine files for analysis

---
 projects/predict-simulate/gather-data.py | 71 ++++++++++++++++++++++++
 1 file changed, 71 insertions(+)
 create mode 100644 projects/predict-simulate/gather-data.py

diff --git a/projects/predict-simulate/gather-data.py b/projects/predict-simulate/gather-data.py
new file mode 100644
index 00000000..b77bcafc
--- /dev/null
+++ b/projects/predict-simulate/gather-data.py
@@ -0,0 +1,71 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+description: gather the ouput from get users settings and events
+version: 0.0.1
+created: 2019-01-30
+author: Ed Nykaza
+dependencies:
+    *
+license: BSD-2-Clause
+"""
+
+# %% REQUIRED LIBRARIES
+import pandas as pd
+import datetime as dt
+import os
+import argparse
+import glob
+
+
+# %% USER INPUTS (ADD THIS IN LATER)
+codeDescription = "Get user's settings and events"
+parser = argparse.ArgumentParser(description=codeDescription)
+
+parser.add_argument("-d",
+                    "--dataPulledDate",
+                    dest="dataPulledDate",
+                    default="2018-09-28",
+                    help="date in '%Y-%m-%d' format of unique donor list" +
+                    "(e.g., PHI-2018-03-02-uniqueDonorList)")
+
+parser.add_argument("-p",
+                    "--dataProcessedDate",
+                    dest="dataProcessedDate",
+                    default="2019-01-21",
+                    help="date in '%Y-%m-%d' format")
+
+args = parser.parse_args()
+
+
+# %% START OF CODE
+dataPulledDate = args.dataPulledDate
+dataProcessedDate = pd.to_datetime(args.dataProcessedDate)
+
+phiDate = "PHI-" + dataPulledDate
+donorPath = os.path.join(
+        "..", "bigdata-processing-pipeline",
+        "data", phiDate + "-donor-data")
+
+outputPath = os.path.join(donorPath, "settings-and-events")
+
+for name in ["allMetadata", "allAgeANDylwSummaries",
+             "allAgeSummaries", "allYlwSummaries",
+             "dayData", "pumpEvents"]:
+    allDF = pd.DataFrame()
+    if name.startswith("all"):
+        files = glob.glob(os.path.join(outputPath, name + '*'))
+    else:
+        files = glob.glob(
+                os.path.join(outputPath, "data", "**", "*-" + name + ".csv"))
+    for f in files:
+        dateModified = \
+            pd.to_datetime(dt.datetime.fromtimestamp(os.path.getmtime(f)))
+        if dateModified > dataProcessedDate:
+            tempDF = pd.read_csv(f, low_memory=False)
+            tempDF.rename(
+                    columns={'Unnamed: 0': 'originalIndex'}, inplace=True)
+            tempDF["from"] = f
+            allDF = pd.concat([allDF, tempDF], ignore_index=True, sort=False)
+    allDF.to_csv(os.path.join(outputPath, "combined-" + name + ".csv"))
+    print("completed " + name)

From f78f128611afd2db1ed7386c8bc7ba1244be2e09 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Thu, 31 Jan 2019 08:10:53 -0600
Subject: [PATCH 67/78] pumpEvents no longer exists, change to basal and bolus
 Events

---
 projects/predict-simulate/gather-data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/projects/predict-simulate/gather-data.py b/projects/predict-simulate/gather-data.py
index b77bcafc..b3c32465 100644
--- a/projects/predict-simulate/gather-data.py
+++ b/projects/predict-simulate/gather-data.py
@@ -51,7 +51,7 @@
 
 for name in ["allMetadata", "allAgeANDylwSummaries",
              "allAgeSummaries", "allYlwSummaries",
-             "dayData", "pumpEvents"]:
+             "basalEvents", "bolusEvents"]:
     allDF = pd.DataFrame()
     if name.startswith("all"):
         files = glob.glob(os.path.join(outputPath, name + '*'))

From ec8b7cb5f01c3b6aa6f82f1bd527f4c3505f5eb0 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Thu, 31 Jan 2019 08:14:21 -0600
Subject: [PATCH 68/78] add dayData back to output

---
 projects/predict-simulate/gather-data.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/projects/predict-simulate/gather-data.py b/projects/predict-simulate/gather-data.py
index b3c32465..d2093b22 100644
--- a/projects/predict-simulate/gather-data.py
+++ b/projects/predict-simulate/gather-data.py
@@ -51,7 +51,7 @@
 
 for name in ["allMetadata", "allAgeANDylwSummaries",
              "allAgeSummaries", "allYlwSummaries",
-             "basalEvents", "bolusEvents"]:
+             "dayData", "basalEvents", "bolusEvents"]:
     allDF = pd.DataFrame()
     if name.startswith("all"):
         files = glob.glob(os.path.join(outputPath, name + '*'))

From 500c124531879c93fee23852aa797e6439d6452a Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Fri, 8 Feb 2019 07:59:56 -0600
Subject: [PATCH 69/78] update packages to include allow static figure recreate
 with plotly

---
 environment.yml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/environment.yml b/environment.yml
index 7061885d..043d73c7 100644
--- a/environment.yml
+++ b/environment.yml
@@ -12,6 +12,9 @@ dependencies:
 - matplotlib
 - scikit-learn
 - plotly
+- plotly::plotly-orca
+- poppler
+- psutil
 - r
 - r-essentials
 - pytest

From f5f146ecceda14f580b1bb1ad987f3c7e5131bce Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Wed, 13 Feb 2019 09:04:14 -0600
Subject: [PATCH 70/78] add a work in progress (wip) to gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index ba5690ed..8492155a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,6 +24,7 @@ data
 figures
 isf-basal-figures
 fonts
+wip
 
 # Test
 htmlcov

From effb6ad73ddeaa32213903998013dcea72ed8bc5 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 20 May 2019 10:34:13 -0500
Subject: [PATCH 71/78] code used to generate summary tables

---
 .../gather-data.py                            |    0
 .../get-users-settings-and-events.py          |    0
 .../visualize-users-settings-and-events-v3.py | 1273 +++++++++++++++++
 3 files changed, 1273 insertions(+)
 rename projects/{predict-simulate => get-donors-pump-settings}/gather-data.py (100%)
 rename projects/{predict-simulate => get-donors-pump-settings}/get-users-settings-and-events.py (100%)
 create mode 100644 projects/get-donors-pump-settings/visualize-users-settings-and-events-v3.py

diff --git a/projects/predict-simulate/gather-data.py b/projects/get-donors-pump-settings/gather-data.py
similarity index 100%
rename from projects/predict-simulate/gather-data.py
rename to projects/get-donors-pump-settings/gather-data.py
diff --git a/projects/predict-simulate/get-users-settings-and-events.py b/projects/get-donors-pump-settings/get-users-settings-and-events.py
similarity index 100%
rename from projects/predict-simulate/get-users-settings-and-events.py
rename to projects/get-donors-pump-settings/get-users-settings-and-events.py
diff --git a/projects/get-donors-pump-settings/visualize-users-settings-and-events-v3.py b/projects/get-donors-pump-settings/visualize-users-settings-and-events-v3.py
new file mode 100644
index 00000000..7f4c38e4
--- /dev/null
+++ b/projects/get-donors-pump-settings/visualize-users-settings-and-events-v3.py
@@ -0,0 +1,1273 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jan 22 06:46:33 2019
+
+@author: ed
+"""
+
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+description: visualize users settings and events
+version: 0.0.1
+created: 2019-01-11
+author: Ed Nykaza
+dependencies:
+    *
+license: BSD-2-Clause
+"""
+
+
+# %% REQUIRED LIBRARIES
+import pandas as pd
+import numpy as np
+from pytz import timezone
+from datetime import timedelta
+import datetime as dt
+import os
+import argparse
+import pdb
+import matplotlib.pyplot as plt
+import plotly
+import plotly.plotly as py
+import plotly.graph_objs as go
+from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
+import plotly.io as pio
+
+
+# %% FUNCTIONS
+def make_bold(val_list):
+    bold_list = []
+    for val in val_list:
+        bold_list.append('<b>' + str(val) + '</b>')
+    return bold_list
+
+def make_bold_and_round(val_list, nDecimalPlaces):
+    bold_list = []
+    for val in val_list:
+        if nDecimalPlaces == 0:
+            bold_list.append('<b>' + str(int(np.round(val, nDecimalPlaces))) + '</b>')
+
+        else:
+            bold_list.append('<b>' + str(np.round(val, nDecimalPlaces)) + '</b>')
+    return bold_list
+
+
+def save_fig(fig, plot_name, width, height, scale):
+    pio.write_image(
+    fig,
+    os.path.join(
+        "..",
+        "figures",
+        plot_name + ".png"
+    ),
+    width=width,
+    height=height,
+    scale=scale)
+
+    return
+
+
+def make_static_plot(field, yLabel, figName, df, yMin, yMax):
+
+    df.sort_values("categories", inplace=True)
+
+    traces = []
+    for yd in df.categories.unique():
+        traces.append(go.Box(
+            y=df.loc[df["categories"] == yd, field].values,
+            x=df.loc[df["categories"] == yd, "categories"].values,
+            name=yd,
+            boxpoints="all",
+            notched=True,
+            hoverlabel=dict(font=dict(size=22)),
+            marker=dict(
+                color=df.loc[df["categories"] == yd, "allColors"].describe()["top"],
+                opacity=0,
+            ),
+        ))
+
+    layout = go.Layout(
+        font=dict(
+            size=22
+        ),
+        xaxis=dict(
+            tickangle=52.5
+        ),
+        yaxis=dict(
+            title=yLabel,
+            range=[yMin, yMax],
+            showgrid=True,
+            gridcolor='#f1f3f4',
+            gridwidth=2,
+            zeroline=True,
+            zerolinecolor='#f1f3f4',
+            zerolinewidth=2,
+        ),
+        margin=dict(
+            l=100,
+            r=200,
+            b=250,
+            t=50,
+        ),
+
+        boxmode='group',
+        showlegend=False,
+        legend=dict(font=dict(size=14))
+    )
+
+    fig = go.Figure(data=traces, layout=layout)
+
+    save_fig(fig, figName + "-boxplot-lowRes", 1800, 1200, 1)
+    save_fig(fig, figName + "-boxplot-highRes", 1800, 1200, 4)
+
+def make_static_table(field, figName, filteredDF, nDecimals, return_summaryTable=False):
+
+    # first make an overall table
+    allCounts = filteredDF.groupby(["hashID"])[field].describe()
+    allAgeTable = pd.DataFrame(index=[field])
+    allAgeTable["min"] = allCounts["min"].min()
+    allAgeTable["max"] = allCounts["max"].max()
+    allAgeTable["U"] = len(allCounts)
+    allAgeTable["N"] = allCounts["count"].sum()
+
+    # then make summary per categories
+    uniqueCounts = filteredDF.groupby(["categories"])["hashID"].describe()
+    uniqueCounts.reset_index(inplace=True)
+    summaryTable = filteredDF.groupby("categories")[field].describe()
+    summaryTable.reset_index(inplace=True)
+    summaryTable = pd.merge(summaryTable, uniqueCounts[["categories", "unique"]], how="left", on="categories")
+    summaryTable = pd.merge(summaryTable, catColorDF, how="left", on="categories")
+    summaryTable["unique"] = summaryTable["unique"].astype(float)
+
+    # add in interquartile range
+    summaryTable["IQR"] = summaryTable["75%"] - summaryTable["25%"]
+
+    col_headings = make_bold(["Group", "N", "U", "Average", "Stdev", "Min", "Q1", "Median", "Q3", "Max"])
+
+    trace = go.Table(
+        header=dict(values=col_headings,
+                    fill = dict(color='white'),
+                    align = ['center', 'center', 'center'],
+                    font = dict(color = 'black', size=12)),
+        columnwidth=[1.5, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        cells=dict(values=[make_bold(summaryTable["categories"]),
+                           make_bold_and_round(summaryTable["count"], 0),
+                           make_bold_and_round(summaryTable["unique"], 0),
+                           make_bold_and_round(summaryTable["mean"], nDecimals),
+                           make_bold_and_round(summaryTable["std"], nDecimals),
+                           make_bold_and_round(summaryTable["min"], nDecimals),
+                           make_bold_and_round(summaryTable["25%"], nDecimals),
+                           make_bold_and_round(summaryTable["50%"], nDecimals),
+                           make_bold_and_round(summaryTable["75%"], nDecimals),
+                           make_bold_and_round(summaryTable["max"], nDecimals)],
+                   fill = dict(color = [summaryTable["allColors"]]),
+                   align = ['center', 'center', 'center'],
+                   font = dict(color = 'black', size=10),
+                   height = 20)
+        )
+
+    fig = go.Figure()
+    fig.add_trace(trace)
+
+    pio.write_image(
+        fig,
+        os.path.join(
+            "..",
+            "figures",
+            figName + "-table-highRes.png"
+        ),
+        width=1200,
+        height=1200,
+        scale=4)
+
+    pio.write_image(
+        fig,
+        os.path.join(
+            "..",
+            "figures",
+            figName + "-table-lowRes.png"
+        ),
+        width=1200,
+        height=1200,
+        scale=1)
+
+    summaryTable.to_csv(
+        os.path.join(
+            "..",
+            "figures",
+            figName + "-table.csv"
+        )
+    )
+    allAgeTable.to_csv(
+        os.path.join(
+            "..",
+            "figures",
+            figName + "-all-age-table.csv"
+        )
+    )
+
+    if return_summaryTable:
+        return summaryTable, allAgeTable
+    else:
+        return
+
+
+def make_lite_interactive_boxplot(field, yLabel, df, yMin, yMax):
+    df.sort_values("categories", inplace=True)
+
+    traces = []
+    for yd in df.categories.unique():
+        yValues = df.loc[df["categories"] == yd, field]
+        yStats = yValues.describe()
+        yMinimum = yStats["min"]
+        yQ1 = yStats["25%"]
+        yQ2 = yStats["50%"]
+        yQ3 = yStats["75%"]
+        yMaximum = yStats["max"]
+        yIQR = yQ3 - yQ1
+        maxWhisker = yIQR * 1.5
+        lowWhiskerBound = yQ1 - maxWhisker
+        highWhiskerBound = yQ3 + maxWhisker
+        yLowerFence = yValues[yValues >= lowWhiskerBound].min()
+        yUpperFence = yValues[yValues <= highWhiskerBound].max()
+        yBoxData = [yMinimum, yLowerFence, yQ1, yQ1, yQ1, yQ1, yQ1,
+                    yQ2, yQ3, yQ3, yQ3, yQ3, yQ3,
+                    yUpperFence, yMaximum]
+
+        # get N and U
+        nDays = df.loc[df["categories"] == yd, "count"].median().astype(int)
+        uniqueDonors = df.loc[df["categories"] == yd, "unique"].median().astype(int)
+
+        traces.append(go.Box(
+            y=yBoxData,
+            jitter=0,
+            pointpos=0,
+            text=list(np.repeat("N=%s, U=%s" % (nDays, uniqueDonors), len(yBoxData))),
+            hoverinfo="y+text",
+            name=yd,
+            boxpoints="all",
+            notched=False,
+            marker=dict(
+                color=df.loc[df["categories"] == yd, "allColors"].describe()["top"],
+                opacity=0,
+            ),
+        ))
+
+    layout = go.Layout(
+        yaxis=dict(
+            title=yLabel,
+            range=[yMin, yMax],
+            showgrid=True,
+            gridcolor='#f1f3f4',
+            gridwidth=2,
+            zeroline=True,
+            zerolinecolor='#f1f3f4',
+            zerolinewidth=2,
+        ),
+        showlegend=True
+    )
+
+    fig = go.Figure(data=traces, layout=layout)
+    plot_url = py.plot(fig, filename="Distribution of " + figName, auto_open=False)
+    print(figName, plot_url)
+
+    return
+
+
+def filter_data(df, min_days_criteria=7):
+
+    # keep all type1 adn null diagnosis data (not specified)
+    df = df[((df.diagnosisType.isnull()) | (df.diagnosisType == "type1"))]
+
+    # filter out invalid ages and ylw
+    df = df[((df.age.astype(float) >= 0) & (df.age.astype(float) <= 90))]
+    df = df[((df.ylw.astype(float) >= 0) & (df.ylw.astype(float) <= 80))]
+
+    # filter out invalid pump and cgm days
+    df = df[((df["validPumpData"]) & (df["validCGMData"]))]
+
+    # filter out Paradigm Veo Pumps
+    df = df[~df["pump.top"].str.contains("Paradigm Veo")]
+
+    # filter out omnipod with mg/dL likely settings
+    df = df[~((df["pump.top"].str.contains("InsOmn-130")) &
+              (df['pumpSettings.isfLikelyUnits'] == "mg/dL"))]
+
+    # require a minimum number of days of data
+    dayGroups = pd.DataFrame(df.groupby(["hashID", "age", "ylw"]).day.count()).reset_index()
+    dayGroups.rename(columns={"day": "nDays"}, inplace=True)
+    df = pd.merge(df, dayGroups, how="left", on=["hashID", "age", "ylw"])
+
+    df = df[df["nDays"] >= min_days_criteria]
+
+    return df
+
+
+def merge_dayData(df, dayDF):
+
+    df = pd.merge(
+        df,
+        dayDF[[
+            "hashID",
+            "day",
+            "validPumpData",
+            "atLeast3Boluses",
+            "validCGMData",
+            "diagnosisType",
+            "pump.top",
+            "pumpSettings.isfLikelyUnits"
+        ]],
+        how="left",
+        on=["hashID", "day"]
+    )
+
+    return df
+
+
+def bin_data(df, ageBins, ageGroupNames, ylwBins, ylwGroupNames, catColorDF, min_unique_donors=10):
+
+    # bin data (defined above)
+    df["ageBins"] = pd.cut(df["age"], ageBins, labels=ageGroupNames)
+    df["ylwBins"] = pd.cut(df["ylw"], ylwBins, labels=ylwGroupNames)
+    df["ageCategories"] = df["ageBins"].astype(str)
+    df["ylwCategories"] = df["ylwBins"].astype(str)
+    df["categories"] = "age " + df["ageBins"].astype(str) + " ylw " + df["ylwBins"].astype(str)
+
+    # attach bin colors (defined above)
+    df = pd.merge(df, catColorDF, how="left", on="categories")
+    df["categories"].astype("category", inplace=True)
+
+    # attach counts per group
+    dGroups = df.groupby("categories")
+    groupDF = dGroups["hashID"].describe()
+    groupDF["ageCategories"] = dGroups["ageCategories"].describe()["top"]
+    groupDF["ylwCategories"] = dGroups["ylwCategories"].describe()["top"]
+    #groupDF["ylwAlpha"] = dGroups["ylwAlpha"].mean()
+    groupDF["allColors"] = dGroups["allColors"].describe()["top"]
+    groupDF.reset_index(inplace=True)
+
+    # attach group counts to the main dataframe
+    df = pd.merge(df, groupDF[["categories", "count", "unique"]], how="left", on="categories")
+
+    # remove all categories that do NOT have at least 10 unique people
+    df = df[df["unique"] > min_unique_donors]
+    groupDF = groupDF[groupDF["unique"] > min_unique_donors]
+
+    # attach N and U to the categories
+    df["categoriesFull"] = (
+        df["categories"].astype(str) +
+        " (N=" + df["count"].astype(str) +
+        ", U=" + df["unique"].astype(str) +  ")"
+    )
+
+    return df, groupDF
+
+
+# %% define age and years living with bins
+group_title = "-withYlw0"
+
+
+# next bin the data by age-ylw groups
+dataGroupName = "age-ylw-groups"
+ageBins = np.array([0,5,8,12,17,24,85])
+ylwBins = np.array([-1,0,1,2,5,10,25,75])
+
+# bin by age
+ageGroupNames = []
+for x, y in zip(ageBins[:-1]+1, ageBins[1:]):
+    ageGroupNames.append("%s-%s"%(f"{x:02d}", f"{y:02d}"))
+
+ylwGroupNames = []
+for x, y in zip(ylwBins[:-1]+1, ylwBins[1:]):
+    if x == y:
+        ylwGroupNames.append("%s"%(f"{x:02d}"))
+    else:
+        ylwGroupNames.append("%s-%s"%(f"{x:02d}", f"{y:02d}"))
+
+catColors = [
+    '#f0d8e5','#f4bdd8','#f7a0cc','#f781bf',
+    '#ebc3c1','#f1a095','#f17d6c','#ec5644','#e41a1c',
+    '#f2d8c3','#fbc299','#ffac6f','#ff9746','#ff7f00',
+    '#d0e1cc','#b8d8b2','#9fcd97','#86c37e','#6cb964','#4daf4a',
+    '#c9d6e3','#afc4da','#95b1d2','#7aa0c9','#5b8fc1','#377eb8',
+    '#dacbde','#d0b6d4','#c5a1ca','#ba8dc0','#af78b7','#a464ad','#984ea3'
+]
+
+finalCategories = [
+        'age 01-05 ylw 00', 'age 01-05 ylw 01', 'age 01-05 ylw 02',
+        'age 01-05 ylw 03-05', 'age 06-08 ylw 00', 'age 06-08 ylw 01',
+        'age 06-08 ylw 02', 'age 06-08 ylw 03-05', 'age 06-08 ylw 06-10',
+        'age 09-12 ylw 00', 'age 09-12 ylw 01', 'age 09-12 ylw 02',
+        'age 09-12 ylw 03-05', 'age 09-12 ylw 06-10', 'age 13-17 ylw 00',
+        'age 13-17 ylw 01', 'age 13-17 ylw 02', 'age 13-17 ylw 03-05',
+        'age 13-17 ylw 06-10', 'age 13-17 ylw 11-25', 'age 18-24 ylw 00',
+        'age 18-24 ylw 01', 'age 18-24 ylw 02', 'age 18-24 ylw 03-05',
+        'age 18-24 ylw 06-10', 'age 18-24 ylw 11-25', 'age 25-85 ylw 00',
+        'age 25-85 ylw 01', 'age 25-85 ylw 02', 'age 25-85 ylw 03-05',
+        'age 25-85 ylw 06-10', 'age 25-85 ylw 11-25',
+        'age 25-85 ylw 26-75'
+]
+
+catColorDF = pd.DataFrame(data=[finalCategories, catColors], index=["categories", "allColors"]).T
+
+
+# %% load in summary donor data
+dataPulledDate = "2019-01-10"
+dataProcessedDate = "2019-01-22"
+
+phiDate = "PHI-" + dataPulledDate
+donorPath = os.path.join("..", "..", "bigdata-processing-pipeline", "data", phiDate + "-donor-data")
+donorList = phiDate + "-uniqueDonorList"
+donors = pd.read_csv(os.path.join(donorPath, donorList + ".csv"), low_memory=False)
+
+
+# %% all-donors summary
+allAgeSummary = pd.DataFrame()
+dataPath = os.path.join(donorPath, "settings-and-events")
+d = pd.read_csv(os.path.join(dataPath, "combined-allMetadata.csv"), low_memory=False)
+
+# attach the donor level data to the
+allMetadata = pd.merge(
+    d,
+    donors[[
+        "hashID",
+        "userID",
+        "diagnosisType",
+        "targetDevices",
+        "targetTimezone",
+        "termsAccepted"
+    ]],
+    how="left",
+    on="hashID"
+)
+allMetadata.to_csv(os.path.join(donorPath, donorList + "-w-metaData.csv"))
+
+
+# %% load data
+dayData = pd.read_csv(os.path.join(dataPath, "combined-dayData.csv"), low_memory=False)
+bolusData = pd.read_csv(os.path.join(dataPath, "combined-bolusEvents.csv"), low_memory=False)
+basalData = pd.read_csv(os.path.join(dataPath, "combined-basalEvents.csv"), low_memory=False)
+
+# %% attach the diagnosis type to the day data
+dayDF = pd.merge(
+    dayData,
+    allMetadata[[
+        "hashID",
+        "diagnosisType",
+        "pump.top",
+        "pumpSettings.isfLikelyUnits"
+    ]],
+    how="left",
+    on="hashID"
+)
+
+dayDF = filter_data(dayDF, min_days_criteria=7)
+dayDF, dayDFGroupSummary = (
+    bin_data(
+        dayDF,
+        ageBins,
+        ageGroupNames,
+        ylwBins,
+        ylwGroupNames,
+        catColorDF,
+        min_unique_donors=10
+    )
+)
+
+
+# %% all-event level summary (max basal and max bolus)
+# attach the day to bolus data and filter data by analysis criteria
+# NOTE: seet the filter_data function for details
+bolus = merge_dayData(bolusData, dayDF)
+bolus = filter_data(bolus, min_days_criteria=7)
+bolus, bolusGroupSummary = (
+    bin_data(
+        bolus,
+        ageBins,
+        ageGroupNames,
+        ylwBins,
+        ylwGroupNames,
+        catColorDF,
+        min_unique_donors=10
+    )
+)
+
+# %% overview of bolus data table
+figName = "overviewTable-bolus-events"
+figName = figName + group_title
+trace = go.Table(
+    header=dict(
+        values=make_bold(["AGE-YLW Group",
+                          "Age",
+                          "Years Living with T1D",
+                          "N (Bolus Events)",
+                          "U (Unique Donors)"]),
+        align = ['center', 'center', 'center'],
+        font = dict(color = 'black', size=14)
+    ),
+    cells=dict(
+        values=[make_bold(bolusGroupSummary['categories']),
+                make_bold(bolusGroupSummary['ageCategories']),
+                make_bold(bolusGroupSummary['ylwCategories']),
+                make_bold(bolusGroupSummary['count']),
+                make_bold(bolusGroupSummary['unique'])],
+        fill = dict(color = [bolusGroupSummary["allColors"]]),
+        align = ['center', 'center', 'center'],
+        font = dict(color = 'black', size=11),
+        height = 22
+    ),
+)
+
+fig = go.Figure()
+fig.add_trace(trace)
+
+pio.write_image(
+    fig,
+    os.path.join(
+        "..",
+        "figures",
+        figName + "-highRes.png"
+    ),
+    width=1200,
+    height=1200,
+    scale=4)
+
+pio.write_image(
+    fig,
+    os.path.join(
+        "..",
+        "figures",
+        figName + "-lowRes.png"
+    ),
+    width=1200,
+    height=1200,
+    scale=1)
+
+
+# %% max bolus amount ()
+maxBolus = pd.DataFrame(bolus.groupby(["hashID", "day"])["unitsInsulin"].max()).reset_index()
+maxBolus.rename(columns={"unitsInsulin":"maxBolusPerDay"}, inplace=True)
+
+maxBolus = pd.merge(
+    maxBolus,
+    dayDF[[
+        "hashID",
+        "day",
+        "categories",
+        "allColors"
+    ]],
+    how="left",
+    on=["hashID", "day"]
+)
+
+# remove nans in category as they represent data from days that did not meat the
+# acceptable day standard
+maxBolus = maxBolus[maxBolus["categories"].notnull()]
+
+field = 'maxBolusPerDay'
+yLabel = "Max Bolus Per Day (U)"
+figName = "Max Bolus"
+yMin = 0
+yMax = 21
+filteredDF = maxBolus[maxBolus[field] > 0].copy()
+
+## make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+# add N events and n unique donors
+filteredDF = pd.merge(
+    filteredDF,
+    summaryTable[[
+        "categories",
+        "count",
+        "unique"
+    ]],
+    how="left",
+    on="categories"
+)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% basal data
+basal = merge_dayData(basalData, dayDF)
+basal = filter_data(basal, min_days_criteria=7)
+basal, basalGroupSummary = (
+    bin_data(
+        basal,
+        ageBins,
+        ageGroupNames,
+        ylwBins,
+        ylwGroupNames,
+        catColorDF,
+        min_unique_donors=10
+    )
+)
+
+
+# %% overview of basal data table
+figName = "overviewTable-basal-events"
+figName = figName + group_title
+
+trace = go.Table(
+    header=dict(
+        values=make_bold(["AGE-YLW Group",
+                          "Age",
+                          "Years Living with T1D",
+                          "N (Basal Events)",
+                          "U (Unique Donors)"]),
+        align = ['center', 'center', 'center'],
+        font = dict(color = 'black', size=14)
+    ),
+    cells=dict(
+        values=[make_bold(basalGroupSummary['categories']),
+                make_bold(basalGroupSummary['ageCategories']),
+                make_bold(basalGroupSummary['ylwCategories']),
+                make_bold(basalGroupSummary['count']),
+                make_bold(basalGroupSummary['unique'])],
+        fill = dict(color = [basalGroupSummary["allColors"]]),
+        align = ['center', 'center', 'center'],
+        font = dict(color = 'black', size=11),
+        height = 22
+    ),
+)
+
+fig = go.Figure()
+fig.add_trace(trace)
+
+pio.write_image(
+    fig,
+    os.path.join(
+        "..",
+        "figures",
+        figName + "-highRes.png"
+    ),
+    width=1200,
+    height=1200,
+    scale=4)
+
+pio.write_image(
+    fig,
+    os.path.join(
+        "..",
+        "figures",
+        figName + "-lowRes.png"
+    ),
+    width=1200,
+    height=1200,
+    scale=1)
+
+
+# %% max basal rate
+maxBasal = pd.DataFrame(basal[basal["type"]=="basal"].groupby(["hashID", "day"])["rate"].max()).reset_index()
+
+maxBasal.rename(columns={"rate":"maxBasalRatePerDay"}, inplace=True)
+
+maxBasal = pd.merge(
+    maxBasal,
+    dayDF[[
+        "hashID",
+        "day",
+        "categories",
+        "allColors"
+    ]],
+    how="left",
+    on=["hashID", "day"]
+)
+
+# remove nans in category as they represent data from days that did not meat the
+# acceptable day standard
+maxBasal = maxBasal[maxBasal["categories"].notnull()]
+
+field = 'maxBasalRatePerDay'
+yLabel = "Max Basal Per Day (U/hr)"
+figName = "Max Basal"
+yMin = 0
+yMax = 3.25
+filteredDF = maxBasal[maxBasal[field] > 0].copy()
+
+## make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+# add N events and n unique donors
+filteredDF = pd.merge(
+    filteredDF,
+    summaryTable[[
+        "categories",
+        "count",
+        "unique"
+    ]],
+    how="left",
+    on="categories"
+)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% overview of day level data table
+figName = "overviewTable-day-data"
+figName = figName + group_title
+
+trace = go.Table(
+    header=dict(
+        values=make_bold(["AGE-YLW Group",
+                          "Age",
+                          "Years Living with T1D",
+                          "N (Days)",
+                          "U (Unique Donors)"]),
+        align = ['center', 'center', 'center'],
+        font = dict(color = 'black', size=14)
+    ),
+    cells=dict(
+        values=[make_bold(dayDFGroupSummary['categories']),
+                make_bold(dayDFGroupSummary['ageCategories']),
+                make_bold(dayDFGroupSummary['ylwCategories']),
+                make_bold(dayDFGroupSummary['count']),
+                make_bold(dayDFGroupSummary['unique'])],
+        fill = dict(color = [dayDFGroupSummary["allColors"]]),
+        align = ['center', 'center', 'center'],
+        font = dict(color = 'black', size=11),
+        height = 22
+    ),
+)
+
+fig = go.Figure()
+fig.add_trace(trace)
+
+pio.write_image(
+    fig,
+    os.path.join(
+        "..",
+        "figures",
+        figName + "-highRes.png"
+    ),
+    width=1200,
+    height=1200,
+    scale=4)
+
+pio.write_image(
+    fig,
+    os.path.join(
+        "..",
+        "figures",
+        figName + "-lowRes.png"
+    ),
+    width=1200,
+    height=1200,
+    scale=1)
+
+
+# %% Average ISF per day
+dayDF["isfRounded"] = dayDF['isf.weightedMean'].round(1)
+field = 'isfRounded'
+yLabel = "Insulin Sensitivity Factor (mg/dL/U)"
+figName = "Insulin Sensitivity Factor"
+yMin = 0
+yMax = 400
+filteredDF = dayDF[dayDF[field] > 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 0
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Average CIR per day
+field = 'cir.weightedMean'
+yLabel = "Carb to Insulin Ratio (g/U)"
+figName = "Carb to Insulin Ratio"
+yMin = 0
+yMax = 70
+filteredDF = dayDF[dayDF[field] > 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Average Correction Target per day
+field = 'ct.target.weightedMean'
+yLabel = "Correction Target (mg/dL)"
+figName = "Correction Target"
+yMin = 70
+yMax = 180
+filteredDF = dayDF[dayDF[field] > 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 0
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Average Basal Rate per day
+field = 'sbr.weightedMean'
+yLabel = "Scheduled Basal Rate (U/hr)"
+figName = "Scheduled Basal Rate"
+yMin = 0
+yMax = 2.5
+filteredDF = dayDF[dayDF[field] > 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 3
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Total Daily Dose
+field = "totalAmountOfInsulin"
+yLabel = "Total Daily Dose (U)"
+figName = "Total Daily Dose"
+yMin = 0
+yMax = 125
+filteredDF = dayDF[dayDF[field] > 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Percent Basal
+dayDF["perecentBasalInPercent"] = dayDF["percentBasal"] * 100
+field = "perecentBasalInPercent"
+yLabel = "Basal Proportion of Total Daily Dose (%)"
+figName = "Basal Proportion of Total Daily Dose"
+yMin = 0
+yMax = 100
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Total Daily Carbs
+field = "totalDailyCarbs"
+yLabel = "Total Daily Carbs (g)"
+figName = "Total Daily Carbs"
+yMin = 0
+yMax = 600
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 0
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Daily Time in Range (70-180 mg/dL)
+dayDF["perecentInRange"] = dayDF["cgm.percent70to180"] * 100
+field = "perecentInRange"
+yLabel = "Percent of Day in Targe Range (70-180 mg/dL, %)"
+figName = "Percent of Day in Targe Range 70-180"
+yMin = 0
+yMax = 100
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Mean CGM (mg/dL)
+field = "cgm.mean_mgdL"
+yLabel = "Daily Average CGM Level (mg/dL)"
+figName = "Daily Average CGM Level"
+yMin = 50
+yMax = 300
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 0
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Cov CGM (mg/dL)
+dayDF["covPercent"] = dayDF["cgm.cov_mgdL"] * 100
+field = "covPercent"
+yLabel = "Coeffient of Variation (%)"
+figName = "Coeffient of Variation"
+yMin = 6
+yMax = 62
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Daily Time Below 54 (Percentage)
+dayDF["perecentBelow54mgdL"] = dayDF["cgm.percentBelow54"] * 100
+field = "perecentBelow54mgdL"
+yLabel = "Percent of Day Below 54 mg/dL (%)"
+figName = "Percent of Day in Extreme Hypo Below 54 mgdL"
+yMin = 0
+yMax = 5
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 2
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Number of Below 54 mg/dL Episodes per Day
+field = "extreme-hypo.count"
+dayDF[field].fillna(0, inplace=True)
+yLabel = "Number of Extreme Hypo Episodes (Below 54 mg/dL) per Day"
+figName =  "Number of Extreme Hypo Episodes per Day"
+yMin = 0
+yMax = 2
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Average Duration of each Episode Below 54 mg/dL
+field = "extreme-hypo-durationMinutes.mean"
+yLabel = "Average Duration of each Extreme Hypo Episode (minutes)"
+figName =  "Average Duration of each Extreme Hypo Episode"
+yMin = 15
+yMax = 120
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 0
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Daily Time Above 250 (Percentage)
+dayDF["perecentAbove250mgdL"] = dayDF["cgm.percentAbove250"] * 100
+field = "perecentAbove250mgdL"
+yLabel = "Percent of Day Above 250 mg/dL (%)"
+figName = "Percent of Day in Extreme Hyper Above 250 mgdL"
+yMin = 0
+yMax = 75
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 0
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Number of Above 250 mg/dL Episodes per Day
+field = "extreme-hyper.count"
+dayDF[field].fillna(0, inplace=True)
+yLabel = "Number of Extreme Hyper Episodes (Above 250 mg/dL) per Day"
+figName =  "Number of Extreme Hyper Episodes per Day"
+yMin = 0
+yMax = 2
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Average Duration of each Episode Above 250 mg/dL
+dayDF["avgExtremeHyperHours"] = dayDF["extreme-hyper-durationMinutes.mean"] / 60
+field = "avgExtremeHyperHours"
+yLabel = "Average Duration of each Extreme Hyper Episode (hours)"
+figName =  "Average Duration of each Extreme Hyper Episode"
+yMin = 2
+yMax = 10
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% save the all age summaries
+figName = "allAgeSettingSummary" + group_title
+allAgeSummary.to_csv(
+    os.path.join(
+        "..",
+        "figures",
+        figName + "-all-age-table.csv"
+    )
+)

From 1c9f0fc69e02577d2aee6d84db5e769bd0d56ab0 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Mon, 20 May 2019 10:42:38 -0500
Subject: [PATCH 72/78] exclude parser output in gitignore

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 8492155a..1c0f392a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,6 +25,7 @@ figures
 isf-basal-figures
 fonts
 wip
+projects/parsers/output/
 
 # Test
 htmlcov

From 7542bf13e1252dbfdfb64bd73256a6ad075a02a6 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Thu, 23 May 2019 11:24:23 -0500
Subject: [PATCH 73/78] change location of data path and file name

---
 ...-and-events-v3.py => visualize-users-settings-and-events.py} | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
 rename projects/get-donors-pump-settings/{visualize-users-settings-and-events-v3.py => visualize-users-settings-and-events.py} (99%)

diff --git a/projects/get-donors-pump-settings/visualize-users-settings-and-events-v3.py b/projects/get-donors-pump-settings/visualize-users-settings-and-events.py
similarity index 99%
rename from projects/get-donors-pump-settings/visualize-users-settings-and-events-v3.py
rename to projects/get-donors-pump-settings/visualize-users-settings-and-events.py
index 7f4c38e4..502a41c2 100644
--- a/projects/get-donors-pump-settings/visualize-users-settings-and-events-v3.py
+++ b/projects/get-donors-pump-settings/visualize-users-settings-and-events.py
@@ -418,7 +418,7 @@ def bin_data(df, ageBins, ageGroupNames, ylwBins, ylwGroupNames, catColorDF, min
 dataProcessedDate = "2019-01-22"
 
 phiDate = "PHI-" + dataPulledDate
-donorPath = os.path.join("..", "..", "bigdata-processing-pipeline", "data", phiDate + "-donor-data")
+donorPath = os.path.join("..", "bigdata-processing-pipeline", "data", phiDate + "-donor-data")
 donorList = phiDate + "-uniqueDonorList"
 donors = pd.read_csv(os.path.join(donorPath, donorList + ".csv"), low_memory=False)
 

From 07ccf9418823d8ba5a1f93ffdb5f85ff9cdaf5b3 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Thu, 23 May 2019 11:52:35 -0500
Subject: [PATCH 74/78] change figure output path

---
 .../visualize-users-settings-and-events.py    | 39 +++++++------------
 1 file changed, 14 insertions(+), 25 deletions(-)

diff --git a/projects/get-donors-pump-settings/visualize-users-settings-and-events.py b/projects/get-donors-pump-settings/visualize-users-settings-and-events.py
index 502a41c2..652236fc 100644
--- a/projects/get-donors-pump-settings/visualize-users-settings-and-events.py
+++ b/projects/get-donors-pump-settings/visualize-users-settings-and-events.py
@@ -58,8 +58,7 @@ def save_fig(fig, plot_name, width, height, scale):
     pio.write_image(
     fig,
     os.path.join(
-        "..",
-        "figures",
+        figure_path,
         plot_name + ".png"
     ),
     width=width,
@@ -174,8 +173,7 @@ def make_static_table(field, figName, filteredDF, nDecimals, return_summaryTable
     pio.write_image(
         fig,
         os.path.join(
-            "..",
-            "figures",
+            figure_path,
             figName + "-table-highRes.png"
         ),
         width=1200,
@@ -185,8 +183,7 @@ def make_static_table(field, figName, filteredDF, nDecimals, return_summaryTable
     pio.write_image(
         fig,
         os.path.join(
-            "..",
-            "figures",
+            figure_path,
             figName + "-table-lowRes.png"
         ),
         width=1200,
@@ -195,15 +192,13 @@ def make_static_table(field, figName, filteredDF, nDecimals, return_summaryTable
 
     summaryTable.to_csv(
         os.path.join(
-            "..",
-            "figures",
+            figure_path,
             figName + "-table.csv"
         )
     )
     allAgeTable.to_csv(
         os.path.join(
-            "..",
-            "figures",
+            figure_path,
             figName + "-all-age-table.csv"
         )
     )
@@ -367,7 +362,7 @@ def bin_data(df, ageBins, ageGroupNames, ylwBins, ylwGroupNames, catColorDF, min
 
 # %% define age and years living with bins
 group_title = "-withYlw0"
-
+figure_path = os.path.join(".", "figures")
 
 # next bin the data by age-ylw groups
 dataGroupName = "age-ylw-groups"
@@ -494,6 +489,7 @@ def bin_data(df, ageBins, ageGroupNames, ylwBins, ylwGroupNames, catColorDF, min
     )
 )
 
+
 # %% overview of bolus data table
 figName = "overviewTable-bolus-events"
 figName = figName + group_title
@@ -526,8 +522,7 @@ def bin_data(df, ageBins, ageGroupNames, ylwBins, ylwGroupNames, catColorDF, min
 pio.write_image(
     fig,
     os.path.join(
-        "..",
-        "figures",
+        figure_path,
         figName + "-highRes.png"
     ),
     width=1200,
@@ -537,8 +532,7 @@ def bin_data(df, ageBins, ageGroupNames, ylwBins, ylwGroupNames, catColorDF, min
 pio.write_image(
     fig,
     os.path.join(
-        "..",
-        "figures",
+        figure_path,
         figName + "-lowRes.png"
     ),
     width=1200,
@@ -655,8 +649,7 @@ def bin_data(df, ageBins, ageGroupNames, ylwBins, ylwGroupNames, catColorDF, min
 pio.write_image(
     fig,
     os.path.join(
-        "..",
-        "figures",
+        figure_path,
         figName + "-highRes.png"
     ),
     width=1200,
@@ -666,8 +659,7 @@ def bin_data(df, ageBins, ageGroupNames, ylwBins, ylwGroupNames, catColorDF, min
 pio.write_image(
     fig,
     os.path.join(
-        "..",
-        "figures",
+        figure_path,
         figName + "-lowRes.png"
     ),
     width=1200,
@@ -769,8 +761,7 @@ def bin_data(df, ageBins, ageGroupNames, ylwBins, ylwGroupNames, catColorDF, min
 pio.write_image(
     fig,
     os.path.join(
-        "..",
-        "figures",
+        figure_path,
         figName + "-highRes.png"
     ),
     width=1200,
@@ -780,8 +771,7 @@ def bin_data(df, ageBins, ageGroupNames, ylwBins, ylwGroupNames, catColorDF, min
 pio.write_image(
     fig,
     os.path.join(
-        "..",
-        "figures",
+        figure_path,
         figName + "-lowRes.png"
     ),
     width=1200,
@@ -1266,8 +1256,7 @@ def bin_data(df, ageBins, ageGroupNames, ylwBins, ylwGroupNames, catColorDF, min
 figName = "allAgeSettingSummary" + group_title
 allAgeSummary.to_csv(
     os.path.join(
-        "..",
-        "figures",
+        figure_path,
         figName + "-all-age-table.csv"
     )
 )

From a8048ca0a63a26553ccea3a2d8ead57bcef18b79 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Thu, 23 May 2019 21:35:32 -0500
Subject: [PATCH 75/78] ignore local plotly plot

---
 .gitignore | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.gitignore b/.gitignore
index 1c0f392a..657b5eef 100644
--- a/.gitignore
+++ b/.gitignore
@@ -32,3 +32,5 @@ htmlcov
 .pytest_cache
 
 
+
+projects/get-donors-pump-settings/temp-plot\.html

From 795d642cc4658d7b26a48aa2c9c128211c754191 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Thu, 23 May 2019 21:36:29 -0500
Subject: [PATCH 76/78] plots of isf and tdd (local only)

needs cleaning before pushing to repository
---
 .../visualize-users-settings-and-events.py    | 267 ++++++++++++++++++
 1 file changed, 267 insertions(+)

diff --git a/projects/get-donors-pump-settings/visualize-users-settings-and-events.py b/projects/get-donors-pump-settings/visualize-users-settings-and-events.py
index 652236fc..cd392754 100644
--- a/projects/get-donors-pump-settings/visualize-users-settings-and-events.py
+++ b/projects/get-donors-pump-settings/visualize-users-settings-and-events.py
@@ -390,6 +390,8 @@ def bin_data(df, ageBins, ageGroupNames, ylwBins, ylwGroupNames, catColorDF, min
     '#dacbde','#d0b6d4','#c5a1ca','#ba8dc0','#af78b7','#a464ad','#984ea3'
 ]
 
+
+
 finalCategories = [
         'age 01-05 ylw 00', 'age 01-05 ylw 01', 'age 01-05 ylw 02',
         'age 01-05 ylw 03-05', 'age 06-08 ylw 00', 'age 06-08 ylw 01',
@@ -1260,3 +1262,268 @@ def bin_data(df, ageBins, ageGroupNames, ylwBins, ylwGroupNames, catColorDF, min
         figName + "-all-age-table.csv"
     )
 )
+
+# %% make a plot of TDD by ISF
+
+# Average ISF per day
+dayDF["isfRounded"] = dayDF['isf.weightedMean'].round(1)
+#field = 'isfRounded'
+#yLabel = "Insulin Sensitivity Factor (mg/dL/U)"
+#figName = "Insulin Sensitivity Factor"
+#yMin = 0
+#yMax = 400
+
+## Total Daily Dose
+#field = "totalAmountOfInsulin"
+#yLabel = "Total Daily Dose (U)"
+#figName = "Total Daily Dose"
+#yMin = 0
+#yMax = 125
+#filteredDF = dayDF[dayDF[field] > 0].copy()
+
+filteredDF = dayDF[((dayDF['isfRounded'] > 0) &
+                    (dayDF['totalAmountOfInsulin'] > 0))].copy()
+
+ylwColors = ["#ffffb2", '#fecc5c', '#fd8d3c', '#f03b20', '#bd0026']
+for f in filteredDF["ylwCategories"].unique():
+    if f == '00':
+        colorCode = 0
+    if f == '01':
+        colorCode = 1
+    if f == '02':
+        colorCode = 2
+    if f == '03-05':
+        colorCode = 3
+    else:
+        colorCode = 4
+
+    filteredDF.loc[filteredDF["ylwCategories"] == f, "ylwColor"] = ylwColors[colorCode]
+
+
+
+from scipy.optimize import curve_fit
+def func(x, a, b, c):
+    return (a * x + b) / (x - 10)
+
+import statsmodels.api as sm
+lowess = sm.nonparametric.lowess
+#a * np.exp(-b*x) + c * np.exp(-d * x)
+#y = a * np.exp(-b * x) + c
+#y = a * np.exp(b*x) + c * np.exp(d * x)
+
+xdata = filteredDF['totalAmountOfInsulin'].round()
+ydata = filteredDF['isfRounded']
+popt, pcov = curve_fit(func, xdata, ydata)
+
+
+x = np.arange(1, 500)
+c = pd.DataFrame(columns=["ISF", "TDD"])
+for xi in x:
+    if sum(filteredDF['isfRounded'] == xi) > 3:
+        c.loc[xi, "ISF"] = xi
+        c.loc[xi, "TDD"] = filteredDF.loc[
+            filteredDF['isfRounded'] == xi,
+            "totalAmountOfInsulin"].median()
+
+asdf2 = c.rolling(25, center=True).mean()
+plt.plot(asdf2["TDD"], asdf2["ISF"])
+
+x = np.arange(1, 300)
+d = pd.DataFrame(columns=["TDD", "ISF"])
+for xi in x:
+    if sum(filteredDF['totalAmountOfInsulin'].round() == xi) > 3:
+        d.loc[xi, "TDD"] = xi
+        d.loc[xi, "ISF"] = filteredDF.loc[
+            filteredDF['totalAmountOfInsulin'].round() == xi,
+            "isfRounded"].median()
+
+# then smooth out the medians
+asdf = d.rolling(10, center=True).mean()
+plt.plot(asdf["TDD"], asdf["ISF"])
+
+
+# try a different approach were we just do a smoothed line
+
+
+z = lowess(ydata, xdata)
+#>>> w = lowess(y, x, frac=1./3)
+plt.plot(z[:,0], z[:,1])
+
+plt.plot(
+    x,
+    func(x, *popt),
+    'r-',
+#    label='fit: a=%5.3f, b=%5.3f, c=%5.3f' % tuple(popt)
+)
+
+#df.sort_values("categories", inplace=True)
+
+traces = []
+traces.append(go.Scatter(
+        y=ydata,
+        x=xdata,
+        name="Scatter",
+        mode='markers',
+        marker=dict(
+            color=filteredDF["allColors"],
+            opacity=0.125,
+        ),
+))
+
+#traces.append(go.Scatter(
+#        y=z2[:,0],
+#        x=z2[:,1],
+#        mode='lines',
+#))
+#
+#traces.append(go.Scatter(
+#        y=z[:,1],
+#        x=z[:,0],
+#        mode='lines',
+#        line=dict(
+#            color="black",
+#        ),
+#))
+
+traces.append(go.Scatter(
+        y=asdf["ISF"],
+        x=asdf["TDD"],
+        mode='lines',
+        name="Trend by TDD",
+        line=dict(
+            color="black",
+            dash="dot",
+        ),
+))
+
+traces.append(go.Scatter(
+        y=asdf2["ISF"],
+        x=asdf2["TDD"],
+        mode='lines',
+        name="Trend by ISF",
+        line=dict(
+            color="black",
+            dash="dash",
+        ),
+))
+
+layout = go.Layout(
+    font=dict(
+        size=18
+    ),
+    xaxis=dict(
+        title="TDD",
+        dtick=20,
+        range=[0, 300],
+        showgrid=True,
+        gridcolor='#f1f3f4',
+        gridwidth=2,
+        zeroline=True,
+        zerolinecolor='#f1f3f4',
+        zerolinewidth=2,
+    ),
+    yaxis=dict(
+        title="ISF",
+        dtick=20,
+        range=[0, 500],
+        showgrid=True,
+        gridcolor='#f1f3f4',
+        gridwidth=2,
+        zeroline=True,
+        zerolinecolor='#f1f3f4',
+        zerolinewidth=2,
+    )
+)
+
+fig = go.Figure(data=traces, layout=layout)
+plot(fig)
+
+for yd in df.categories.unique():
+    traces.append(go.Box(
+        y=df.loc[df["categories"] == yd, field].values,
+        x=df.loc[df["categories"] == yd, "categories"].values,
+        name=yd,
+        boxpoints="all",
+        notched=True,
+        hoverlabel=dict(font=dict(size=22)),
+        marker=dict(
+            color=df.loc[df["categories"] == yd, "allColors"].describe()["top"],
+            opacity=0,
+        ),
+    ))
+
+layout = go.Layout(
+    font=dict(
+        size=22
+    ),
+    xaxis=dict(
+        tickangle=52.5
+    ),
+    yaxis=dict(
+        title=yLabel,
+        range=[yMin, yMax],
+        showgrid=True,
+        gridcolor='#f1f3f4',
+        gridwidth=2,
+        zeroline=True,
+        zerolinecolor='#f1f3f4',
+        zerolinewidth=2,
+    ),
+    margin=dict(
+        l=100,
+        r=200,
+        b=250,
+        t=50,
+    ),
+
+    boxmode='group',
+    showlegend=False,
+    legend=dict(font=dict(size=14))
+)
+
+fig = go.Figure(data=traces, layout=layout)
+
+save_fig(fig, figName + "-boxplot-lowRes", 1800, 1200, 1)
+save_fig(fig, figName + "-boxplot-highRes", 1800, 1200, 4)
+
+
+
+
+
+
+
+
+#filteredDF.plot.scatter(y="isfRounded", x="totalAmountOfInsulin", alpha=0.025)
+
+# %% make a plot of TDD by max temp basal rate
+maxBasal = pd.DataFrame(basal[basal["type"]=="basal"].groupby(["hashID", "day"])["rate"].max()).reset_index()
+
+maxBasal.rename(columns={"rate":"maxBasalRatePerDay"}, inplace=True)
+
+maxBasal = pd.merge(
+    maxBasal,
+    dayDF[[
+        "hashID",
+        "day",
+        "categories",
+        "allColors",
+        "totalAmountOfInsulin",
+        'basal.closedLoopDays'
+    ]],
+    how="left",
+    on=["hashID", "day"]
+)
+
+# remove nans in category as they represent data from days that did not meat the
+# acceptable day standard
+#maxBasal = maxBasal[maxBasal["categories"].notnull()]
+
+
+
+
+filteredDF = maxBasal[((maxBasal['totalAmountOfInsulin'] > 0) &
+                    (maxBasal['maxBasalRatePerDay'] > 0))].copy()
+
+
+filteredDF.plot.scatter(y="maxBasalRatePerDay", x="totalAmountOfInsulin", alpha=0.125)
+

From c709afdc41f4183d8cf838860256652e56809af9 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 28 May 2019 13:51:17 -0500
Subject: [PATCH 77/78] update environment

---
 environment.yml | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/environment.yml b/environment.yml
index 043d73c7..97b1f82d 100644
--- a/environment.yml
+++ b/environment.yml
@@ -11,15 +11,13 @@ dependencies:
 - xlsxwriter
 - matplotlib
 - scikit-learn
+- pip
 - plotly
 - plotly::plotly-orca
 - poppler
 - psutil
-- r
-- r-essentials
 - pytest
 - pytest-cov
 
 - pip:
-  - python-dotenv
-  - -e git+https://github.com/tidepool-org/data-analytics#egg=tidals\&subdirectory=tidepool-analysis-tools
+  - python-dotenv
\ No newline at end of file

From 7b4d0f9b80e762792e9dfad230e96255f0e2a554 Mon Sep 17 00:00:00 2001
From: Ed Nykaza <edward.t.nykaza@gmail.com>
Date: Tue, 28 May 2019 19:02:38 -0500
Subject: [PATCH 78/78] plots using jos age and ylw groups

---
 ...ize-users-settings-and-events-jaeb-ages.py | 1396 +++++++++++++++++
 1 file changed, 1396 insertions(+)
 create mode 100644 projects/get-donors-pump-settings/visualize-users-settings-and-events-jaeb-ages.py

diff --git a/projects/get-donors-pump-settings/visualize-users-settings-and-events-jaeb-ages.py b/projects/get-donors-pump-settings/visualize-users-settings-and-events-jaeb-ages.py
new file mode 100644
index 00000000..62eb6969
--- /dev/null
+++ b/projects/get-donors-pump-settings/visualize-users-settings-and-events-jaeb-ages.py
@@ -0,0 +1,1396 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+Created on Tue Jan 22 06:46:33 2019
+
+@author: ed
+"""
+
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+"""
+description: visualize users settings and events
+version: 0.0.1
+created: 2019-01-11
+author: Ed Nykaza
+dependencies:
+    *
+license: BSD-2-Clause
+"""
+
+
+# %% REQUIRED LIBRARIES
+import pandas as pd
+import numpy as np
+from pytz import timezone
+from datetime import timedelta
+import datetime as dt
+import os
+import argparse
+import pdb
+import matplotlib.pyplot as plt
+import plotly
+import plotly.plotly as py
+import plotly.graph_objs as go
+from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot
+import plotly.io as pio
+
+
+
+# %% FUNCTIONS
+def make_bold(val_list):
+    bold_list = []
+    for val in val_list:
+        bold_list.append('<b>' + str(val) + '</b>')
+    return bold_list
+
+def make_bold_and_round(val_list, nDecimalPlaces):
+    bold_list = []
+    for val in val_list:
+        if nDecimalPlaces == 0:
+            bold_list.append('<b>' + str(int(np.round(val, nDecimalPlaces))) + '</b>')
+
+        else:
+            bold_list.append('<b>' + str(np.round(val, nDecimalPlaces)) + '</b>')
+    return bold_list
+
+
+def save_fig(fig, plot_name, width, height, scale):
+    pio.write_image(
+    fig,
+    os.path.join(
+        figure_path,
+        plot_name + ".png"
+    ),
+    width=width,
+    height=height,
+    scale=scale)
+
+    return
+
+
+def make_static_plot(field, yLabel, figName, df, yMin, yMax):
+
+    df.sort_values("categories", inplace=True)
+
+    traces = []
+    for yd in df.categories.unique():
+        traces.append(go.Box(
+            y=df.loc[df["categories"] == yd, field].values,
+            x=df.loc[df["categories"] == yd, "categories"].values,
+            name=yd,
+            boxpoints="all",
+            notched=True,
+            hoverlabel=dict(font=dict(size=22)),
+            marker=dict(
+                color=df.loc[df["categories"] == yd, "allColors"].describe()["top"],
+                opacity=0,
+            ),
+        ))
+
+    layout = go.Layout(
+        font=dict(
+            size=22
+        ),
+        xaxis=dict(
+            tickangle=52.5
+        ),
+        yaxis=dict(
+            title=yLabel,
+            range=[yMin, yMax],
+            showgrid=True,
+            gridcolor='#f1f3f4',
+            gridwidth=2,
+            zeroline=True,
+            zerolinecolor='#f1f3f4',
+            zerolinewidth=2,
+        ),
+        margin=dict(
+            l=100,
+            r=200,
+            b=250,
+            t=50,
+        ),
+
+#        boxmode='group',
+        showlegend=False,
+        legend=dict(font=dict(size=14))
+    )
+
+    fig = go.Figure(data=traces, layout=layout)
+
+    save_fig(fig, figName + "-boxplot-lowRes", 1800, 1200, 1)
+    save_fig(fig, figName + "-boxplot-highRes", 1800, 1200, 4)
+
+def make_static_table(field, figName, filteredDF, nDecimals, return_summaryTable=False):
+
+    # first make an overall table
+    allCounts = filteredDF.groupby(["hashID"])[field].describe()
+    allAgeTable = pd.DataFrame(index=[field])
+    allAgeTable["min"] = allCounts["min"].min()
+    allAgeTable["max"] = allCounts["max"].max()
+    allAgeTable["U"] = len(allCounts)
+    allAgeTable["N"] = allCounts["count"].sum()
+
+    # then make summary per categories
+    uniqueCounts = filteredDF.groupby(["categories"])["hashID"].describe()
+    uniqueCounts.reset_index(inplace=True)
+    summaryTable = filteredDF.groupby("categories")[field].describe()
+    summaryTable.reset_index(inplace=True)
+    summaryTable = pd.merge(summaryTable, uniqueCounts[["categories", "unique"]], how="left", on="categories")
+    summaryTable = pd.merge(summaryTable, catColorDF, how="left", on="categories")
+    summaryTable["unique"] = summaryTable["unique"].astype(float)
+
+    # add in interquartile range
+    summaryTable["IQR"] = summaryTable["75%"] - summaryTable["25%"]
+
+    col_headings = make_bold(["Group", "N", "U", "Average", "Stdev", "Min", "Q1", "Median", "Q3", "Max"])
+
+    trace = go.Table(
+        header=dict(values=col_headings,
+                    fill = dict(color='white'),
+                    align = ['center', 'center', 'center'],
+                    font = dict(color = 'black', size=12)),
+        columnwidth=[1.5, 1, 1, 1, 1, 1, 1, 1, 1, 1],
+        cells=dict(values=[make_bold(summaryTable["categories"]),
+                           make_bold_and_round(summaryTable["count"], 0),
+                           make_bold_and_round(summaryTable["unique"], 0),
+                           make_bold_and_round(summaryTable["mean"], nDecimals),
+                           make_bold_and_round(summaryTable["std"], nDecimals),
+                           make_bold_and_round(summaryTable["min"], nDecimals),
+                           make_bold_and_round(summaryTable["25%"], nDecimals),
+                           make_bold_and_round(summaryTable["50%"], nDecimals),
+                           make_bold_and_round(summaryTable["75%"], nDecimals),
+                           make_bold_and_round(summaryTable["max"], nDecimals)],
+                   fill = dict(color = [summaryTable["allColors"]]),
+                   align = ['center', 'center', 'center'],
+                   font = dict(color = 'black', size=10),
+                   height = 20)
+        )
+
+    fig = go.Figure()
+    fig.add_trace(trace)
+
+    pio.write_image(
+        fig,
+        os.path.join(
+            figure_path,
+            figName + "-table-highRes.png"
+        ),
+        width=1200,
+        height=1200,
+        scale=4)
+
+    pio.write_image(
+        fig,
+        os.path.join(
+            figure_path,
+            figName + "-table-lowRes.png"
+        ),
+        width=1200,
+        height=1200,
+        scale=1)
+
+    summaryTable.to_csv(
+        os.path.join(
+            figure_path,
+            figName + "-table.csv"
+        )
+    )
+    allAgeTable.to_csv(
+        os.path.join(
+            figure_path,
+            figName + "-all-age-table.csv"
+        )
+    )
+
+    if return_summaryTable:
+        return summaryTable, allAgeTable
+    else:
+        return
+
+
+def make_lite_interactive_boxplot(field, yLabel, df, yMin, yMax):
+    df.sort_values("categories", inplace=True)
+
+    traces = []
+    for yd in df.categories.unique():
+        yValues = df.loc[df["categories"] == yd, field]
+        yStats = yValues.describe()
+        yMinimum = yStats["min"]
+        yQ1 = yStats["25%"]
+        yQ2 = yStats["50%"]
+        yQ3 = yStats["75%"]
+        yMaximum = yStats["max"]
+        yIQR = yQ3 - yQ1
+        maxWhisker = yIQR * 1.5
+        lowWhiskerBound = yQ1 - maxWhisker
+        highWhiskerBound = yQ3 + maxWhisker
+        yLowerFence = yValues[yValues >= lowWhiskerBound].min()
+        yUpperFence = yValues[yValues <= highWhiskerBound].max()
+        yBoxData = [yMinimum, yLowerFence, yQ1, yQ1, yQ1, yQ1, yQ1,
+                    yQ2, yQ3, yQ3, yQ3, yQ3, yQ3,
+                    yUpperFence, yMaximum]
+
+        # get N and U
+        nDays = df.loc[df["categories"] == yd, "count"].median().astype(int)
+        uniqueDonors = df.loc[df["categories"] == yd, "unique"].median().astype(int)
+
+        traces.append(go.Box(
+            y=yBoxData,
+            jitter=0,
+            pointpos=0,
+            text=list(np.repeat("N=%s, U=%s" % (nDays, uniqueDonors), len(yBoxData))),
+            hoverinfo="y+text",
+            name=yd,
+            boxpoints="all",
+            notched=False,
+            marker=dict(
+                color=df.loc[df["categories"] == yd, "allColors"].describe()["top"],
+                opacity=0,
+            ),
+        ))
+
+    layout = go.Layout(
+        yaxis=dict(
+            title=yLabel,
+            range=[yMin, yMax],
+            showgrid=True,
+            gridcolor='#f1f3f4',
+            gridwidth=2,
+            zeroline=True,
+            zerolinecolor='#f1f3f4',
+            zerolinewidth=2,
+        ),
+        showlegend=True
+    )
+
+    fig = go.Figure(data=traces, layout=layout)
+    plot_url = py.plot(fig, filename="Distribution of " + figName, auto_open=False)
+    print(figName, plot_url)
+
+    return
+
+
+def filter_data(df, min_days_criteria=7):
+
+    # keep all type1 adn null diagnosis data (not specified)
+    df = df[((df.diagnosisType.isnull()) | (df.diagnosisType == "type1"))]
+
+    # filter out invalid ages and ylw
+    df = df[((df.age.astype(float) >= 0) & (df.age.astype(float) <= 90))]
+    df = df[((df.ylw.astype(float) >= 0) & (df.ylw.astype(float) <= 80))]
+
+    # filter out invalid pump and cgm days
+    df = df[((df["validPumpData"]) & (df["validCGMData"]))]
+
+    # filter out Paradigm Veo Pumps
+    df = df[~df["pump.top"].str.contains("Paradigm Veo")]
+
+    # filter out omnipod with mg/dL likely settings
+    df = df[~((df["pump.top"].str.contains("InsOmn-130")) &
+              (df['pumpSettings.isfLikelyUnits'] == "mg/dL"))]
+
+    # require a minimum number of days of data
+    dayGroups = pd.DataFrame(df.groupby(["hashID", "age", "ylw"]).day.count()).reset_index()
+    dayGroups.rename(columns={"day": "nDays"}, inplace=True)
+    df = pd.merge(df, dayGroups, how="left", on=["hashID", "age", "ylw"])
+
+    df = df[df["nDays"] >= min_days_criteria]
+
+    return df
+
+
+def merge_dayData(df, dayDF):
+
+    df = pd.merge(
+        df,
+        dayDF[[
+            "hashID",
+            "day",
+            "validPumpData",
+            "atLeast3Boluses",
+            "validCGMData",
+            "diagnosisType",
+            "pump.top",
+            "pumpSettings.isfLikelyUnits"
+        ]],
+        how="left",
+        on=["hashID", "day"]
+    )
+
+    return df
+
+
+def bin_data(df, ageBins, ageGroupNames, ylwBins, ylwGroupNames, catColorDF, min_unique_donors=10):
+
+    # bin data (defined above)
+    df["ageBins"] = pd.cut(df["age"], ageBins, labels=ageGroupNames)
+    df["ylwBins"] = pd.cut(df["ylw"], ylwBins, labels=ylwGroupNames)
+    df["ageCategories"] = df["ageBins"].astype(str)
+    df["ylwCategories"] = df["ylwBins"].astype(str)
+    df["categories"] = "age " + df["ageBins"].astype(str) + " ylw " + df["ylwBins"].astype(str)
+
+    # attach bin colors (defined above)
+    df = pd.merge(df, catColorDF, how="left", on="categories")
+    df["categories"].astype("category", inplace=True)
+
+    # attach counts per group
+    dGroups = df.groupby("categories")
+    groupDF = dGroups["hashID"].describe()
+    groupDF["ageCategories"] = dGroups["ageCategories"].describe()["top"]
+    groupDF["ylwCategories"] = dGroups["ylwCategories"].describe()["top"]
+    #groupDF["ylwAlpha"] = dGroups["ylwAlpha"].mean()
+    groupDF["allColors"] = dGroups["allColors"].describe()["top"]
+    groupDF.reset_index(inplace=True)
+
+    # attach group counts to the main dataframe
+    df = pd.merge(df, groupDF[["categories", "count", "unique"]], how="left", on="categories")
+
+    # remove all categories that do NOT have at least 10 unique people
+    df = df[df["unique"] > min_unique_donors]
+    groupDF = groupDF[groupDF["unique"] > min_unique_donors]
+
+    # attach N and U to the categories
+    df["categoriesFull"] = (
+        df["categories"].astype(str) +
+        " (N=" + df["count"].astype(str) +
+        ", U=" + df["unique"].astype(str) +  ")"
+    )
+
+    return df, groupDF
+
+
+# %% define age and years living with bins
+group_title = "-jos-groups"
+figure_path = os.path.join(".", "figures")
+
+# next bin the data by age-ylw groups
+dataGroupName = "age-ylw-groups"
+
+# original age and ylw bins
+#ageBins = np.array([0,5,8,12,17,24,85])
+#ylwBins = np.array([-1,0,1,2,5,10,25,75])
+
+#catColors = [
+#    '#f0d8e5','#f4bdd8','#f7a0cc','#f781bf',
+#    '#ebc3c1','#f1a095','#f17d6c','#ec5644','#e41a1c',
+#    '#f2d8c3','#fbc299','#ffac6f','#ff9746','#ff7f00',
+#    '#d0e1cc','#b8d8b2','#9fcd97','#86c37e','#6cb964','#4daf4a',
+#    '#c9d6e3','#afc4da','#95b1d2','#7aa0c9','#5b8fc1','#377eb8',
+#    '#dacbde','#d0b6d4','#c5a1ca','#ba8dc0','#af78b7','#a464ad','#984ea3'
+#]
+#
+#finalCategories = [
+#        'age 01-05 ylw 00', 'age 01-05 ylw 01', 'age 01-05 ylw 02',
+#        'age 01-05 ylw 03-05', 'age 06-08 ylw 00', 'age 06-08 ylw 01',
+#        'age 06-08 ylw 02', 'age 06-08 ylw 03-05', 'age 06-08 ylw 06-10',
+#        'age 09-12 ylw 00', 'age 09-12 ylw 01', 'age 09-12 ylw 02',
+#        'age 09-12 ylw 03-05', 'age 09-12 ylw 06-10', 'age 13-17 ylw 00',
+#        'age 13-17 ylw 01', 'age 13-17 ylw 02', 'age 13-17 ylw 03-05',
+#        'age 13-17 ylw 06-10', 'age 13-17 ylw 11-25', 'age 18-24 ylw 00',
+#        'age 18-24 ylw 01', 'age 18-24 ylw 02', 'age 18-24 ylw 03-05',
+#        'age 18-24 ylw 06-10', 'age 18-24 ylw 11-25', 'age 25-85 ylw 00',
+#        'age 25-85 ylw 01', 'age 25-85 ylw 02', 'age 25-85 ylw 03-05',
+#        'age 25-85 ylw 06-10', 'age 25-85 ylw 11-25',
+#        'age 25-85 ylw 26-75'
+#]
+
+# jaeb obs study bins
+ageBins = np.array([-1,6,13,25,85])
+ylwBins = np.array([-1,1,5,75])
+
+# bin by age
+ageGroupNames = []
+for x, y in zip(ageBins[:-1]+1, ageBins[1:]):
+    ageGroupNames.append("%s-%s"%(f"{x:02d}", f"{y:02d}"))
+
+ylwGroupNames = []
+for x, y in zip(ylwBins[:-1]+1, ylwBins[1:]):
+    if x == y:
+        ylwGroupNames.append("%s"%(f"{x:02d}"))
+    else:
+        ylwGroupNames.append("%s-%s"%(f"{x:02d}", f"{y:02d}"))
+
+## 7 colors in each
+#oranges = ['#fdd0a2','#fdae6b','#fd8d3c','#f16913','#d94801','#a63603','#7f2704']
+#reds = ['#fcbba1','#fc9272','#fb6a4a','#ef3b2c','#cb181d','#a50f15','#67000d']
+#greens = ['#c7e9c0','#a1d99b','#74c476','#41ab5d','#238b45','#006d2c','#00441b']
+#blues = ['#c6dbef','#9ecae1','#6baed6','#4292c6','#2171b5','#08519c','#08306b']
+#purples = ['#dadaeb','#bcbddc','#9e9ac8','#807dba','#6a51a3','#54278f','#3f007d']
+#greys = ['#d9d9d9','#bdbdbd','#969696','#737373','#525252','#252525','#000000']
+
+# 3 colors in each
+#reds = ['#fcae91','#fb6a4a','#cb181d']
+oranges = ['#fdbe85','#fd8d3c','#d94701']
+greens = ['#bae4b3','#74c476','#238b45']
+blues = ['#bdd7e7','#6baed6','#2171b5']
+purples = ['#cbc9e2','#9e9ac8','#6a51a3']
+#greys = ['#cccccc','#969696','#525252']
+
+
+color_matrix = pd.DataFrame([oranges, greens, blues, purples])
+
+all_colors = np.reshape(color_matrix.values, -1)
+
+i = 0
+catColorDF = pd.DataFrame()
+for ai in range(0, len(ageGroupNames)):
+    for yi in range(0, len(ylwGroupNames)):
+        catColorDF.loc[i, "categories"] = "age %s ylw %s" %(ageGroupNames[ai], ylwGroupNames[yi])
+        catColorDF.loc[i, "allColors"] = all_colors[i]
+        i = i + 1
+
+
+
+# %% load in summary donor data
+dataPulledDate = "2019-01-10"
+dataProcessedDate = "2019-01-22"
+
+phiDate = "PHI-" + dataPulledDate
+donorPath = os.path.join("..", "bigdata-processing-pipeline", "data", phiDate + "-donor-data")
+donorList = phiDate + "-uniqueDonorList"
+donors = pd.read_csv(os.path.join(donorPath, donorList + ".csv"), low_memory=False)
+
+
+# %% all-donors summary
+allAgeSummary = pd.DataFrame()
+dataPath = os.path.join(donorPath, "settings-and-events")
+d = pd.read_csv(os.path.join(dataPath, "combined-allMetadata.csv"), low_memory=False)
+
+# attach the donor level data to the
+allMetadata = pd.merge(
+    d,
+    donors[[
+        "hashID",
+        "userID",
+        "diagnosisType",
+        "targetDevices",
+        "targetTimezone",
+        "termsAccepted"
+    ]],
+    how="left",
+    on="hashID"
+)
+allMetadata.to_csv(os.path.join(donorPath, donorList + "-w-metaData.csv"))
+
+
+# %% load data
+dayData = pd.read_csv(os.path.join(dataPath, "combined-dayData.csv"), low_memory=False)
+bolusData = pd.read_csv(os.path.join(dataPath, "combined-bolusEvents.csv"), low_memory=False)
+basalData = pd.read_csv(os.path.join(dataPath, "combined-basalEvents.csv"), low_memory=False)
+
+# %% attach the diagnosis type to the day data
+dayDF = pd.merge(
+    dayData,
+    allMetadata[[
+        "hashID",
+        "diagnosisType",
+        "pump.top",
+        "pumpSettings.isfLikelyUnits"
+    ]],
+    how="left",
+    on="hashID"
+)
+
+dayDF = filter_data(dayDF, min_days_criteria=7)
+dayDF, dayDFGroupSummary = (
+    bin_data(
+        dayDF,
+        ageBins,
+        ageGroupNames,
+        ylwBins,
+        ylwGroupNames,
+        catColorDF,
+        min_unique_donors=10
+    )
+)
+
+
+# %% all-event level summary (max basal and max bolus)
+# attach the day to bolus data and filter data by analysis criteria
+# NOTE: seet the filter_data function for details
+bolus = merge_dayData(bolusData, dayDF)
+bolus = filter_data(bolus, min_days_criteria=7)
+bolus, bolusGroupSummary = (
+    bin_data(
+        bolus,
+        ageBins,
+        ageGroupNames,
+        ylwBins,
+        ylwGroupNames,
+        catColorDF,
+        min_unique_donors=10
+    )
+)
+
+
+# %% overview of bolus data table
+figName = "overviewTable-bolus-events"
+figName = figName + group_title
+trace = go.Table(
+    header=dict(
+        values=make_bold(["AGE-YLW Group",
+                          "Age",
+                          "Years Living with T1D",
+                          "N (Bolus Events)",
+                          "U (Unique Donors)"]),
+        align = ['center', 'center', 'center'],
+        font = dict(color = 'black', size=14)
+    ),
+    cells=dict(
+        values=[make_bold(bolusGroupSummary['categories']),
+                make_bold(bolusGroupSummary['ageCategories']),
+                make_bold(bolusGroupSummary['ylwCategories']),
+                make_bold(bolusGroupSummary['count']),
+                make_bold(bolusGroupSummary['unique'])],
+        fill = dict(color = [bolusGroupSummary["allColors"]]),
+        align = ['center', 'center', 'center'],
+        font = dict(color = 'black', size=11),
+        height = 22
+    ),
+)
+
+fig = go.Figure()
+fig.add_trace(trace)
+
+pio.write_image(
+    fig,
+    os.path.join(
+        figure_path,
+        figName + "-highRes.png"
+    ),
+    width=1200,
+    height=1200,
+    scale=4)
+
+pio.write_image(
+    fig,
+    os.path.join(
+        figure_path,
+        figName + "-lowRes.png"
+    ),
+    width=1200,
+    height=1200,
+    scale=1)
+
+
+# %% max bolus amount ()
+maxBolus = pd.DataFrame(bolus.groupby(["hashID", "day"])["unitsInsulin"].max()).reset_index()
+maxBolus.rename(columns={"unitsInsulin":"maxBolusPerDay"}, inplace=True)
+
+maxBolus = pd.merge(
+    maxBolus,
+    dayDF[[
+        "hashID",
+        "day",
+        "categories",
+        "allColors"
+    ]],
+    how="left",
+    on=["hashID", "day"]
+)
+
+# remove nans in category as they represent data from days that did not meat the
+# acceptable day standard
+maxBolus = maxBolus[maxBolus["categories"].notnull()]
+
+field = 'maxBolusPerDay'
+yLabel = "Max Bolus Per Day (U)"
+figName = "Max Bolus"
+yMin = 0
+yMax = 21
+filteredDF = maxBolus[maxBolus[field] > 0].copy()
+
+## make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+# add N events and n unique donors
+filteredDF = pd.merge(
+    filteredDF,
+    summaryTable[[
+        "categories",
+        "count",
+        "unique"
+    ]],
+    how="left",
+    on="categories"
+)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% basal data
+basal = merge_dayData(basalData, dayDF)
+basal = filter_data(basal, min_days_criteria=7)
+basal, basalGroupSummary = (
+    bin_data(
+        basal,
+        ageBins,
+        ageGroupNames,
+        ylwBins,
+        ylwGroupNames,
+        catColorDF,
+        min_unique_donors=10
+    )
+)
+
+
+# %% overview of basal data table
+figName = "overviewTable-basal-events"
+figName = figName + group_title
+
+trace = go.Table(
+    header=dict(
+        values=make_bold(["AGE-YLW Group",
+                          "Age",
+                          "Years Living with T1D",
+                          "N (Basal Events)",
+                          "U (Unique Donors)"]),
+        align = ['center', 'center', 'center'],
+        font = dict(color = 'black', size=14)
+    ),
+    cells=dict(
+        values=[make_bold(basalGroupSummary['categories']),
+                make_bold(basalGroupSummary['ageCategories']),
+                make_bold(basalGroupSummary['ylwCategories']),
+                make_bold(basalGroupSummary['count']),
+                make_bold(basalGroupSummary['unique'])],
+        fill = dict(color = [basalGroupSummary["allColors"]]),
+        align = ['center', 'center', 'center'],
+        font = dict(color = 'black', size=11),
+        height = 22
+    ),
+)
+
+fig = go.Figure()
+fig.add_trace(trace)
+
+pio.write_image(
+    fig,
+    os.path.join(
+        figure_path,
+        figName + "-highRes.png"
+    ),
+    width=1200,
+    height=1200,
+    scale=4)
+
+pio.write_image(
+    fig,
+    os.path.join(
+        figure_path,
+        figName + "-lowRes.png"
+    ),
+    width=1200,
+    height=1200,
+    scale=1)
+
+
+# %% max basal rate
+maxBasal = pd.DataFrame(basal[basal["type"]=="basal"].groupby(["hashID", "day"])["rate"].max()).reset_index()
+
+maxBasal.rename(columns={"rate":"maxBasalRatePerDay"}, inplace=True)
+
+maxBasal = pd.merge(
+    maxBasal,
+    dayDF[[
+        "hashID",
+        "day",
+        "categories",
+        "allColors"
+    ]],
+    how="left",
+    on=["hashID", "day"]
+)
+
+# remove nans in category as they represent data from days that did not meat the
+# acceptable day standard
+maxBasal = maxBasal[maxBasal["categories"].notnull()]
+
+field = 'maxBasalRatePerDay'
+yLabel = "Max Basal Per Day (U/hr)"
+figName = "Max Basal"
+yMin = 0
+yMax = 3.25
+filteredDF = maxBasal[maxBasal[field] > 0].copy()
+
+## make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+# add N events and n unique donors
+filteredDF = pd.merge(
+    filteredDF,
+    summaryTable[[
+        "categories",
+        "count",
+        "unique"
+    ]],
+    how="left",
+    on="categories"
+)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% overview of day level data table
+figName = "overviewTable-day-data"
+figName = figName + group_title
+
+trace = go.Table(
+    header=dict(
+        values=make_bold(["AGE-YLW Group",
+                          "Age",
+                          "Years Living with T1D",
+                          "N (Days)",
+                          "U (Unique Donors)"]),
+        align = ['center', 'center', 'center'],
+        font = dict(color = 'black', size=14)
+    ),
+    cells=dict(
+        values=[make_bold(dayDFGroupSummary['categories']),
+                make_bold(dayDFGroupSummary['ageCategories']),
+                make_bold(dayDFGroupSummary['ylwCategories']),
+                make_bold(dayDFGroupSummary['count']),
+                make_bold(dayDFGroupSummary['unique'])],
+        fill = dict(color = [dayDFGroupSummary["allColors"]]),
+        align = ['center', 'center', 'center'],
+        font = dict(color = 'black', size=11),
+        height = 22
+    ),
+)
+
+fig = go.Figure()
+fig.add_trace(trace)
+
+pio.write_image(
+    fig,
+    os.path.join(
+        figure_path,
+        figName + "-highRes.png"
+    ),
+    width=1200,
+    height=1200,
+    scale=4)
+
+pio.write_image(
+    fig,
+    os.path.join(
+        figure_path,
+        figName + "-lowRes.png"
+    ),
+    width=1200,
+    height=1200,
+    scale=1)
+
+
+# %% Average ISF per day
+dayDF["isfRounded"] = dayDF['isf.weightedMean'].round(1)
+field = 'isfRounded'
+yLabel = "Insulin Sensitivity Factor (mg/dL/U)"
+figName = "Insulin Sensitivity Factor"
+yMin = 0
+yMax = 400
+filteredDF = dayDF[dayDF[field] > 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 0
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Average CIR per day
+field = 'cir.weightedMean'
+yLabel = "Carb to Insulin Ratio (g/U)"
+figName = "Carb to Insulin Ratio"
+yMin = 0
+yMax = 70
+filteredDF = dayDF[dayDF[field] > 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Average Correction Target per day
+field = 'ct.target.weightedMean'
+yLabel = "Correction Target (mg/dL)"
+figName = "Correction Target"
+yMin = 70
+yMax = 180
+filteredDF = dayDF[dayDF[field] > 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 0
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Average Basal Rate per day
+field = 'sbr.weightedMean'
+yLabel = "Scheduled Basal Rate (U/hr)"
+figName = "Scheduled Basal Rate"
+yMin = 0
+yMax = 2.5
+filteredDF = dayDF[dayDF[field] > 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 3
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Total Daily Dose
+field = "totalAmountOfInsulin"
+yLabel = "Total Daily Dose (U)"
+figName = "Total Daily Dose"
+yMin = 0
+yMax = 125
+filteredDF = dayDF[dayDF[field] > 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Percent Basal
+dayDF["perecentBasalInPercent"] = dayDF["percentBasal"] * 100
+field = "perecentBasalInPercent"
+yLabel = "Basal Proportion of Total Daily Dose (%)"
+figName = "Basal Proportion of Total Daily Dose"
+yMin = 0
+yMax = 100
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Total Daily Carbs
+field = "totalDailyCarbs"
+yLabel = "Total Daily Carbs (g)"
+figName = "Total Daily Carbs"
+yMin = 0
+yMax = 600
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 0
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Daily Time in Range (70-180 mg/dL)
+dayDF["perecentInRange"] = dayDF["cgm.percent70to180"] * 100
+field = "perecentInRange"
+yLabel = "Percent of Day in Targe Range (70-180 mg/dL, %)"
+figName = "Percent of Day in Targe Range 70-180"
+yMin = 0
+yMax = 100
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Mean CGM (mg/dL)
+field = "cgm.mean_mgdL"
+yLabel = "Daily Average CGM Level (mg/dL)"
+figName = "Daily Average CGM Level"
+yMin = 50
+yMax = 300
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 0
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Cov CGM (mg/dL)
+dayDF["covPercent"] = dayDF["cgm.cov_mgdL"] * 100
+field = "covPercent"
+yLabel = "Coeffient of Variation (%)"
+figName = "Coeffient of Variation"
+yMin = 6
+yMax = 62
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Daily Time Below 54 (Percentage)
+dayDF["perecentBelow54mgdL"] = dayDF["cgm.percentBelow54"] * 100
+field = "perecentBelow54mgdL"
+yLabel = "Percent of Day Below 54 mg/dL (%)"
+figName = "Percent of Day in Extreme Hypo Below 54 mgdL"
+yMin = 0
+yMax = 5
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 2
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Number of Below 54 mg/dL Episodes per Day
+field = "extreme-hypo.count"
+dayDF[field].fillna(0, inplace=True)
+yLabel = "Number of Extreme Hypo Episodes (Below 54 mg/dL) per Day"
+figName =  "Number of Extreme Hypo Episodes per Day"
+yMin = 0
+yMax = 2
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Average Duration of each Episode Below 54 mg/dL
+field = "extreme-hypo-durationMinutes.mean"
+yLabel = "Average Duration of each Extreme Hypo Episode (minutes)"
+figName =  "Average Duration of each Extreme Hypo Episode"
+yMin = 15
+yMax = 120
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 0
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Daily Time Above 250 (Percentage)
+dayDF["perecentAbove250mgdL"] = dayDF["cgm.percentAbove250"] * 100
+field = "perecentAbove250mgdL"
+yLabel = "Percent of Day Above 250 mg/dL (%)"
+figName = "Percent of Day in Extreme Hyper Above 250 mgdL"
+yMin = 0
+yMax = 75
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 0
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Number of Above 250 mg/dL Episodes per Day
+field = "extreme-hyper.count"
+dayDF[field].fillna(0, inplace=True)
+yLabel = "Number of Extreme Hyper Episodes (Above 250 mg/dL) per Day"
+figName =  "Number of Extreme Hyper Episodes per Day"
+yMin = 0
+yMax = 2
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% Average Duration of each Episode Above 250 mg/dL
+dayDF["avgExtremeHyperHours"] = dayDF["extreme-hyper-durationMinutes.mean"] / 60
+field = "avgExtremeHyperHours"
+yLabel = "Average Duration of each Extreme Hyper Episode (hours)"
+figName =  "Average Duration of each Extreme Hyper Episode"
+yMin = 2
+yMax = 10
+filteredDF = dayDF[dayDF[field] >= 0].copy()
+
+# make/save static summary table
+figName = figName + group_title
+nDecimalPlaces = 1
+summaryTable, allAgeTable = make_static_table(
+    field,
+    figName,
+    filteredDF,
+    nDecimalPlaces,
+    return_summaryTable=True
+)
+
+# add the ageTable to the allAgeSummary
+#allAgeSummary = pd.concat([allAgeSummary, allAgeTable], axis=0)
+
+## make/save static boxplot
+make_static_plot(field, yLabel, figName, filteredDF, yMin, yMax)
+
+# make lite interactive plot
+make_lite_interactive_boxplot(field, yLabel, filteredDF, yMin, yMax)
+
+
+# %% save the all age summaries
+figName = "allAgeSettingSummary" + group_title
+allAgeSummary.to_csv(
+    os.path.join(
+        figure_path,
+        figName + "-all-age-table.csv"
+    )
+)
+
+
+# %% make a plot of TDD by ISF
+# Average ISF per day
+dayDF["isfRounded"] = dayDF['isf.weightedMean'].round(1)
+
+filteredDF = dayDF[((dayDF['isfRounded'] > 0) &
+                    (dayDF['totalAmountOfInsulin'] > 0))].copy()
+
+x = np.arange(1, 500)
+c = pd.DataFrame(columns=["ISF", "TDD"])
+for xi in x:
+    if sum(filteredDF['isfRounded'] == xi) > 3:
+        c.loc[xi, "ISF"] = xi
+        c.loc[xi, "TDD"] = filteredDF.loc[
+            filteredDF['isfRounded'] == xi,
+            "totalAmountOfInsulin"].median()
+
+trend_by_isf = c.rolling(25, center=True).mean()
+
+x = np.arange(1, 300)
+d = pd.DataFrame(columns=["TDD", "ISF"])
+for xi in x:
+    if sum(filteredDF['totalAmountOfInsulin'].round() == xi) > 3:
+        d.loc[xi, "TDD"] = xi
+        d.loc[xi, "ISF"] = filteredDF.loc[
+            filteredDF['totalAmountOfInsulin'].round() == xi,
+            "isfRounded"].median()
+
+# then smooth out the medians
+trend_by_tdd = d.rolling(10, center=True).mean()
+
+traces = []
+
+for yd in catColorDF.categories.unique():
+    traces.append(go.Scattergl(
+            y=filteredDF.loc[filteredDF["categories"] == yd, 'isfRounded'],
+            x=filteredDF.loc[filteredDF["categories"] == yd, 'totalAmountOfInsulin'].round(),
+            name=yd,
+            mode='markers',
+            marker=dict(
+                color=filteredDF.loc[filteredDF["categories"] == yd, 'allColors'],
+                opacity=0.5,
+            ),
+    ))
+
+traces.append(go.Scattergl(
+        y=trend_by_tdd["ISF"],
+        x=trend_by_tdd["TDD"],
+        mode='lines',
+        name="Trend by TDD",
+        line=dict(
+            color="black",
+            dash="dot",
+        ),
+))
+
+traces.append(go.Scattergl(
+        y=trend_by_isf["ISF"],
+        x=trend_by_isf["TDD"],
+        mode='lines',
+        name="Trend by ISF",
+        line=dict(
+            color="black",
+            dash="dash",
+        ),
+))
+
+layout = go.Layout(
+    font=dict(
+        size=18
+    ),
+    xaxis=dict(
+        title="TDD",
+        dtick=20,
+        range=[0, 300],
+        showgrid=True,
+        gridcolor='#f1f3f4',
+        gridwidth=2,
+        zeroline=True,
+        zerolinecolor='#f1f3f4',
+        zerolinewidth=2,
+    ),
+    yaxis=dict(
+        title="ISF",
+        dtick=20,
+        range=[0, 500],
+        showgrid=True,
+        gridcolor='#f1f3f4',
+        gridwidth=2,
+        zeroline=True,
+        zerolinecolor='#f1f3f4',
+        zerolinewidth=2,
+    )
+)
+figName = "ISFbyTDD"
+fig = go.Figure(data=traces, layout=layout)
+plot_url = py.plot(fig, filename=figName, auto_open=False)
+print(figName, plot_url)