From c60d7aa5b8efaea97e24bfc018cfdd93a849c289 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Wed, 31 Aug 2022 13:35:01 -0500
Subject: [PATCH 01/58] try staging data

---
 .../src/Utils/DataStaging/StagedFeatures.c3typ        | 11 +++++++++++
 1 file changed, 11 insertions(+)
 create mode 100644 training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ

diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
new file mode 100644
index 00000000..41903dff
--- /dev/null
+++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
@@ -0,0 +1,11 @@
+/**
+* Copyright (c) 2022, C3 AI DTI, Development Operations Team
+* All rights reserved. License: https://github.com/c3aidti/.github
+**/
+/**
+* This type hosts data for models that cannot obtain Features 
+* directly from other entity types.
+*/
+entity type StagedFeatures schema name "STGD_FTRS" {
+    feature: map<string, double> suffix name "FTR"
+}

From cbc7b46af1852d70876533bd79c74b1ae62ea97a Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Wed, 31 Aug 2022 13:38:44 -0500
Subject: [PATCH 02/58] lil fix

---
 .../gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
index 41903dff..3e227a38 100644
--- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
+++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
@@ -7,5 +7,5 @@
 * directly from other entity types.
 */
 entity type StagedFeatures schema name "STGD_FTRS" {
-    feature: map<string, double> suffix name "FTR"
+    feature: map<string, double> schema suffix "FTR"
 }

From 56cbdb2bb543b40ee8bbcf1c7fff6df2c64d8696 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Wed, 31 Aug 2022 14:22:51 -0500
Subject: [PATCH 03/58] frankeinteining

---
 .../GaussianProcessRegressionPipe.py          | 20 ++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index 8f032df0..ece71b5d 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -71,8 +71,26 @@ def getFeatures(this):
     import pandas as pd
 
     dataSourceSpec = c3.GPRDataSourceSpec.get(this.dataSourceSpec.id)
-
     featuresType = dataSourceSpec.featuresType.toType()
+    
+    if (featuresType.name == "StagedFeatures"):
+        features = c3.StagedFeatures.fetch({
+            "limit": -1,
+            "order": "id"
+        }).objs.toJson()
+
+        df = pd.DataFrame(features)
+        keys = df.iloc[0]["feature"].keys()
+
+        for key in keys:
+            df[key] = df["feature"].apply(lambda x: x[key])
+        
+        df.drop("version", axis=1, inplace=True)
+        df = df.select_dtypes(["number"])
+
+        return c3.Dataset.fromPython(df)
+
+
     inputTableC3 = featuresType.fetch(dataSourceSpec.featuresSpec).objs.toJson()
     inputTablePandas = pd.DataFrame(inputTableC3)
     inputTablePandas = inputTablePandas.drop("version", axis=1)

From 2425ad738fd9d91f6a8be0f9657301c1123e993d Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Wed, 31 Aug 2022 14:41:04 -0500
Subject: [PATCH 04/58] try targets now

---
 .../GaussianProcessRegressionPipe.py          | 24 ++++++++++++++++---
 .../Utils/DataStaging/StagedFeatures.c3typ    |  2 +-
 .../src/Utils/DataStaging/StagedTargets.c3typ | 11 +++++++++
 3 files changed, 33 insertions(+), 4 deletions(-)
 create mode 100644 training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index ece71b5d..5b5b56f9 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -80,10 +80,10 @@ def getFeatures(this):
         }).objs.toJson()
 
         df = pd.DataFrame(features)
-        keys = df.iloc[0]["feature"].keys()
+        keys = df.iloc[0]["features"].keys()
 
         for key in keys:
-            df[key] = df["feature"].apply(lambda x: x[key])
+            df[key] = df["features"].apply(lambda x: x[key])
         
         df.drop("version", axis=1, inplace=True)
         df = df.select_dtypes(["number"])
@@ -112,8 +112,26 @@ def getTarget(this):
     import pandas as pd
 
     dataSourceSpec = c3.GPRDataSourceSpec.get(this.dataSourceSpec.id)
-
     targetType = dataSourceSpec.targetType.toType()
+
+    if (targetType.name == "StagedTargets"):
+        targets = c3.StagedTargets.fetch({
+            "limit": -1,
+            "order": "id"
+        }).objs.toJson()
+
+        df = pd.DataFrame(targets)
+        keys = df.iloc[0]["targets"].keys()
+
+        for key in keys:
+            df[key] = df["targets"].apply(lambda x: x[key])
+        
+        df.drop("version", axis=1, inplace=True)
+        df = df.select_dtypes(["number"])
+
+        return c3.Dataset.fromPython(df)
+ 
+        
     outputTableC3 = targetType.fetch(dataSourceSpec.targetSpec).objs.toJson()
     outputTablePandas = pd.DataFrame(outputTableC3)
     outputTablePandas = outputTablePandas.drop("version", axis=1)
diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
index 3e227a38..55f5f2ba 100644
--- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
+++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
@@ -7,5 +7,5 @@
 * directly from other entity types.
 */
 entity type StagedFeatures schema name "STGD_FTRS" {
-    feature: map<string, double> schema suffix "FTR"
+    features: map<string, double> schema suffix "FTR"
 }
diff --git a/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ
new file mode 100644
index 00000000..9f861106
--- /dev/null
+++ b/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ
@@ -0,0 +1,11 @@
+/**
+* Copyright (c) 2022, C3 AI DTI, Development Operations Team
+* All rights reserved. License: https://github.com/c3aidti/.github
+**/
+/**
+* This type hosts data for models that cannot obtain Targets
+* directly from other entity types.
+*/
+entity type StagedTargets schema name "STGD_TRGTS" {
+    targets: map<string, double> schema suffix "TRGT"
+}

From 3e2601cf962257469b5fc92be8ca002453c4b357 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Fri, 16 Sep 2022 09:36:56 -0500
Subject: [PATCH 05/58] adding target center

---
 .../GPRegression/GaussianProcessRegressionPipe.py  | 14 +++++++++++++-
 .../GaussianProcessRegressionTechnique.c3typ       |  3 +++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index 5b5b56f9..ad4d1542 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -12,6 +12,10 @@ def train(this, input, targetOutput, spec):
     X = c3.Dataset.toNumpy(dataset=input)
     y = c3.Dataset.toNumpy(dataset=targetOutput)
 
+    if (technique.centerTarget):
+        y = y - y.mean()
+    
+
     # get kernel object from c3, make it python again
     kernel = c3.PythonSerialization.deserialize(serialized=serializedKernel.pickledKernel)
 
@@ -19,7 +23,15 @@ def train(this, input, targetOutput, spec):
     gp = GaussianProcessRegressor(kernel=kernel)
     gp.fit(X, y)
 
-    this.trainedModel = c3.MLTrainedModelArtifact(model=c3.PythonSerialization.serialize(obj=gp))
+    if (technique.centerTarget):
+        this.trainedModel = c3.MLTrainedModelArtifact(
+            model=c3.PythonSerialization.serialize(obj=gp),
+            targetMean=float(y.mean())
+        )
+    else:
+        this.trainedModel = c3.MLTrainedModelArtifact(
+            model=c3.PythonSerialization.serialize(obj=gp),
+        )
 
     return this
 
diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ
index 71915854..61a975a3 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ
@@ -10,4 +10,7 @@ entity type GaussianProcessRegressionTechnique mixes MLTechnique schema name 'GP
     // the kernel object
     @ML(hyperParameter=true)
     kernel: SklearnGPRKernel
+    // center target data before fitting
+    @ML(hyperParameter=true)
+    centerTarget: boolean=false
 }
\ No newline at end of file

From 7029e8726750161ccf55ed9f98d8ff4fbf6d1625 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Fri, 16 Sep 2022 09:46:39 -0500
Subject: [PATCH 06/58] lil fix

---
 .../GPRegression/GaussianProcessRegressionPipe.py             | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index ad4d1542..fdb7b847 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -24,9 +24,11 @@ def train(this, input, targetOutput, spec):
     gp.fit(X, y)
 
     if (technique.centerTarget):
+        params = {}
+        params["targetMean"] = float(y.mean())
         this.trainedModel = c3.MLTrainedModelArtifact(
             model=c3.PythonSerialization.serialize(obj=gp),
-            targetMean=float(y.mean())
+            parameterss=params
         )
     else:
         this.trainedModel = c3.MLTrainedModelArtifact(

From b1f76b650215b785f90ee845e25bdb365b83a8ff Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Fri, 16 Sep 2022 09:50:05 -0500
Subject: [PATCH 07/58] typo

---
 .../GPRegression/GaussianProcessRegressionPipe.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index fdb7b847..b1282bc5 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -28,7 +28,7 @@ def train(this, input, targetOutput, spec):
         params["targetMean"] = float(y.mean())
         this.trainedModel = c3.MLTrainedModelArtifact(
             model=c3.PythonSerialization.serialize(obj=gp),
-            parameterss=params
+            parameters=params
         )
     else:
         this.trainedModel = c3.MLTrainedModelArtifact(

From 33f8e382d834ef3c7b420fb34dd665236778c834 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Fri, 16 Sep 2022 09:56:25 -0500
Subject: [PATCH 08/58] another fix

---
 .../GPRegression/GaussianProcessRegressionPipe.py              | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index b1282bc5..c1984f93 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -13,6 +13,7 @@ def train(this, input, targetOutput, spec):
     y = c3.Dataset.toNumpy(dataset=targetOutput)
 
     if (technique.centerTarget):
+        targetMean = float(y.mean())
         y = y - y.mean()
     
 
@@ -25,7 +26,7 @@ def train(this, input, targetOutput, spec):
 
     if (technique.centerTarget):
         params = {}
-        params["targetMean"] = float(y.mean())
+        params["targetMean"] = targetMean
         this.trainedModel = c3.MLTrainedModelArtifact(
             model=c3.PythonSerialization.serialize(obj=gp),
             parameters=params

From 392c4ee6ee1d26c9df70e13206724dff12de8d03 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Fri, 23 Sep 2022 10:24:29 -0500
Subject: [PATCH 09/58] add centerTarget to getPipe method

---
 training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js b/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js
index be6bbd64..6ac6c284 100644
--- a/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js
+++ b/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js
@@ -23,7 +23,8 @@ function getPipe(excFeats, gstpId, targetName, technique) {
     }).objs.map(obj => obj.id);
 
     // find the techniques
-    filter = Filter.intersects("kernel.id", kernelIds);
+    filter = Filter.intersects("kernel.id", kernelIds)
+        .and().eq("technique.centerTarget", technique.centerTarget);
     var techIds = GaussianProcessRegressionTechnique.fetch({
         "filter": filter.value,
         "limit": -1,

From ce27a669229e2d3a9a4674c96a4f0b39a08778fe Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Mon, 26 Sep 2022 10:09:36 -0500
Subject: [PATCH 10/58] call fix

---
 training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js b/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js
index 6ac6c284..29fe2001 100644
--- a/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js
+++ b/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js
@@ -24,7 +24,7 @@ function getPipe(excFeats, gstpId, targetName, technique) {
 
     // find the techniques
     filter = Filter.intersects("kernel.id", kernelIds)
-        .and().eq("technique.centerTarget", technique.centerTarget);
+        .and().eq("centerTarget", technique.centerTarget);
     var techIds = GaussianProcessRegressionTechnique.fetch({
         "filter": filter.value,
         "limit": -1,

From a43051c47854814dacf2a51a3a78f727dab03072 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Tue, 4 Oct 2022 12:44:14 -0500
Subject: [PATCH 11/58] first stab at method

---
 .../Utils/DataStaging/StagedFeatures.c3typ    |  4 ++
 .../src/Utils/DataStaging/StagedFeatures.py   | 65 +++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 training/gordon-group/src/Utils/DataStaging/StagedFeatures.py

diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
index 55f5f2ba..d43eeeef 100644
--- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
+++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
@@ -7,5 +7,9 @@
 * directly from other entity types.
 */
 entity type StagedFeatures schema name "STGD_FTRS" {
+    // the features to be staged
     features: map<string, double> schema suffix "FTR"
+    // method to stage from gstp list
+    @py(env='gordon_1_0_0')
+    stageFromAODGPRModelIdlist: function(ids: !any): integer
 }
diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
new file mode 100644
index 00000000..d65a691e
--- /dev/null
+++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
@@ -0,0 +1,65 @@
+def stageFromAODGPRModelIdsList(ids):
+    """
+    Given a list of GaussianProcessRegressionPipes trained with
+    AOD data, stage the features for each model.
+
+    Input:
+        ids: list of model ids
+
+    Return:
+        int: zero if it worked, 1 if it failed
+    """
+    import pandas as pd
+
+    # get data from dataSourceSpec one model
+    model = c3.GaussianProcessRegressionPipe.get(ids[0], "dataSourceSpec")
+    data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id)
+    excludeFeatures = data_source_spec.excludeFeatures
+    featuresType = data_source_spec.featuresType.toType()
+    inputTableC3 = featuresType.fetch(data_source_spec.featuresSpec).objs.toJson()
+    inputTable = pd.DataFrame(inputTableC3)
+    inputTable = inputTable.drop("version", axis=1)
+    inputTable = inputTable.select_dtypes(["number"])
+    if (dataSourceSpec.excludeFeatures):
+        inputTable.drop(columns=dataSourceSpec.excludeFeatures, inplace=True)
+
+    # get gstp coordinates from each model
+    lats = []
+    lons = []
+    times = []
+    for model_id in ids:
+        model = c3.GaussianProcessRegressionPipe.get(model_id, "dataSourceSpec")
+        data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id, "targetSpec")
+        gstp_id = data_source_spec.targetSpec.filter.split(" == ")[1].replace('"', '')
+        gstp = c3.GeoSurfaceTimePoint.get(gstp_id)
+        lats.append(gstp.latitude)
+        lons.append(gstp.longitude)
+        times.append(gstp.time)
+
+    def row_to_dict(row):
+        d = {}
+        for col in row.index:
+            d[col] = row[col]
+        return d
+
+    def add_coords(obj, lat, lon, time):
+        obj["latitude"] = lat
+        obj["longitude"] = lon
+        obj["time"] = time
+        return
+    
+    # build dataframe
+    df_sim_par = pd.DataFrame()
+    df_sim_par["features"] = inputTable.apply(row_to_dict, axis=1)
+
+    df = pd.DataFrame()
+    for i in range(len(lats)):
+        df_to_add = df_sim_par.copy()
+        df_to_add["features"].apply(add_coords, args=(lats[i], lons[i], times[i]))
+        df = pd.concat([df,df_to_add], ignore_index=True)
+
+    df["id"] = df.index
+    output_records = df.to_dict(orient="records")
+    c3.StagedFeatures.upsertBatch(objs=output_records)
+
+    return 0
\ No newline at end of file

From f6bf0800b54914d3c21abf54f9ecff127740b50e Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Tue, 4 Oct 2022 12:53:41 -0500
Subject: [PATCH 12/58] typo

---
 .../gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
index d43eeeef..d0bd8c42 100644
--- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
+++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
@@ -11,5 +11,5 @@ entity type StagedFeatures schema name "STGD_FTRS" {
     features: map<string, double> schema suffix "FTR"
     // method to stage from gstp list
     @py(env='gordon_1_0_0')
-    stageFromAODGPRModelIdlist: function(ids: !any): integer
+    stageFromAODGPRModelIdsList: function(ids: !any): integer
 }

From 0d3cea2b89d48f541a4548d42c11c06b40eb3363 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Tue, 4 Oct 2022 12:59:11 -0500
Subject: [PATCH 13/58] fix syntax bug

---
 training/gordon-group/src/Utils/DataStaging/StagedFeatures.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
index d65a691e..6bc21b60 100644
--- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
+++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
@@ -20,8 +20,8 @@ def stageFromAODGPRModelIdsList(ids):
     inputTable = pd.DataFrame(inputTableC3)
     inputTable = inputTable.drop("version", axis=1)
     inputTable = inputTable.select_dtypes(["number"])
-    if (dataSourceSpec.excludeFeatures):
-        inputTable.drop(columns=dataSourceSpec.excludeFeatures, inplace=True)
+    if (excludeFeatures):
+        inputTable.drop(columns=excludeFeatures, inplace=True)
 
     # get gstp coordinates from each model
     lats = []

From 22da146aa8f6c451422b81532c4744e22f771440 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Tue, 4 Oct 2022 13:03:52 -0500
Subject: [PATCH 14/58] making it more flexible

---
 .../gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
index d0bd8c42..98f8179f 100644
--- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
+++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
@@ -8,7 +8,7 @@
 */
 entity type StagedFeatures schema name "STGD_FTRS" {
     // the features to be staged
-    features: map<string, double> schema suffix "FTR"
+    features: map<string, any> schema suffix "FTR"
     // method to stage from gstp list
     @py(env='gordon_1_0_0')
     stageFromAODGPRModelIdsList: function(ids: !any): integer

From dc1fa70acc4d671b81265b55f594542322c36bcf Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Tue, 4 Oct 2022 13:12:18 -0500
Subject: [PATCH 15/58] try without timestamps

---
 .../gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ  | 2 +-
 .../gordon-group/src/Utils/DataStaging/StagedFeatures.py     | 5 ++---
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
index 98f8179f..d0bd8c42 100644
--- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
+++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
@@ -8,7 +8,7 @@
 */
 entity type StagedFeatures schema name "STGD_FTRS" {
     // the features to be staged
-    features: map<string, any> schema suffix "FTR"
+    features: map<string, double> schema suffix "FTR"
     // method to stage from gstp list
     @py(env='gordon_1_0_0')
     stageFromAODGPRModelIdsList: function(ids: !any): integer
diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
index 6bc21b60..71446df6 100644
--- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
+++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
@@ -42,10 +42,9 @@ def row_to_dict(row):
             d[col] = row[col]
         return d
 
-    def add_coords(obj, lat, lon, time):
+    def add_coords(obj, lat, lon):
         obj["latitude"] = lat
         obj["longitude"] = lon
-        obj["time"] = time
         return
     
     # build dataframe
@@ -55,7 +54,7 @@ def add_coords(obj, lat, lon, time):
     df = pd.DataFrame()
     for i in range(len(lats)):
         df_to_add = df_sim_par.copy()
-        df_to_add["features"].apply(add_coords, args=(lats[i], lons[i], times[i]))
+        df_to_add["features"].apply(add_coords, args=(lats[i], lons[i]))
         df = pd.concat([df,df_to_add], ignore_index=True)
 
     df["id"] = df.index

From abae3a949a764313f8378d93bb0f9b1552833963 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Tue, 4 Oct 2022 13:20:03 -0500
Subject: [PATCH 16/58] clear table before start

---
 training/gordon-group/src/Utils/DataStaging/StagedFeatures.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
index 71446df6..6f11cce0 100644
--- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
+++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
@@ -11,6 +11,8 @@ def stageFromAODGPRModelIdsList(ids):
     """
     import pandas as pd
 
+    c3.StagedFeatures.removeAll()
+
     # get data from dataSourceSpec one model
     model = c3.GaussianProcessRegressionPipe.get(ids[0], "dataSourceSpec")
     data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id)

From ee487af89fe4c34a5816370b7089168ebb0db709 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Tue, 4 Oct 2022 13:51:21 -0500
Subject: [PATCH 17/58] okay

---
 .../src/Utils/DataStaging/StagedTargets.c3typ |  4 ++
 .../src/Utils/DataStaging/StagedTargets.py    | 51 +++++++++++++++++++
 2 files changed, 55 insertions(+)
 create mode 100644 training/gordon-group/src/Utils/DataStaging/StagedTargets.py

diff --git a/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ
index 9f861106..6d31dcab 100644
--- a/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ
+++ b/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ
@@ -7,5 +7,9 @@
 * directly from other entity types.
 */
 entity type StagedTargets schema name "STGD_TRGTS" {
+    // the staged targets
     targets: map<string, double> schema suffix "TRGT"
+    // method to stage from gstp list
+    @py(env='gordon_1_0_0')
+    stageFromAODGPRModelIdsList: function(ids: !any): integer
 }
diff --git a/training/gordon-group/src/Utils/DataStaging/StagedTargets.py b/training/gordon-group/src/Utils/DataStaging/StagedTargets.py
new file mode 100644
index 00000000..74d45700
--- /dev/null
+++ b/training/gordon-group/src/Utils/DataStaging/StagedTargets.py
@@ -0,0 +1,51 @@
+def stageFromAODGPRModelIdsList(ids):
+    """
+    Given a list of GaussianProcessRegressionPipes trained with
+    AOD data, stage the targets for each model.
+
+    Input:
+        ids: list of model ids
+
+    Return:
+        int: zero if it worked, 1 if it failed
+    """
+    import pandas as pd
+
+    c3.StagedTargets.removeAll()
+
+    model = c3.GaussianProcessRegressionPipe.get(ids[0], "dataSourceSpec")
+    data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id)
+    target_type = data_source_spec.targetType.toType()
+
+    df = pd.DataFrame()
+
+    for model_id in ids:
+        model = c3.GaussianProcessRegressionPipe.get(model_id, "dataSourceSpec")
+        data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id)
+        outputC3 = target_type.fetch(data_source_spec.targetSpec).objs.toJson()
+        output = pd.DataFrame(outputC3)
+        output = output.drop("version", axis=1)
+        if data_source_spec.targetName == "all":
+            output = pd.DataFrame(
+                output.sum(axis=1),
+                columns=[data_source_spec.targetName]
+            )
+        else:
+            output = pd.DataFrame(output[data_source_spec.targetName])
+
+        df = pd.concat([df, output], ignore_index=True)
+
+    def row_to_dict(row):
+        d = {}
+        for col in row.index:
+            d[col] = row[col]
+        return d
+
+    df_final = pd.DataFrame()
+    df_final["targets"] = df.apply(row_to_dict, axis=1)
+    df_final["id"] = df_final.index
+
+    output_records = df_final.to_dict(orient="records")
+    c3.StagedTargets.upsertBatch(objs=output_records)
+
+    return 0

From e464e5221c54636918f443d990ba2a3648e7640d Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Tue, 4 Oct 2022 14:11:50 -0500
Subject: [PATCH 18/58] simpler!

---
 .../src/Utils/DataStaging/StagedTargets.py    | 24 ++++---------------
 1 file changed, 4 insertions(+), 20 deletions(-)

diff --git a/training/gordon-group/src/Utils/DataStaging/StagedTargets.py b/training/gordon-group/src/Utils/DataStaging/StagedTargets.py
index 74d45700..bbc202f6 100644
--- a/training/gordon-group/src/Utils/DataStaging/StagedTargets.py
+++ b/training/gordon-group/src/Utils/DataStaging/StagedTargets.py
@@ -12,28 +12,12 @@ def stageFromAODGPRModelIdsList(ids):
     import pandas as pd
 
     c3.StagedTargets.removeAll()
-
-    model = c3.GaussianProcessRegressionPipe.get(ids[0], "dataSourceSpec")
-    data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id)
-    target_type = data_source_spec.targetType.toType()
-
+    
     df = pd.DataFrame()
-
     for model_id in ids:
-        model = c3.GaussianProcessRegressionPipe.get(model_id, "dataSourceSpec")
-        data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id)
-        outputC3 = target_type.fetch(data_source_spec.targetSpec).objs.toJson()
-        output = pd.DataFrame(outputC3)
-        output = output.drop("version", axis=1)
-        if data_source_spec.targetName == "all":
-            output = pd.DataFrame(
-                output.sum(axis=1),
-                columns=[data_source_spec.targetName]
-            )
-        else:
-            output = pd.DataFrame(output[data_source_spec.targetName])
-
-        df = pd.concat([df, output], ignore_index=True)
+        model = c3.GaussianProcessRegressionPipe.get(model_id)
+        pdf = model.getTarget()
+        df = pd.concat([df,pdf], ignore_index=True)
 
     def row_to_dict(row):
         d = {}

From ee35cf278cfa9622fda4cd5890667ceed51e1567 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Tue, 4 Oct 2022 14:15:58 -0500
Subject: [PATCH 19/58] extra method call

---
 training/gordon-group/src/Utils/DataStaging/StagedTargets.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/gordon-group/src/Utils/DataStaging/StagedTargets.py b/training/gordon-group/src/Utils/DataStaging/StagedTargets.py
index bbc202f6..cafd3dad 100644
--- a/training/gordon-group/src/Utils/DataStaging/StagedTargets.py
+++ b/training/gordon-group/src/Utils/DataStaging/StagedTargets.py
@@ -16,7 +16,7 @@ def stageFromAODGPRModelIdsList(ids):
     df = pd.DataFrame()
     for model_id in ids:
         model = c3.GaussianProcessRegressionPipe.get(model_id)
-        pdf = model.getTarget()
+        pdf = c3.Dataset.toPandas(model.getTarget())
         df = pd.concat([df,pdf], ignore_index=True)
 
     def row_to_dict(row):

From 6c6c4ed302e23ea6a47f02ccc91f7b22559c0f9e Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Tue, 4 Oct 2022 14:38:42 -0500
Subject: [PATCH 20/58] cleaner bur likely slower

---
 .../src/Utils/DataStaging/StagedFeatures.py   | 55 ++++---------------
 1 file changed, 11 insertions(+), 44 deletions(-)

diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
index 6f11cce0..64a53446 100644
--- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
+++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
@@ -13,54 +13,21 @@ def stageFromAODGPRModelIdsList(ids):
 
     c3.StagedFeatures.removeAll()
 
-    # get data from dataSourceSpec one model
-    model = c3.GaussianProcessRegressionPipe.get(ids[0], "dataSourceSpec")
-    data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id)
-    excludeFeatures = data_source_spec.excludeFeatures
-    featuresType = data_source_spec.featuresType.toType()
-    inputTableC3 = featuresType.fetch(data_source_spec.featuresSpec).objs.toJson()
-    inputTable = pd.DataFrame(inputTableC3)
-    inputTable = inputTable.drop("version", axis=1)
-    inputTable = inputTable.select_dtypes(["number"])
-    if (excludeFeatures):
-        inputTable.drop(columns=excludeFeatures, inplace=True)
-
-    # get gstp coordinates from each model
-    lats = []
-    lons = []
-    times = []
+    df = pd.DataFrame()
     for model_id in ids:
-        model = c3.GaussianProcessRegressionPipe.get(model_id, "dataSourceSpec")
+        model = c3.GaussianProcessRegressionPipe.get(model_id)
         data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id, "targetSpec")
         gstp_id = data_source_spec.targetSpec.filter.split(" == ")[1].replace('"', '')
         gstp = c3.GeoSurfaceTimePoint.get(gstp_id)
-        lats.append(gstp.latitude)
-        lons.append(gstp.longitude)
-        times.append(gstp.time)
-
-    def row_to_dict(row):
-        d = {}
-        for col in row.index:
-            d[col] = row[col]
-        return d
-
-    def add_coords(obj, lat, lon):
-        obj["latitude"] = lat
-        obj["longitude"] = lon
-        return
-    
-    # build dataframe
-    df_sim_par = pd.DataFrame()
-    df_sim_par["features"] = inputTable.apply(row_to_dict, axis=1)
-
-    df = pd.DataFrame()
-    for i in range(len(lats)):
-        df_to_add = df_sim_par.copy()
-        df_to_add["features"].apply(add_coords, args=(lats[i], lons[i]))
-        df = pd.concat([df,df_to_add], ignore_index=True)
-
-    df["id"] = df.index
-    output_records = df.to_dict(orient="records")
+        pdf = c3.Dataset.toPandas(model.getFeatures())
+        pdf["latitude"] = gstp.latitude
+        pdf["longitude"] = gstp.longitude
+        df = pd.concat([df,pdf], ignore_index=True)
+
+    df_final = pd.DataFrame()
+    df_final["features"] = df.apply(row_to_dict, axis=1)
+    df_final["id"] = df_final.index
+    output_records = df_final.to_dict(orient="records")
     c3.StagedFeatures.upsertBatch(objs=output_records)
 
     return 0
\ No newline at end of file

From 9c8539c154e3b9ae3eb239b0214f2bb9d639444f Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Tue, 4 Oct 2022 14:50:24 -0500
Subject: [PATCH 21/58] forgot function def

---
 .../gordon-group/src/Utils/DataStaging/StagedFeatures.py    | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
index 64a53446..b6cacd74 100644
--- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
+++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
@@ -24,6 +24,12 @@ def stageFromAODGPRModelIdsList(ids):
         pdf["longitude"] = gstp.longitude
         df = pd.concat([df,pdf], ignore_index=True)
 
+    def row_to_dict(row):
+        d = {}
+        for col in row.index:
+            d[col] = row[col]
+        return d
+        
     df_final = pd.DataFrame()
     df_final["features"] = df.apply(row_to_dict, axis=1)
     df_final["id"] = df_final.index

From 72d9d932b4f1670b10e34e8f24dc7aff2cd470d0 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Wed, 5 Oct 2022 13:01:06 -0500
Subject: [PATCH 22/58] method to train with staged data

---
 .../GaussianProcessRegressionPipe.c3typ       |  3 ++
 .../GaussianProcessRegressionPipe.py          | 52 +++++++++++++++++++
 2 files changed, 55 insertions(+)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
index 294929a2..c1290866 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
@@ -28,4 +28,7 @@ entity type GaussianProcessRegressionPipe extends MLLeafPipe<Dataset, Dataset> t
     // guarantee that process() is only allowed after train()
     @py(env='gordon-ML_1_0_0')
     isProcessable: ~
+    // train large model with AOD staged data
+    @py(env='gordon-ML_1_0_0')
+    trainWithStagedAOD: member function(modelIds: any): integer
 }
diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index c1984f93..662ad224 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -163,3 +163,55 @@ def getTarget(this):
         outputTablePandas = pd.DataFrame(outputTablePandas[dataSourceSpec.targetName])
 
     return c3.Dataset.fromPython(outputTablePandas)
+
+
+def trainWithStagedAOD(this, ids):
+    """
+    This method trains a large model with data coming from previously trained
+    GPR models with AOD data.
+
+    Inputs:
+        ids: list of GaussianProcessRegressionPipes ids
+
+    Returns:
+        int: 0 if method worked, 1 otherwise
+    """
+    from sklearn.gaussian_process import GaussianProcessRegressor
+
+    # stage features and targets
+    c3.StagedFeatures.stageFromAODGPRModelIdsList(ids)
+    c3.StagedTargets.stageFromAODGPRModelIdsList(ids)
+    # get data
+    X = c3.Dataset.toNumpy(dataset=this.getFeatures())
+    y = c3.Dataset.toNumpy(dataset=this.getTarget())
+
+    # generate training technique
+    technique = c3.GaussianProcessRegressionTechnique.get(this.technique.id)
+    serializedKernel = c3.SklearnGPRKernel.get(technique.kernel.id)
+
+    if (technique.centerTarget):
+        targetMean = float(y.mean())
+        y = y - y.mean()
+    
+    # get kernel object from c3, make it python again
+    kernel = c3.PythonSerialization.deserialize(serialized=serializedKernel.pickledKernel)
+
+    # build and train GPR
+    gp = GaussianProcessRegressor(kernel=kernel)
+    gp.fit(X, y)
+
+    if (technique.centerTarget):
+        params = {}
+        params["targetMean"] = targetMean
+        this.trainedModel = c3.MLTrainedModelArtifact(
+            model=c3.PythonSerialization.serialize(obj=gp),
+            parameters=params
+        )
+    else:
+        this.trainedModel = c3.MLTrainedModelArtifact(
+            model=c3.PythonSerialization.serialize(obj=gp),
+        )
+
+    this.upsert()
+
+    return 0    

From 13c7fdfe17cfd0f499c884a8c09f5ddabbd774f7 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Wed, 5 Oct 2022 13:11:39 -0500
Subject: [PATCH 23/58] change argument name

---
 .../GPRegression/GaussianProcessRegressionPipe.py           | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index 662ad224..d6d99e70 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -165,7 +165,7 @@ def getTarget(this):
     return c3.Dataset.fromPython(outputTablePandas)
 
 
-def trainWithStagedAOD(this, ids):
+def trainWithStagedAOD(this, modelIds):
     """
     This method trains a large model with data coming from previously trained
     GPR models with AOD data.
@@ -179,8 +179,8 @@ def trainWithStagedAOD(this, ids):
     from sklearn.gaussian_process import GaussianProcessRegressor
 
     # stage features and targets
-    c3.StagedFeatures.stageFromAODGPRModelIdsList(ids)
-    c3.StagedTargets.stageFromAODGPRModelIdsList(ids)
+    c3.StagedFeatures.stageFromAODGPRModelIdsList(modelIds)
+    c3.StagedTargets.stageFromAODGPRModelIdsList(modelIds)
     # get data
     X = c3.Dataset.toNumpy(dataset=this.getFeatures())
     y = c3.Dataset.toNumpy(dataset=this.getTarget())

From 6e5e09c6d3b1faf749f60966ab467d361e3a714d Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Wed, 12 Oct 2022 12:56:47 -0500
Subject: [PATCH 24/58] env for gpytorch -- first of many

---
 .../seed/ActionRuntime/py-gordon-ML_2_0_0.json    | 15 +++++++++++++++
 1 file changed, 15 insertions(+)
 create mode 100644 training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json

diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
new file mode 100644
index 00000000..296abc5d
--- /dev/null
+++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
@@ -0,0 +1,15 @@
+{
+    "language": "Python",
+    "runtimeVersion": "3.8.10",
+    "modules": {
+      "conda.scikit-learn":"=0.24.2",
+      "conda.pandas":"=1.0.1"
+    },
+    "repositories": [
+      "https://repo.continuum.io/pkgs/main",
+      "conda-forge"
+    ],
+    "runtime": "CPython",
+    "name": "py-gordon-ML_2_0_0",
+    "id": "py-gordon-ML_2_0_0"
+}
\ No newline at end of file

From 36e9badfc6443c749f5a06b1b5b79c5c97320691 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Wed, 12 Oct 2022 12:59:57 -0500
Subject: [PATCH 25/58] right

---
 .../gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json    | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
index 296abc5d..dee4c64c 100644
--- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
+++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
@@ -2,8 +2,7 @@
     "language": "Python",
     "runtimeVersion": "3.8.10",
     "modules": {
-      "conda.scikit-learn":"=0.24.2",
-      "conda.pandas":"=1.0.1"
+      "conda.scikit-learn":"=0.24.2"
     },
     "repositories": [
       "https://repo.continuum.io/pkgs/main",

From fc68638573e2447ac111138c7ce4fffd6d641dcb Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Wed, 12 Oct 2022 13:39:33 -0500
Subject: [PATCH 26/58] add pandas

---
 .../gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
index dee4c64c..2e2fe66f 100644
--- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
+++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
@@ -2,7 +2,8 @@
     "language": "Python",
     "runtimeVersion": "3.8.10",
     "modules": {
-      "conda.scikit-learn":"=0.24.2"
+      "conda.scikit-learn":"=0.24.2",
+      "conda.pandas":"1.3.0"
     },
     "repositories": [
       "https://repo.continuum.io/pkgs/main",

From bb93223145039cce43f1f9723d04ad5572b9fa3a Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Wed, 12 Oct 2022 13:45:27 -0500
Subject: [PATCH 27/58] forgot something

---
 .../gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
index 2e2fe66f..dbbf1929 100644
--- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
+++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
@@ -3,7 +3,7 @@
     "runtimeVersion": "3.8.10",
     "modules": {
       "conda.scikit-learn":"=0.24.2",
-      "conda.pandas":"1.3.0"
+      "conda.pandas":"=1.3.0"
     },
     "repositories": [
       "https://repo.continuum.io/pkgs/main",

From 6483836feca1df1d70e2db642b946ef9378fdf5f Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Wed, 12 Oct 2022 15:13:53 -0500
Subject: [PATCH 28/58] adding validation to GPR train method

---
 .../GPRegression/GaussianProcessRegressionPipe.py             | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index d6d99e70..32673fed 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -16,6 +16,10 @@ def train(this, input, targetOutput, spec):
         targetMean = float(y.mean())
         y = y - y.mean()
     
+    if (technique.validation):
+        # trim X
+        # trim y
+
 
     # get kernel object from c3, make it python again
     kernel = c3.PythonSerialization.deserialize(serialized=serializedKernel.pickledKernel)

From 2c13dd58f82dc45eca705810ef7f92eb62b2d0dc Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Wed, 12 Oct 2022 15:22:35 -0500
Subject: [PATCH 29/58] shuffling feats and targs

---
 .../GPRegression/GaussianProcessRegressionPipe.py          | 7 +++++--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index 32673fed..64beab8a 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -17,8 +17,11 @@ def train(this, input, targetOutput, spec):
         y = y - y.mean()
     
     if (technique.validation):
-        # trim X
-        # trim y
+        rng = np.random.RandomState(technique.randomSeed)
+        rng.shuffle(X)
+        X = X[0:int(technique.splitFraction*len(X))]
+        rng.shuffle(y)
+        y = y[0:int(technique.splitFraction*len(y))]
 
 
     # get kernel object from c3, make it python again

From 1bd5d2823de43a8976c955f5645e5c105d17f2f9 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Wed, 12 Oct 2022 15:27:03 -0500
Subject: [PATCH 30/58] fix typos

---
 .../GPRegression/GaussianProcessRegressionPipe.py             | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index 64beab8a..38e1f984 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -19,9 +19,9 @@ def train(this, input, targetOutput, spec):
     if (technique.validation):
         rng = np.random.RandomState(technique.randomSeed)
         rng.shuffle(X)
-        X = X[0:int(technique.splitFraction*len(X))]
+        X = X[0:int((1.0 - technique.splitFraction)*len(X))]
         rng.shuffle(y)
-        y = y[0:int(technique.splitFraction*len(y))]
+        y = y[0:int((1.0 - technique.splitFraction)*len(y))]
 
 
     # get kernel object from c3, make it python again

From b2d345282d14c478cf5473a97095ff5b181a2358 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Thu, 13 Oct 2022 13:53:17 -0500
Subject: [PATCH 31/58] add torch

---
 .../seed/ActionRuntime/py-gordon-ML_2_0_0.json           | 9 +++++++--
 .../GaussianProcessRegressionTechnique.c3typ             | 9 +++++++++
 2 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
index dbbf1929..fb077a9e 100644
--- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
+++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
@@ -3,11 +3,16 @@
     "runtimeVersion": "3.8.10",
     "modules": {
       "conda.scikit-learn":"=0.24.2",
-      "conda.pandas":"=1.3.0"
+      "conda.pandas":"=1.3.0",
+      "conda.pytorch":"1.12.1",
+      "conda.torchvision":"=0.13.1",
+      "conda.torchaudio":"=0.12.1",
+      "conda.cudatoolkit":"=11.3"
     },
     "repositories": [
       "https://repo.continuum.io/pkgs/main",
-      "conda-forge"
+      "conda-forge",
+      "pytorch"
     ],
     "runtime": "CPython",
     "name": "py-gordon-ML_2_0_0",
diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ
index 61a975a3..6e1543f0 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ
@@ -13,4 +13,13 @@ entity type GaussianProcessRegressionTechnique mixes MLTechnique schema name 'GP
     // center target data before fitting
     @ML(hyperParameter=true)
     centerTarget: boolean=false
+    // leave fraction of rows for post-validation
+    @ML(hyperParameter=true)
+    validation: boolean=false
+    // random seed to be used by numpy.shuffle
+    @ML(hyperParameter=true)
+    randomSeed: integer=42
+    // fraction to be left for validation
+    @ML(hyperParameter=true)
+    splitFraction: float=0.2
 }
\ No newline at end of file

From 07698ced5581adfc8badae504bdecc3e0512f72f Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Thu, 13 Oct 2022 14:26:59 -0500
Subject: [PATCH 32/58] typo

---
 .../gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
index fb077a9e..c5a9a417 100644
--- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
+++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
@@ -4,7 +4,7 @@
     "modules": {
       "conda.scikit-learn":"=0.24.2",
       "conda.pandas":"=1.3.0",
-      "conda.pytorch":"1.12.1",
+      "conda.pytorch":"=1.12.1",
       "conda.torchvision":"=0.13.1",
       "conda.torchaudio":"=0.12.1",
       "conda.cudatoolkit":"=11.3"

From cc690e8c18a4da87f054482ad6a0a80d3717a184 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Fri, 14 Oct 2022 10:59:49 -0500
Subject: [PATCH 33/58] ligcc

---
 .../gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
index c5a9a417..4a4e4f5c 100644
--- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
+++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
@@ -7,12 +7,14 @@
       "conda.pytorch":"=1.12.1",
       "conda.torchvision":"=0.13.1",
       "conda.torchaudio":"=0.12.1",
-      "conda.cudatoolkit":"=11.3"
+      "conda.cudatoolkit":"=11.3",
+      "conda.libgcc":"=7.2.0"
     },
     "repositories": [
       "https://repo.continuum.io/pkgs/main",
       "conda-forge",
-      "pytorch"
+      "pytorch",
+      "anaconda"
     ],
     "runtime": "CPython",
     "name": "py-gordon-ML_2_0_0",

From 46d0525221b7d5eba5d53abeabf8fdae52236de1 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Fri, 14 Oct 2022 11:26:14 -0500
Subject: [PATCH 34/58] add gpytorch

---
 .../gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
index 4a4e4f5c..f115f499 100644
--- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
+++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
@@ -8,7 +8,8 @@
       "conda.torchvision":"=0.13.1",
       "conda.torchaudio":"=0.12.1",
       "conda.cudatoolkit":"=11.3",
-      "conda.libgcc":"=7.2.0"
+      "conda.libgcc":"=7.2.0",
+      "conda.gpytorch":"=1.9.0"
     },
     "repositories": [
       "https://repo.continuum.io/pkgs/main",

From ccb8795a67d0439584678c3fc1683c34c4c30b5f Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Fri, 14 Oct 2022 11:44:30 -0500
Subject: [PATCH 35/58] adding dill for serialization

---
 .../gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json    | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
index f115f499..34699175 100644
--- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
+++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
@@ -9,7 +9,8 @@
       "conda.torchaudio":"=0.12.1",
       "conda.cudatoolkit":"=11.3",
       "conda.libgcc":"=7.2.0",
-      "conda.gpytorch":"=1.9.0"
+      "conda.gpytorch":"=1.9.0",
+      "conda.dill":"=0.2.8.2"
     },
     "repositories": [
       "https://repo.continuum.io/pkgs/main",

From 4485189e755a68a1259dcb1eed49d1d95cf22caa Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Thu, 27 Oct 2022 15:14:10 -0500
Subject: [PATCH 36/58] keep data in memory

---
 .../GaussianProcessRegressionPipe.c3typ       |  3 +
 .../GaussianProcessRegressionPipe.py          | 64 ++++++++++++++++++-
 2 files changed, 66 insertions(+), 1 deletion(-)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
index c1290866..4cd7fa7a 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
@@ -31,4 +31,7 @@ entity type GaussianProcessRegressionPipe extends MLLeafPipe<Dataset, Dataset> t
     // train large model with AOD staged data
     @py(env='gordon-ML_1_0_0')
     trainWithStagedAOD: member function(modelIds: any): integer
+    // train with list of GSTPs
+    @py(env='gordon-ML_1_0_0')
+    trainWithListOfAODModels: member function(modelIds: any): integer
 }
diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index 38e1f984..8212f665 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -221,4 +221,66 @@ def trainWithStagedAOD(this, modelIds):
 
     this.upsert()
 
-    return 0    
+    return 0
+
+def trainWithListOfAODModels(this, modelIds):
+    """
+    This method trains a large model with data coming from previously trained
+    GPR models with AOD data.
+
+    Inputs:
+        ids: list of GaussianProcessRegressionPipes ids
+
+    Returns:
+        int: 0 if method worked, 1 otherwise
+    """
+    from sklearn.gaussian_process import GaussianProcessRegressor
+
+    # get data
+    X = pd.DataFrame()
+    y = pd.DataFrame()
+    for model_id in ids:
+        model = c3.GaussianProcessRegressionPipe.get(model_id)
+        data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id, "targetSpec")
+        gstp_id = data_source_spec.targetSpec.filter.split(" == ")[1].replace('"', '')
+        gstp = c3.GeoSurfaceTimePoint.get(gstp_id)
+        px = c3.Dataset.toPandas(model.getFeatures())
+        px["latitude"] = gstp.latitude
+        px["longitude"] = gstp.longitude
+        X = pd.concat([X,px], ignore_index=True)
+
+        py = c3.Dataset.toPandas(model.getTarget())
+        y = pd.concat([y,py], ignore_index=True)
+    X = X.to_numpy()
+    y = y.to_numpy()
+
+    # generate training technique
+    technique = c3.GaussianProcessRegressionTechnique.get(this.technique.id)
+    serializedKernel = c3.SklearnGPRKernel.get(technique.kernel.id)
+
+    if (technique.centerTarget):
+        targetMean = float(y.mean())
+        y = y - y.mean()
+
+    # get kernel object from c3, make it python again
+    kernel = c3.PythonSerialization.deserialize(serialized=serializedKernel.pickledKernel)
+
+    # build and train GPR
+    gp = GaussianProcessRegressor(kernel=kernel)
+    gp.fit(X, y)
+
+    if (technique.centerTarget):
+        params = {}
+        params["targetMean"] = targetMean
+        this.trainedModel = c3.MLTrainedModelArtifact(
+            model=c3.PythonSerialization.serialize(obj=gp),
+            parameters=params
+        )
+    else:
+        this.trainedModel = c3.MLTrainedModelArtifact(
+            model=c3.PythonSerialization.serialize(obj=gp),
+        )
+
+    this.upsert()
+
+    return 0
\ No newline at end of file

From 97146a319c54f458108c51a050278538ed841e47 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Thu, 27 Oct 2022 15:36:23 -0500
Subject: [PATCH 37/58] add pandas

---
 .../GPRegression/GaussianProcessRegressionPipe.py                | 1 +
 1 file changed, 1 insertion(+)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index 8212f665..d8b19343 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -235,6 +235,7 @@ def trainWithListOfAODModels(this, modelIds):
         int: 0 if method worked, 1 otherwise
     """
     from sklearn.gaussian_process import GaussianProcessRegressor
+    import pandas as pd
 
     # get data
     X = pd.DataFrame()

From a76dc7e0aa8ac1d849953090c93612d5d29c1864 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Thu, 27 Oct 2022 15:47:55 -0500
Subject: [PATCH 38/58] another trial

---
 .../GPRegression/GaussianProcessRegressionPipe.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index d8b19343..7f56d51b 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -240,7 +240,7 @@ def trainWithListOfAODModels(this, modelIds):
     # get data
     X = pd.DataFrame()
     y = pd.DataFrame()
-    for model_id in ids:
+    for model_id in modelIds:
         model = c3.GaussianProcessRegressionPipe.get(model_id)
         data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id, "targetSpec")
         gstp_id = data_source_spec.targetSpec.filter.split(" == ")[1].replace('"', '')

From 81f42196b8a9d53b907b4bfc9673ba0c880ad5fe Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Thu, 27 Oct 2022 15:53:37 -0500
Subject: [PATCH 39/58] source spec not required

---
 .../GPRegression/GaussianProcessRegressionPipe.c3typ            | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
index 4cd7fa7a..585b16e2 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
@@ -11,7 +11,7 @@ entity type GaussianProcessRegressionPipe extends MLLeafPipe<Dataset, Dataset> t
     // the technique for this regression
     technique: !GaussianProcessRegressionTechnique
     // data source spec for this regression
-    dataSourceSpec: !GPRDataSourceSpec
+    dataSourceSpec: GPRDataSourceSpec
 
     // get features data
     @py(env='gordon-ML_1_0_0')

From 321a00e8b11b3025fa659a461485fb9577fd2ca4 Mon Sep 17 00:00:00 2001
From: James Carzon <jcarzon@andrew.cmu.edu>
Date: Thu, 27 Oct 2022 22:48:26 -0400
Subject: [PATCH 40/58] Include time in hours as staged feature

---
 .../gordon-group/src/Utils/DataStaging/StagedFeatures.py   | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
index b6cacd74..861e8942 100644
--- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
+++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
@@ -10,6 +10,7 @@ def stageFromAODGPRModelIdsList(ids):
         int: zero if it worked, 1 if it failed
     """
     import pandas as pd
+    from datetime import timedelta 
 
     c3.StagedFeatures.removeAll()
 
@@ -22,6 +23,12 @@ def stageFromAODGPRModelIdsList(ids):
         pdf = c3.Dataset.toPandas(model.getFeatures())
         pdf["latitude"] = gstp.latitude
         pdf["longitude"] = gstp.longitude
+        my_time = gstp.time.timetuple()
+        pdf["time"] = timedelta(
+            days=my_time.tm_yday,
+            minutes=my_time.tm_min,
+            hours=my_time.tm_hour
+        ).total_seconds() / 3600
         df = pd.concat([df,pdf], ignore_index=True)
 
     def row_to_dict(row):

From d3f0dd422a8f6feeec87c9ee272811ad27438093 Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Thu, 3 Nov 2022 11:07:44 -0500
Subject: [PATCH 41/58] add excl feats

---
 .../GPRegression/GaussianProcessRegressionPipe.c3typ           | 2 +-
 .../GPRegression/GaussianProcessRegressionPipe.py              | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
index 585b16e2..f8c2f44b 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
@@ -33,5 +33,5 @@ entity type GaussianProcessRegressionPipe extends MLLeafPipe<Dataset, Dataset> t
     trainWithStagedAOD: member function(modelIds: any): integer
     // train with list of GSTPs
     @py(env='gordon-ML_1_0_0')
-    trainWithListOfAODModels: member function(modelIds: any): integer
+    trainWithListOfAODModels: member function(modelIds: any, excludeFeatures: any): integer
 }
diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index 7f56d51b..324d11fc 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -223,7 +223,7 @@ def trainWithStagedAOD(this, modelIds):
 
     return 0
 
-def trainWithListOfAODModels(this, modelIds):
+def trainWithListOfAODModels(this, modelIds, excludeFeatures):
     """
     This method trains a large model with data coming from previously trained
     GPR models with AOD data.
@@ -252,6 +252,7 @@ def trainWithListOfAODModels(this, modelIds):
 
         py = c3.Dataset.toPandas(model.getTarget())
         y = pd.concat([y,py], ignore_index=True)
+    X.drop(excludeFeatures, axis=1, inplace=True)
     X = X.to_numpy()
     y = y.to_numpy()
 

From b71cbb95728897e7444d45dafc7701150234daaf Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Thu, 3 Nov 2022 12:20:34 -0500
Subject: [PATCH 42/58] making it more readable

---
 .../GPRegression/GaussianProcessRegressionPipe.py               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index 324d11fc..d4316209 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -252,7 +252,7 @@ def trainWithListOfAODModels(this, modelIds, excludeFeatures):
 
         py = c3.Dataset.toPandas(model.getTarget())
         y = pd.concat([y,py], ignore_index=True)
-    X.drop(excludeFeatures, axis=1, inplace=True)
+    X.drop(columns=excludeFeatures, inplace=True)
     X = X.to_numpy()
     y = y.to_numpy()
 

From 312779234f2d8f68f1d608297a20852c4e5e2a80 Mon Sep 17 00:00:00 2001
From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com>
Date: Thu, 3 Nov 2022 17:00:32 -0400
Subject: [PATCH 43/58] Include time in hours as action feature

---
 .../GPRegression/GaussianProcessRegressionPipe.py          | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
index d4316209..5ba1084e 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -236,6 +236,7 @@ def trainWithListOfAODModels(this, modelIds, excludeFeatures):
     """
     from sklearn.gaussian_process import GaussianProcessRegressor
     import pandas as pd
+    from datetime import timedelta
 
     # get data
     X = pd.DataFrame()
@@ -245,9 +246,15 @@ def trainWithListOfAODModels(this, modelIds, excludeFeatures):
         data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id, "targetSpec")
         gstp_id = data_source_spec.targetSpec.filter.split(" == ")[1].replace('"', '')
         gstp = c3.GeoSurfaceTimePoint.get(gstp_id)
+        my_time = gstp.time.timetuple()
         px = c3.Dataset.toPandas(model.getFeatures())
         px["latitude"] = gstp.latitude
         px["longitude"] = gstp.longitude
+        px["time"] = timedelta(
+            days=my_time.tm_yday,
+            minutes=my_time.tm_min,
+            hours=my_time.tm_hour
+        ).total_seconds() / 3600
         X = pd.concat([X,px], ignore_index=True)
 
         py = c3.Dataset.toPandas(model.getTarget())

From 4e86563c6b73539732d4532fd97e72fbd923093e Mon Sep 17 00:00:00 2001
From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com>
Date: Sat, 5 Nov 2022 20:46:01 -0400
Subject: [PATCH 44/58] Predict using DynMapReduce

---
 .../src/Utils/Predict/PredictAODGPR.c3typ     | 22 +++++
 .../src/Utils/Predict/PredictAODGPR.js        | 59 +++++++++++++
 .../src/Utils/Predict/PredictAODGPR.py        | 87 +++++++++++++++++++
 3 files changed, 168 insertions(+)
 create mode 100644 training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ
 create mode 100644 training/gordon-group/src/Utils/Predict/PredictAODGPR.js
 create mode 100644 training/gordon-group/src/Utils/Predict/PredictAODGPR.py

diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ
new file mode 100644
index 00000000..91325661
--- /dev/null
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ
@@ -0,0 +1,22 @@
+/**
+* Copyright (c) 2022, C3 AI DTI, Development Operations Team
+* All rights reserved. License: https://github.com/c3aidti/.github
+**/
+/**
+* This finds {@link GaussianProcessRegressionPipe}s that were trained 
+* with {@link Simulation3HourlyAODOutput} as targets, 
+* {@link SimulationModelParameters} as features, 
+* via a {@link AODGaussianMLTrainingJob}
+*/
+type PredictAODGPR {
+    // Retrieve models based on exluded features, {@link GeoSurfaceTimePoint} instance, target name and training technique
+    getPipe: function(excFeats: [string], gstpId: string, targetName: string, technique: any): any js server
+    // Retrieve all models for a certain {@link GeoSurfaceTimePoint} filter
+    getPipes: function(excFeats: [string], gstpFilter: any, targetName: string, technique: any): any js server
+    // Extract learned parameters from trained {@link GaussianProcessRegressionPipe}s specified by {@link GeoSurfaceTimePoint} filter, target name, excluded features and {@link GaussianProcessRegressionTechnque}
+    @py(env='gordon-ML_1_0_0')
+    makePredictionsJob: function(excFeats: [string], gstpFilter: any, synthDataset: any, targetName: string, technique: any, batchSize: int): any
+    // Build a pandas dataframe with the hyper parameters once job is complete
+    @py(env='gordon-ML_1_0_0')
+    getPredictionsDataframeFromJob: inline function(job: any): any
+}
\ No newline at end of file
diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.js b/training/gordon-group/src/Utils/Predict/PredictAODGPR.js
new file mode 100644
index 00000000..93b9c7a0
--- /dev/null
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.js
@@ -0,0 +1,59 @@
+function getPipe(excFeats, gstpId, targetName, technique) {
+    // identical to the methods used in AODGPRModelFinder.js
+
+    // find the data source specs
+    var gstpKey = "geoSurfaceTimePoint.id == \"" + gstpId + "\"";
+    var filter = Filter.eq("featuresType.typeName", "SimulationModelParameters")
+        .and().eq("targetType.typeName", "Simulation3HourlyAODOutput")
+        .and().intersects("excludeFeatures", excFeats)
+        .and().eq("targetName", targetName)
+        .and().eq("targetSpec.filter", gstpKey);
+
+    var sourceSpecIds = GPRDataSourceSpec.fetch({
+        "filter": filter,
+        "limit": -1,
+        "include": "id"
+    }).objs.map(obj => obj.id);
+
+    // find the kernels
+    filter = Filter.eq("pickledKernel", technique.kernel.pickledKernel);
+    var kernelIds = SklearnGPRKernel.fetch({
+        "filter": filter.value,
+        "limit": -1,
+        "include": "id"
+    }).objs.map(obj => obj.id);
+
+    // find the techniques
+    filter = Filter.intersects("kernel.id", kernelIds)
+        .and().eq("centerTarget", technique.centerTarget);
+    var techIds = GaussianProcessRegressionTechnique.fetch({
+        "filter": filter.value,
+        "limit": -1,
+        "include": "id"
+    }).objs.map(obj => obj.id);
+
+    // now find the models
+    filter = Filter.intersects("technique.id", techIds)
+        .and().intersects("dataSourceSpec.id", sourceSpecIds);
+    var pipes = GaussianProcessRegressionPipe.fetch({
+        "filter": filter.value,
+        "limit": -1
+    }).objs;
+
+    return pipes
+}
+
+function getPipes(excFeats, gstpFilter, targetName, technique) {
+    var gstpIds = GeoSurfaceTimePoint.fetch({
+        "filter": gstpFilter,
+        "limit": -1,
+        "include": "id"
+    }).objs.map(obj => obj.id);
+
+    var pipes = gstpIds.map(id => AODGPRModelFinder.getPipe(excFeats, id, targetName, technique));
+    var nonNulls = pipes.filter(function (el) {
+        return el.length != 0;
+    });
+
+    return nonNulls
+}
\ No newline at end of file
diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
new file mode 100644
index 00000000..cc1decc6
--- /dev/null
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
@@ -0,0 +1,87 @@
+def makePredictionsJob(
+    excFeats, gstpFilter, targetName, synthDataset, technique, batchSize
+):
+    """
+    Dynamic map-reduce job to get predictions on synthDataset.
+    """
+    def cassandra_mapper(batch, objs, job):
+        models = []
+        for obj in objs:
+            model = c3.AODGPRModelFinder.getPipe(
+                job.context.value["excludeFeatures"],
+                obj.id,
+                job.context.value["targetName"],
+                job.context.value["technique"]
+            )
+            models.append(model)
+        
+        return {batch: models}
+
+    def cassandra_reducer(key, interValues, job):
+        values = []
+        for iv in interValues:
+            for val in iv:
+                for m in val:
+                    model_id = m["id"]
+                    centered = m["technique"]["centerTarget"]
+                    if centered:
+                        center = m["trainedModel"].parameters["targetMean"].asfloat()
+                    else:
+                        center = 0
+                    preds = m.process(synthDataset, computeStd=True)
+                    values.append((preds, synthDataset, model_id, center))
+                    
+
+        return values
+
+    map_lambda = c3.Lambda.fromPython(cassandra_mapper)
+    reduce_lambda = c3.Lambda.fromPython(cassandra_reducer, runtime="gordon-ML_1_0_0")
+
+    job_context = c3.MappObj(
+        value={
+            'excludeFeatures': excFeats,
+            'targetName': targetName,
+            'technique': technique,
+            'syntheticDataset': synthDataset
+        }
+    )
+    job = c3.DynMapReduce.startFromSpec(
+        c3.DynMapReduceSpec(
+            targetType="GeoSurfaceTimePoint",       
+            filter=gstpFilter, 
+            mapLambda=map_lambda,
+            reduceLambda=reduce_lambda,
+            batchSize=batchSize,
+            context=job_context
+        )
+    )
+
+    return job
+
+
+def getPredictionsDataframeFromJob(job):
+    """
+    Iterates over job result and builds dataframe.
+    """
+    import pandas as pd
+    import numpy as np
+
+    predictions = []
+    
+    if job.status().status == "completed":
+        for key, value in job.results().items():
+            for subvalue in value:
+                df_y = c3.Dataset.toPandas(subvalue[0])
+                df_y[0] += subvalue[3]
+                df_x = c3.Dataset.toPandas(subvalue[1])
+                m_preds = pd.concat(
+                    [df_x, df_y],
+                    axis=1
+                )
+                m_preds["modelId"] = subvalue[2]
+                predictions.append(m_preds)
+                
+        df = pd.concat(predictions, axis=0).reset_index(drop=True)
+        return df
+    else:
+        return False
\ No newline at end of file

From 0c5316b3dac4172b9bd01abdcd467b67ef75ccf8 Mon Sep 17 00:00:00 2001
From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com>
Date: Sat, 5 Nov 2022 20:57:53 -0400
Subject: [PATCH 45/58] Match order of arguments

---
 training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ
index 91325661..608df842 100644
--- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ
@@ -15,7 +15,7 @@ type PredictAODGPR {
     getPipes: function(excFeats: [string], gstpFilter: any, targetName: string, technique: any): any js server
     // Extract learned parameters from trained {@link GaussianProcessRegressionPipe}s specified by {@link GeoSurfaceTimePoint} filter, target name, excluded features and {@link GaussianProcessRegressionTechnque}
     @py(env='gordon-ML_1_0_0')
-    makePredictionsJob: function(excFeats: [string], gstpFilter: any, synthDataset: any, targetName: string, technique: any, batchSize: int): any
+    makePredictionsJob: function(excFeats: [string], gstpFilter: any, targetName: string, synthDataset: any, technique: any, batchSize: int): any
     // Build a pandas dataframe with the hyper parameters once job is complete
     @py(env='gordon-ML_1_0_0')
     getPredictionsDataframeFromJob: inline function(job: any): any

From e022e7f3a97fb77afe6f021510dc052823d0672b Mon Sep 17 00:00:00 2001
From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com>
Date: Mon, 7 Nov 2022 10:47:28 -0500
Subject: [PATCH 46/58] Try prediction with unpickled model

---
 training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
index cc1decc6..e7b4eb21 100644
--- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
@@ -19,6 +19,7 @@ def cassandra_mapper(batch, objs, job):
 
     def cassandra_reducer(key, interValues, job):
         values = []
+        synthDataframe = c3.Dataset.toPandas(synthDataset)
         for iv in interValues:
             for val in iv:
                 for m in val:
@@ -28,7 +29,10 @@ def cassandra_reducer(key, interValues, job):
                         center = m["trainedModel"].parameters["targetMean"].asfloat()
                     else:
                         center = 0
-                    preds = m.process(synthDataset, computeStd=True)
+                    pickledModel = m["trainedModel"]["model"]
+                    model = c3.PythonSerialization.deserialize(serialized=pickledModel)
+                    mean, sd = model.predict(synthDataframe, return_std=True)
+                    preds = pd.concat([pd.DataFrame(mean), pd.DataFrame(sd)], axis=1)
                     values.append((preds, synthDataset, model_id, center))
                     
 

From f9370feabc4637bcfa66c8ca82d5356a678434d2 Mon Sep 17 00:00:00 2001
From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com>
Date: Mon, 7 Nov 2022 13:10:00 -0500
Subject: [PATCH 47/58] Grab synthDataset from job context

---
 training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
index e7b4eb21..10641a4d 100644
--- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
@@ -19,7 +19,7 @@ def cassandra_mapper(batch, objs, job):
 
     def cassandra_reducer(key, interValues, job):
         values = []
-        synthDataframe = c3.Dataset.toPandas(synthDataset)
+        synthDataframe = c3.Dataset.toPandas(job.context.value["syntheticDataset"])
         for iv in interValues:
             for val in iv:
                 for m in val:

From 72b65930c97d53bfeadc2a76bd5f1277c72313f2 Mon Sep 17 00:00:00 2001
From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com>
Date: Mon, 7 Nov 2022 13:35:50 -0500
Subject: [PATCH 48/58] Import pandas for method

---
 training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
index 10641a4d..755f8d9f 100644
--- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
@@ -4,6 +4,8 @@ def makePredictionsJob(
     """
     Dynamic map-reduce job to get predictions on synthDataset.
     """
+    import pandas as pd
+
     def cassandra_mapper(batch, objs, job):
         models = []
         for obj in objs:

From cd3443003d6e2d8fdff24064d2442e25fc1a1bbd Mon Sep 17 00:00:00 2001
From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com>
Date: Mon, 7 Nov 2022 13:55:37 -0500
Subject: [PATCH 49/58] Keep predictions without pandas

---
 training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
index 755f8d9f..3cd67fdc 100644
--- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
@@ -4,7 +4,6 @@ def makePredictionsJob(
     """
     Dynamic map-reduce job to get predictions on synthDataset.
     """
-    import pandas as pd
 
     def cassandra_mapper(batch, objs, job):
         models = []
@@ -34,8 +33,7 @@ def cassandra_reducer(key, interValues, job):
                     pickledModel = m["trainedModel"]["model"]
                     model = c3.PythonSerialization.deserialize(serialized=pickledModel)
                     mean, sd = model.predict(synthDataframe, return_std=True)
-                    preds = pd.concat([pd.DataFrame(mean), pd.DataFrame(sd)], axis=1)
-                    values.append((preds, synthDataset, model_id, center))
+                    values.append((mean, sd, synthDataset, model_id, center))
                     
 
         return values

From 872eb02a9efef910949d91a18c2919c0679f37f9 Mon Sep 17 00:00:00 2001
From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com>
Date: Mon, 7 Nov 2022 14:15:12 -0500
Subject: [PATCH 50/58] Typo: returned unused object

---
 training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
index 3cd67fdc..27829a03 100644
--- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
@@ -33,7 +33,7 @@ def cassandra_reducer(key, interValues, job):
                     pickledModel = m["trainedModel"]["model"]
                     model = c3.PythonSerialization.deserialize(serialized=pickledModel)
                     mean, sd = model.predict(synthDataframe, return_std=True)
-                    values.append((mean, sd, synthDataset, model_id, center))
+                    values.append((mean, sd, model_id, center))
                     
 
         return values

From 928e552d47f828529645bda047607f95ba22bc5c Mon Sep 17 00:00:00 2001
From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com>
Date: Mon, 7 Nov 2022 14:48:30 -0500
Subject: [PATCH 51/58] Restructure resultant df of predictions

---
 .../src/Utils/Predict/PredictAODGPR.py        | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
index 27829a03..90c05bd5 100644
--- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
@@ -71,20 +71,18 @@ def getPredictionsDataframeFromJob(job):
     import numpy as np
 
     predictions = []
-    
+
     if job.status().status == "completed":
         for key, value in job.results().items():
             for subvalue in value:
-                df_y = c3.Dataset.toPandas(subvalue[0])
-                df_y[0] += subvalue[3]
-                df_x = c3.Dataset.toPandas(subvalue[1])
-                m_preds = pd.concat(
-                    [df_x, df_y],
-                    axis=1
-                )
-                m_preds["modelId"] = subvalue[2]
-                predictions.append(m_preds)
-                
+                df_m = pd.DataFrame()
+                df_m["mean"] = np.array(subvalue[0]).flatten()
+                df_m["mean"] += subvalue[3]
+                df_m["sd"] = subvalue[1]
+                df_m["modelId"] = subvalue[2]
+
+            predictions.append(df_m)
+
         df = pd.concat(predictions, axis=0).reset_index(drop=True)
         return df
     else:

From c7eb9e1bf2df2de0f94132d63ab052d361c399b1 Mon Sep 17 00:00:00 2001
From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com>
Date: Mon, 7 Nov 2022 21:33:30 -0500
Subject: [PATCH 52/58] Add lat, lon, time to prediction results

---
 .../src/Utils/Predict/PredictAODGPR.py        | 27 ++++++++++++++-----
 1 file changed, 21 insertions(+), 6 deletions(-)

diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
index 90c05bd5..6beedbc6 100644
--- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
@@ -24,6 +24,7 @@ def cassandra_reducer(key, interValues, job):
         for iv in interValues:
             for val in iv:
                 for m in val:
+                    # predictions
                     model_id = m["id"]
                     centered = m["technique"]["centerTarget"]
                     if centered:
@@ -33,7 +34,16 @@ def cassandra_reducer(key, interValues, job):
                     pickledModel = m["trainedModel"]["model"]
                     model = c3.PythonSerialization.deserialize(serialized=pickledModel)
                     mean, sd = model.predict(synthDataframe, return_std=True)
-                    values.append((mean, sd, model_id, center))
+
+                    # location
+                    dssId = m["dataSourceSpec"]["id"]
+                    dss = c3.GPRDataSourceSpec.get(dssId)
+                    gstpId = dss.targetSpec.filter.split(" == ")[1].replace('"', '')
+                    gstp = c3.GeoSurfaceTimePoint.get(gstpId)
+                    lat = gstp.latitude
+                    lon = gstp.longitude
+                    time = gstp.time
+                    values.append((model_id, mean, center, sd, synthDataframe, lat, lon, time))
                     
 
         return values
@@ -74,12 +84,17 @@ def getPredictionsDataframeFromJob(job):
 
     if job.status().status == "completed":
         for key, value in job.results().items():
-            for subvalue in value:
+            for subvalue in value: #(model_id, mean, center, sd, synthDataframe, lat, lon, time)
                 df_m = pd.DataFrame()
-                df_m["mean"] = np.array(subvalue[0]).flatten()
-                df_m["mean"] += subvalue[3]
-                df_m["sd"] = subvalue[1]
-                df_m["modelId"] = subvalue[2]
+                df_m["mean"] = np.array(subvalue[1]).flatten()
+                df_m["mean"] += subvalue[2]
+                df_m["sd"] = subvalue[3]
+                df_m["lat"] = subvalue[5]
+                df_m["lon"] = subvalue[6]
+                df_m["time"] = subvalue[7]
+                df_m["modelId"] = subvalue[0]
+
+                df_m = pd.concat([df_m, subvalue[4]], axis=1)
 
             predictions.append(df_m)
 

From 67e11bfc7c7f8ae73c79fb8b42a3b98f21014aeb Mon Sep 17 00:00:00 2001
From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com>
Date: Mon, 7 Nov 2022 22:08:09 -0500
Subject: [PATCH 53/58] Don't save synthDataset in results

---
 training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
index 6beedbc6..d853670b 100644
--- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
@@ -43,7 +43,7 @@ def cassandra_reducer(key, interValues, job):
                     lat = gstp.latitude
                     lon = gstp.longitude
                     time = gstp.time
-                    values.append((model_id, mean, center, sd, synthDataframe, lat, lon, time))
+                    values.append((model_id, mean, center, sd, lat, lon, time))
                     
 
         return values
@@ -94,8 +94,6 @@ def getPredictionsDataframeFromJob(job):
                 df_m["time"] = subvalue[7]
                 df_m["modelId"] = subvalue[0]
 
-                df_m = pd.concat([df_m, subvalue[4]], axis=1)
-
             predictions.append(df_m)
 
         df = pd.concat(predictions, axis=0).reset_index(drop=True)

From 0b2489dde9d528bcff5bb91df2e129e70927ef3c Mon Sep 17 00:00:00 2001
From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com>
Date: Mon, 7 Nov 2022 22:53:24 -0500
Subject: [PATCH 54/58] Indexing subvalues correctly

---
 training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
index d853670b..360815e9 100644
--- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
@@ -84,14 +84,14 @@ def getPredictionsDataframeFromJob(job):
 
     if job.status().status == "completed":
         for key, value in job.results().items():
-            for subvalue in value: #(model_id, mean, center, sd, synthDataframe, lat, lon, time)
+            for subvalue in value:
                 df_m = pd.DataFrame()
                 df_m["mean"] = np.array(subvalue[1]).flatten()
                 df_m["mean"] += subvalue[2]
                 df_m["sd"] = subvalue[3]
-                df_m["lat"] = subvalue[5]
-                df_m["lon"] = subvalue[6]
-                df_m["time"] = subvalue[7]
+                df_m["lat"] = subvalue[4]
+                df_m["lon"] = subvalue[5]
+                df_m["time"] = subvalue[6]
                 df_m["modelId"] = subvalue[0]
 
             predictions.append(df_m)

From f02f52a949f65bcf106f744b9ae6dbe60af0cc51 Mon Sep 17 00:00:00 2001
From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com>
Date: Wed, 9 Nov 2022 07:23:05 -0500
Subject: [PATCH 55/58] Clean up predictions data frame

---
 .../src/Utils/Predict/PredictAODGPR.py             | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
index 360815e9..13de2e6f 100644
--- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
@@ -86,17 +86,17 @@ def getPredictionsDataframeFromJob(job):
         for key, value in job.results().items():
             for subvalue in value:
                 df_m = pd.DataFrame()
-                df_m["mean"] = np.array(subvalue[1]).flatten()
-                df_m["mean"] += subvalue[2]
-                df_m["sd"] = subvalue[3]
-                df_m["lat"] = subvalue[4]
-                df_m["lon"] = subvalue[5]
+                df_m["meanResponse"] = np.array(subvalue[1]).flatten()
+                df_m["meanResponse"] += subvalue[2]
+                df_m["sdResponse"] = subvalue[3]
+                df_m["latitude"] = subvalue[4]
+                df_m["longitude"] = subvalue[5]
                 df_m["time"] = subvalue[6]
                 df_m["modelId"] = subvalue[0]
 
             predictions.append(df_m)
 
-        df = pd.concat(predictions, axis=0).reset_index(drop=True)
-        return df
+        # df = pd.concat(predictions, axis=0).reset_index(drop=True)
+        return predictions
     else:
         return False
\ No newline at end of file

From 049256e938ac28db26dca5faf7536458183c7df8 Mon Sep 17 00:00:00 2001
From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com>
Date: Wed, 9 Nov 2022 11:13:52 -0500
Subject: [PATCH 56/58] Return one df rather than list of dfs

---
 training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
index 13de2e6f..10148377 100644
--- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
@@ -96,7 +96,7 @@ def getPredictionsDataframeFromJob(job):
 
             predictions.append(df_m)
 
-        # df = pd.concat(predictions, axis=0).reset_index(drop=True)
+        df = pd.concat(predictions, axis=0).reset_index(drop=True)
         return predictions
     else:
         return False
\ No newline at end of file

From 787a0bb6cc067601b6c5e392c3cdd159c24df18b Mon Sep 17 00:00:00 2001
From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com>
Date: Mon, 14 Nov 2022 18:27:12 -0500
Subject: [PATCH 57/58] Track model variants in predictions

---
 training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
index 10148377..4daba9c8 100644
--- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
+++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py
@@ -93,10 +93,11 @@ def getPredictionsDataframeFromJob(job):
                 df_m["longitude"] = subvalue[5]
                 df_m["time"] = subvalue[6]
                 df_m["modelId"] = subvalue[0]
+                df_m["variant"] = list(range(df_m.shape[0]))
 
             predictions.append(df_m)
 
         df = pd.concat(predictions, axis=0).reset_index(drop=True)
-        return predictions
+        return df
     else:
         return False
\ No newline at end of file

From 95438adb80457a9ca03b7cf8799bd81d19eeb31d Mon Sep 17 00:00:00 2001
From: Bruno Abreu <babreu@illinois.edu>
Date: Fri, 20 Jan 2023 13:26:14 -0600
Subject: [PATCH 58/58] bogus comment to kick workflow

---
 .../GPRegression/GaussianProcessRegressionPipe.c3typ             | 1 +
 1 file changed, 1 insertion(+)

diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
index f8c2f44b..3a6a55fe 100644
--- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
+++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
@@ -5,6 +5,7 @@
 /**
 * GaussianProcessRegressionPipe.c3typ
 * Performs Scikit-Learn's GP Regression.
+* Bogus comment to reprovision app.
 */
 @db(unique=['technique, dataSourceSpec'])
 entity type GaussianProcessRegressionPipe extends MLLeafPipe<Dataset, Dataset> type key 'GPREG' {