From c60d7aa5b8efaea97e24bfc018cfdd93a849c289 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Wed, 31 Aug 2022 13:35:01 -0500 Subject: [PATCH 01/58] try staging data --- .../src/Utils/DataStaging/StagedFeatures.c3typ | 11 +++++++++++ 1 file changed, 11 insertions(+) create mode 100644 training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ new file mode 100644 index 00000000..41903dff --- /dev/null +++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ @@ -0,0 +1,11 @@ +/** +* Copyright (c) 2022, C3 AI DTI, Development Operations Team +* All rights reserved. License: https://github.com/c3aidti/.github +**/ +/** +* This type hosts data for models that cannot obtain Features +* directly from other entity types. +*/ +entity type StagedFeatures schema name "STGD_FTRS" { + feature: map suffix name "FTR" +} From cbc7b46af1852d70876533bd79c74b1ae62ea97a Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Wed, 31 Aug 2022 13:38:44 -0500 Subject: [PATCH 02/58] lil fix --- .../gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ index 41903dff..3e227a38 100644 --- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ +++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ @@ -7,5 +7,5 @@ * directly from other entity types. */ entity type StagedFeatures schema name "STGD_FTRS" { - feature: map suffix name "FTR" + feature: map schema suffix "FTR" } From 56cbdb2bb543b40ee8bbcf1c7fff6df2c64d8696 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Wed, 31 Aug 2022 14:22:51 -0500 Subject: [PATCH 03/58] frankeinteining --- .../GaussianProcessRegressionPipe.py | 20 ++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index 8f032df0..ece71b5d 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -71,8 +71,26 @@ def getFeatures(this): import pandas as pd dataSourceSpec = c3.GPRDataSourceSpec.get(this.dataSourceSpec.id) - featuresType = dataSourceSpec.featuresType.toType() + + if (featuresType.name == "StagedFeatures"): + features = c3.StagedFeatures.fetch({ + "limit": -1, + "order": "id" + }).objs.toJson() + + df = pd.DataFrame(features) + keys = df.iloc[0]["feature"].keys() + + for key in keys: + df[key] = df["feature"].apply(lambda x: x[key]) + + df.drop("version", axis=1, inplace=True) + df = df.select_dtypes(["number"]) + + return c3.Dataset.fromPython(df) + + inputTableC3 = featuresType.fetch(dataSourceSpec.featuresSpec).objs.toJson() inputTablePandas = pd.DataFrame(inputTableC3) inputTablePandas = inputTablePandas.drop("version", axis=1) From 2425ad738fd9d91f6a8be0f9657301c1123e993d Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Wed, 31 Aug 2022 14:41:04 -0500 Subject: [PATCH 04/58] try targets now --- .../GaussianProcessRegressionPipe.py | 24 ++++++++++++++++--- .../Utils/DataStaging/StagedFeatures.c3typ | 2 +- .../src/Utils/DataStaging/StagedTargets.c3typ | 11 +++++++++ 3 files changed, 33 insertions(+), 4 deletions(-) create mode 100644 training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index ece71b5d..5b5b56f9 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -80,10 +80,10 @@ def getFeatures(this): }).objs.toJson() df = pd.DataFrame(features) - keys = df.iloc[0]["feature"].keys() + keys = df.iloc[0]["features"].keys() for key in keys: - df[key] = df["feature"].apply(lambda x: x[key]) + df[key] = df["features"].apply(lambda x: x[key]) df.drop("version", axis=1, inplace=True) df = df.select_dtypes(["number"]) @@ -112,8 +112,26 @@ def getTarget(this): import pandas as pd dataSourceSpec = c3.GPRDataSourceSpec.get(this.dataSourceSpec.id) - targetType = dataSourceSpec.targetType.toType() + + if (targetType.name == "StagedTargets"): + targets = c3.StagedTargets.fetch({ + "limit": -1, + "order": "id" + }).objs.toJson() + + df = pd.DataFrame(targets) + keys = df.iloc[0]["targets"].keys() + + for key in keys: + df[key] = df["targets"].apply(lambda x: x[key]) + + df.drop("version", axis=1, inplace=True) + df = df.select_dtypes(["number"]) + + return c3.Dataset.fromPython(df) + + outputTableC3 = targetType.fetch(dataSourceSpec.targetSpec).objs.toJson() outputTablePandas = pd.DataFrame(outputTableC3) outputTablePandas = outputTablePandas.drop("version", axis=1) diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ index 3e227a38..55f5f2ba 100644 --- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ +++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ @@ -7,5 +7,5 @@ * directly from other entity types. */ entity type StagedFeatures schema name "STGD_FTRS" { - feature: map schema suffix "FTR" + features: map schema suffix "FTR" } diff --git a/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ new file mode 100644 index 00000000..9f861106 --- /dev/null +++ b/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ @@ -0,0 +1,11 @@ +/** +* Copyright (c) 2022, C3 AI DTI, Development Operations Team +* All rights reserved. License: https://github.com/c3aidti/.github +**/ +/** +* This type hosts data for models that cannot obtain Targets +* directly from other entity types. +*/ +entity type StagedTargets schema name "STGD_TRGTS" { + targets: map schema suffix "TRGT" +} From 3e2601cf962257469b5fc92be8ca002453c4b357 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Fri, 16 Sep 2022 09:36:56 -0500 Subject: [PATCH 05/58] adding target center --- .../GPRegression/GaussianProcessRegressionPipe.py | 14 +++++++++++++- .../GaussianProcessRegressionTechnique.c3typ | 3 +++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index 5b5b56f9..ad4d1542 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -12,6 +12,10 @@ def train(this, input, targetOutput, spec): X = c3.Dataset.toNumpy(dataset=input) y = c3.Dataset.toNumpy(dataset=targetOutput) + if (technique.centerTarget): + y = y - y.mean() + + # get kernel object from c3, make it python again kernel = c3.PythonSerialization.deserialize(serialized=serializedKernel.pickledKernel) @@ -19,7 +23,15 @@ def train(this, input, targetOutput, spec): gp = GaussianProcessRegressor(kernel=kernel) gp.fit(X, y) - this.trainedModel = c3.MLTrainedModelArtifact(model=c3.PythonSerialization.serialize(obj=gp)) + if (technique.centerTarget): + this.trainedModel = c3.MLTrainedModelArtifact( + model=c3.PythonSerialization.serialize(obj=gp), + targetMean=float(y.mean()) + ) + else: + this.trainedModel = c3.MLTrainedModelArtifact( + model=c3.PythonSerialization.serialize(obj=gp), + ) return this diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ index 71915854..61a975a3 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ @@ -10,4 +10,7 @@ entity type GaussianProcessRegressionTechnique mixes MLTechnique schema name 'GP // the kernel object @ML(hyperParameter=true) kernel: SklearnGPRKernel + // center target data before fitting + @ML(hyperParameter=true) + centerTarget: boolean=false } \ No newline at end of file From 7029e8726750161ccf55ed9f98d8ff4fbf6d1625 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Fri, 16 Sep 2022 09:46:39 -0500 Subject: [PATCH 06/58] lil fix --- .../GPRegression/GaussianProcessRegressionPipe.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index ad4d1542..fdb7b847 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -24,9 +24,11 @@ def train(this, input, targetOutput, spec): gp.fit(X, y) if (technique.centerTarget): + params = {} + params["targetMean"] = float(y.mean()) this.trainedModel = c3.MLTrainedModelArtifact( model=c3.PythonSerialization.serialize(obj=gp), - targetMean=float(y.mean()) + parameterss=params ) else: this.trainedModel = c3.MLTrainedModelArtifact( From b1f76b650215b785f90ee845e25bdb365b83a8ff Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Fri, 16 Sep 2022 09:50:05 -0500 Subject: [PATCH 07/58] typo --- .../GPRegression/GaussianProcessRegressionPipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index fdb7b847..b1282bc5 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -28,7 +28,7 @@ def train(this, input, targetOutput, spec): params["targetMean"] = float(y.mean()) this.trainedModel = c3.MLTrainedModelArtifact( model=c3.PythonSerialization.serialize(obj=gp), - parameterss=params + parameters=params ) else: this.trainedModel = c3.MLTrainedModelArtifact( From 33f8e382d834ef3c7b420fb34dd665236778c834 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Fri, 16 Sep 2022 09:56:25 -0500 Subject: [PATCH 08/58] another fix --- .../GPRegression/GaussianProcessRegressionPipe.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index b1282bc5..c1984f93 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -13,6 +13,7 @@ def train(this, input, targetOutput, spec): y = c3.Dataset.toNumpy(dataset=targetOutput) if (technique.centerTarget): + targetMean = float(y.mean()) y = y - y.mean() @@ -25,7 +26,7 @@ def train(this, input, targetOutput, spec): if (technique.centerTarget): params = {} - params["targetMean"] = float(y.mean()) + params["targetMean"] = targetMean this.trainedModel = c3.MLTrainedModelArtifact( model=c3.PythonSerialization.serialize(obj=gp), parameters=params From 392c4ee6ee1d26c9df70e13206724dff12de8d03 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Fri, 23 Sep 2022 10:24:29 -0500 Subject: [PATCH 09/58] add centerTarget to getPipe method --- training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js b/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js index be6bbd64..6ac6c284 100644 --- a/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js +++ b/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js @@ -23,7 +23,8 @@ function getPipe(excFeats, gstpId, targetName, technique) { }).objs.map(obj => obj.id); // find the techniques - filter = Filter.intersects("kernel.id", kernelIds); + filter = Filter.intersects("kernel.id", kernelIds) + .and().eq("technique.centerTarget", technique.centerTarget); var techIds = GaussianProcessRegressionTechnique.fetch({ "filter": filter.value, "limit": -1, From ce27a669229e2d3a9a4674c96a4f0b39a08778fe Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Mon, 26 Sep 2022 10:09:36 -0500 Subject: [PATCH 10/58] call fix --- training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js b/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js index 6ac6c284..29fe2001 100644 --- a/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js +++ b/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js @@ -24,7 +24,7 @@ function getPipe(excFeats, gstpId, targetName, technique) { // find the techniques filter = Filter.intersects("kernel.id", kernelIds) - .and().eq("technique.centerTarget", technique.centerTarget); + .and().eq("centerTarget", technique.centerTarget); var techIds = GaussianProcessRegressionTechnique.fetch({ "filter": filter.value, "limit": -1, From a43051c47854814dacf2a51a3a78f727dab03072 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Tue, 4 Oct 2022 12:44:14 -0500 Subject: [PATCH 11/58] first stab at method --- .../Utils/DataStaging/StagedFeatures.c3typ | 4 ++ .../src/Utils/DataStaging/StagedFeatures.py | 65 +++++++++++++++++++ 2 files changed, 69 insertions(+) create mode 100644 training/gordon-group/src/Utils/DataStaging/StagedFeatures.py diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ index 55f5f2ba..d43eeeef 100644 --- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ +++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ @@ -7,5 +7,9 @@ * directly from other entity types. */ entity type StagedFeatures schema name "STGD_FTRS" { + // the features to be staged features: map schema suffix "FTR" + // method to stage from gstp list + @py(env='gordon_1_0_0') + stageFromAODGPRModelIdlist: function(ids: !any): integer } diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py new file mode 100644 index 00000000..d65a691e --- /dev/null +++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py @@ -0,0 +1,65 @@ +def stageFromAODGPRModelIdsList(ids): + """ + Given a list of GaussianProcessRegressionPipes trained with + AOD data, stage the features for each model. + + Input: + ids: list of model ids + + Return: + int: zero if it worked, 1 if it failed + """ + import pandas as pd + + # get data from dataSourceSpec one model + model = c3.GaussianProcessRegressionPipe.get(ids[0], "dataSourceSpec") + data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id) + excludeFeatures = data_source_spec.excludeFeatures + featuresType = data_source_spec.featuresType.toType() + inputTableC3 = featuresType.fetch(data_source_spec.featuresSpec).objs.toJson() + inputTable = pd.DataFrame(inputTableC3) + inputTable = inputTable.drop("version", axis=1) + inputTable = inputTable.select_dtypes(["number"]) + if (dataSourceSpec.excludeFeatures): + inputTable.drop(columns=dataSourceSpec.excludeFeatures, inplace=True) + + # get gstp coordinates from each model + lats = [] + lons = [] + times = [] + for model_id in ids: + model = c3.GaussianProcessRegressionPipe.get(model_id, "dataSourceSpec") + data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id, "targetSpec") + gstp_id = data_source_spec.targetSpec.filter.split(" == ")[1].replace('"', '') + gstp = c3.GeoSurfaceTimePoint.get(gstp_id) + lats.append(gstp.latitude) + lons.append(gstp.longitude) + times.append(gstp.time) + + def row_to_dict(row): + d = {} + for col in row.index: + d[col] = row[col] + return d + + def add_coords(obj, lat, lon, time): + obj["latitude"] = lat + obj["longitude"] = lon + obj["time"] = time + return + + # build dataframe + df_sim_par = pd.DataFrame() + df_sim_par["features"] = inputTable.apply(row_to_dict, axis=1) + + df = pd.DataFrame() + for i in range(len(lats)): + df_to_add = df_sim_par.copy() + df_to_add["features"].apply(add_coords, args=(lats[i], lons[i], times[i])) + df = pd.concat([df,df_to_add], ignore_index=True) + + df["id"] = df.index + output_records = df.to_dict(orient="records") + c3.StagedFeatures.upsertBatch(objs=output_records) + + return 0 \ No newline at end of file From f6bf0800b54914d3c21abf54f9ecff127740b50e Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Tue, 4 Oct 2022 12:53:41 -0500 Subject: [PATCH 12/58] typo --- .../gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ index d43eeeef..d0bd8c42 100644 --- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ +++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ @@ -11,5 +11,5 @@ entity type StagedFeatures schema name "STGD_FTRS" { features: map schema suffix "FTR" // method to stage from gstp list @py(env='gordon_1_0_0') - stageFromAODGPRModelIdlist: function(ids: !any): integer + stageFromAODGPRModelIdsList: function(ids: !any): integer } From 0d3cea2b89d48f541a4548d42c11c06b40eb3363 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Tue, 4 Oct 2022 12:59:11 -0500 Subject: [PATCH 13/58] fix syntax bug --- training/gordon-group/src/Utils/DataStaging/StagedFeatures.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py index d65a691e..6bc21b60 100644 --- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py +++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py @@ -20,8 +20,8 @@ def stageFromAODGPRModelIdsList(ids): inputTable = pd.DataFrame(inputTableC3) inputTable = inputTable.drop("version", axis=1) inputTable = inputTable.select_dtypes(["number"]) - if (dataSourceSpec.excludeFeatures): - inputTable.drop(columns=dataSourceSpec.excludeFeatures, inplace=True) + if (excludeFeatures): + inputTable.drop(columns=excludeFeatures, inplace=True) # get gstp coordinates from each model lats = [] From 22da146aa8f6c451422b81532c4744e22f771440 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Tue, 4 Oct 2022 13:03:52 -0500 Subject: [PATCH 14/58] making it more flexible --- .../gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ index d0bd8c42..98f8179f 100644 --- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ +++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ @@ -8,7 +8,7 @@ */ entity type StagedFeatures schema name "STGD_FTRS" { // the features to be staged - features: map schema suffix "FTR" + features: map schema suffix "FTR" // method to stage from gstp list @py(env='gordon_1_0_0') stageFromAODGPRModelIdsList: function(ids: !any): integer From dc1fa70acc4d671b81265b55f594542322c36bcf Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Tue, 4 Oct 2022 13:12:18 -0500 Subject: [PATCH 15/58] try without timestamps --- .../gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ | 2 +- .../gordon-group/src/Utils/DataStaging/StagedFeatures.py | 5 ++--- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ index 98f8179f..d0bd8c42 100644 --- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ +++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ @@ -8,7 +8,7 @@ */ entity type StagedFeatures schema name "STGD_FTRS" { // the features to be staged - features: map schema suffix "FTR" + features: map schema suffix "FTR" // method to stage from gstp list @py(env='gordon_1_0_0') stageFromAODGPRModelIdsList: function(ids: !any): integer diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py index 6bc21b60..71446df6 100644 --- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py +++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py @@ -42,10 +42,9 @@ def row_to_dict(row): d[col] = row[col] return d - def add_coords(obj, lat, lon, time): + def add_coords(obj, lat, lon): obj["latitude"] = lat obj["longitude"] = lon - obj["time"] = time return # build dataframe @@ -55,7 +54,7 @@ def add_coords(obj, lat, lon, time): df = pd.DataFrame() for i in range(len(lats)): df_to_add = df_sim_par.copy() - df_to_add["features"].apply(add_coords, args=(lats[i], lons[i], times[i])) + df_to_add["features"].apply(add_coords, args=(lats[i], lons[i])) df = pd.concat([df,df_to_add], ignore_index=True) df["id"] = df.index From abae3a949a764313f8378d93bb0f9b1552833963 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Tue, 4 Oct 2022 13:20:03 -0500 Subject: [PATCH 16/58] clear table before start --- training/gordon-group/src/Utils/DataStaging/StagedFeatures.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py index 71446df6..6f11cce0 100644 --- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py +++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py @@ -11,6 +11,8 @@ def stageFromAODGPRModelIdsList(ids): """ import pandas as pd + c3.StagedFeatures.removeAll() + # get data from dataSourceSpec one model model = c3.GaussianProcessRegressionPipe.get(ids[0], "dataSourceSpec") data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id) From ee487af89fe4c34a5816370b7089168ebb0db709 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Tue, 4 Oct 2022 13:51:21 -0500 Subject: [PATCH 17/58] okay --- .../src/Utils/DataStaging/StagedTargets.c3typ | 4 ++ .../src/Utils/DataStaging/StagedTargets.py | 51 +++++++++++++++++++ 2 files changed, 55 insertions(+) create mode 100644 training/gordon-group/src/Utils/DataStaging/StagedTargets.py diff --git a/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ index 9f861106..6d31dcab 100644 --- a/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ +++ b/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ @@ -7,5 +7,9 @@ * directly from other entity types. */ entity type StagedTargets schema name "STGD_TRGTS" { + // the staged targets targets: map schema suffix "TRGT" + // method to stage from gstp list + @py(env='gordon_1_0_0') + stageFromAODGPRModelIdsList: function(ids: !any): integer } diff --git a/training/gordon-group/src/Utils/DataStaging/StagedTargets.py b/training/gordon-group/src/Utils/DataStaging/StagedTargets.py new file mode 100644 index 00000000..74d45700 --- /dev/null +++ b/training/gordon-group/src/Utils/DataStaging/StagedTargets.py @@ -0,0 +1,51 @@ +def stageFromAODGPRModelIdsList(ids): + """ + Given a list of GaussianProcessRegressionPipes trained with + AOD data, stage the targets for each model. + + Input: + ids: list of model ids + + Return: + int: zero if it worked, 1 if it failed + """ + import pandas as pd + + c3.StagedTargets.removeAll() + + model = c3.GaussianProcessRegressionPipe.get(ids[0], "dataSourceSpec") + data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id) + target_type = data_source_spec.targetType.toType() + + df = pd.DataFrame() + + for model_id in ids: + model = c3.GaussianProcessRegressionPipe.get(model_id, "dataSourceSpec") + data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id) + outputC3 = target_type.fetch(data_source_spec.targetSpec).objs.toJson() + output = pd.DataFrame(outputC3) + output = output.drop("version", axis=1) + if data_source_spec.targetName == "all": + output = pd.DataFrame( + output.sum(axis=1), + columns=[data_source_spec.targetName] + ) + else: + output = pd.DataFrame(output[data_source_spec.targetName]) + + df = pd.concat([df, output], ignore_index=True) + + def row_to_dict(row): + d = {} + for col in row.index: + d[col] = row[col] + return d + + df_final = pd.DataFrame() + df_final["targets"] = df.apply(row_to_dict, axis=1) + df_final["id"] = df_final.index + + output_records = df_final.to_dict(orient="records") + c3.StagedTargets.upsertBatch(objs=output_records) + + return 0 From e464e5221c54636918f443d990ba2a3648e7640d Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Tue, 4 Oct 2022 14:11:50 -0500 Subject: [PATCH 18/58] simpler! --- .../src/Utils/DataStaging/StagedTargets.py | 24 ++++--------------- 1 file changed, 4 insertions(+), 20 deletions(-) diff --git a/training/gordon-group/src/Utils/DataStaging/StagedTargets.py b/training/gordon-group/src/Utils/DataStaging/StagedTargets.py index 74d45700..bbc202f6 100644 --- a/training/gordon-group/src/Utils/DataStaging/StagedTargets.py +++ b/training/gordon-group/src/Utils/DataStaging/StagedTargets.py @@ -12,28 +12,12 @@ def stageFromAODGPRModelIdsList(ids): import pandas as pd c3.StagedTargets.removeAll() - - model = c3.GaussianProcessRegressionPipe.get(ids[0], "dataSourceSpec") - data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id) - target_type = data_source_spec.targetType.toType() - + df = pd.DataFrame() - for model_id in ids: - model = c3.GaussianProcessRegressionPipe.get(model_id, "dataSourceSpec") - data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id) - outputC3 = target_type.fetch(data_source_spec.targetSpec).objs.toJson() - output = pd.DataFrame(outputC3) - output = output.drop("version", axis=1) - if data_source_spec.targetName == "all": - output = pd.DataFrame( - output.sum(axis=1), - columns=[data_source_spec.targetName] - ) - else: - output = pd.DataFrame(output[data_source_spec.targetName]) - - df = pd.concat([df, output], ignore_index=True) + model = c3.GaussianProcessRegressionPipe.get(model_id) + pdf = model.getTarget() + df = pd.concat([df,pdf], ignore_index=True) def row_to_dict(row): d = {} From ee35cf278cfa9622fda4cd5890667ceed51e1567 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Tue, 4 Oct 2022 14:15:58 -0500 Subject: [PATCH 19/58] extra method call --- training/gordon-group/src/Utils/DataStaging/StagedTargets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/gordon-group/src/Utils/DataStaging/StagedTargets.py b/training/gordon-group/src/Utils/DataStaging/StagedTargets.py index bbc202f6..cafd3dad 100644 --- a/training/gordon-group/src/Utils/DataStaging/StagedTargets.py +++ b/training/gordon-group/src/Utils/DataStaging/StagedTargets.py @@ -16,7 +16,7 @@ def stageFromAODGPRModelIdsList(ids): df = pd.DataFrame() for model_id in ids: model = c3.GaussianProcessRegressionPipe.get(model_id) - pdf = model.getTarget() + pdf = c3.Dataset.toPandas(model.getTarget()) df = pd.concat([df,pdf], ignore_index=True) def row_to_dict(row): From 6c6c4ed302e23ea6a47f02ccc91f7b22559c0f9e Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Tue, 4 Oct 2022 14:38:42 -0500 Subject: [PATCH 20/58] cleaner bur likely slower --- .../src/Utils/DataStaging/StagedFeatures.py | 55 ++++--------------- 1 file changed, 11 insertions(+), 44 deletions(-) diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py index 6f11cce0..64a53446 100644 --- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py +++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py @@ -13,54 +13,21 @@ def stageFromAODGPRModelIdsList(ids): c3.StagedFeatures.removeAll() - # get data from dataSourceSpec one model - model = c3.GaussianProcessRegressionPipe.get(ids[0], "dataSourceSpec") - data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id) - excludeFeatures = data_source_spec.excludeFeatures - featuresType = data_source_spec.featuresType.toType() - inputTableC3 = featuresType.fetch(data_source_spec.featuresSpec).objs.toJson() - inputTable = pd.DataFrame(inputTableC3) - inputTable = inputTable.drop("version", axis=1) - inputTable = inputTable.select_dtypes(["number"]) - if (excludeFeatures): - inputTable.drop(columns=excludeFeatures, inplace=True) - - # get gstp coordinates from each model - lats = [] - lons = [] - times = [] + df = pd.DataFrame() for model_id in ids: - model = c3.GaussianProcessRegressionPipe.get(model_id, "dataSourceSpec") + model = c3.GaussianProcessRegressionPipe.get(model_id) data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id, "targetSpec") gstp_id = data_source_spec.targetSpec.filter.split(" == ")[1].replace('"', '') gstp = c3.GeoSurfaceTimePoint.get(gstp_id) - lats.append(gstp.latitude) - lons.append(gstp.longitude) - times.append(gstp.time) - - def row_to_dict(row): - d = {} - for col in row.index: - d[col] = row[col] - return d - - def add_coords(obj, lat, lon): - obj["latitude"] = lat - obj["longitude"] = lon - return - - # build dataframe - df_sim_par = pd.DataFrame() - df_sim_par["features"] = inputTable.apply(row_to_dict, axis=1) - - df = pd.DataFrame() - for i in range(len(lats)): - df_to_add = df_sim_par.copy() - df_to_add["features"].apply(add_coords, args=(lats[i], lons[i])) - df = pd.concat([df,df_to_add], ignore_index=True) - - df["id"] = df.index - output_records = df.to_dict(orient="records") + pdf = c3.Dataset.toPandas(model.getFeatures()) + pdf["latitude"] = gstp.latitude + pdf["longitude"] = gstp.longitude + df = pd.concat([df,pdf], ignore_index=True) + + df_final = pd.DataFrame() + df_final["features"] = df.apply(row_to_dict, axis=1) + df_final["id"] = df_final.index + output_records = df_final.to_dict(orient="records") c3.StagedFeatures.upsertBatch(objs=output_records) return 0 \ No newline at end of file From 9c8539c154e3b9ae3eb239b0214f2bb9d639444f Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Tue, 4 Oct 2022 14:50:24 -0500 Subject: [PATCH 21/58] forgot function def --- .../gordon-group/src/Utils/DataStaging/StagedFeatures.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py index 64a53446..b6cacd74 100644 --- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py +++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py @@ -24,6 +24,12 @@ def stageFromAODGPRModelIdsList(ids): pdf["longitude"] = gstp.longitude df = pd.concat([df,pdf], ignore_index=True) + def row_to_dict(row): + d = {} + for col in row.index: + d[col] = row[col] + return d + df_final = pd.DataFrame() df_final["features"] = df.apply(row_to_dict, axis=1) df_final["id"] = df_final.index From 72d9d932b4f1670b10e34e8f24dc7aff2cd470d0 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Wed, 5 Oct 2022 13:01:06 -0500 Subject: [PATCH 22/58] method to train with staged data --- .../GaussianProcessRegressionPipe.c3typ | 3 ++ .../GaussianProcessRegressionPipe.py | 52 +++++++++++++++++++ 2 files changed, 55 insertions(+) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ index 294929a2..c1290866 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ @@ -28,4 +28,7 @@ entity type GaussianProcessRegressionPipe extends MLLeafPipe t // guarantee that process() is only allowed after train() @py(env='gordon-ML_1_0_0') isProcessable: ~ + // train large model with AOD staged data + @py(env='gordon-ML_1_0_0') + trainWithStagedAOD: member function(modelIds: any): integer } diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index c1984f93..662ad224 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -163,3 +163,55 @@ def getTarget(this): outputTablePandas = pd.DataFrame(outputTablePandas[dataSourceSpec.targetName]) return c3.Dataset.fromPython(outputTablePandas) + + +def trainWithStagedAOD(this, ids): + """ + This method trains a large model with data coming from previously trained + GPR models with AOD data. + + Inputs: + ids: list of GaussianProcessRegressionPipes ids + + Returns: + int: 0 if method worked, 1 otherwise + """ + from sklearn.gaussian_process import GaussianProcessRegressor + + # stage features and targets + c3.StagedFeatures.stageFromAODGPRModelIdsList(ids) + c3.StagedTargets.stageFromAODGPRModelIdsList(ids) + # get data + X = c3.Dataset.toNumpy(dataset=this.getFeatures()) + y = c3.Dataset.toNumpy(dataset=this.getTarget()) + + # generate training technique + technique = c3.GaussianProcessRegressionTechnique.get(this.technique.id) + serializedKernel = c3.SklearnGPRKernel.get(technique.kernel.id) + + if (technique.centerTarget): + targetMean = float(y.mean()) + y = y - y.mean() + + # get kernel object from c3, make it python again + kernel = c3.PythonSerialization.deserialize(serialized=serializedKernel.pickledKernel) + + # build and train GPR + gp = GaussianProcessRegressor(kernel=kernel) + gp.fit(X, y) + + if (technique.centerTarget): + params = {} + params["targetMean"] = targetMean + this.trainedModel = c3.MLTrainedModelArtifact( + model=c3.PythonSerialization.serialize(obj=gp), + parameters=params + ) + else: + this.trainedModel = c3.MLTrainedModelArtifact( + model=c3.PythonSerialization.serialize(obj=gp), + ) + + this.upsert() + + return 0 From 13c7fdfe17cfd0f499c884a8c09f5ddabbd774f7 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Wed, 5 Oct 2022 13:11:39 -0500 Subject: [PATCH 23/58] change argument name --- .../GPRegression/GaussianProcessRegressionPipe.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index 662ad224..d6d99e70 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -165,7 +165,7 @@ def getTarget(this): return c3.Dataset.fromPython(outputTablePandas) -def trainWithStagedAOD(this, ids): +def trainWithStagedAOD(this, modelIds): """ This method trains a large model with data coming from previously trained GPR models with AOD data. @@ -179,8 +179,8 @@ def trainWithStagedAOD(this, ids): from sklearn.gaussian_process import GaussianProcessRegressor # stage features and targets - c3.StagedFeatures.stageFromAODGPRModelIdsList(ids) - c3.StagedTargets.stageFromAODGPRModelIdsList(ids) + c3.StagedFeatures.stageFromAODGPRModelIdsList(modelIds) + c3.StagedTargets.stageFromAODGPRModelIdsList(modelIds) # get data X = c3.Dataset.toNumpy(dataset=this.getFeatures()) y = c3.Dataset.toNumpy(dataset=this.getTarget()) From 6e5e09c6d3b1faf749f60966ab467d361e3a714d Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Wed, 12 Oct 2022 12:56:47 -0500 Subject: [PATCH 24/58] env for gpytorch -- first of many --- .../seed/ActionRuntime/py-gordon-ML_2_0_0.json | 15 +++++++++++++++ 1 file changed, 15 insertions(+) create mode 100644 training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json new file mode 100644 index 00000000..296abc5d --- /dev/null +++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json @@ -0,0 +1,15 @@ +{ + "language": "Python", + "runtimeVersion": "3.8.10", + "modules": { + "conda.scikit-learn":"=0.24.2", + "conda.pandas":"=1.0.1" + }, + "repositories": [ + "https://repo.continuum.io/pkgs/main", + "conda-forge" + ], + "runtime": "CPython", + "name": "py-gordon-ML_2_0_0", + "id": "py-gordon-ML_2_0_0" +} \ No newline at end of file From 36e9badfc6443c749f5a06b1b5b79c5c97320691 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Wed, 12 Oct 2022 12:59:57 -0500 Subject: [PATCH 25/58] right --- .../gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json index 296abc5d..dee4c64c 100644 --- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json +++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json @@ -2,8 +2,7 @@ "language": "Python", "runtimeVersion": "3.8.10", "modules": { - "conda.scikit-learn":"=0.24.2", - "conda.pandas":"=1.0.1" + "conda.scikit-learn":"=0.24.2" }, "repositories": [ "https://repo.continuum.io/pkgs/main", From fc68638573e2447ac111138c7ce4fffd6d641dcb Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Wed, 12 Oct 2022 13:39:33 -0500 Subject: [PATCH 26/58] add pandas --- .../gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json index dee4c64c..2e2fe66f 100644 --- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json +++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json @@ -2,7 +2,8 @@ "language": "Python", "runtimeVersion": "3.8.10", "modules": { - "conda.scikit-learn":"=0.24.2" + "conda.scikit-learn":"=0.24.2", + "conda.pandas":"1.3.0" }, "repositories": [ "https://repo.continuum.io/pkgs/main", From bb93223145039cce43f1f9723d04ad5572b9fa3a Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Wed, 12 Oct 2022 13:45:27 -0500 Subject: [PATCH 27/58] forgot something --- .../gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json index 2e2fe66f..dbbf1929 100644 --- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json +++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json @@ -3,7 +3,7 @@ "runtimeVersion": "3.8.10", "modules": { "conda.scikit-learn":"=0.24.2", - "conda.pandas":"1.3.0" + "conda.pandas":"=1.3.0" }, "repositories": [ "https://repo.continuum.io/pkgs/main", From 6483836feca1df1d70e2db642b946ef9378fdf5f Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Wed, 12 Oct 2022 15:13:53 -0500 Subject: [PATCH 28/58] adding validation to GPR train method --- .../GPRegression/GaussianProcessRegressionPipe.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index d6d99e70..32673fed 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -16,6 +16,10 @@ def train(this, input, targetOutput, spec): targetMean = float(y.mean()) y = y - y.mean() + if (technique.validation): + # trim X + # trim y + # get kernel object from c3, make it python again kernel = c3.PythonSerialization.deserialize(serialized=serializedKernel.pickledKernel) From 2c13dd58f82dc45eca705810ef7f92eb62b2d0dc Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Wed, 12 Oct 2022 15:22:35 -0500 Subject: [PATCH 29/58] shuffling feats and targs --- .../GPRegression/GaussianProcessRegressionPipe.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index 32673fed..64beab8a 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -17,8 +17,11 @@ def train(this, input, targetOutput, spec): y = y - y.mean() if (technique.validation): - # trim X - # trim y + rng = np.random.RandomState(technique.randomSeed) + rng.shuffle(X) + X = X[0:int(technique.splitFraction*len(X))] + rng.shuffle(y) + y = y[0:int(technique.splitFraction*len(y))] # get kernel object from c3, make it python again From 1bd5d2823de43a8976c955f5645e5c105d17f2f9 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Wed, 12 Oct 2022 15:27:03 -0500 Subject: [PATCH 30/58] fix typos --- .../GPRegression/GaussianProcessRegressionPipe.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index 64beab8a..38e1f984 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -19,9 +19,9 @@ def train(this, input, targetOutput, spec): if (technique.validation): rng = np.random.RandomState(technique.randomSeed) rng.shuffle(X) - X = X[0:int(technique.splitFraction*len(X))] + X = X[0:int((1.0 - technique.splitFraction)*len(X))] rng.shuffle(y) - y = y[0:int(technique.splitFraction*len(y))] + y = y[0:int((1.0 - technique.splitFraction)*len(y))] # get kernel object from c3, make it python again From b2d345282d14c478cf5473a97095ff5b181a2358 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Thu, 13 Oct 2022 13:53:17 -0500 Subject: [PATCH 31/58] add torch --- .../seed/ActionRuntime/py-gordon-ML_2_0_0.json | 9 +++++++-- .../GaussianProcessRegressionTechnique.c3typ | 9 +++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json index dbbf1929..fb077a9e 100644 --- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json +++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json @@ -3,11 +3,16 @@ "runtimeVersion": "3.8.10", "modules": { "conda.scikit-learn":"=0.24.2", - "conda.pandas":"=1.3.0" + "conda.pandas":"=1.3.0", + "conda.pytorch":"1.12.1", + "conda.torchvision":"=0.13.1", + "conda.torchaudio":"=0.12.1", + "conda.cudatoolkit":"=11.3" }, "repositories": [ "https://repo.continuum.io/pkgs/main", - "conda-forge" + "conda-forge", + "pytorch" ], "runtime": "CPython", "name": "py-gordon-ML_2_0_0", diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ index 61a975a3..6e1543f0 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ @@ -13,4 +13,13 @@ entity type GaussianProcessRegressionTechnique mixes MLTechnique schema name 'GP // center target data before fitting @ML(hyperParameter=true) centerTarget: boolean=false + // leave fraction of rows for post-validation + @ML(hyperParameter=true) + validation: boolean=false + // random seed to be used by numpy.shuffle + @ML(hyperParameter=true) + randomSeed: integer=42 + // fraction to be left for validation + @ML(hyperParameter=true) + splitFraction: float=0.2 } \ No newline at end of file From 07698ced5581adfc8badae504bdecc3e0512f72f Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Thu, 13 Oct 2022 14:26:59 -0500 Subject: [PATCH 32/58] typo --- .../gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json index fb077a9e..c5a9a417 100644 --- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json +++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json @@ -4,7 +4,7 @@ "modules": { "conda.scikit-learn":"=0.24.2", "conda.pandas":"=1.3.0", - "conda.pytorch":"1.12.1", + "conda.pytorch":"=1.12.1", "conda.torchvision":"=0.13.1", "conda.torchaudio":"=0.12.1", "conda.cudatoolkit":"=11.3" From cc690e8c18a4da87f054482ad6a0a80d3717a184 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Fri, 14 Oct 2022 10:59:49 -0500 Subject: [PATCH 33/58] ligcc --- .../gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json index c5a9a417..4a4e4f5c 100644 --- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json +++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json @@ -7,12 +7,14 @@ "conda.pytorch":"=1.12.1", "conda.torchvision":"=0.13.1", "conda.torchaudio":"=0.12.1", - "conda.cudatoolkit":"=11.3" + "conda.cudatoolkit":"=11.3", + "conda.libgcc":"=7.2.0" }, "repositories": [ "https://repo.continuum.io/pkgs/main", "conda-forge", - "pytorch" + "pytorch", + "anaconda" ], "runtime": "CPython", "name": "py-gordon-ML_2_0_0", From 46d0525221b7d5eba5d53abeabf8fdae52236de1 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Fri, 14 Oct 2022 11:26:14 -0500 Subject: [PATCH 34/58] add gpytorch --- .../gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json index 4a4e4f5c..f115f499 100644 --- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json +++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json @@ -8,7 +8,8 @@ "conda.torchvision":"=0.13.1", "conda.torchaudio":"=0.12.1", "conda.cudatoolkit":"=11.3", - "conda.libgcc":"=7.2.0" + "conda.libgcc":"=7.2.0", + "conda.gpytorch":"=1.9.0" }, "repositories": [ "https://repo.continuum.io/pkgs/main", From ccb8795a67d0439584678c3fc1683c34c4c30b5f Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Fri, 14 Oct 2022 11:44:30 -0500 Subject: [PATCH 35/58] adding dill for serialization --- .../gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json index f115f499..34699175 100644 --- a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json +++ b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json @@ -9,7 +9,8 @@ "conda.torchaudio":"=0.12.1", "conda.cudatoolkit":"=11.3", "conda.libgcc":"=7.2.0", - "conda.gpytorch":"=1.9.0" + "conda.gpytorch":"=1.9.0", + "conda.dill":"=0.2.8.2" }, "repositories": [ "https://repo.continuum.io/pkgs/main", From 4485189e755a68a1259dcb1eed49d1d95cf22caa Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Thu, 27 Oct 2022 15:14:10 -0500 Subject: [PATCH 36/58] keep data in memory --- .../GaussianProcessRegressionPipe.c3typ | 3 + .../GaussianProcessRegressionPipe.py | 64 ++++++++++++++++++- 2 files changed, 66 insertions(+), 1 deletion(-) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ index c1290866..4cd7fa7a 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ @@ -31,4 +31,7 @@ entity type GaussianProcessRegressionPipe extends MLLeafPipe t // train large model with AOD staged data @py(env='gordon-ML_1_0_0') trainWithStagedAOD: member function(modelIds: any): integer + // train with list of GSTPs + @py(env='gordon-ML_1_0_0') + trainWithListOfAODModels: member function(modelIds: any): integer } diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index 38e1f984..8212f665 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -221,4 +221,66 @@ def trainWithStagedAOD(this, modelIds): this.upsert() - return 0 + return 0 + +def trainWithListOfAODModels(this, modelIds): + """ + This method trains a large model with data coming from previously trained + GPR models with AOD data. + + Inputs: + ids: list of GaussianProcessRegressionPipes ids + + Returns: + int: 0 if method worked, 1 otherwise + """ + from sklearn.gaussian_process import GaussianProcessRegressor + + # get data + X = pd.DataFrame() + y = pd.DataFrame() + for model_id in ids: + model = c3.GaussianProcessRegressionPipe.get(model_id) + data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id, "targetSpec") + gstp_id = data_source_spec.targetSpec.filter.split(" == ")[1].replace('"', '') + gstp = c3.GeoSurfaceTimePoint.get(gstp_id) + px = c3.Dataset.toPandas(model.getFeatures()) + px["latitude"] = gstp.latitude + px["longitude"] = gstp.longitude + X = pd.concat([X,px], ignore_index=True) + + py = c3.Dataset.toPandas(model.getTarget()) + y = pd.concat([y,py], ignore_index=True) + X = X.to_numpy() + y = y.to_numpy() + + # generate training technique + technique = c3.GaussianProcessRegressionTechnique.get(this.technique.id) + serializedKernel = c3.SklearnGPRKernel.get(technique.kernel.id) + + if (technique.centerTarget): + targetMean = float(y.mean()) + y = y - y.mean() + + # get kernel object from c3, make it python again + kernel = c3.PythonSerialization.deserialize(serialized=serializedKernel.pickledKernel) + + # build and train GPR + gp = GaussianProcessRegressor(kernel=kernel) + gp.fit(X, y) + + if (technique.centerTarget): + params = {} + params["targetMean"] = targetMean + this.trainedModel = c3.MLTrainedModelArtifact( + model=c3.PythonSerialization.serialize(obj=gp), + parameters=params + ) + else: + this.trainedModel = c3.MLTrainedModelArtifact( + model=c3.PythonSerialization.serialize(obj=gp), + ) + + this.upsert() + + return 0 \ No newline at end of file From 97146a319c54f458108c51a050278538ed841e47 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Thu, 27 Oct 2022 15:36:23 -0500 Subject: [PATCH 37/58] add pandas --- .../GPRegression/GaussianProcessRegressionPipe.py | 1 + 1 file changed, 1 insertion(+) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index 8212f665..d8b19343 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -235,6 +235,7 @@ def trainWithListOfAODModels(this, modelIds): int: 0 if method worked, 1 otherwise """ from sklearn.gaussian_process import GaussianProcessRegressor + import pandas as pd # get data X = pd.DataFrame() From a76dc7e0aa8ac1d849953090c93612d5d29c1864 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Thu, 27 Oct 2022 15:47:55 -0500 Subject: [PATCH 38/58] another trial --- .../GPRegression/GaussianProcessRegressionPipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index d8b19343..7f56d51b 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -240,7 +240,7 @@ def trainWithListOfAODModels(this, modelIds): # get data X = pd.DataFrame() y = pd.DataFrame() - for model_id in ids: + for model_id in modelIds: model = c3.GaussianProcessRegressionPipe.get(model_id) data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id, "targetSpec") gstp_id = data_source_spec.targetSpec.filter.split(" == ")[1].replace('"', '') From 81f42196b8a9d53b907b4bfc9673ba0c880ad5fe Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Thu, 27 Oct 2022 15:53:37 -0500 Subject: [PATCH 39/58] source spec not required --- .../GPRegression/GaussianProcessRegressionPipe.c3typ | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ index 4cd7fa7a..585b16e2 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ @@ -11,7 +11,7 @@ entity type GaussianProcessRegressionPipe extends MLLeafPipe t // the technique for this regression technique: !GaussianProcessRegressionTechnique // data source spec for this regression - dataSourceSpec: !GPRDataSourceSpec + dataSourceSpec: GPRDataSourceSpec // get features data @py(env='gordon-ML_1_0_0') From 321a00e8b11b3025fa659a461485fb9577fd2ca4 Mon Sep 17 00:00:00 2001 From: James Carzon Date: Thu, 27 Oct 2022 22:48:26 -0400 Subject: [PATCH 40/58] Include time in hours as staged feature --- .../gordon-group/src/Utils/DataStaging/StagedFeatures.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py index b6cacd74..861e8942 100644 --- a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py +++ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py @@ -10,6 +10,7 @@ def stageFromAODGPRModelIdsList(ids): int: zero if it worked, 1 if it failed """ import pandas as pd + from datetime import timedelta c3.StagedFeatures.removeAll() @@ -22,6 +23,12 @@ def stageFromAODGPRModelIdsList(ids): pdf = c3.Dataset.toPandas(model.getFeatures()) pdf["latitude"] = gstp.latitude pdf["longitude"] = gstp.longitude + my_time = gstp.time.timetuple() + pdf["time"] = timedelta( + days=my_time.tm_yday, + minutes=my_time.tm_min, + hours=my_time.tm_hour + ).total_seconds() / 3600 df = pd.concat([df,pdf], ignore_index=True) def row_to_dict(row): From d3f0dd422a8f6feeec87c9ee272811ad27438093 Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Thu, 3 Nov 2022 11:07:44 -0500 Subject: [PATCH 41/58] add excl feats --- .../GPRegression/GaussianProcessRegressionPipe.c3typ | 2 +- .../GPRegression/GaussianProcessRegressionPipe.py | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ index 585b16e2..f8c2f44b 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ @@ -33,5 +33,5 @@ entity type GaussianProcessRegressionPipe extends MLLeafPipe t trainWithStagedAOD: member function(modelIds: any): integer // train with list of GSTPs @py(env='gordon-ML_1_0_0') - trainWithListOfAODModels: member function(modelIds: any): integer + trainWithListOfAODModels: member function(modelIds: any, excludeFeatures: any): integer } diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index 7f56d51b..324d11fc 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -223,7 +223,7 @@ def trainWithStagedAOD(this, modelIds): return 0 -def trainWithListOfAODModels(this, modelIds): +def trainWithListOfAODModels(this, modelIds, excludeFeatures): """ This method trains a large model with data coming from previously trained GPR models with AOD data. @@ -252,6 +252,7 @@ def trainWithListOfAODModels(this, modelIds): py = c3.Dataset.toPandas(model.getTarget()) y = pd.concat([y,py], ignore_index=True) + X.drop(excludeFeatures, axis=1, inplace=True) X = X.to_numpy() y = y.to_numpy() From b71cbb95728897e7444d45dafc7701150234daaf Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Thu, 3 Nov 2022 12:20:34 -0500 Subject: [PATCH 42/58] making it more readable --- .../GPRegression/GaussianProcessRegressionPipe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index 324d11fc..d4316209 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -252,7 +252,7 @@ def trainWithListOfAODModels(this, modelIds, excludeFeatures): py = c3.Dataset.toPandas(model.getTarget()) y = pd.concat([y,py], ignore_index=True) - X.drop(excludeFeatures, axis=1, inplace=True) + X.drop(columns=excludeFeatures, inplace=True) X = X.to_numpy() y = y.to_numpy() From 312779234f2d8f68f1d608297a20852c4e5e2a80 Mon Sep 17 00:00:00 2001 From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com> Date: Thu, 3 Nov 2022 17:00:32 -0400 Subject: [PATCH 43/58] Include time in hours as action feature --- .../GPRegression/GaussianProcessRegressionPipe.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py index d4316209..5ba1084e 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py @@ -236,6 +236,7 @@ def trainWithListOfAODModels(this, modelIds, excludeFeatures): """ from sklearn.gaussian_process import GaussianProcessRegressor import pandas as pd + from datetime import timedelta # get data X = pd.DataFrame() @@ -245,9 +246,15 @@ def trainWithListOfAODModels(this, modelIds, excludeFeatures): data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id, "targetSpec") gstp_id = data_source_spec.targetSpec.filter.split(" == ")[1].replace('"', '') gstp = c3.GeoSurfaceTimePoint.get(gstp_id) + my_time = gstp.time.timetuple() px = c3.Dataset.toPandas(model.getFeatures()) px["latitude"] = gstp.latitude px["longitude"] = gstp.longitude + px["time"] = timedelta( + days=my_time.tm_yday, + minutes=my_time.tm_min, + hours=my_time.tm_hour + ).total_seconds() / 3600 X = pd.concat([X,px], ignore_index=True) py = c3.Dataset.toPandas(model.getTarget()) From 4e86563c6b73539732d4532fd97e72fbd923093e Mon Sep 17 00:00:00 2001 From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com> Date: Sat, 5 Nov 2022 20:46:01 -0400 Subject: [PATCH 44/58] Predict using DynMapReduce --- .../src/Utils/Predict/PredictAODGPR.c3typ | 22 +++++ .../src/Utils/Predict/PredictAODGPR.js | 59 +++++++++++++ .../src/Utils/Predict/PredictAODGPR.py | 87 +++++++++++++++++++ 3 files changed, 168 insertions(+) create mode 100644 training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ create mode 100644 training/gordon-group/src/Utils/Predict/PredictAODGPR.js create mode 100644 training/gordon-group/src/Utils/Predict/PredictAODGPR.py diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ new file mode 100644 index 00000000..91325661 --- /dev/null +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ @@ -0,0 +1,22 @@ +/** +* Copyright (c) 2022, C3 AI DTI, Development Operations Team +* All rights reserved. License: https://github.com/c3aidti/.github +**/ +/** +* This finds {@link GaussianProcessRegressionPipe}s that were trained +* with {@link Simulation3HourlyAODOutput} as targets, +* {@link SimulationModelParameters} as features, +* via a {@link AODGaussianMLTrainingJob} +*/ +type PredictAODGPR { + // Retrieve models based on exluded features, {@link GeoSurfaceTimePoint} instance, target name and training technique + getPipe: function(excFeats: [string], gstpId: string, targetName: string, technique: any): any js server + // Retrieve all models for a certain {@link GeoSurfaceTimePoint} filter + getPipes: function(excFeats: [string], gstpFilter: any, targetName: string, technique: any): any js server + // Extract learned parameters from trained {@link GaussianProcessRegressionPipe}s specified by {@link GeoSurfaceTimePoint} filter, target name, excluded features and {@link GaussianProcessRegressionTechnque} + @py(env='gordon-ML_1_0_0') + makePredictionsJob: function(excFeats: [string], gstpFilter: any, synthDataset: any, targetName: string, technique: any, batchSize: int): any + // Build a pandas dataframe with the hyper parameters once job is complete + @py(env='gordon-ML_1_0_0') + getPredictionsDataframeFromJob: inline function(job: any): any +} \ No newline at end of file diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.js b/training/gordon-group/src/Utils/Predict/PredictAODGPR.js new file mode 100644 index 00000000..93b9c7a0 --- /dev/null +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.js @@ -0,0 +1,59 @@ +function getPipe(excFeats, gstpId, targetName, technique) { + // identical to the methods used in AODGPRModelFinder.js + + // find the data source specs + var gstpKey = "geoSurfaceTimePoint.id == \"" + gstpId + "\""; + var filter = Filter.eq("featuresType.typeName", "SimulationModelParameters") + .and().eq("targetType.typeName", "Simulation3HourlyAODOutput") + .and().intersects("excludeFeatures", excFeats) + .and().eq("targetName", targetName) + .and().eq("targetSpec.filter", gstpKey); + + var sourceSpecIds = GPRDataSourceSpec.fetch({ + "filter": filter, + "limit": -1, + "include": "id" + }).objs.map(obj => obj.id); + + // find the kernels + filter = Filter.eq("pickledKernel", technique.kernel.pickledKernel); + var kernelIds = SklearnGPRKernel.fetch({ + "filter": filter.value, + "limit": -1, + "include": "id" + }).objs.map(obj => obj.id); + + // find the techniques + filter = Filter.intersects("kernel.id", kernelIds) + .and().eq("centerTarget", technique.centerTarget); + var techIds = GaussianProcessRegressionTechnique.fetch({ + "filter": filter.value, + "limit": -1, + "include": "id" + }).objs.map(obj => obj.id); + + // now find the models + filter = Filter.intersects("technique.id", techIds) + .and().intersects("dataSourceSpec.id", sourceSpecIds); + var pipes = GaussianProcessRegressionPipe.fetch({ + "filter": filter.value, + "limit": -1 + }).objs; + + return pipes +} + +function getPipes(excFeats, gstpFilter, targetName, technique) { + var gstpIds = GeoSurfaceTimePoint.fetch({ + "filter": gstpFilter, + "limit": -1, + "include": "id" + }).objs.map(obj => obj.id); + + var pipes = gstpIds.map(id => AODGPRModelFinder.getPipe(excFeats, id, targetName, technique)); + var nonNulls = pipes.filter(function (el) { + return el.length != 0; + }); + + return nonNulls +} \ No newline at end of file diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py new file mode 100644 index 00000000..cc1decc6 --- /dev/null +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py @@ -0,0 +1,87 @@ +def makePredictionsJob( + excFeats, gstpFilter, targetName, synthDataset, technique, batchSize +): + """ + Dynamic map-reduce job to get predictions on synthDataset. + """ + def cassandra_mapper(batch, objs, job): + models = [] + for obj in objs: + model = c3.AODGPRModelFinder.getPipe( + job.context.value["excludeFeatures"], + obj.id, + job.context.value["targetName"], + job.context.value["technique"] + ) + models.append(model) + + return {batch: models} + + def cassandra_reducer(key, interValues, job): + values = [] + for iv in interValues: + for val in iv: + for m in val: + model_id = m["id"] + centered = m["technique"]["centerTarget"] + if centered: + center = m["trainedModel"].parameters["targetMean"].asfloat() + else: + center = 0 + preds = m.process(synthDataset, computeStd=True) + values.append((preds, synthDataset, model_id, center)) + + + return values + + map_lambda = c3.Lambda.fromPython(cassandra_mapper) + reduce_lambda = c3.Lambda.fromPython(cassandra_reducer, runtime="gordon-ML_1_0_0") + + job_context = c3.MappObj( + value={ + 'excludeFeatures': excFeats, + 'targetName': targetName, + 'technique': technique, + 'syntheticDataset': synthDataset + } + ) + job = c3.DynMapReduce.startFromSpec( + c3.DynMapReduceSpec( + targetType="GeoSurfaceTimePoint", + filter=gstpFilter, + mapLambda=map_lambda, + reduceLambda=reduce_lambda, + batchSize=batchSize, + context=job_context + ) + ) + + return job + + +def getPredictionsDataframeFromJob(job): + """ + Iterates over job result and builds dataframe. + """ + import pandas as pd + import numpy as np + + predictions = [] + + if job.status().status == "completed": + for key, value in job.results().items(): + for subvalue in value: + df_y = c3.Dataset.toPandas(subvalue[0]) + df_y[0] += subvalue[3] + df_x = c3.Dataset.toPandas(subvalue[1]) + m_preds = pd.concat( + [df_x, df_y], + axis=1 + ) + m_preds["modelId"] = subvalue[2] + predictions.append(m_preds) + + df = pd.concat(predictions, axis=0).reset_index(drop=True) + return df + else: + return False \ No newline at end of file From 0c5316b3dac4172b9bd01abdcd467b67ef75ccf8 Mon Sep 17 00:00:00 2001 From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com> Date: Sat, 5 Nov 2022 20:57:53 -0400 Subject: [PATCH 45/58] Match order of arguments --- training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ index 91325661..608df842 100644 --- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.c3typ @@ -15,7 +15,7 @@ type PredictAODGPR { getPipes: function(excFeats: [string], gstpFilter: any, targetName: string, technique: any): any js server // Extract learned parameters from trained {@link GaussianProcessRegressionPipe}s specified by {@link GeoSurfaceTimePoint} filter, target name, excluded features and {@link GaussianProcessRegressionTechnque} @py(env='gordon-ML_1_0_0') - makePredictionsJob: function(excFeats: [string], gstpFilter: any, synthDataset: any, targetName: string, technique: any, batchSize: int): any + makePredictionsJob: function(excFeats: [string], gstpFilter: any, targetName: string, synthDataset: any, technique: any, batchSize: int): any // Build a pandas dataframe with the hyper parameters once job is complete @py(env='gordon-ML_1_0_0') getPredictionsDataframeFromJob: inline function(job: any): any From e022e7f3a97fb77afe6f021510dc052823d0672b Mon Sep 17 00:00:00 2001 From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com> Date: Mon, 7 Nov 2022 10:47:28 -0500 Subject: [PATCH 46/58] Try prediction with unpickled model --- training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py index cc1decc6..e7b4eb21 100644 --- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py @@ -19,6 +19,7 @@ def cassandra_mapper(batch, objs, job): def cassandra_reducer(key, interValues, job): values = [] + synthDataframe = c3.Dataset.toPandas(synthDataset) for iv in interValues: for val in iv: for m in val: @@ -28,7 +29,10 @@ def cassandra_reducer(key, interValues, job): center = m["trainedModel"].parameters["targetMean"].asfloat() else: center = 0 - preds = m.process(synthDataset, computeStd=True) + pickledModel = m["trainedModel"]["model"] + model = c3.PythonSerialization.deserialize(serialized=pickledModel) + mean, sd = model.predict(synthDataframe, return_std=True) + preds = pd.concat([pd.DataFrame(mean), pd.DataFrame(sd)], axis=1) values.append((preds, synthDataset, model_id, center)) From f9370feabc4637bcfa66c8ca82d5356a678434d2 Mon Sep 17 00:00:00 2001 From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com> Date: Mon, 7 Nov 2022 13:10:00 -0500 Subject: [PATCH 47/58] Grab synthDataset from job context --- training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py index e7b4eb21..10641a4d 100644 --- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py @@ -19,7 +19,7 @@ def cassandra_mapper(batch, objs, job): def cassandra_reducer(key, interValues, job): values = [] - synthDataframe = c3.Dataset.toPandas(synthDataset) + synthDataframe = c3.Dataset.toPandas(job.context.value["syntheticDataset"]) for iv in interValues: for val in iv: for m in val: From 72b65930c97d53bfeadc2a76bd5f1277c72313f2 Mon Sep 17 00:00:00 2001 From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com> Date: Mon, 7 Nov 2022 13:35:50 -0500 Subject: [PATCH 48/58] Import pandas for method --- training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py index 10641a4d..755f8d9f 100644 --- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py @@ -4,6 +4,8 @@ def makePredictionsJob( """ Dynamic map-reduce job to get predictions on synthDataset. """ + import pandas as pd + def cassandra_mapper(batch, objs, job): models = [] for obj in objs: From cd3443003d6e2d8fdff24064d2442e25fc1a1bbd Mon Sep 17 00:00:00 2001 From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com> Date: Mon, 7 Nov 2022 13:55:37 -0500 Subject: [PATCH 49/58] Keep predictions without pandas --- training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py index 755f8d9f..3cd67fdc 100644 --- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py @@ -4,7 +4,6 @@ def makePredictionsJob( """ Dynamic map-reduce job to get predictions on synthDataset. """ - import pandas as pd def cassandra_mapper(batch, objs, job): models = [] @@ -34,8 +33,7 @@ def cassandra_reducer(key, interValues, job): pickledModel = m["trainedModel"]["model"] model = c3.PythonSerialization.deserialize(serialized=pickledModel) mean, sd = model.predict(synthDataframe, return_std=True) - preds = pd.concat([pd.DataFrame(mean), pd.DataFrame(sd)], axis=1) - values.append((preds, synthDataset, model_id, center)) + values.append((mean, sd, synthDataset, model_id, center)) return values From 872eb02a9efef910949d91a18c2919c0679f37f9 Mon Sep 17 00:00:00 2001 From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com> Date: Mon, 7 Nov 2022 14:15:12 -0500 Subject: [PATCH 50/58] Typo: returned unused object --- training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py index 3cd67fdc..27829a03 100644 --- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py @@ -33,7 +33,7 @@ def cassandra_reducer(key, interValues, job): pickledModel = m["trainedModel"]["model"] model = c3.PythonSerialization.deserialize(serialized=pickledModel) mean, sd = model.predict(synthDataframe, return_std=True) - values.append((mean, sd, synthDataset, model_id, center)) + values.append((mean, sd, model_id, center)) return values From 928e552d47f828529645bda047607f95ba22bc5c Mon Sep 17 00:00:00 2001 From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com> Date: Mon, 7 Nov 2022 14:48:30 -0500 Subject: [PATCH 51/58] Restructure resultant df of predictions --- .../src/Utils/Predict/PredictAODGPR.py | 20 +++++++++---------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py index 27829a03..90c05bd5 100644 --- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py @@ -71,20 +71,18 @@ def getPredictionsDataframeFromJob(job): import numpy as np predictions = [] - + if job.status().status == "completed": for key, value in job.results().items(): for subvalue in value: - df_y = c3.Dataset.toPandas(subvalue[0]) - df_y[0] += subvalue[3] - df_x = c3.Dataset.toPandas(subvalue[1]) - m_preds = pd.concat( - [df_x, df_y], - axis=1 - ) - m_preds["modelId"] = subvalue[2] - predictions.append(m_preds) - + df_m = pd.DataFrame() + df_m["mean"] = np.array(subvalue[0]).flatten() + df_m["mean"] += subvalue[3] + df_m["sd"] = subvalue[1] + df_m["modelId"] = subvalue[2] + + predictions.append(df_m) + df = pd.concat(predictions, axis=0).reset_index(drop=True) return df else: From c7eb9e1bf2df2de0f94132d63ab052d361c399b1 Mon Sep 17 00:00:00 2001 From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com> Date: Mon, 7 Nov 2022 21:33:30 -0500 Subject: [PATCH 52/58] Add lat, lon, time to prediction results --- .../src/Utils/Predict/PredictAODGPR.py | 27 ++++++++++++++----- 1 file changed, 21 insertions(+), 6 deletions(-) diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py index 90c05bd5..6beedbc6 100644 --- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py @@ -24,6 +24,7 @@ def cassandra_reducer(key, interValues, job): for iv in interValues: for val in iv: for m in val: + # predictions model_id = m["id"] centered = m["technique"]["centerTarget"] if centered: @@ -33,7 +34,16 @@ def cassandra_reducer(key, interValues, job): pickledModel = m["trainedModel"]["model"] model = c3.PythonSerialization.deserialize(serialized=pickledModel) mean, sd = model.predict(synthDataframe, return_std=True) - values.append((mean, sd, model_id, center)) + + # location + dssId = m["dataSourceSpec"]["id"] + dss = c3.GPRDataSourceSpec.get(dssId) + gstpId = dss.targetSpec.filter.split(" == ")[1].replace('"', '') + gstp = c3.GeoSurfaceTimePoint.get(gstpId) + lat = gstp.latitude + lon = gstp.longitude + time = gstp.time + values.append((model_id, mean, center, sd, synthDataframe, lat, lon, time)) return values @@ -74,12 +84,17 @@ def getPredictionsDataframeFromJob(job): if job.status().status == "completed": for key, value in job.results().items(): - for subvalue in value: + for subvalue in value: #(model_id, mean, center, sd, synthDataframe, lat, lon, time) df_m = pd.DataFrame() - df_m["mean"] = np.array(subvalue[0]).flatten() - df_m["mean"] += subvalue[3] - df_m["sd"] = subvalue[1] - df_m["modelId"] = subvalue[2] + df_m["mean"] = np.array(subvalue[1]).flatten() + df_m["mean"] += subvalue[2] + df_m["sd"] = subvalue[3] + df_m["lat"] = subvalue[5] + df_m["lon"] = subvalue[6] + df_m["time"] = subvalue[7] + df_m["modelId"] = subvalue[0] + + df_m = pd.concat([df_m, subvalue[4]], axis=1) predictions.append(df_m) From 67e11bfc7c7f8ae73c79fb8b42a3b98f21014aeb Mon Sep 17 00:00:00 2001 From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com> Date: Mon, 7 Nov 2022 22:08:09 -0500 Subject: [PATCH 53/58] Don't save synthDataset in results --- training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py index 6beedbc6..d853670b 100644 --- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py @@ -43,7 +43,7 @@ def cassandra_reducer(key, interValues, job): lat = gstp.latitude lon = gstp.longitude time = gstp.time - values.append((model_id, mean, center, sd, synthDataframe, lat, lon, time)) + values.append((model_id, mean, center, sd, lat, lon, time)) return values @@ -94,8 +94,6 @@ def getPredictionsDataframeFromJob(job): df_m["time"] = subvalue[7] df_m["modelId"] = subvalue[0] - df_m = pd.concat([df_m, subvalue[4]], axis=1) - predictions.append(df_m) df = pd.concat(predictions, axis=0).reset_index(drop=True) From 0b2489dde9d528bcff5bb91df2e129e70927ef3c Mon Sep 17 00:00:00 2001 From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com> Date: Mon, 7 Nov 2022 22:53:24 -0500 Subject: [PATCH 54/58] Indexing subvalues correctly --- training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py index d853670b..360815e9 100644 --- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py @@ -84,14 +84,14 @@ def getPredictionsDataframeFromJob(job): if job.status().status == "completed": for key, value in job.results().items(): - for subvalue in value: #(model_id, mean, center, sd, synthDataframe, lat, lon, time) + for subvalue in value: df_m = pd.DataFrame() df_m["mean"] = np.array(subvalue[1]).flatten() df_m["mean"] += subvalue[2] df_m["sd"] = subvalue[3] - df_m["lat"] = subvalue[5] - df_m["lon"] = subvalue[6] - df_m["time"] = subvalue[7] + df_m["lat"] = subvalue[4] + df_m["lon"] = subvalue[5] + df_m["time"] = subvalue[6] df_m["modelId"] = subvalue[0] predictions.append(df_m) From f02f52a949f65bcf106f744b9ae6dbe60af0cc51 Mon Sep 17 00:00:00 2001 From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com> Date: Wed, 9 Nov 2022 07:23:05 -0500 Subject: [PATCH 55/58] Clean up predictions data frame --- .../src/Utils/Predict/PredictAODGPR.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py index 360815e9..13de2e6f 100644 --- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py @@ -86,17 +86,17 @@ def getPredictionsDataframeFromJob(job): for key, value in job.results().items(): for subvalue in value: df_m = pd.DataFrame() - df_m["mean"] = np.array(subvalue[1]).flatten() - df_m["mean"] += subvalue[2] - df_m["sd"] = subvalue[3] - df_m["lat"] = subvalue[4] - df_m["lon"] = subvalue[5] + df_m["meanResponse"] = np.array(subvalue[1]).flatten() + df_m["meanResponse"] += subvalue[2] + df_m["sdResponse"] = subvalue[3] + df_m["latitude"] = subvalue[4] + df_m["longitude"] = subvalue[5] df_m["time"] = subvalue[6] df_m["modelId"] = subvalue[0] predictions.append(df_m) - df = pd.concat(predictions, axis=0).reset_index(drop=True) - return df + # df = pd.concat(predictions, axis=0).reset_index(drop=True) + return predictions else: return False \ No newline at end of file From 049256e938ac28db26dca5faf7536458183c7df8 Mon Sep 17 00:00:00 2001 From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com> Date: Wed, 9 Nov 2022 11:13:52 -0500 Subject: [PATCH 56/58] Return one df rather than list of dfs --- training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py index 13de2e6f..10148377 100644 --- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py @@ -96,7 +96,7 @@ def getPredictionsDataframeFromJob(job): predictions.append(df_m) - # df = pd.concat(predictions, axis=0).reset_index(drop=True) + df = pd.concat(predictions, axis=0).reset_index(drop=True) return predictions else: return False \ No newline at end of file From 787a0bb6cc067601b6c5e392c3cdd159c24df18b Mon Sep 17 00:00:00 2001 From: JamesCarzon <86118662+JamesCarzon@users.noreply.github.com> Date: Mon, 14 Nov 2022 18:27:12 -0500 Subject: [PATCH 57/58] Track model variants in predictions --- training/gordon-group/src/Utils/Predict/PredictAODGPR.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py index 10148377..4daba9c8 100644 --- a/training/gordon-group/src/Utils/Predict/PredictAODGPR.py +++ b/training/gordon-group/src/Utils/Predict/PredictAODGPR.py @@ -93,10 +93,11 @@ def getPredictionsDataframeFromJob(job): df_m["longitude"] = subvalue[5] df_m["time"] = subvalue[6] df_m["modelId"] = subvalue[0] + df_m["variant"] = list(range(df_m.shape[0])) predictions.append(df_m) df = pd.concat(predictions, axis=0).reset_index(drop=True) - return predictions + return df else: return False \ No newline at end of file From 95438adb80457a9ca03b7cf8799bd81d19eeb31d Mon Sep 17 00:00:00 2001 From: Bruno Abreu Date: Fri, 20 Jan 2023 13:26:14 -0600 Subject: [PATCH 58/58] bogus comment to kick workflow --- .../GPRegression/GaussianProcessRegressionPipe.c3typ | 1 + 1 file changed, 1 insertion(+) diff --git a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ index f8c2f44b..3a6a55fe 100644 --- a/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ +++ b/training/gordon-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ @@ -5,6 +5,7 @@ /** * GaussianProcessRegressionPipe.c3typ * Performs Scikit-Learn's GP Regression. +* Bogus comment to reprovision app. */ @db(unique=['technique, dataSourceSpec']) entity type GaussianProcessRegressionPipe extends MLLeafPipe type key 'GPREG' {