c3aidti · babreu-ncsa · Aug 31, 2022 · Aug 31, 2022 · Aug 31, 2022 · Aug 31, 2022
diff --git a/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json b/training/gordon-group/seed/ActionRuntime/py-gordon-ML_2_0_0.json
@@ -0,0 +1,24 @@
+{
+    "language": "Python",
+    "runtimeVersion": "3.8.10",
+    "modules": {
+      "conda.scikit-learn":"=0.24.2",
+      "conda.pandas":"=1.3.0",
+      "conda.pytorch":"=1.12.1",
+      "conda.torchvision":"=0.13.1",
+      "conda.torchaudio":"=0.12.1",
+      "conda.cudatoolkit":"=11.3",
+      "conda.libgcc":"=7.2.0",
+      "conda.gpytorch":"=1.9.0",
+      "conda.dill":"=0.2.8.2"
+    },
+    "repositories": [
+      "https://repo.continuum.io/pkgs/main",
+      "conda-forge",
+      "pytorch",
+      "anaconda"
+    ],
+    "runtime": "CPython",
+    "name": "py-gordon-ML_2_0_0",
+    "id": "py-gordon-ML_2_0_0"
+}
diff --git a/...oup/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ b/...oup/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.c3typ
@@ -5,13 +5,14 @@
 /**
 * GaussianProcessRegressionPipe.c3typ
 * Performs Scikit-Learn's GP Regression.
+* Bogus comment to reprovision app.
 */
 @db(unique=['technique, dataSourceSpec'])
 entity type GaussianProcessRegressionPipe extends MLLeafPipe<Dataset, Dataset> type key 'GPREG' {
     // the technique for this regression
     technique: !GaussianProcessRegressionTechnique
     // data source spec for this regression
-    dataSourceSpec: !GPRDataSourceSpec
+    dataSourceSpec: GPRDataSourceSpec
 
     // get features data
     @py(env='gordon-ML_1_0_0')
@@ -28,4 +29,10 @@ entity type GaussianProcessRegressionPipe extends MLLeafPipe<Dataset, Dataset> t
     // guarantee that process() is only allowed after train()
     @py(env='gordon-ML_1_0_0')
     isProcessable: ~
+    // train large model with AOD staged data
+    @py(env='gordon-ML_1_0_0')
+    trainWithStagedAOD: member function(modelIds: any): integer
+    // train with list of GSTPs
+    @py(env='gordon-ML_1_0_0')
+    trainWithListOfAODModels: member function(modelIds: any, excludeFeatures: any): integer
 }
diff --git a/...-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py b/...-group/src/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionPipe.py
@@ -12,14 +12,36 @@ def train(this, input, targetOutput, spec):
     X = c3.Dataset.toNumpy(dataset=input)
     y = c3.Dataset.toNumpy(dataset=targetOutput)
 
+    if (technique.centerTarget):
+        targetMean = float(y.mean())
+        y = y - y.mean()
+
+    if (technique.validation):
+        rng = np.random.RandomState(technique.randomSeed)
+        rng.shuffle(X)
+        X = X[0:int((1.0 - technique.splitFraction)*len(X))]
+        rng.shuffle(y)
+        y = y[0:int((1.0 - technique.splitFraction)*len(y))]
+
+
     # get kernel object from c3, make it python again
     kernel = c3.PythonSerialization.deserialize(serialized=serializedKernel.pickledKernel)
 
     # build and train GPR
     gp = GaussianProcessRegressor(kernel=kernel)
     gp.fit(X, y)
 
-    this.trainedModel = c3.MLTrainedModelArtifact(model=c3.PythonSerialization.serialize(obj=gp))
+    if (technique.centerTarget):
+        params = {}
+        params["targetMean"] = targetMean
+        this.trainedModel = c3.MLTrainedModelArtifact(
+            model=c3.PythonSerialization.serialize(obj=gp),
+            parameters=params
+        )
+    else:
+        this.trainedModel = c3.MLTrainedModelArtifact(
+            model=c3.PythonSerialization.serialize(obj=gp),
+        )
 
     return this
 
@@ -71,8 +93,26 @@ def getFeatures(this):
     import pandas as pd
 
     dataSourceSpec = c3.GPRDataSourceSpec.get(this.dataSourceSpec.id)
-
     featuresType = dataSourceSpec.featuresType.toType()
+
+    if (featuresType.name == "StagedFeatures"):
+        features = c3.StagedFeatures.fetch({
+            "limit": -1,
+            "order": "id"
+        }).objs.toJson()
+
+        df = pd.DataFrame(features)
+        keys = df.iloc[0]["features"].keys()
+
+        for key in keys:
+            df[key] = df["features"].apply(lambda x: x[key])
+
+        df.drop("version", axis=1, inplace=True)
+        df = df.select_dtypes(["number"])
+
+        return c3.Dataset.fromPython(df)
+
+
     inputTableC3 = featuresType.fetch(dataSourceSpec.featuresSpec).objs.toJson()
     inputTablePandas = pd.DataFrame(inputTableC3)
     inputTablePandas = inputTablePandas.drop("version", axis=1)
@@ -94,8 +134,26 @@ def getTarget(this):
     import pandas as pd
 
     dataSourceSpec = c3.GPRDataSourceSpec.get(this.dataSourceSpec.id)
-
     targetType = dataSourceSpec.targetType.toType()
+
+    if (targetType.name == "StagedTargets"):
+        targets = c3.StagedTargets.fetch({
+            "limit": -1,
+            "order": "id"
+        }).objs.toJson()
+
+        df = pd.DataFrame(targets)
+        keys = df.iloc[0]["targets"].keys()
+
+        for key in keys:
+            df[key] = df["targets"].apply(lambda x: x[key])
+
+        df.drop("version", axis=1, inplace=True)
+        df = df.select_dtypes(["number"])
+
+        return c3.Dataset.fromPython(df)
+
+
     outputTableC3 = targetType.fetch(dataSourceSpec.targetSpec).objs.toJson()
     outputTablePandas = pd.DataFrame(outputTableC3)
     outputTablePandas = outputTablePandas.drop("version", axis=1)
@@ -112,3 +170,126 @@ def getTarget(this):
         outputTablePandas = pd.DataFrame(outputTablePandas[dataSourceSpec.targetName])
 
     return c3.Dataset.fromPython(outputTablePandas)
+
+
+def trainWithStagedAOD(this, modelIds):
+    """
+    This method trains a large model with data coming from previously trained
+    GPR models with AOD data.
+
+    Inputs:
+        ids: list of GaussianProcessRegressionPipes ids
+
+    Returns:
+        int: 0 if method worked, 1 otherwise
+    """
+    from sklearn.gaussian_process import GaussianProcessRegressor
+
+    # stage features and targets
+    c3.StagedFeatures.stageFromAODGPRModelIdsList(modelIds)
+    c3.StagedTargets.stageFromAODGPRModelIdsList(modelIds)
+    # get data
+    X = c3.Dataset.toNumpy(dataset=this.getFeatures())
+    y = c3.Dataset.toNumpy(dataset=this.getTarget())
+
+    # generate training technique
+    technique = c3.GaussianProcessRegressionTechnique.get(this.technique.id)
+    serializedKernel = c3.SklearnGPRKernel.get(technique.kernel.id)
+
+    if (technique.centerTarget):
+        targetMean = float(y.mean())
+        y = y - y.mean()
+
+    # get kernel object from c3, make it python again
+    kernel = c3.PythonSerialization.deserialize(serialized=serializedKernel.pickledKernel)
+
+    # build and train GPR
+    gp = GaussianProcessRegressor(kernel=kernel)
+    gp.fit(X, y)
+
+    if (technique.centerTarget):
+        params = {}
+        params["targetMean"] = targetMean
+        this.trainedModel = c3.MLTrainedModelArtifact(
+            model=c3.PythonSerialization.serialize(obj=gp),
+            parameters=params
+        )
+    else:
+        this.trainedModel = c3.MLTrainedModelArtifact(
+            model=c3.PythonSerialization.serialize(obj=gp),
+        )
+
+    this.upsert()
+
+    return 0
+
+def trainWithListOfAODModels(this, modelIds, excludeFeatures):
+    """
+    This method trains a large model with data coming from previously trained
+    GPR models with AOD data.
+
+    Inputs:
+        ids: list of GaussianProcessRegressionPipes ids
+
+    Returns:
+        int: 0 if method worked, 1 otherwise
+    """
+    from sklearn.gaussian_process import GaussianProcessRegressor
+    import pandas as pd
+    from datetime import timedelta
+
+    # get data
+    X = pd.DataFrame()
+    y = pd.DataFrame()
+    for model_id in modelIds:
+        model = c3.GaussianProcessRegressionPipe.get(model_id)
+        data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id, "targetSpec")
+        gstp_id = data_source_spec.targetSpec.filter.split(" == ")[1].replace('"', '')
+        gstp = c3.GeoSurfaceTimePoint.get(gstp_id)
+        my_time = gstp.time.timetuple()
+        px = c3.Dataset.toPandas(model.getFeatures())
+        px["latitude"] = gstp.latitude
+        px["longitude"] = gstp.longitude
+        px["time"] = timedelta(
+            days=my_time.tm_yday,
+            minutes=my_time.tm_min,
+            hours=my_time.tm_hour
+        ).total_seconds() / 3600
+        X = pd.concat([X,px], ignore_index=True)
+
+        py = c3.Dataset.toPandas(model.getTarget())
+        y = pd.concat([y,py], ignore_index=True)
+    X.drop(columns=excludeFeatures, inplace=True)
+    X = X.to_numpy()
+    y = y.to_numpy()
+
+    # generate training technique
+    technique = c3.GaussianProcessRegressionTechnique.get(this.technique.id)
+    serializedKernel = c3.SklearnGPRKernel.get(technique.kernel.id)
+
+    if (technique.centerTarget):
+        targetMean = float(y.mean())
+        y = y - y.mean()
+
+    # get kernel object from c3, make it python again
+    kernel = c3.PythonSerialization.deserialize(serialized=serializedKernel.pickledKernel)
+
+    # build and train GPR
+    gp = GaussianProcessRegressor(kernel=kernel)
+    gp.fit(X, y)
+
+    if (technique.centerTarget):
+        params = {}
+        params["targetMean"] = targetMean
+        this.trainedModel = c3.MLTrainedModelArtifact(
+            model=c3.PythonSerialization.serialize(obj=gp),
+            parameters=params
+        )
+    else:
+        this.trainedModel = c3.MLTrainedModelArtifact(
+            model=c3.PythonSerialization.serialize(obj=gp),
+        )
+
+    this.upsert()
+
+    return 0
diff --git a/...rc/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ b/...rc/CustomMLPipeline/machineLearning/GPRegression/GaussianProcessRegressionTechnique.c3typ
@@ -10,4 +10,16 @@ entity type GaussianProcessRegressionTechnique mixes MLTechnique schema name 'GP
     // the kernel object
     @ML(hyperParameter=true)
     kernel: SklearnGPRKernel
+    // center target data before fitting
+    @ML(hyperParameter=true)
+    centerTarget: boolean=false
+    // leave fraction of rows for post-validation
+    @ML(hyperParameter=true)
+    validation: boolean=false
+    // random seed to be used by numpy.shuffle
+    @ML(hyperParameter=true)
+    randomSeed: integer=42
+    // fraction to be left for validation
+    @ML(hyperParameter=true)
+    splitFraction: float=0.2
 }
diff --git a/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js b/training/gordon-group/src/Utils/AOD/AODGPRModelFinder.js
@@ -23,7 +23,8 @@ function getPipe(excFeats, gstpId, targetName, technique) {
     }).objs.map(obj => obj.id);
 
     // find the techniques
-    filter = Filter.intersects("kernel.id", kernelIds);
+    filter = Filter.intersects("kernel.id", kernelIds)
+        .and().eq("centerTarget", technique.centerTarget);
     var techIds = GaussianProcessRegressionTechnique.fetch({
         "filter": filter.value,
         "limit": -1,

diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.c3typ
@@ -0,0 +1,15 @@
+/**
+* Copyright (c) 2022, C3 AI DTI, Development Operations Team
+* All rights reserved. License: https://github.com/c3aidti/.github
+**/
+/**
+* This type hosts data for models that cannot obtain Features 
+* directly from other entity types.
+*/
+entity type StagedFeatures schema name "STGD_FTRS" {
+    // the features to be staged
+    features: map<string, double> schema suffix "FTR"
+    // method to stage from gstp list
+    @py(env='gordon_1_0_0')
+    stageFromAODGPRModelIdsList: function(ids: !any): integer
+}
diff --git a/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py b/training/gordon-group/src/Utils/DataStaging/StagedFeatures.py
@@ -0,0 +1,46 @@
+def stageFromAODGPRModelIdsList(ids):
+    """
+    Given a list of GaussianProcessRegressionPipes trained with
+    AOD data, stage the features for each model.
+
+    Input:
+        ids: list of model ids
+
+    Return:
+        int: zero if it worked, 1 if it failed
+    """
+    import pandas as pd
+    from datetime import timedelta 
+
+    c3.StagedFeatures.removeAll()
+
+    df = pd.DataFrame()
+    for model_id in ids:
+        model = c3.GaussianProcessRegressionPipe.get(model_id)
+        data_source_spec = c3.GPRDataSourceSpec.get(model.dataSourceSpec.id, "targetSpec")
+        gstp_id = data_source_spec.targetSpec.filter.split(" == ")[1].replace('"', '')
+        gstp = c3.GeoSurfaceTimePoint.get(gstp_id)
+        pdf = c3.Dataset.toPandas(model.getFeatures())
+        pdf["latitude"] = gstp.latitude
+        pdf["longitude"] = gstp.longitude
+        my_time = gstp.time.timetuple()
+        pdf["time"] = timedelta(
+            days=my_time.tm_yday,
+            minutes=my_time.tm_min,
+            hours=my_time.tm_hour
+        ).total_seconds() / 3600
+        df = pd.concat([df,pdf], ignore_index=True)
+
+    def row_to_dict(row):
+        d = {}
+        for col in row.index:
+            d[col] = row[col]
+        return d
+
+    df_final = pd.DataFrame()
+    df_final["features"] = df.apply(row_to_dict, axis=1)
+    df_final["id"] = df_final.index
+    output_records = df_final.to_dict(orient="records")
+    c3.StagedFeatures.upsertBatch(objs=output_records)
+
+    return 0
diff --git a/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ b/training/gordon-group/src/Utils/DataStaging/StagedTargets.c3typ
@@ -0,0 +1,15 @@
+/**
+* Copyright (c) 2022, C3 AI DTI, Development Operations Team
+* All rights reserved. License: https://github.com/c3aidti/.github
+**/
+/**
+* This type hosts data for models that cannot obtain Targets
+* directly from other entity types.
+*/
+entity type StagedTargets schema name "STGD_TRGTS" {
+    // the staged targets
+    targets: map<string, double> schema suffix "TRGT"
+    // method to stage from gstp list
+    @py(env='gordon_1_0_0')
+    stageFromAODGPRModelIdsList: function(ids: !any): integer
+}