c3aidti · dti-devops · Nov 29, 2022 · Jan 23, 2023 · Feb 7, 2023 · Mar 22, 2023
diff --git a/nonAppFiles/smokePPE/smoke_PPE_Design_original_scale_w_bc_ri_index.csv b/nonAppFiles/smokePPE/smoke_PPE_Design_original_scale_w_bc_ri_index.csv
diff --git a/nonAppFiles/smokePPE/smoke_PPE_Unit_Design.csv b/nonAppFiles/smokePPE/smoke_PPE_Unit_Design.csv
diff --git a/smoke/smokeApp/seed/ActionRuntime/py-gordon_1_0_0.json b/smoke/smokeApp/seed/ActionRuntime/py-gordon_1_0_0.json
@@ -10,7 +10,8 @@
       "conda.basemap":"=1.2.2",
       "conda.nbformat":"=5.1.3",
       "conda.iris":"=3.1.0",
-      "conda.fsspec":""
+      "conda.fsspec":"",
+      "conda.scikit-learn":"=0.23.1"
     },
     "repositories": [
       "https://repo.continuum.io/pkgs/main",

diff --git a/.../seed/SourceSmokePPESimulationModelParametersMap/SmokePPESimulationModelParametersMap.csv b/.../seed/SourceSmokePPESimulationModelParametersMap/SmokePPESimulationModelParametersMap.csv
diff --git a/...mokeApp/src/batchJobs/smokePPEGaussianML/SmokePPECoarseGrainedGaussianMLTrainingJob.c3typ b/...mokeApp/src/batchJobs/smokePPEGaussianML/SmokePPECoarseGrainedGaussianMLTrainingJob.c3typ
@@ -0,0 +1,12 @@
+/**
+* Copyright (c) 2022, C3 AI DTI, Development Operations Team
+* All rights reserved. License: https://github.com/c3aidti/.github
+**/
+/*
+* Training job for AOD data {@link Simulation3HourlyAODOutput} 
+* and {@link GaussianProcessRegressionPipe}s with coarse graining
+*/
+type SmokePPECoarseGrainedGaussianMLTrainingJob extends BatchJob<SmokePPECoarseGrainedGaussianMLTrainingJob, SmokePPECoarseGrainedGaussianMLTrainingJobOptions, SmokePPECoarseGrainedGaussianMLTrainingJobBatch> type key 'SMKPPECRSMLJB' {
+    doStart:        ~ js server
+    processBatch:   ~ js server
+}
diff --git a/...ussianML/smokePPEGaussianMLTrainingJob.js → ...ePPECoarseGrainedGaussianMLTrainingJob.js b/...ussianML/smokePPEGaussianMLTrainingJob.js → ...ePPECoarseGrainedGaussianMLTrainingJob.js
@@ -3,24 +3,27 @@
 * All rights reserved. License: https://github.com/c3aidti/.github
 **/
 /**
- * Implementation of AODGaussianMLTrainingJob
- * @param {AODGaussianMLTrainingJob} job
- * @param {AODGaussianMLTrainingJobOptions} options
+ * Implementation of SmokePPECoarseGrainedGaussianMLTrainingJob
+ * @param {SmokePPECoarseGrainedGaussianMLTrainingJob} job
+ * @param {SmokePPECoarseGrainedGaussianMLTrainingJobOptions} options
  */
 function doStart(job, options) {
     job.setHardwareProfile(options.hardwareProfileId);
     var batch = [];
 
+
+    var gstpFilter = Filter.ge("latitude", options.minLat).and().lt("latitude", options.maxLat).and().ge("longitude", options.minLon).and().lt("longitude", options.maxLon).and().ge("time", options.minTime).and().lt("time", options.maxTime);
+
     var gstps = GeoSurfaceTimePoint.fetchObjStream({
-        filter: options.gstpFilter,
+        filter: gstpFilter,
         limit: -1
     });
 
     while(gstps.hasNext()) {
         batch.push(gstps.next());
 
         if (batch.length >= options.batchSize || !gstps.hasNext()) {
-            var batchSpec = AODGaussianMLTrainingJobBatch.make({values: batch});
+            var batchSpec = SmokePPEGaussianMLTrainingJobBatch.make({values: batch});
             job.scheduleBatch(batchSpec);
 
             batch = [];
@@ -31,9 +34,9 @@ function doStart(job, options) {
 
 /**
  * Implementation of what to do in each batch
- * @param {AODGaussianMLTrainingJobBatch} batch
- * @param {AODGaussianMLTrainingJob} job
- * @param {AODGaussianMLTrainingJobOptions} options
+ * @param {SmokePPEGaussianMLTrainingJobBatch} batch
+ * @param {SmokePPEGaussianMLTrainingJob} job
+ * @param {SmokePPEGaussianMLTrainingJobOptions} options
  */
 function processBatch(batch, job, options) {
     batch.values.forEach(function(gstp) {
@@ -105,9 +108,10 @@ function processBatch(batch, job, options) {
         var X = GPR_pipe.getFeatures();
         var y = GPR_pipe.getTarget();
 
-        // train and save
-        var GPR_pipe_trained = GPR_pipe.train(X, y);
-        GPR_pipe_trained.upsert();
-
+        if (X.size() > 0 && y.size() > 0) {
+            // train and save
+            var GPR_pipe_trained = GPR_pipe.train(X, y);
+            GPR_pipe_trained.upsert();
+        };
     });
-}
+}
diff --git a/...e/smokeApp/src/batchJobs/smokePPEGaussianML/SmokePPECoarseGrainedGaussianMLTrainingJob.py b/...e/smokeApp/src/batchJobs/smokePPEGaussianML/SmokePPECoarseGrainedGaussianMLTrainingJob.py
@@ -0,0 +1,51 @@
+def doStart(self, job, options):
+    import pandas as pd
+    # set hardware profile
+    job.setHardwareProfile(options.hardwareProfileId)
+
+    # grab all gstps
+    gstpFilter = c3.Filter().ge("latitude", options.minLat).and_().lt("latitude", options.maxLat).and_().ge("longitude", options.minLon).and_().lt("longitude", options.maxLon).and_().ge("time", options.minTime).and_().lt("time", options.maxTime)
+
+    allGstps = c3.GeoSurfaceTimePoint.fetch({
+        "filter": gstpFilter,
+        "limit": -1
+    }).toPandas()
+
+    # find all unique time stamps
+    times = allGstps["time"].unique()
+
+    batch = []
+    # loop over each unique time stamp
+    for time in times:
+        # get all lat-lon poiunts for that time stamp
+        gstpsForTime = allGstps[allGstps["time"] == time]
+        # loop between minLat, maxLat with latStep
+        n_lat_steps = (options.maxLat - options.minLat) / options.latStep
+        n_lon_steps = (options.maxLon - options.minLon) / options.lonStep
+        for i in range(n_lat_steps):
+            lat_down = options.minLat + i * options.latStep
+            lat_up = lat_down + options.latStep
+            # loop between minLon, maxLon with lonStep
+            for j in range(n_lon_steps):
+                lon_left = options.minLon + j * options.lonStep
+                lon_right = lon_left + options.lonStep
+                # get all gstps in that lat-lon box
+                gstpsInBox = gstpsForTime[gstpsForTime["latitude"] >= lat_down and gstpsForTime["latitude"] < lat_up and gstpsForTime["longitude"] >= lon_left and gstpsForTime["longitude"] < lon_right]
+                # loop over each gstp
+                targets = []
+                for gstp in gstpsInBox:
+                    targetFilter = c3.Filter().eq("geoSurfaceTimePoint.id", gstp["id"])
+                    target = c3.SmokePPESimulationOutput.fetch({
+                        "filter": targetFilter
+                        "limit": -1
+                    })
+                    targets.append(target)
+                # average over list of targets
+
+                batch.append(above_list_of_targets)
+                if len(batch) >= options.batchSize:
+                    batchSpec = c3.SmokePPECoarseGrainedGaussianMLTrainingJobBatch.make({"values": batch})
+                    job.scheduleBatch(batchSpec)
+                    batch = []
+
+
diff --git a/...pp/src/batchJobs/smokePPEGaussianML/SmokePPECoarseGrainedGaussianMLTrainingJobBatch.c3typ b/...pp/src/batchJobs/smokePPEGaussianML/SmokePPECoarseGrainedGaussianMLTrainingJobBatch.c3typ
@@ -0,0 +1,11 @@
+/**
+* Copyright (c) 2022, C3 AI DTI, Development Operations Team
+* All rights reserved. License: https://github.com/c3aidti/.github
+**/
+/**
+* Represents a unit of work (batch) in a {@link SmokePPECoarseGrainedGaussianMLTraningJob}
+*/
+type SmokePPECoarseGrainedGaussianMLTrainingJobBatch {
+    // The {@link GeoSurfaceTimePoint}s that define the targets for the models in this batch
+    values: [GeoSurfaceTimePoint]
+}
diff --git a/.../src/batchJobs/smokePPEGaussianML/SmokePPECoarseGrainedGaussianMLTrainingJobOptions.c3typ b/.../src/batchJobs/smokePPEGaussianML/SmokePPECoarseGrainedGaussianMLTrainingJobOptions.c3typ
@@ -0,0 +1,33 @@
+/**
+* Copyright (c) 2022, C3 AI DTI, Development Operations Team
+* All rights reserved. License: https://github.com/c3aidti/.github
+**/
+/**
+* Represents customization options for {@link SmokePPECoarseGrainedGaussianMLTrainingJob}
+*/
+type SmokePPECoarseGrainedGaussianMLTrainingJobOptions {
+    // How many models will be trained in each batch
+    batchSize: int = 10
+    // the min latitude
+    minLat: !float
+    // the max latitude
+    maxLat: !float
+    // the min longitude
+    minLon: !float
+    // the max longitude
+    maxLon: !float
+    // the latitute step
+    latStep: !float
+    // the longitude step
+    lonStep: !float
+    // the name of the variable to collect from {@link Simulation3HourlyAODData}
+    targetName: !string
+    // the features to exclude in {@link SimulationModelParameters}
+    excludeFeatures: [string]
+    // the {@link GaussianProcessRegressionTechnique} to train the models
+    gprTechnique: !GaussianProcessRegressionTechnique
+    // hardware profile ID to run the batches
+    hardwareProfileId: string = "appc8m642-w"
+    // flag for staged GSTPs training ({@link StagedGSTP})
+    stagedGSTP: boolean = false
+}
diff --git a/...ianML/smokePPEGaussianMLTrainingJob.c3typ → ...ianML/SmokePPEGaussianMLTrainingJob.c3typ b/...ianML/smokePPEGaussianMLTrainingJob.c3typ → ...ianML/SmokePPEGaussianMLTrainingJob.c3typ
@@ -6,7 +6,7 @@
 * Training job for AOD data {@link Simulation3HourlyAODOutput} 
 * and {@link GaussianProcessRegressionPipe}s
 */
-type AODGaussianMLTrainingJob extends BatchJob<AODGaussianMLTrainingJob, AODGaussianMLTrainingJobOptions, AODGaussianMLTrainingJobBatch> type key 'AODGMLJB' {
+type SmokePPEGaussianMLTrainingJob extends BatchJob<SmokePPEGaussianMLTrainingJob, SmokePPEGaussianMLTrainingJobOptions, SmokePPEGaussianMLTrainingJobBatch> type key 'SMKPPEMLJB' {
     doStart:        ~ js server
     processBatch:   ~ js server
-}
+}
diff --git a/smoke/smokeApp/src/batchJobs/smokePPEGaussianML/SmokePPEGaussianMLTrainingJob.js b/smoke/smokeApp/src/batchJobs/smokePPEGaussianML/SmokePPEGaussianMLTrainingJob.js
@@ -0,0 +1,132 @@
+/**
+* Copyright (c) 2022, C3 AI DTI, Development Operations Team
+* All rights reserved. License: https://github.com/c3aidti/.github
+**/
+/**
+ * Implementation of SmokePPEGaussianMLTrainingJob
+ * @param {SmokePPEGaussianMLTrainingJob} job
+ * @param {SmokePPEGaussianMLTrainingJobOptions} options
+ */
+function doStart(job, options) {
+    job.setHardwareProfile(options.hardwareProfileId);
+    var batch = [];
+
+    if (options.stagedGSTP) {
+        var staged_gstps = StagedGSTP.fetchObjStream({
+            limit: -1
+        });
+
+        while(staged_gstps.hasNext()) {
+            var gstp = GeoSurfaceTimePoint.get(staged_gstps.next().geoSurfaceTimePoint.id);
+            batch.push(gstp);
+
+            if (batch.length >= options.batchSize || !gstps.hasNext()) {
+                var batchSpec = SmokePPEGaussianMLTrainingJobBatch.make({values: batch});
+                job.scheduleBatch(batchSpec);
+
+                batch = [];
+            }
+        }
+    } else {
+        var gstps = GeoSurfaceTimePoint.fetchObjStream({
+            filter: options.gstpFilter,
+            limit: -1
+        });
+
+        while(gstps.hasNext()) {
+            batch.push(gstps.next());
+
+            if (batch.length >= options.batchSize || !gstps.hasNext()) {
+                var batchSpec = SmokePPEGaussianMLTrainingJobBatch.make({values: batch});
+                job.scheduleBatch(batchSpec);
+
+                batch = [];
+            }
+        }
+    }
+}
+
+
+/**
+ * Implementation of what to do in each batch
+ * @param {SmokePPEGaussianMLTrainingJobBatch} batch
+ * @param {SmokePPEGaussianMLTrainingJob} job
+ * @param {SmokePPEGaussianMLTrainingJobOptions} options
+ */
+function processBatch(batch, job, options) {
+    batch.values.forEach(function(gstp) {
+
+        // define target
+        var targetType = TypeRef.make({"typeName": "SmokePPESimulationOutput"});
+        var targetFilter = Filter.eq("geoSurfaceTimePoint.id", gstp.id);
+        var targetSpec = FetchSpec.make({
+            "limit": -1,
+            "order": "simulationSample.id",
+            "filter": targetFilter.toString()
+        });
+
+        // find the simulations
+        var simulationsSpec = FetchSpec.make({
+            "limit": -1,
+            "order": "simulationSample.id",
+            "filter": targetFilter.toString(),
+            "include": "simulationSample"
+        });
+        var samples = targetType.toType().fetch(simulationsSpec).objs;
+        var simIds = [];
+        for(var i = 0; i < samples.length; i++) {
+            simIds.push(samples[i].simulationSample.id);
+        }
+
+        var featuresType = TypeRef.make({"typeName": "SmokePPESimulationModelParameters"});
+        var allSamples = featuresType.toType().fetch({
+            "limit": -1,
+            "order": "id",
+            "include": "id"
+        }).objs;
+        var allSimIds = [];
+        for(var i = 0; i < allSamples.length; i++) {
+            allSimIds.push(allSamples[i].id);
+        };
+        var excludeIds = [];
+        for(var i = 0; i < allSimIds.length; i++) {
+            if(simIds.indexOf(allSimIds[i]) === -1) {
+                excludeIds.push(allSimIds[i]);
+            }
+        };
+
+        // define the features
+        var featuresFilter = Filter.not().intersects("id", excludeIds);
+        var featuresSpec = FetchSpec.make({
+            "limit": -1,
+            "order": "id",
+            "filter": featuresFilter
+        });
+
+        // define the data source spec
+        var sourceSpec = GPRDataSourceSpec.make({
+            "featuresType": featuresType,
+            "featuresSpec": featuresSpec,
+            "excludeFeatures": options.excludeFeatures,
+            "targetType": targetType,
+            "targetSpec": targetSpec,
+            "targetName": options.targetName
+        }).upsert()
+
+        // create the pipe
+        var GPR_pipe = GaussianProcessRegressionPipe.make({
+            "technique": options.gprTechnique,
+            "dataSourceSpec": sourceSpec
+        })
+
+        // get target and features
+        var X = GPR_pipe.getFeatures();
+        var y = GPR_pipe.getTarget();
+
+        if (X.size() > 0 && y.size() > 0) {
+            // train and save
+            var GPR_pipe_trained = GPR_pipe.train(X, y);
+            GPR_pipe_trained.upsert();
+        };
+    });
+}
diff --git a/.../smokePPEGaussianMLTrainingJobBatch.c3typ → .../SmokePPEGaussianMLTrainingJobBatch.c3typ b/.../smokePPEGaussianMLTrainingJobBatch.c3typ → .../SmokePPEGaussianMLTrainingJobBatch.c3typ
@@ -3,9 +3,9 @@
 * All rights reserved. License: https://github.com/c3aidti/.github
 **/
 /**
-* Represents a unit of work (batch) in a {@link AODGaussianMLTraningJob}
+* Represents a unit of work (batch) in a {@link SmokePPEGaussianMLTraningJob}
 */
-type AODGaussianMLTrainingJobBatch {
+type SmokePPEGaussianMLTrainingJobBatch {
     // The {@link GeoSurfaceTimePoint}s that define the targets for the models in this batch
     values: [GeoSurfaceTimePoint]
-}
+}
diff --git a/...mokePPEGaussianMLTrainingJobOptions.c3typ → ...mokePPEGaussianMLTrainingJobOptions.c3typ b/...mokePPEGaussianMLTrainingJobOptions.c3typ → ...mokePPEGaussianMLTrainingJobOptions.c3typ
@@ -3,9 +3,9 @@
 * All rights reserved. License: https://github.com/c3aidti/.github
 **/
 /**
-* Represents customization options for {@link AODGaussianMLTrainingJob}
+* Represents customization options for {@link SmokePPEGaussianMLTrainingJob}
 */
-type AODGaussianMLTrainingJobOptions {
+type SmokePPEGaussianMLTrainingJobOptions {
     // How many models will be trained in each batch
     batchSize: int = 10
     // {@link GeoSurfaceTimePoint}s filter that defines the total number of models
@@ -18,4 +18,6 @@ type AODGaussianMLTrainingJobOptions {
     gprTechnique: !GaussianProcessRegressionTechnique
     // hardware profile ID to run the batches
     hardwareProfileId: string = "appc8m642-w"
-}
+    // flag for staged GSTPs training ({@link StagedGSTP})
+    stagedGSTP: boolean = false
+}
diff --git a/smoke/smokeApp/src/entity/coordinates/StagedGSTP.c3typ b/smoke/smokeApp/src/entity/coordinates/StagedGSTP.c3typ
@@ -0,0 +1,18 @@
+/**
+* Copyright (c) 2022, C3 AI DTI, Development Operations Team
+* All rights reserved. License: https://github.com/c3aidti/.github
+**/
+/**
+* Staged {@link GeoSurfaceTimePoint} for faster processing by ML pipes.
+*/
+@db(unique=['geoSurfaceTimePoint'])
+entity type StagedGSTP schema name 'STGD_GSTP' {
+    // the {@link GeoSurfaceTimePoint}
+    geoSurfaceTimePoint: !GeoSurfaceTimePoint
+    // stage based on a region filter
+    @py(env='gordon-ML_1_0_0')
+    directStage: function(gstpFilter: any): int
+    // unstage based on a ragion filter
+    @py(env='gordon-ML_1_0_0')
+    unstage: function(gstpFilter: any): int
+}