cytomining · gwaybio · Jun 7, 2019 · Jun 7, 2019 · Jun 7, 2019 · Jun 7, 2019
@@ -1,116 +1,85 @@
-"""Command-line interface for DeepProfiler.
-
-Four subcommands are available, intended to be run in order:
-
-1. ``setup``   — create the project directory structure under ``--root``.
-2. ``prepare`` — compute per-plate illumination statistics and compress images
-                 to 8-bit PNG (optional but recommended for large datasets).
-3. ``profile`` — extract per-cell deep learning features using the Cell
-                 Painting CNN v1 checkpoint and write ``.npz`` files.
-4. ``split``   — split the metadata index into N parts for parallel profiling
-                 across multiple machines or jobs.
-
-Typical usage::
-
-    deepprofiler --root=/data/project --config=config.json --exp=run1 profile
-
-See README.md and the DeepProfiler Handbook for full configuration details.
-"""
-
-import copy
 import json
 import os
 
 import click
 
 import deepprofiler.dataset.compression
-import deepprofiler.dataset.illumination_statistics
-import deepprofiler.dataset.image_dataset
 import deepprofiler.dataset.indexing
+import deepprofiler.dataset.illumination_statistics
 import deepprofiler.dataset.metadata
 import deepprofiler.dataset.utils
-import deepprofiler.profiling
+import deepprofiler.dataset.image_dataset
+import deepprofiler.learning.training
+import deepprofiler.learning.profiling
+import deepprofiler.learning.optimization
+import deepprofiler.download.normalize_bbbc021_metadata
 
 
 # Main interaction point
 @click.group()
 @click.option("--root", prompt="Root directory for DeepProfiler experiment",
               help="Root directory for DeepProfiler experiment",
-              type=click.Path(exists=True))
+              type=click.Path("r"))
 @click.option("--config", default=None,
-              help="Path to existing config file (filename in project_root/inputs/config/)",
-              type=click.STRING)
+              help="Path to existing config file",
+              type=click.Path("r"))
 @click.option("--cores", default=0,
-              help="Number of CPU cores for parallel processing (all=0) for prepare command",
+              help="Number of CPU cores for parallel processing (all=0)",
               type=click.INT)
-@click.option("--gpu", default="0",
-              help="GPU device id (the id can be checked with nvidia-smi)",
-              type=click.STRING)
-@click.option("--exp", default="results",
-              help="Name of experiment, this folder will be created in project_root/outputs/",
-              type=click.STRING)
-@click.option("--metadata", default='index.csv',
-              help="Metadata index filename in project_root/inputs/metadata/",
-              type=click.STRING)
 @click.pass_context
-def cli(context, root, config, exp, cores, gpu, metadata):
-    """Configure paths and load the experiment config, then dispatch to a subcommand."""
+def cli(context, root, config, cores):
     dirs = {
         "root": root,
-        "locations": root + "/inputs/locations/",  # TODO: use os.path.join()
-        "config": root + "/inputs/config/",
-        "images": root + "/inputs/images/",
-        "metadata": root + "/inputs/metadata/",
-        "intensities": root + "/outputs/intensities/",
-        "compressed_images": root + "/outputs/compressed/images/",
-        "results": root + "/outputs/" + exp + "/",
-        "checkpoints": root + "/outputs/" + exp + "/checkpoint/",
-        "logs": root + "/outputs/" + exp + "/logs/",
-        "summaries": root + "/outputs/" + exp + "/summaries/",
-        "features": root + "/outputs/" + exp + "/features/"
+        "locations": os.path.join(root, "inputs", "locations"),
+        "config": os.path.join(root, "inputs", "config"),
+        "images": os.path.join(root, "inputs", "images"),
+        "metadata": os.path.join(root, "inputs", "metadata"),
+        "preprocessed": os.path.join(root, "inputs", "preprocessed"),
+        "pretrained": os.path.join(root, "inputs", "pretrained"),
+        "intensities": os.path.join(root, "outputs", "intensities"),
+        "compressed_images": os.path.join(root, "outputs", "compressed", "images"),
+        "compressed_metadata": os.path.join(root, "outputs", "compressed", "metadata"),
+        "training": os.path.join(root, "outputs", "training"),
+        "checkpoints": os.path.join(root, "outputs", "training", "checkpoint"),
+        "logs": os.path.join(root, "outputs", "training", "logs"),
+        "summaries": os.path.join(root, "outputs", "training", "summaries"),
+        "features": os.path.join(root, "outputs", "features")
     }
-    if context.invoked_subcommand == 'setup':
-        context.obj["dirs"] = dirs
-        return 
+    if config is not None:
+
+        context.obj["config"] = {}
+        context.obj["config"]["paths"] = {}
+        context.obj["config"]["paths"]["config"] = config
+        dirs["config"] = os.path.dirname(os.path.abspath(config))
+    else:
+        config = os.path.join(dirs["config"], "config.json")
 
-    config = dirs["config"] + "/" + config
     context.obj["cores"] = cores
-    context.obj["gpu"] = gpu
-    os.environ["CUDA_VISIBLE_DEVICES"] = gpu
-    # Load configuration file
-    if config is not None and os.path.isfile(config):
+
+    if os.path.isfile(config):
         with open(config, "r") as f:
             params = json.load(f)
-
-        # Override paths defined by user
         if "paths" in params.keys():
             for key, value in dirs.items():
                 if key not in params["paths"].keys():
-                    params["paths"][key] = dirs[key]
+                    params["paths"][key] = os.path.join(root, dirs[key])
                 else:
-                    dirs[key] = params["paths"][key]
-        else:
-            params["paths"] = copy.deepcopy(dirs)
+                    dirs[key] = os.path.join(root, params["paths"][key])
 
-        if os.path.isdir(dirs["root"]):
-            for k in ["results", "checkpoints", "logs", "summaries", "features"]:
-                os.makedirs(dirs[k], exist_ok=True)
+        else:
+            params["paths"] = dirs
 
-        # Update references
-        params["experiment_name"] = exp
-        params["paths"]["index"] = params["paths"]["metadata"] + metadata
+        params["paths"]["index"] = os.path.join(root, params["paths"]["metadata"], "index.csv")
         context.obj["config"] = params
-    else:
-        raise Exception("Config does not exists; make sure that the file exists in /inputs/config/")
-
+        process = deepprofiler.dataset.utils.Parallel(context.obj["config"], numProcs=context.obj["cores"])
+        context.obj["process"] = process
     context.obj["dirs"] = dirs
 
 
 # Optional tool: Create the support file and folder structure in a root directory
-@cli.command(help='initialize folder structure of DeepProfiler project')
+@cli.command()
 @click.pass_context
 def setup(context):
-    """Create the project directory tree under the configured root."""
     for path in context.obj["dirs"].values():
         if not os.path.isdir(path):
             print("Creating directory: ", path)
@@ -121,49 +90,85 @@ def setup(context):
     context.obj["config"]["paths"] = context.obj["dirs"]
 
 
+# Optional tool: Download and prepare the BBBC021 dataset
+@cli.command()
+@click.pass_context
+def download_bbbc021(context):
+    context.invoke(setup)
+    deepprofiler.download.normalize_bbbc021_metadata.normalize_bbbc021_metadata(context)
+    print("BBBC021 download and preparation complete!")
+
+
 # First tool: Compute illumination statistics and compress images
-@cli.command(help='Run illumination correction and compression')
+@cli.command()
 @click.pass_context
 def prepare(context):
-    """Compute per-plate illumination statistics and compress images to 8-bit PNG."""
     metadata = deepprofiler.dataset.metadata.read_plates(context.obj["config"]["paths"]["index"])
-    process = deepprofiler.dataset.utils.Parallel(context.obj["config"], numProcs=context.obj["cores"])
+    process = context.obj["process"]
     process.compute(deepprofiler.dataset.illumination_statistics.calculate_statistics, metadata)
     print("Illumination complete!")
-    metadata = deepprofiler.dataset.metadata.read_plates(
-        context.obj["config"]["paths"]["index"])  # reinitialize generator
+    metadata = deepprofiler.dataset.metadata.read_plates(context.obj["config"]["paths"]["index"])  # reinitialize generator
     process.compute(deepprofiler.dataset.compression.compress_plate, metadata)
+    deepprofiler.dataset.indexing.write_compression_index(context.obj["config"])
+    context.parent.obj["config"]["paths"]["index"] = os.path.join(context.obj["config"]["paths"]["compressed_metadata"], "compressed.csv")
     print("Compression complete!")
 
 
-# Second tool: Profile cells and extract features
-@cli.command(help='run feature extraction')
+# Optional learning tool: Optimize the hyperparameters of a model
+@cli.command()
+@click.option("--epoch", default=1)
+@click.option("--seed", default=None)
+@click.pass_context
+def optimize(context, epoch, seed):
+    if context.parent.obj["config"]["prepare"]["compression"]["implement"]:
+        context.parent.obj["config"]["paths"]["index"] = os.path.join(context.obj["config"]["paths"]["compressed_metadata"], "compressed.csv")
+        context.parent.obj["config"]["paths"]["images"] = context.obj["config"]["paths"]["compressed_images"]
+    metadata = deepprofiler.dataset.image_dataset.read_dataset(context.obj["config"])
+    optim = deepprofiler.learning.optimization.Optimize(context.obj["config"], metadata, epoch, seed)
+    optim.optimize()
+
+
+# Second tool: Train a network
+@cli.command()
+@click.option("--epoch", default=1)
+@click.option("--seed", default=None)
+@click.pass_context
+def train(context, epoch, seed):
+    if context.parent.obj["config"]["prepare"]["compression"]["implement"]:
+        context.parent.obj["config"]["paths"]["index"] = os.path.join(context.obj["config"]["paths"]["compressed_metadata"], "compressed.csv")
+        context.parent.obj["config"]["paths"]["images"] = context.obj["config"]["paths"]["compressed_images"]
+    metadata = deepprofiler.dataset.image_dataset.read_dataset(context.obj["config"])
+    deepprofiler.learning.training.learn_model(context.obj["config"], metadata, epoch, seed)
+
+
+# Third tool: Profile cells and extract features
+@cli.command()
 @click.pass_context
 @click.option("--part",
               help="Part of index to process",
               default=-1,
               type=click.INT)
 def profile(context, part):
-    """Extract per-cell deep learning features and write .npz files."""
     if context.parent.obj["config"]["prepare"]["compression"]["implement"]:
+        context.parent.obj["config"]["paths"]["index"] = os.path.join(context.obj["config"]["paths"]["compressed_metadata"], "compressed.csv")
         context.parent.obj["config"]["paths"]["images"] = context.obj["config"]["paths"]["compressed_images"]
     config = context.obj["config"]
     if part >= 0:
         partfile = "index-{0:03d}.csv".format(part)
         config["paths"]["index"] = context.obj["config"]["paths"]["index"].replace("index.csv", partfile)
-    dset = deepprofiler.dataset.image_dataset.read_dataset(context.obj["config"], mode='profile')
-    deepprofiler.profiling.profile(context.obj["config"], dset)
+    metadata = deepprofiler.dataset.image_dataset.read_dataset(context.obj["config"])
+    deepprofiler.learning.profiling.profile(context.obj["config"], metadata)
 
 
 # Auxiliary tool: Split index in multiple parts
-@cli.command(help='split metadata into multiple parts')
+@cli.command()
 @click.pass_context
 @click.option("--parts",
               help="Number of parts to split the index",
               type=click.INT)
 def split(context, parts):
-    """Split the metadata index into N parts for parallel profiling jobs."""
     if context.parent.obj["config"]["prepare"]["compression"]["implement"]:
+        context.parent.obj["config"]["paths"]["index"] = os.path.join(context.obj["config"]["paths"]["compressed_metadata"], "compressed.csv")
         context.parent.obj["config"]["paths"]["images"] = context.obj["config"]["paths"]["compressed_images"]
     deepprofiler.dataset.indexing.split_index(context.obj["config"], parts)