diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json
index 2095fd1b4..619f20cc8 100644
--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -13,5 +13,10 @@
"ghcr.io/devcontainers/features/docker-in-docker:2": {},
// For building singularity
"ghcr.io/devcontainers/features/go:1": {}
+ },
+ "hostRequirements": {
+ "cpus": 4,
+ "memory": "16gb",
+ "storage": "32gb"
}
}
diff --git a/_typos.toml b/_typos.toml
index 62ded764a..8314bff45 100644
--- a/_typos.toml
+++ b/_typos.toml
@@ -2,3 +2,6 @@
# Ignore data files
extend-glob = ["*.txt"]
check-file = false
+
+[default.extend-words]
+STIP = "STIP"
\ No newline at end of file
diff --git a/docs/_static/config/beginner.yaml b/docs/_static/config/beginner.yaml
index 1ddda6dc2..97cb2c29f 100644
--- a/docs/_static/config/beginner.yaml
+++ b/docs/_static/config/beginner.yaml
@@ -23,13 +23,12 @@ containers:
algorithms:
- name: "pathlinker"
- params:
- include: true
+ include: true
+ runs:
run1:
k: 1
-
- # run2: # uncomment for step 3.2
- # k: [10, 100] # uncomment for step 3.2
+ # run2: # uncomment for step 3.2
+ # k: [10, 100] # uncomment for step 3.2
# Here we specify which pathways to run and other file location information.
# Assume that if a dataset label does not change, the lists of associated input files do not change
diff --git a/docs/_static/config/intermediate.yaml b/docs/_static/config/intermediate.yaml
index 58d1400d8..a57eb13bb 100644
--- a/docs/_static/config/intermediate.yaml
+++ b/docs/_static/config/intermediate.yaml
@@ -21,76 +21,46 @@ containers:
# then myAlg will be run on (a=1,b=0.5),(a=1,b=0.75),(a=2,b=0.5), and (a=2,b=0,75). Pretty neat, but be
# careful: too many parameters might make your runs take a long time.
+# TODO: get rid of the algorithms that will take too long to run
algorithms:
- name: "pathlinker"
include: true
runs:
run1:
- k: 1
- run2:
- k: [10, 100]
- - name: omicsintegrator1
- include: true
- runs:
- run1:
- b: [0.55, 2, 10]
- d: 10
- g: 1e-3
- r: 0.01
- w: 0.1
- mu: 0.008
+ k: [1, 10, 100, 1000]
+
- name: omicsintegrator2
include: true
runs:
run1:
- b: 4
- g: 0
- run2:
- b: 2
- g: 3
- - name: meo
- include: true
- runs:
- run1:
- local_search: [true, false]
- max_path_length: [2, 3]
- rand_restarts: 10
- - name: allpairs
- include: true
- - name: domino
- include: true
- runs:
- run1:
- slice_threshold: 0.3
- module_threshold: 0.05
+ b: [4, 10]
+ g: [0, 3]
+ w: [0.25, 6]
+
- name: mincostflow
include: true
runs:
run1:
- capacity: 15
- flow: 80
- run2:
- capacity: 1
- flow: 6
- run3:
- capacity: 5
- flow: 60
+ capacity: [15, 30]
+ flow: [80, 15]
+
- name: "strwr"
include: true
runs:
run1:
- alpha: [0.85]
+ alpha: 0.85
threshold: [100, 200]
+
- name: "rwr"
include: true
runs:
run1:
- alpha: [0.85]
+ alpha: 0.85
threshold: [100, 200]
# Here we specify which pathways to run and other file location information.
# Assume that if a dataset label does not change, the lists of associated input files do not change
-datasets: # TODO update this based on the dataset that I set up
+datasets:
- # Labels can only contain letters, numbers, or underscores
label: egfr
node_files: ["tps-egfr-prizes.txt"] # the input nodes
@@ -100,6 +70,14 @@ datasets: # TODO update this based on the dataset that I set up
# Relative path from the spras directory where these files live
data_dir: "input"
+gold_standards:
+ - # Labels can only contain letters, numbers, or underscores
+ label: gs_egfr
+ node_files: ["gs-egfr.txt"]
+ data_dir: "input"
+ # List of dataset labels to compare with the specific gold standard dataset
+ dataset_labels: ["egfr"]
+
reconstruction_settings:
# Set where everything is saved
@@ -107,26 +85,26 @@ reconstruction_settings:
reconstruction_dir: "output/intermediate"
analysis:
+ summary:
+ include: false # set to true for step 4
+
# Machine learning analysis (e.g. clustering) of the pathway output files for each dataset
ml:
# ml analysis per dataset
include: false # set to true for step 3
+
# adds ml analysis per algorithm output
# only runs for algorithms with multiple parameter combinations chosen
- aggregate_per_algorithm: false
- # specify how many principal components to calculate
- components: 2
- # boolean to show the labels on the pca graph
- labels: true
- # 'ward', 'complete', 'average', 'single'
- # if linkage: ward, must use metric: euclidean
- linkage: 'ward'
- # 'euclidean', 'manhattan', 'cosine'
- metric: 'euclidean'
- # controls whether kernel density estimation (KDE) is computed and visualized on top of PCA plots.
- # the coordinates of the KDE maximum (kde_peak) are also saved to the PCA coordinates output file.
- # KDE needs to be run in order to select a parameter combination with PCA because the maximum kernel density is used
- # to pick the 'best' parameter combination.
- kde: false
+ aggregate_per_algorithm: false # set to true for step 4
+ # overalls a kernel density estimation over pca
+ kde: false # set to true for step 4
# removes empty pathways from consideration in ml analysis (pca only)
- remove_empty_pathways: false
+ remove_empty_pathways: false # set to true for step 4
+
+ evaluation:
+ # evaluation per dataset-goldstandard pair
+ # evaluation will not run unless ml include is set to true
+ include: false # set to true for step 4
+ # adds evaluation per algorithm per dataset-goldstandard pair
+ # evaluation per algorithm will not run unless ml include and ml aggregate_per_algorithm are set to true
+ aggregate_per_algorithm: false # set to true for step 4
diff --git a/docs/_static/images/hac-horizontal.png b/docs/_static/images/hac-horizontal.png
index d67394daf..2acda68ef 100644
Binary files a/docs/_static/images/hac-horizontal.png and b/docs/_static/images/hac-horizontal.png differ
diff --git a/docs/_static/images/hac-vertical.png b/docs/_static/images/hac-vertical.png
index 284b2492c..eee2d611f 100644
Binary files a/docs/_static/images/hac-vertical.png and b/docs/_static/images/hac-vertical.png differ
diff --git a/docs/_static/images/jaccard-heatmap.png b/docs/_static/images/jaccard-heatmap.png
index f80e20a71..97769c3ff 100644
Binary files a/docs/_static/images/jaccard-heatmap.png and b/docs/_static/images/jaccard-heatmap.png differ
diff --git a/docs/_static/images/pca-kde.png b/docs/_static/images/pca-kde.png
index 771b6ee23..d98c12a0d 100644
Binary files a/docs/_static/images/pca-kde.png and b/docs/_static/images/pca-kde.png differ
diff --git a/docs/_static/images/pca.png b/docs/_static/images/pca.png
index 7b78df704..be4b2c64f 100644
Binary files a/docs/_static/images/pca.png and b/docs/_static/images/pca.png differ
diff --git a/docs/_static/images/pr-curve-ensemble-nodes-per-algorithm-nodes.png b/docs/_static/images/pr-curve-ensemble-nodes-per-algorithm-nodes.png
index 7e8dd20b0..013916672 100644
Binary files a/docs/_static/images/pr-curve-ensemble-nodes-per-algorithm-nodes.png and b/docs/_static/images/pr-curve-ensemble-nodes-per-algorithm-nodes.png differ
diff --git a/docs/_static/images/pr-pca-chosen-pathway-per-algorithm-nodes.png b/docs/_static/images/pr-pca-chosen-pathway-per-algorithm-nodes.png
index e5820cc12..269e21702 100644
Binary files a/docs/_static/images/pr-pca-chosen-pathway-per-algorithm-nodes.png and b/docs/_static/images/pr-pca-chosen-pathway-per-algorithm-nodes.png differ
diff --git a/docs/_static/images/pr-per-pathway-nodes.png b/docs/_static/images/pr-per-pathway-nodes.png
index ae9239f31..15a4e4f25 100644
Binary files a/docs/_static/images/pr-per-pathway-nodes.png and b/docs/_static/images/pr-per-pathway-nodes.png differ
diff --git a/docs/_static/images/upload-file-dev-container.png b/docs/_static/images/upload-file-dev-container.png
new file mode 100644
index 000000000..d820f6bc4
Binary files /dev/null and b/docs/_static/images/upload-file-dev-container.png differ
diff --git a/docs/tutorial/advanced.rst b/docs/tutorial/advanced.rst
index 368d9cb49..f85e22ff1 100644
--- a/docs/tutorial/advanced.rst
+++ b/docs/tutorial/advanced.rst
@@ -19,19 +19,37 @@ for getting parameter grids for any algorithms for a given dataset.
Grid Search
===========
-A grid search systematically checks different combinations of parameter
-values to see how each affects network reconstruction results.
+A grid search systematically runs different combinations of parameter
+values on a dataset to see how each affects network reconstruction
+results.
In SPRAS, users can define parameter grids for each algorithm directly
in the configuration file. When executed, SPRAS automatically runs each
algorithm across all parameter combinations and collects the resulting
subnetworks.
-SPRAS will also support parameter refinement using graph topological
-heuristics. These topological metrics help identify parameter regions
-that produce biologically plausible outputs networks. Based on these
-heuristics, SPRAS will generate new configuration files with refined
-parameter grids for each algorithm per dataset.
+SPRAS will also include automatically narrowing down a parameter grid
+for each algorithm on each dataset using a two-stage grid search.
+Instead of tuning to a gold standard or a single metric, the search uses
+graph topological heuristics (rules based on statistics like node count
+and edge count) to discard subnetworks that are biologically
+implausible. In the first stage, SPRAS runs each algorithm over a coarse
+grid: a small set of parameter values spread across a wide range with
+large gaps between them. Parameter combinations whose output subnetworks
+pass the heuristics are kept, and the rest are discarded. Because the
+underlying data differ from dataset to dataset, the set of passing
+combinations also differs.
+
+In the second stage, SPRAS refines the surviving combinations into a
+finer grid. For each passing combination, it varies one parameter at a
+time to sample values near the ones that worked. For example, if ``b =
+5``, ``d = 10``, ``w = 2`` passed, SPRAS also tries neighbors such as
+``w = 1`` and ``w = 3`` or ``d = 5`` and ``d = 15``. A neighbor is
+evaluated as long as at least one of its adjacent coarse-grid values
+passed, so the search can still explore just past the edge of the
+passing region. The same heuristics filter these neighbors, and the
+combinations that survive both stages form the final fine-tuned grid for
+that algorithm and dataset.
Users can further refine these grids by rerunning the updated
configuration and adjusting the parameter ranges around the newly
@@ -40,186 +58,8 @@ specific outputs for a given dataset.
.. note::
- Some grid search features are still under development and will be
- added in future SPRAS releases.
-
-Parameter selection
-===================
-
-Parameter selection refers to the process of determining which parameter
-combinations should be used for evaluation on a gold standard dataset.
-
-Parameter selection is handled in the evaluation code, which supports
-multiple parameter selection strategies. Once the grid space search is
-complete for each dataset, the user can enable evaluation (by setting
-evaluation ``include: true``) and it will run all of the parameter
-selection code.
-
-PCA-based parameter selection
------------------------------
-
-The PCA-based approach identifies a representative parameter setting for
-each pathway reconstruction algorithm on a given dataset. It selects the
-single parameter combination that best captures the central trend of an
-algorithm's reconstruction behavior.
-
-.. image:: ../_static/images/pca-kde.png
- :alt: Principal component analysis visualization across pathway outputs with a kernel density estimate computed on top
- :width: 600
- :align: center
-
-.. raw:: html
-
-
-
-For each algorithm, all reconstructed subnetworks are projected into an
-algorithm-specific 2D PCA space based on the set of edges produced by
-the respective parameter combinations for that algorithm. This
-projection summarizes how the algorithm's outputs vary across different
-parameter combinations, allowing patterns in the outputs to be
-visualized in a lower-dimensional space.
-
-Within each PCA space, a kernel density estimate (KDE) is computed over
-the projected points to identify regions of high density. The output
-closest to the highest KDE peak is selected as the most representative
-parameter setting, as it corresponds to the region where the algorithm
-most consistently produces similar subnetworks.
-
-Ensemble network-based parameter selection
-------------------------------------------
-
-The ensemble-based approach combines results from all parameter settings
-for each pathway reconstruction algorithm on a given dataset. Instead of
-focusing on a single "best" parameter combination, it summarizes the
-algorithm's overall reconstruction behavior across parameters.
-
-All reconstructed subnetworks are merged into algorithm-specific
-ensemble networks, where each edge weight reflects how frequently that
-interaction appears across the outputs. Edges that occur more often are
-assigned higher weights, highlighting interactions that are most
-consistently recovered by the algorithm.
-
-These consensus networks help identify the core patterns and overall
-stability of an algorithm's output's without needing to choose a single
-parameter setting (no clear optimal parameter combination could exists).
-
-Ground truth-based evaluation without parameter selection
----------------------------------------------------------
-
-The no parameter selection approach chooses all parameter combinations
-for each pathway reconstruction algorithm on a given dataset. This
-approach can be useful for idenitifying patterns in algorithm
-performance without favoring any specific parameter setting.
-
-************
- Evaluation
-************
-
-In some cases, users may have a gold standard file that allows them to
-evaluate the quality of the reconstructed subnetworks generated by
-pathway reconstruction algorithms.
-
-However, gold standards may not exist for certain types of experimental
-data where validated ground truth interactions or molecules are
-unavailable or incomplete. For example, in emerging research areas or
-poorly characterized biological systems, interactions may not yet be
-experimentally verified or fully known, making it difficult to define a
-reliable reference network for evaluation.
-
-Adding gold standard datasets and evaluation post analysis a configuration
-==========================================================================
-
-In the configuration file, users can specify one or more gold standard
-datasets to evaluate the subnetworks reconstructed from each dataset.
-When gold standards are provided and evaluation is enabled (``include:
-true``), SPRAS will automatically compare the reconstructed subnetworks
-for a specific dataset against the corresponding gold standards.
-
-.. code:: yaml
-
- gold_standards:
- -
- label: gs1
- node_files: ["gs_nodes0.txt", "gs_nodes1.txt"]
- data_dir: "input"
- dataset_labels: ["data0"]
- -
- label: gs2
- edge_files: ["gs_edges0.txt"]
- data_dir: "input"
- dataset_labels: ["data0", "data1"]
-
- analysis:
- evaluation:
- include: true
-
-A gold standard dataset must include the following types of keys and
-files:
-
-- ``label``: a name that uniquely identifies a gold standard dataset
- throughout the SPRAS workflow and outputs.
-- ``node_file`` or ``edge_file``: A list of node or edge files. Only
- one of these can be defined per gold standard dataset.
-- ``data_dir``: The file path of the directory where the input gold
- standard dataset files are located.
-- ``dataset_labels``: a list of dataset labels indicating which
- datasets this gold standard dataset should be evaluated against.
-
-When evaluation is enabled, SPRAS will automatically run its built-in
-evaluation analysis on each defined dataset-gold standard pair. This
-evaluation computes metrics such as precision, recall, and
-precision-recall curves, depending on the parameter selection method
-used.
-
-For each pathway, evaluation can be run independently of any parameter
-selection method (the ground truth-based evaluation without parameter
-selection idea) to directly inspect precision and recall for each
-reconstructed network from a given dataset.
-
-.. image:: ../_static/images/pr-per-pathway-nodes.png
- :alt: Precision and recall computed for each pathway and visualized on a scatter plot
- :width: 600
- :align: center
-
-.. raw:: html
-
-
-
-Ensemble-based parameter selection generates precision-recall curves by
-thresholding on the frequency of edges across an ensemble of
-reconstructed networks for an algorithm for given dataset.
-
-.. image:: ../_static/images/pr-curve-ensemble-nodes-per-algorithm-nodes.png
- :alt: Precision-recall curve computed for a single ensemble file / pathway and visualized as a curve
- :width: 600
- :align: center
-
-.. raw:: html
-
-
-
-PCA-based parameter selection computes a precision and recall for a
-single reconstructed network selected using PCA from all reconstructed
-networks for an algorithm for given dataset.
-
-.. image:: ../_static/images/pr-pca-chosen-pathway-per-algorithm-nodes.png
- :alt: Precision and recall computed for each pathway chosen by the PCA-selection method and visualized on a scatter plot
- :width: 600
- :align: center
-
-.. raw:: html
-
-
-
-.. note::
-
- Evaluation will only execute if ml has ``include: true``, because the
- PCA parameter selection step depends on the PCA ML analysis.
-
-.. note::
-
- To see evaluation in action, run SPRAS using the config.yaml or
- egfr.yaml configuration files.
+ Grid search features are still under development and will be added in
+ future SPRAS releases.
**********************
HTCondor integration
diff --git a/docs/tutorial/beginner.rst b/docs/tutorial/beginner.rst
index a0846666f..a60966446 100644
--- a/docs/tutorial/beginner.rst
+++ b/docs/tutorial/beginner.rst
@@ -15,66 +15,6 @@ You will learn how to:
dataset
- Enable post-analysis steps to generate post analysis information
-****************************************************************************
- Step 0: Clone the SPRAS repository, set up the environment, and run Docker
-****************************************************************************
-
-0.1 Clone the SPRAS repository
-==============================
-
-Visit the `SPRAS GitHub repository
-`__ and clone it locally
-
-0.2 Set up the SPRAS environment
-================================
-
-From the root directory of the SPRAS repository, create and activate the
-Conda environment and install the SPRAS python package:
-
-.. code:: bash
-
- conda env create -f environment.yml
- conda activate spras
- python -m pip install .
-
-.. note::
-
- The first command performs a one-time installation of the SPRAS
- dependencies by creating a Conda environment (an isolated space that
- keeps all required packages and versions separate from your system).
-
- The second command activates the newly created environment so you can
- use these dependencies when running SPRAS; this step must be done
- each time you open a new terminal session.
-
- The last command is a one-time installation of the SPRAS package into
- the environment.
-
-0.3 Test the installation
-=========================
-
-Run the following command to confirm that SPRAS has been set up
-successfully from the command line:
-
-.. code:: bash
-
- python -c "import spras; print('SPRAS import successful')"
-
-0.4 Start Docker
-================
-
-Before running SPRAS, make sure Docker Desktop is running.
-
-Launch Docker Desktop and wait until it says "Docker is running".
-
-.. note::
-
- SPRAS itself does not run inside a Docker container. However, Docker
- is required because SPRAS uses it to execute individual pathway
- reconstruction algorithms and certain post-analysis steps within
- isolated containers. These containers include all the necessary
- dependencies to run each algorithm or post analysis.
-
*****************************
Step 1: Configuration files
*****************************
@@ -93,15 +33,6 @@ seralizaiton.
SPRAS uses Snakemake to read the YAML configuration file and execute a
SPRAS workflow accordingly.
-..
- Snakemake considers a task from the configuration file complete once the expected output files are present in the output directory.
-
-..
- As a result, rerunning the same configuration file may do nothing if those files already exist.
-
-..
- To continue or rerun SPRAS with the same configuration file, delete the output directory (or its contents) or modify the configuration file so Snakemake regenerates new results.
-
1.1 Save config for this tutorial
=================================
@@ -124,6 +55,25 @@ After adding this file, your directory structure will look like this
│ ├── tps-egfr-prizes.txt # pre-defined in SPRAS already, used by the beginner.yaml file
│ └── ... other input data ...
+.. note::
+
+ There is a nested ``spras`` folder within the larger ``spras``
+ repository.
+
+ When downloading the beginner config file, place it in your working
+ ``spras`` directory under ``spras/config``, not in the
+ ``spras/spras/config`` directory.
+
+.. note::
+
+ If working in the dev container, you can upload files from your local
+ machine by right-clicking on a folder.
+
+ .. image:: ../_static/images/upload-file-dev-container.png
+ :alt: Right-click menu showing upload option
+ :scale: 40
+ :align: center
+
config/
-------
@@ -160,20 +110,33 @@ Algorithms
- name: omicsintegrator1
params:
include: true
- run1:
- b: 0.1
- d: 10
- g: 1e-3
- run2:
- b: [0.55, 2, 10]
- d: [10, 20]
- g: 1e-3
+ runs:
+ run1:
+ b: 0.1
+ d: 10
+ g: 1e-3
+ run2:
+ b: [0.55, 2, 10]
+ d: [10, 20]
+ g: 1e-3
When defining an algorithm in the configuration file, its name must
match one of the supported SPRAS algorithms. Each algorithm includes an
include flag, which you set to true to have Snakemake run it, or false
to disable it.
+A parameter is a configurable value, or set of values, that controls an
+algorithm's behavior. Parameters govern algorithm-specific behavior that
+shapes how an output subnetwork is constructed, so changing them
+produces different subnetworks from the same input data.
+
+Each algorithm exposes its own set of parameters that control its
+optimization strategy. Some algorithms have no adjustable parameters,
+while others include multiple tunable settings that influence how
+subnetworks are created. These parameters vary widely between algorithms
+and reflect the unique optimization techniques each method employs under
+the hood.
+
Algorithm parameters can be organized into one or more run blocks (e.g.,
run1, run2, …), with each block containing key-value pairs. When
defining a parameter, it can be passed as a single value or passed by
@@ -183,13 +146,6 @@ lists within a run block, SPRAS generates all possible combinations
single-value parameters in the same run block. Each unique combination
runs once per algorithm.
-Each algorithm exposes its own set of parameters that control its
-optimization strategy. Some algorithms have no adjustable parameters,
-while others include multiple tunable settings that influence how
-subnetworks are created. These parameters vary widely between algorithms
-and reflect the unique optimization techniques each method employs under
-the hood.
-
(See :doc:`Pathway Reconstruction Methods <../prms/prms>` for
information about algorithms and their parameters).
@@ -200,7 +156,7 @@ Datasets
datasets:
-
- label: egfr
+ label: data1
node_files: ["prizes.txt", "sources-targets.txt"]
edge_files: ["interactome.txt"]
other_files: []
@@ -227,6 +183,41 @@ A dataset must include the following types of keys and files:
connecting two molecules. An interactome is a large network of
possible interactions that defines many edges connecting molecules.
+Gold standard datasets
+----------------------
+
+.. code:: yaml
+
+ gold_standards:
+ -
+ label: gs1
+ node_files: ["gs_nodes0.txt", "gs_nodes1.txt"]
+ data_dir: "input"
+ dataset_labels: ["data0"]
+ -
+ label: gs2
+ edge_files: ["gs_edges0.txt"]
+ data_dir: "input"
+ dataset_labels: ["data0", "data1"]
+
+Users can specify one or more gold standard datasets in the
+configuration file to evaluate the subnetworks reconstructed from each
+dataset. When gold standards are provided and evaluation is enabled
+(shown below), SPRAS compares the output subnetworks for a given dataset
+against the gold standards listed in its ``dataset_labels`` field.
+
+A gold standard dataset must include the following types of keys and
+files:
+
+- ``label``: a name that uniquely identifies a gold standard dataset
+ throughout the SPRAS workflow and outputs.
+- ``node_file`` or ``edge_file``: A list of node or edge files. Only
+ one of these can be defined per gold standard dataset.
+- ``data_dir``: The file path of the directory where the input gold
+ standard dataset files are located.
+- ``dataset_labels``: a list of dataset labels indicating which
+ datasets this gold standard dataset should be evaluated against.
+
Reconstruction settings
-----------------------
@@ -246,17 +237,19 @@ Analysis
.. code:: yaml
analysis:
- summary:
- include: true
- cytoscape:
- include: true
- ml:
- include: true
+ summary:
+ include: true
+ cytoscape:
+ include: true
+ ml:
+ include: true
+ evaluation:
+ include: true
SPRAS includes multiple downstream analyses that can be toggled on or
off directly in the configuration file. When enabled, these analyses are
-performed per dataset and produce summaries or visualizations of the
-results from all enabled algorithms for that dataset.
+performed per dataset and produce summaries, visualizations, or
+evaluations of the results from all enabled algorithms for that dataset.
.. note::
@@ -419,8 +412,24 @@ under pathlinker so it looks like:
.. code:: yaml
- run2:
- k: [10, 100]
+ - name: "pathlinker"
+ include: true
+ runs:
+ run1:
+ k: 1
+ run2:
+ k: [10, 100]
+
+.. note::
+
+ YAML is indentation-sensitive. The ``run1`` and ``run2`` keys must be
+ aligned at the same indentation level, and their ``k`` parameters
+ must also be aligned with each other. Misaligned indentation will
+ cause the configuration file to fail to parse.
+
+ Tools like `YAML Prettifier
+ `_ can help format and
+ validate your configuration file.
With this update, the ``beginner.yaml`` configuration file is set up
have SPRAS run a single algorithm with multiple parameter settings on
@@ -571,6 +580,11 @@ Your analysis section in the configuration file should look like this:
cytoscape:
include: true
+.. note::
+
+ The Cytoscape analysis step will take noticeably longer to run than
+ the other analysis options.
+
``summary`` generates graph topological summary statistics for each
algorithm's parameter combination output, generating a summary file for
all reconstructed subnetworks for a given dataset.
diff --git a/docs/tutorial/intermediate.rst b/docs/tutorial/intermediate.rst
index 1055b8799..c9d061cb1 100644
--- a/docs/tutorial/intermediate.rst
+++ b/docs/tutorial/intermediate.rst
@@ -21,12 +21,91 @@ You will learn how to:
Step 1: Transforming high throughput experimental data into SPRAS compatible input data
*****************************************************************************************
-1.1 What is the SPRAS-standardized input data?
-==============================================
+1.1 Example of high-throughput omic data
+========================================
-A pathway reconstruction algorithm requires a set of input nodes and an
-interactome; however, each algorithm expects its inputs to follow a
-unique format.
+High-throughput omics technologies measure thousands of biological
+molecules in a single experiment, producing genome-, transcriptome-, or
+proteome-wide snapshots of cellular state. These measurements quantify
+how molecular abundance or activity changes across conditions or time
+points, generating large-scale datasets that can be used as input for
+pathway reconstruction.
+
+An example dataset is EGF response mass spectrometry data [4]_, a
+proteomics dataset that measures peptide abundance after cells are
+stimulated with epidermal growth factor (EGF).
+
+The experiment for this data was repeated three times, known as
+biological replicates, to ensure the results are consistent. Each
+replicate measures the abundance of peptides at different time points
+(0-128 minutes) to capture how protein activity changes over time.
+
+.. note::
+
+ Mass spectrometry is a technique used to measure and identify
+ proteins in a sample. It works by breaking proteins into smaller
+ pieces called peptides and measuring their mass-to-charge ratio,
+ which enables identifying which peptide is being measured. The data
+ show how much of each peptide is present, which can show how protein
+ phosphorylation abundances change under different conditions.
+
+ Since proteins interact with each other in biological pathways,
+ changes in their phosphorylation abundances can reveal which parts of
+ a pathway are active or affected.
+
+Example of one peptide's measurements in one of the biological
+replicates:
+
+.. list-table::
+ :header-rows: 1
+ :widths: 20 15 10 10 10 10 10 10 10 10 10 10
+
+ - - peptide
+ - protein
+ - gene.name
+ - modified.sites
+ - 0 min
+ - 2 min
+ - 4 min
+ - 8 min
+ - 16 min
+ - 32 min
+ - 64 min
+ - 128 mn
+
+ - - K.n[305.21]AFWMAIGGDRDEIEGLS[167.00]S[167.00]DEEH.-
+ - Q6PD74,B4DG44,Q5JPJ4,Q6AWA0
+ - AAGAB
+ - S310,S311
+ - 14.97
+ - 14.81
+ - 13.99
+ - 13.98
+ - 12.87
+ - 13.88
+ - 13.91
+ - 15.60
+
+Omics data can serve as input for pathway reconstruction, but it must
+first be reformatted to match the input format and requirements of each
+algorithm.
+
+1.2 What is the standardized input data?
+========================================
+
+A pathway reconstruction algorithm at minimum requires a set of input
+nodes (node_files) and an interactome (edge_files); however, each
+algorithm expects its inputs to follow a unique format.
+
+.. note::
+
+ Input nodes are a set of molecules of interest, typically derived
+ from high-throughput omics data.
+
+ An interactome is a network of known molecule-to-molecule
+ interactions, typically compiled by aggregating experimental and
+ curated data from public databases. It defines the set of possible
+ edges that algorithms can draw on when reconstructing.
To simplify this process, SPRAS requires all input data in a dataset to
be formatted once into a standardized SPRAS format. SPRAS then
@@ -38,8 +117,8 @@ is enabled in the configuration file.
Each algorithm uses the input nodes to guide or constrain the
optimization process used to construct reconstruct subnetworks.
- An algorithm maps these input nodes onto the interactome and uses the
- network to identify connecting paths between the input nodes to form
+ An algorithm maps these input nodes onto the interactome and
+ identifies connecting paths between the input nodes to form
subnetworks.
Pathway reconstruction algorithms differ in the inputs nodes they
@@ -85,9 +164,22 @@ algorithms also interpret the input interactome differently.
- And some support mixed-directionaltiy interactomes. These
interactomes contain both directed and undirected edges.
-SPRAS automatically converts the user-provided edge file into the format
-expected by each algorithm, ensuring that the directionality of the
-interactome matches the algorithm's requirements.
+.. note::
+
+ Directionality describes whether an edge in the interactome captures
+ the direction of a biological interaction.
+
+ A directed edge (A -> B) means that molecule A acts on molecule B,
+ but not the reverse, for example, a kinase phosphorylating its
+ substrate or a transcription factor regulating a target gene.
+
+ An undirected edge (A - B) means that A and B interact, but the data
+ does not specify which one acts on the other, for example, two
+ proteins that bind each other in a complex.
+
+SPRAS automatically converts the user-provided edge file (interactome)
+into the format expected by each algorithm, ensuring that the
+directionality of the interactome matches the algorithm's requirements.
An example of an edge file required by SPRAS follows a tab-separated
format. where ``U`` indicates an undirected edge and ``D`` indicates a
@@ -104,73 +196,13 @@ directed edge:
about input data formats can be found in the ``inputs/README.md``
file within the SPRAS repository.
-1.2 Example high throughput data
-================================
-
-An example dataset is using EGF response mass spectrometry data [4]_.
-The experiment for this data was repeated three times, known as
-biological replicates, to ensure the results are consistent. Each
-replicate measures the abundance of peptides at different time points
-(0-128 minutes) to capture how protein activity changes over time.
+1.3 Preprocessing the omic data
+===============================
-.. note::
-
- Mass spectrometry is a technique used to measure and identify
- proteins in a sample. It works by breaking proteins into smaller
- pieces called peptides and measuring their mass-to-charge ratio,
- which enables identifying which peptide is being measured. The data
- show how much of each peptide is present, which can show how protein
- phosphorylation abundances change under different conditions.
-
- Since proteins interact with each other in biological pathways,
- changes in their phosphorylation abundances can reveal which parts of
- a pathway are active or affected. By mapping these changing proteins
- onto known interaction networks, pathway reconstruction algorithms
- can identify which signaling pathways are likely involved in the
- biological response to a specific condition.
-
-Example of one of the biological replicate A with one peptide:
-
-.. list-table::
- :header-rows: 1
- :widths: 20 15 10 10 10 10 10 10 10 10 10 10
-
- - - peptide
- - protein
- - gene.name
- - modified.sites
- - 0 min
- - 2 min
- - 4 min
- - 8 min
- - 16 min
- - 32 min
- - 64 min
- - 128 mn
-
- - - K.n[305.21]AFWMAIGGDRDEIEGLS[167.00]S[167.00]DEEH.-
- - Q6PD74,B4DG44,Q5JPJ4,Q6AWA0
- - AAGAB
- - S310,S311
- - 14.97
- - 14.81
- - 13.99
- - 13.98
- - 12.87
- - 13.88
- - 13.91
- - 15.60
-
-The goal is to turn this experimental data into the format that SPRAS
-expects.
-
-1.3 Filtering and normalizing the replicates
-============================================
-
-Before analysis, we filter out peptides not present in all three
-replicates to ensure consistency. Then, we normalize each replicate so
-intensity values are comparable and not biased by replicate-specific
-effects.
+Before analysis, we filter out peptides that are not present in all
+three replicates to ensure consistency across measurements. We then
+normalize each replicate so that intensity values are comparable and not
+biased by replicate-specific effects.
.. list-table::
:header-rows: 1
@@ -232,11 +264,12 @@ effects.
- 5.48
- A
-1.4 Computing p-values using Tukey's HSD Test
-=============================================
+1.4 Computing prizes
+====================
-We want to calculate the p-values per peptide. This tells us how likely
-changes in abundance happen by chance.
+We can transform these measurements into prizes for pathway
+reconstruction. One approach is to calculate a p-value per peptide,
+which quantifies how likely changes in abundance happen by chance.
We use Tukey's Honest Significant Difference (HSD) test to compare all
time points and correct for multiple testing to get a p-value for every
@@ -311,28 +344,29 @@ pair of time points.
Peptides with lower p-values are more statistically significant and may
represent biologically meaningful changes in phosphorylation over time.
-1.5 From p-values to prizes
-===========================
+To use these p-values as input node prizes, we transform them with
+``-log10(p-value)`` so that smaller p-values produce larger prize
+scores.
-P-values are transformed using ``-log10(p-value)`` so smaller p-values
-give larger prize scores.
+Two adjustments are needed before the prizes are usable:
-For each peptide, the smallest p-value is selected (representing the
-most significant change) between each time point to the baseline (0 min)
-and between consecutive time points. This is because the ultimate
-network analysis will not use the temporal information.
+- Collapsing temporal information: The dataset contains temporal
+ measurements, but SPRAS does not include algorithms that use temporal
+ information. For each peptide, we select the smallest p-value across
+ all baseline-vs-time and consecutive time-point comparisons, since
+ the smallest p-value represents the most significant change.
-For each protein mapped to multiple peptides, the maximum prize value
-across all its peptides is assigned.
-
-Finally, all protein identifiers (using the first one listed for each
-protein) are converted to UniProt Entry Names to match the identifiers
-that will be used in the interactome.
+- Resolving peptide-to-protein duplicates: A single protein can map to
+ multiple peptides. For each protein, we assign the maximum prize
+ value across all of its peptides.
.. note::
- All node identifiers should use the same namespace across every part
- of the data in a dataset.
+ All node identifiers must use the same namespace across every part of
+ a dataset.
+
+ For this dataset, all protein identifiers are converted to UniProt
+ Entry Names, and the same conversion is applied to the interactome.
.. list-table::
:header-rows: 1
@@ -350,14 +384,15 @@ that will be used in the interactome.
- 0.12392034609392
- 0.906857382317364
-Input node data put into a SPRAS-standardized format:
+Input node data put into a SPRAS-standardized format (and IDs mapped to UniProt
+ Entry Names):
.. code:: text
NODE_ID prize
AAGAB_HUMAN 0.906857382
-1.6 From Prizes to Source and Targets / Actives
+1.6 From prizes to sources, targets and actives
===============================================
.. image:: ../_static/images/erbb-signaling-pathway.png
@@ -377,7 +412,8 @@ Using known pathway knowledge [1]_ [2]_ [3]_:
- EGF is known to initiate signaling, so it can be added and assigned a
high score (greater than all other nodes) to emphasize its importance
and guide algorithms to start reconstruction from this point. (EGF is
- currently not in the data)
+ currently not in the data). We can assign it a score of 10; chosen
+ empirically.
- EGFR is in the current data. Looking at the pathway, we can see that
EGFR directly interacts with EGF in the pathway.
@@ -389,7 +425,7 @@ Using known pathway knowledge [1]_ [2]_ [3]_:
correspond to proteins that are active under the given biological
condition.
-Input node data put into a SPRAS-standardized format:
+Input node data transformed into a SPRAS-standardized format:
.. code:: text
@@ -403,9 +439,10 @@ Input node data put into a SPRAS-standardized format:
1.8 Finding an Interactome to use
=================================
-To connect our proteins, we use a background protein-protein interaction
-(PPI) network (the interactome). For this dataset, two interactomes are
-merged (directed edges prioritized when available):
+To connect our proteins, we need a background interactome. For this
+dataset, we merge two protein-protein interaction (PPI) interactomes,
+prioritizing directed edges when both sources include the same
+interaction:
- iRefIndex v13 (159,095 undirected interactions)
- PhosphoSitePlus (4,080 directed kinase-substrate interactions)
@@ -441,9 +478,13 @@ Interactome data put into a SPRAS-standardized format:
.. note::
- Many databases exist that provide interactomes. One is `STRING
+ Many databases provide interactomes. One example is `STRING
`__, which contains known protein-protein
- interactions across different species.
+ interactions across different species. For a broader overview of
+ available interactomes, see `Koh et al. (2025)
+ `__. Users can
+ also construct their own interactomes from experimental or curated
+ data.
1.9 This SPRAS-standardized data is already saved into SPRAS
============================================================
@@ -529,7 +570,7 @@ algorithm-specific inputs and an output filename (``raw-pathway.txt``).
With each of the ``raw-pathway.txt`` files, an algorithm-specific
wrapper includes a module that will convert the algorithm-specific
-format into a standardized SPRAS format.
+format into a standardized SPRAS output format.
2.3 Running SPRAS with multiple algorithms
==========================================
@@ -538,6 +579,50 @@ In the ``intermediate.yaml`` configuration file, it is set up to have
SPRAS run multiple algorithms with multiple parameter settings on a
single dataset.
+.. code:: yaml
+
+ algorithms:
+ - name: "pathlinker"
+ include: true
+ runs:
+ run1:
+ k: [1, 10, 100, 1000]
+
+ - name: omicsintegrator2
+ include: true
+ runs:
+ run1:
+ b: [4, 10]
+ g: [0, 3]
+ w: [0.25, 6]
+
+ - name: mincostflow
+ include: true
+ runs:
+ run1:
+ capacity: [15, 30]
+ flow: [80, 15]
+
+ - name: "strwr"
+ include: true
+ runs:
+ run1:
+ alpha: 0.85
+ threshold: [100, 200]
+
+ - name: "rwr"
+ include: true
+ runs:
+ run1:
+ alpha: 0.85
+ threshold: [100, 200]
+
+.. note::
+
+ The full suite of algorithms is described in :doc:`Pathway
+ Reconstruction Methods <../prms/prms>`. This part of the tutorial
+ uses only a subset.
+
From the root directory, run the command below from the command line:
.. code:: bash
@@ -624,118 +709,108 @@ What your directory structure should like after this run:
│ ├── phosphosite-irefindex13.0-uniprot.txt
│ └── tps-egfr-prizes.txt
├── outputs/
- │ └── basic/
- │ └── dataset-egfr-merged.pickle
- │ └── egfr-meo-params-FJBHHNE
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-meo-params-GKEDDFZ
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-meo-params-JQ4DL7K
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-meo-params-OXXIFMZ
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-mincostflow-params-42UBTQI
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-mincostflow-params-4G2PQRB
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-omicsintegrator1-params-FZI2OGW
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-omicsintegrator1-params-GUMLBDZ
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-omicsintegrator1-params-PCWFPQW
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-omicsintegrator2-params-EHHWPMD
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-omicsintegrator2-params-IV3IPCJ
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-pathlinker-params-4YXABT7
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-pathlinker-params-7S4SLU6
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-pathlinker-params-D4TUKMX
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-pathlinker-params-VQL7BDZ
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-rwr-params-34NN6EK
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-rwr-params-GGZCZBU
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-strwr-params-34NN6EK
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-strwr-params-GGZCZBU
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── logs
- │ └── datasets-egfr.yaml
- │ └── parameters-allpairs-params-BEH6YB2.yaml
- │ └── parameters-domino-params-V3X4RW7.yaml
- │ └── parameters-meo-params-FJBHHNE.yaml
- │ └── parameters-meo-params-GKEDDFZ.yaml
- │ └── parameters-meo-params-JQ4DL7K.yaml
- │ └── parameters-meo-params-OXXIFMZ.yaml
- │ └── parameters-mincostflow-params-42UBTQI.yaml
- │ └── parameters-mincostflow-params-4G2PQRB.yaml
- │ └── parameters-mincostflow-params-GGT4CVE.yaml
- │ └── parameters-omicsintegrator1-params-FZI2OGW.yaml
- │ └── parameters-omicsintegrator1-params-GUMLBDZ.yaml
- │ └── parameters-omicsintegrator1-params-PCWFPQW.yaml
- │ └── parameters-omicsintegrator2-params-EHHWPMD.yaml
- │ └── parameters-omicsintegrator2-params-IV3IPCJ.yaml
- │ └── parameters-pathlinker-params-4YXABT7.yaml
- │ └── parameters-pathlinker-params-7S4SLU6.yaml
- │ └── parameters-pathlinker-params-D4TUKMX.yaml
- │ └── parameters-pathlinker-params-VQL7BDZ.yaml
- │ └── parameters-rwr-params-34NN6EK.yaml
- │ └── parameters-rwr-params-GGZCZBU.yaml
- │ └── parameters-strwr-params-34NN6EK.yaml
- │ └── parameters-strwr-params-GGZCZBU.yaml
- │ └── prepared
- │ └── egfr-domino-inputs
- │ ├── active_genes.txt
- │ └── network.txt
- │ └── egfr-meo-inputs
- │ ├── edges.txt
- │ ├── sources.txt
- │ └── targets.txt
- │ └── egfr-mincostflow-inputs
- │ ├── edges.txt
- │ ├── sources.txt
- │ └── targets.txt
- │ └── egfr-omicsintegrator1-inputs
- │ ├── dummy_nodes.txt
- │ ├── edges.txt
- │ └── prizes.txt
- │ └── egfr-omicsintegrator2-inputs
- │ ├── edges.txt
- │ └── prizes.txt
- │ └── egfr-pathlinker-inputs
- │ ├── network.txt
- │ ── nodetypes.txt
- │ └── egfr-rwr-inputs
- │ ├── network.txt
- │ └── nodes.txt
- │ └── egfr-strwr-inputs
- | ├── network.txt
- | ├── sources.txt
- | └── targets.txt
+ │ └── intermediate/
+ │ ├── dataset-egfr-merged.pickle
+ │ ├── egfr-mincostflow-params-42UBTQI/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-mincostflow-params-B4P4LUU/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-mincostflow-params-KTZPGLQ/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-mincostflow-params-MY6UCHG/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-44PJEHW/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-4NC62EL/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-4VRLTK5/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-52OUGT2/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-KEVHYWP/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-RUGOWNI/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-RVH2YKU/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-WW2ILRO/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-pathlinker-params-7S4SLU6/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-pathlinker-params-D4TUKMX/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-pathlinker-params-TFORORH/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-pathlinker-params-VQL7BDZ/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-rwr-params-34NN6EK/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-rwr-params-GGZCZBU/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-strwr-params-34NN6EK/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-strwr-params-GGZCZBU/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── logs/
+ │ │ ├── datasets-egfr.yaml
+ │ │ ├── parameters-mincostflow-params-42UBTQI.yaml
+ │ │ ├── parameters-mincostflow-params-B4P4LUU.yaml
+ │ │ ├── parameters-mincostflow-params-KTZPGLQ.yaml
+ │ │ ├── parameters-mincostflow-params-MY6UCHG.yaml
+ │ │ ├── parameters-omicsintegrator2-params-44PJEHW.yaml
+ │ │ ├── parameters-omicsintegrator2-params-4NC62EL.yaml
+ │ │ ├── parameters-omicsintegrator2-params-4VRLTK5.yaml
+ │ │ ├── parameters-omicsintegrator2-params-52OUGT2.yaml
+ │ │ ├── parameters-omicsintegrator2-params-KEVHYWP.yaml
+ │ │ ├── parameters-omicsintegrator2-params-RUGOWNI.yaml
+ │ │ ├── parameters-omicsintegrator2-params-RVH2YKU.yaml
+ │ │ ├── parameters-omicsintegrator2-params-WW2ILRO.yaml
+ │ │ ├── parameters-pathlinker-params-7S4SLU6.yaml
+ │ │ ├── parameters-pathlinker-params-D4TUKMX.yaml
+ │ │ ├── parameters-pathlinker-params-TFORORH.yaml
+ │ │ ├── parameters-pathlinker-params-VQL7BDZ.yaml
+ │ │ ├── parameters-rwr-params-34NN6EK.yaml
+ │ │ ├── parameters-rwr-params-GGZCZBU.yaml
+ │ │ ├── parameters-strwr-params-34NN6EK.yaml
+ │ │ └── parameters-strwr-params-GGZCZBU.yaml
+ │ └── prepared/
+ │ ├── egfr-mincostflow-inputs/
+ │ │ ├── edges.txt
+ │ │ ├── sources.txt
+ │ │ └── targets.txt
+ │ ├── egfr-omicsintegrator2-inputs/
+ │ │ ├── edges.txt
+ │ │ └── prizes.txt
+ │ ├── egfr-pathlinker-inputs/
+ │ │ ├── network.txt
+ │ │ └── nodetypes.txt
+ │ ├── egfr-rwr-inputs/
+ │ │ ├── network.txt
+ │ │ └── nodes.txt
+ │ └── egfr-strwr-inputs/
+ │ ├── network.txt
+ │ ├── sources.txt
+ │ └── targets.txt
2.4 Reviewing the pathway.txt files
===================================
@@ -764,7 +839,7 @@ contains the following reconstructed subnetwork:
.. code:: text
- Node1 Node2 Rank Direction
+ Node1 Node2 Rank Direction
CBL_HUMAN EGFR_HUMAN 1 U
EGFR_HUMAN EGF_HUMAN 1 U
EMD_HUMAN LMNA_HUMAN 1 U
@@ -777,42 +852,68 @@ contains the following reconstructed subnetwork:
EGF_HUMAN S10A4_HUMAN 1 U
EMD_HUMAN SRC_HUMAN 1 U
-And the file ``egfr-omicsintegrator1-params-GUMLBDZ/pathway.txt``
+And the file ``egfr-omicsintegrator1-params-YYFFQV4/pathway.txt``
contains the following reconstructed subnetwork:
.. code:: text
- Node1 Node2 Rank Direction
- CBLB_HUMAN EGFR_HUMAN 1 U
- CBL_HUMAN CD2AP_HUMAN 1 U
- CBL_HUMAN CRKL_HUMAN 1 U
- CBL_HUMAN EGFR_HUMAN 1 U
- CBL_HUMAN PLCG1_HUMAN 1 U
- CDK1_HUMAN NPM_HUMAN 1 D
- CHD4_HUMAN HDAC2_HUMAN 1 U
- EGFR_HUMAN EGF_HUMAN 1 U
- EGFR_HUMAN GRB2_HUMAN 1 U
- EIF3B_HUMAN EIF3G_HUMAN 1 U
- FAK1_HUMAN PAXI_HUMAN 1 U
- GAB1_HUMAN PTN11_HUMAN 1 U
- GRB2_HUMAN PTN11_HUMAN 1 U
- GRB2_HUMAN SHC1_HUMAN 1 U
- HDAC2_HUMAN SIN3A_HUMAN 1 U
- HGS_HUMAN STAM2_HUMAN 1 U
- KS6A1_HUMAN MK01_HUMAN 1 U
- MK01_HUMAN ABI1_HUMAN 1 D
- MK01_HUMAN ERF_HUMAN 1 D
- MRE11_HUMAN RAD50_HUMAN 1 U
+ Node1 Node2 Rank Direction
+ CBLB_HUMAN EGFR_HUMAN 1 U
+ CBL_HUMAN CD2AP_HUMAN 1 U
+ CBL_HUMAN CRKL_HUMAN 1 U
+ CBL_HUMAN EGFR_HUMAN 1 U
+ CBL_HUMAN PLCG1_HUMAN 1 U
+ CDK1_HUMAN NPM_HUMAN 1 D
+ CHD4_HUMAN HDAC1_HUMAN 1 U
+ CHIP_HUMAN HS90A_HUMAN 1 U
+ CHIP_HUMAN P53_HUMAN 1 U
+ DNMT1_HUMAN HDAC1_HUMAN 1 U
+ EGFR_HUMAN EGF_HUMAN 1 U
+ EGFR_HUMAN GRB2_HUMAN 1 U
+ EIF3B_HUMAN EIF3G_HUMAN 1 U
+ FAK1_HUMAN PAXI_HUMAN 1 U
+ GAB1_HUMAN PTN11_HUMAN 1 U
+ GRB2_HUMAN KHDR1_HUMAN 1 U
+ GRB2_HUMAN PTN11_HUMAN 1 U
+ GRB2_HUMAN SHC1_HUMAN 1 U
+ HDAC1_HUMAN HDAC2_HUMAN 1 U
+ HDAC1_HUMAN P53_HUMAN 1 U
+ HDAC1_HUMAN RB_HUMAN 1 U
+ HDAC1_HUMAN SIN3A_HUMAN 1 U
+ HGS_HUMAN STAM2_HUMAN 1 U
+ HS90A_HUMAN STIP1_HUMAN 1 U
+ HS90A_HUMAN TEBP_HUMAN 1 U
+ KHDR1_HUMAN LCK_HUMAN 1 U
+ KS6A1_HUMAN MK01_HUMAN 1 U
+ MK01_HUMAN ABI1_HUMAN 1 D
+ MK01_HUMAN ERF_HUMAN 1 D
+ MRE11_HUMAN RAD50_HUMAN 1 U
+ P53_HUMAN TP53B_HUMAN 1 U
******************************
Step 3: Use ML post-analysis
******************************
+Rather than inspecting each output on its own, users may want to
+understand how the outputs from multiple algorithms and parameter
+combinations relate to one another when run on the same dataset. SPRAS
+includes machine learning (ML) post-analysis methods for this:
+hierarchical agglomerative clustering, principal component analysis,
+Jaccard similarity, and ensembling.
+
+.. note::
+
+ Each ML method operates on a dataset-specific binary
+ edge-by-subnetwork matrix. Rows represent edges in the union of all
+ reconstructed pathways, and columns represent output subnetworks. An
+ entry indicates whether a given edge appears in a given subnetwork (1
+ if present, 0 if absent).
+
3.1 Adding ML post-analysis to the intermediate configuration
=============================================================
To enable the ML analysis, update the analysis section in your
-configuration file by setting ml to true. Your analysis section in the
+configuration file by setting ML to true. Your analysis section in the
configuration file should look like this:
.. code:: yaml
@@ -823,14 +924,17 @@ configuration file should look like this:
... (other parameters preset)
``ml`` will perform unsupervised analyses such as principal component
-analysis (PCA), hierarchical agglomerative clustering (HAC), ensembling,
-and jaccard similarity comparisons of the pathways.
+analysis, hierarchical agglomerative clustering, ensembling, and jaccard
+similarity comparisons of the pathways.
+
+.. note::
-- The ``ml`` section includes configurable parameters that let you
- adjust the behavior of the analyses performed.
+ The ``ml`` section includes configurable parameters that adjust the
+ behavior of these analyses. For the available options, see
+ ``config.yaml`` in the ``config/`` folder.
-With these updates, SPRAS will run the full set of unsupervised machine
-learning analyses across all outputs for a given dataset.
+With these updates, SPRAS will run the full ML analyses across all
+outputs for a given dataset.
After saving the changes in the configuration file, rerun with:
@@ -848,12 +952,12 @@ any requested post-analysis steps. It reuses cached results; here the
``pathway.txt`` files generated from the previously executed algorithms
on the egfr dataset are reused.
-2. Running the ml analysis
+2. Running the ML analysis
SPRAS aggregates all the reconstructed subnetworks produced across the
specified algorithms for a given dataset. SPRAS then performs machine
-learning analyses on each these groups and saves the results in the
-``-ml/`` (``egfr-ml/``) folder.
+learning analyses on each these groups and saves the results in a
+``-ml/`` (``egfr-ml/`` in this case) folder.
What your directory structure should like after this run:
---------------------------------------------------------
@@ -870,129 +974,119 @@ What your directory structure should like after this run:
│ ├── phosphosite-irefindex13.0-uniprot.txt
│ └── tps-egfr-prizes.txt
├── outputs/
- │ └── basic/
- │ └── dataset-egfr-merged.pickle
- │ └── egfr-meo-params-FJBHHNE
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-meo-params-GKEDDFZ
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-meo-params-JQ4DL7K
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-meo-params-OXXIFMZ
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-mincostflow-params-42UBTQI
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-mincostflow-params-4G2PQRB
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-omicsintegrator1-params-FZI2OGW
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-omicsintegrator1-params-GUMLBDZ
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-omicsintegrator1-params-PCWFPQW
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-omicsintegrator2-params-EHHWPMD
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-omicsintegrator2-params-IV3IPCJ
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-pathlinker-params-4YXABT7
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-pathlinker-params-7S4SLU6
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-pathlinker-params-D4TUKMX
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-pathlinker-params-VQL7BDZ
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-rwr-params-34NN6EK
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-rwr-params-GGZCZBU
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-strwr-params-34NN6EK
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-strwr-params-GGZCZBU
- │ └── pathway.txt
- │ └── raw-pathway.txt
- │ └── egfr-ml
- │ └── ensemble-pathway.txt
- │ └── hac-clusters-horizontal.txt
- │ └── hac-clusters-vertical.txt
- │ └── hac-horizontal.png
- │ └── hac-vertical.png
- │ └── jaccard-heatmap.png
- │ └── jaccard-matrix.txt
- │ └── pca-coordinates.txt
- │ └── pca-variance.txt
- │ └── pca.png
- │ └── logs
- │ └── datasets-egfr.yaml
- │ └── parameters-allpairs-params-BEH6YB2.yaml
- │ └── parameters-domino-params-V3X4RW7.yaml
- │ └── parameters-meo-params-FJBHHNE.yaml
- │ └── parameters-meo-params-GKEDDFZ.yaml
- │ └── parameters-meo-params-JQ4DL7K.yaml
- │ └── parameters-meo-params-OXXIFMZ.yaml
- │ └── parameters-mincostflow-params-42UBTQI.yaml
- │ └── parameters-mincostflow-params-4G2PQRB.yaml
- │ └── parameters-mincostflow-params-GGT4CVE.yaml
- │ └── parameters-omicsintegrator1-params-FZI2OGW.yaml
- │ └── parameters-omicsintegrator1-params-GUMLBDZ.yaml
- │ └── parameters-omicsintegrator1-params-PCWFPQW.yaml
- │ └── parameters-omicsintegrator2-params-EHHWPMD.yaml
- │ └── parameters-omicsintegrator2-params-IV3IPCJ.yaml
- │ └── parameters-pathlinker-params-4YXABT7.yaml
- │ └── parameters-pathlinker-params-7S4SLU6.yaml
- │ └── parameters-pathlinker-params-D4TUKMX.yaml
- │ └── parameters-pathlinker-params-VQL7BDZ.yaml
- │ └── parameters-rwr-params-34NN6EK.yaml
- │ └── parameters-rwr-params-GGZCZBU.yaml
- │ └── parameters-strwr-params-34NN6EK.yaml
- │ └── parameters-strwr-params-GGZCZBU.yaml
- │ └── prepared
- │ └── egfr-domino-inputs
- │ ├── active_genes.txt
- │ └── network.txt
- │ └── egfr-meo-inputs
- │ ├── edges.txt
- │ ├── sources.txt
- │ └── targets.txt
- │ └── egfr-mincostflow-inputs
- │ ├── edges.txt
- │ ├── sources.txt
- │ └── targets.txt
- │ └── egfr-omicsintegrator1-inputs
- │ ├── dummy_nodes.txt
- │ ├── edges.txt
- │ └── prizes.txt
- │ └── egfr-omicsintegrator2-inputs
- │ ├── edges.txt
- │ └── prizes.txt
- │ └── egfr-pathlinker-inputs
- │ ├── network.txt
- │ ── nodetypes.txt
- │ └── egfr-rwr-inputs
- │ ├── network.txt
- │ └── nodes.txt
- │ └── egfr-strwr-inputs
- | ├── network.txt
- | ├── sources.txt
- | └── targets.txt
+ │ └── intermediate/
+ │ ├── dataset-egfr-merged.pickle
+ │ ├── egfr-mincostflow-params-42UBTQI/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-mincostflow-params-B4P4LUU/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-mincostflow-params-KTZPGLQ/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-mincostflow-params-MY6UCHG/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-ml/
+ │ │ ├── ensemble-pathway.txt
+ │ │ ├── hac-clusters-horizontal.txt
+ │ │ ├── hac-clusters-vertical.txt
+ │ │ ├── hac-horizontal.png
+ │ │ ├── hac-vertical.png
+ │ │ ├── jaccard-heatmap.png
+ │ │ ├── jaccard-matrix.txt
+ │ │ ├── pca-coordinates.txt
+ │ │ ├── pca-variance.txt
+ │ │ └── pca.png
+ │ ├── egfr-omicsintegrator2-params-44PJEHW/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-4NC62EL/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-4VRLTK5/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-52OUGT2/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-KEVHYWP/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-RUGOWNI/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-RVH2YKU/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-WW2ILRO/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-pathlinker-params-7S4SLU6/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-pathlinker-params-D4TUKMX/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-pathlinker-params-TFORORH/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-pathlinker-params-VQL7BDZ/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-rwr-params-34NN6EK/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-rwr-params-GGZCZBU/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-strwr-params-34NN6EK/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-strwr-params-GGZCZBU/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── logs/
+ │ │ ├── datasets-egfr.yaml
+ │ │ ├── parameters-mincostflow-params-42UBTQI.yaml
+ │ │ ├── parameters-mincostflow-params-B4P4LUU.yaml
+ │ │ ├── parameters-mincostflow-params-KTZPGLQ.yaml
+ │ │ ├── parameters-mincostflow-params-MY6UCHG.yaml
+ │ │ ├── parameters-omicsintegrator2-params-44PJEHW.yaml
+ │ │ ├── parameters-omicsintegrator2-params-4NC62EL.yaml
+ │ │ ├── parameters-omicsintegrator2-params-4VRLTK5.yaml
+ │ │ ├── parameters-omicsintegrator2-params-52OUGT2.yaml
+ │ │ ├── parameters-omicsintegrator2-params-KEVHYWP.yaml
+ │ │ ├── parameters-omicsintegrator2-params-RUGOWNI.yaml
+ │ │ ├── parameters-omicsintegrator2-params-RVH2YKU.yaml
+ │ │ ├── parameters-omicsintegrator2-params-WW2ILRO.yaml
+ │ │ ├── parameters-pathlinker-params-7S4SLU6.yaml
+ │ │ ├── parameters-pathlinker-params-D4TUKMX.yaml
+ │ │ ├── parameters-pathlinker-params-TFORORH.yaml
+ │ │ ├── parameters-pathlinker-params-VQL7BDZ.yaml
+ │ │ ├── parameters-rwr-params-34NN6EK.yaml
+ │ │ ├── parameters-rwr-params-GGZCZBU.yaml
+ │ │ ├── parameters-strwr-params-34NN6EK.yaml
+ │ │ └── parameters-strwr-params-GGZCZBU.yaml
+ │ └── prepared/
+ │ ├── egfr-mincostflow-inputs/
+ │ │ ├── edges.txt
+ │ │ ├── sources.txt
+ │ │ └── targets.txt
+ │ ├── egfr-omicsintegrator2-inputs/
+ │ │ ├── edges.txt
+ │ │ └── prizes.txt
+ │ ├── egfr-pathlinker-inputs/
+ │ │ ├── network.txt
+ │ │ └── nodetypes.txt
+ │ ├── egfr-rwr-inputs/
+ │ │ ├── network.txt
+ │ │ └── nodes.txt
+ │ └── egfr-strwr-inputs/
+ │ ├── network.txt
+ │ ├── sources.txt
+ │ └── targets.txt
Step 3.2: Reviewing the ML outputs
==================================
@@ -1014,43 +1108,49 @@ across the outputs.
.. code:: text
- Node1 Node2 Frequency Direction
- EGF_HUMAN EGFR_HUMAN 0.42857142857142855 D
- EGF_HUMAN S10A4_HUMAN 0.38095238095238093 D
- S10A4_HUMAN MYH9_HUMAN 0.38095238095238093 D
- K7PPA8_HUMAN MDM2_HUMAN 0.09523809523809523 D
- MDM2_HUMAN P53_HUMAN 0.19047619047619047 D
- S10A4_HUMAN K7PPA8_HUMAN 0.19047619047619047 D
- K7PPA8_HUMAN SIR1_HUMAN 0.19047619047619047 D
- MDM2_HUMAN MDM4_HUMAN 0.09523809523809523 D
- MDM4_HUMAN P53_HUMAN 0.09523809523809523 D
- CD2A2_HUMAN CDK4_HUMAN 0.09523809523809523 D
- CDK4_HUMAN RB_HUMAN 0.09523809523809523 D
- MDM2_HUMAN CD2A2_HUMAN 0.09523809523809523 D
- EP300_HUMAN P53_HUMAN 0.2857142857142857 D
- K7PPA8_HUMAN EP300_HUMAN 0.09523809523809523 D
+ Node1 Node2 Frequency Direction
+ EGF_HUMAN EGFR_HUMAN 0.3 D
+ EGF_HUMAN S10A4_HUMAN 0.25 D
+ S10A4_HUMAN MYH9_HUMAN 0.2 D
+ K7PPA8_HUMAN MDM2_HUMAN 0.15 D
+ MDM2_HUMAN P53_HUMAN 0.15 D
+ S10A4_HUMAN K7PPA8_HUMAN 0.15 D
+ K7PPA8_HUMAN SIR1_HUMAN 0.15 D
+ MDM2_HUMAN MDM4_HUMAN 0.15 D
+ MDM4_HUMAN P53_HUMAN 0.15 D
+ CD2A2_HUMAN CDK4_HUMAN 0.15 D
+ CDK4_HUMAN RB_HUMAN 0.15 D
+ MDM2_HUMAN CD2A2_HUMAN 0.15 D
+ EP300_HUMAN P53_HUMAN 0.35 D
+ K7PPA8_HUMAN EP300_HUMAN 0.15 D
...
High frequency edges indicate interactions consistently recovered by
multiple algorithms. Low frequency edges may reflect noise or
-algorithm-specific connections.
+algorithm-specific connections. Edges that occur across many outputs are
+less likely to be algorithm-specific artifacts, so ensembling lets you
+filter for interactions supported by multiple algorithms or parameter
+settings.
Hierarchical agglomerative clustering
-------------------------------------
-#. Open the HAC image(s)
+#. Open the hierarchical agglomerative clustering image(s)
In your file explorer, go to
``output/intermediate/egfr-ml/hac-horizontal.png`` and/or
``output/intermediate/egfr-ml/hac-vertical.png`` and open it locally.
-SPRAS includes HAC to group similar pathways outputs based on shared
-edges. This helps identify clusters of algorithms that produce
-comparable subnetworks and highlights distinct reconstruction behaviors.
+SPRAS includes hierarchical agglomerative clustering to group similar
+pathways outputs based on shared edges. This helps identify clusters of
+algorithms that produce comparable subnetworks and highlights distinct
+reconstruction behaviors.
In the plots below, each branch represents a cluster of related
-pathways. Shorter distances between branches indicate outputs with
-greater similarity.
+pathways, and shorter distances between branches indicate greater
+similarity. Tight clusters group algorithms and parameter settings that
+produce comparable pathway structures, while isolated branches flag
+outputs that differ from the rest.
.. image:: ../_static/images/hac-horizontal.png
:alt: Hierarchical agglomerative clustering horizontal view
@@ -1070,10 +1170,6 @@ greater similarity.
-HAC visualizations help compare which algorithms and parameter settings
-produce similar pathway structures. Tight clusters indicate similar
-output behavior, while isolated branches may reveal unique results.
-
Principal component analysis
----------------------------
@@ -1082,10 +1178,13 @@ Principal component analysis
In your file explorer, go to ``output/intermediate/egfr-ml/pca.png`` and
open it locally.
-SPRAS also includes PCA to visualize variation across pathway outputs.
-Each point represents a pathway, placed based on its overall network
-structure. Pathways that cluster together in PCA space are more similar,
-while those farther apart differ in their reconstructed subnetworks.
+SPRAS also includes principal component analysis (PCA) to visualize
+variation across pathway outputs. Each point represents a pathway,
+placed based on its overall network structure. Pathways that cluster
+together in PCA space are more similar, while those farther apart differ
+in their reconstructed subnetworks. PCA may help identify patterns such
+as clusters of similar algorithms outputs, parameter sensitivities,
+and/or outlier outputs in a lower lower-dimensional space.
.. image:: ../_static/images/pca.png
:alt: Principal component analysis visualization across pathway outputs
@@ -1096,9 +1195,6 @@ while those farther apart differ in their reconstructed subnetworks.
-PCA may help identify patterns such as clusters of similar algorithms
-outputs, parameter sensitivities, and/or outlier outputs.
-
Jaccard similarity
------------------
@@ -1110,7 +1206,9 @@ In your file explorer, go to
SPRAS computes pairwise jaccard similarity between pathway outputs to
measure how much overlap exists between their reconstructed subnetworks.
The heatmap visualizes how similar the output pathways are between
-algorithms and their parameter settings.
+algorithms and their parameter settings. Higher similarity values
+indicate that pathways share many of the same edges, while lower values
+suggest distinct reconstructions.
.. image:: ../_static/images/jaccard-heatmap.png
:alt: Jaccard heatmap of the overlap between pathway outputs
@@ -1121,11 +1219,542 @@ algorithms and their parameter settings.
-Higher similarity values indicate that pathways share many of the same
-edges, while lower values suggest distinct reconstructions.
+**************************************
+ Step 4: Use Evaluation post-analysis
+**************************************
+
+In some cases, users may have a gold standard file that allows them to
+evaluate the quality of the reconstructed subnetworks generated by
+pathway reconstruction algorithms.
+
+However, gold standards may not exist for certain types of experimental
+data where validated ground truth interactions or molecules are
+unavailable or incomplete. For example, in emerging research areas or
+poorly characterized biological systems, interactions may not yet be
+experimentally verified or fully known, making it difficult to define a
+reliable reference network for evaluation.
+
+.. note::
+
+ A gold standard captures interactions that are already known, but
+ pathway reconstruction is also a tool for discovery. An algorithm
+ that scores well against a gold standard may do so by recovering
+ established interactions while missing novel ones.
+
+4.1 Adding evaluation post-analysis to the intermediate configuration
+=====================================================================
+
+To enable evaluation, update the analysis section of your configuration
+file. In the ``evaluation`` section, set ``include`` and
+``aggregate_per_algorithm`` to ``true``. Also, in the ``ml`` section,
+set ``kde``, ``r`emove_empty_pathways``, and ``aggregate_per_algorithm``
+to true. Your analysis section in the configuration file should look
+like this:
+
+.. code:: yaml
+
+ analysis:
+ summary:
+ include: true
+ ml:
+ include: true
+ aggregate_per_algorithm: true
+ kde: true
+ remove_empty_pathways: true
+
+ evaluation:
+ include: true
+ aggregate_per_algorithm: true
+
+Setting ``aggregate_per_algorithm`` to true will additionally group
+post-analysis and evaluations by algorithm per dataset. Without this,
+outputs from all algorithm per dataset are aggregated together for
+post-analysis rather than broken out per algorithm.
+
+Within ``ml``, ``remove_empty_pathways`` excludes pathways with no nodes
+or edges from the PCA post analysis. The ``kde`` creates a kernel
+density estimate over the PCA plots.
+
+``summary`` is enabled because evaluation uses summary statistics to
+break ties between pathways for some of the parameter selection methods
+(more details further into the tutorial).
+
+We need to delete the existing ``egfr-ml/`` folder before rerunning
+SPRAS so that Snakemake regenerates the ML outputs with the new
+customized ML settings. Run this command from the root directory:
+
+.. code:: bash
+
+ rm -rf output/intermediate/egfr-ml/
+
+.. note::
+
+ Snakemake skips steps whose output files already exist, so changes to
+ ML configuration parameters will not trigger a rerun unless the
+ existing ML outputs are removed first.
+
+ Automatic re-execution on config changes is a known limitation and is
+ being addressed in ongoing SPRAS development.
+
+The intermediate configuration also includes a gold standard for the
+EGFR dataset, which is already set up in SPRAS and does not require any
+setup:
+
+.. code:: yaml
+
+ gold_standards:
+ -
+ label: gs_egfr
+ node_files: ["gs-egfr.txt"]
+ data_dir: "input"
+ dataset_labels: ["egfr"]
+
+.. note::
+
+ The gold standard for this dataset consists of nodes only, following
+ the original study. The gold standard nodes are drawn from eight
+ EGFR-related reference pathways [4]_.
+
+ A limitation of this gold standard is its incomplete coverage of EGF
+ signaling pathways. Across the eight reference pathways, typically
+ 5\% or fewer of the input nodes appear in any single pathway, and
+ 85\% are absent from all eight. This reflects the general
+ incompleteness of curated pathway databases relative to measured
+ signaling responses, rather than a flaw specific to this dataset
+ [4]_.
+
+With these updates, SPRAS will run the evaluations across all outputs
+for a given dataset.
+
+After saving the changes in the configuration file, rerun with:
+
+.. code:: bash
+
+ snakemake --cores 4 --configfile config/intermediate.yaml
+
+What happens when you run this command
+--------------------------------------
+
+#. Reusing cached results
+
+Snakemake reads the options set in ``intermediate.yaml`` and checks for
+any requested post-analysis steps. It reuses cached results; here the
+``pathway.txt`` files generated from the previously executed algorithms
+on the egfr dataset are reused.
+
+2. Running the ML analysis
+
+SPRAS aggregates all the reconstructed subnetworks produced across the
+specified algorithms for a given dataset. SPRAS then performs machine
+learning analyses on each these groups and saves the results in the
+``-ml/`` (``egfr-ml/``) folder. It is also going to be running
+the ML per algorithm for a given dataset. This groups the ML post
+analysis by algorithm per dataset and produces algorithm specific ML
+outputs.
+
+3. Running the summary analysis
+
+SPRAS aggregates the ``pathway.txt`` files from all selected parameter
+combinations into a single summary table, saved as
+``egfr-pathway-summary.txt``. This is used if any tiebreakers occur
+during PCA-based parameter selection.
+
+4. Running the evaluation
+
+For each dataset listed in a gold standard's ``dataset_labels``, SPRAS
+compares the reconstructed subnetworks against that gold standard using
+the parameter selection methods enabled in the configuration.
+
+The evaluation runs at two levels: once across all algorithms combined,
+and once per algorithm. The per-algorithm evaluation depends on
+per-algorithm ML outputs, which is why ``aggregate_per_algorithm`` was
+set to true in the ``ml`` section above. This produces both
+all-algorithm evaluation files and algorithm-specific evaluation files
+for each dataset-goldstandard pair.
+
+What your directory structure should like after this run:
+---------------------------------------------------------
+
+.. code:: text
+
+ spras/
+ ├── .snakemake/
+ │ └── log/
+ │ └── ... snakemake log files ...
+ ├── config/
+ │ └── basic.yaml
+ ├── inputs/
+ │ ├── phosphosite-irefindex13.0-uniprot.txt
+ │ └── tps-egfr-prizes.txt
+ ├── outputs/
+ │ └── intermediate/
+ │ ├── dataset-egfr-merged.pickle
+ │ ├── egfr-gs_egfr-eval/
+ │ │ ├── pr-curve-ensemble-nodes-per-algorithm-nodes.png
+ │ │ ├── pr-curve-ensemble-nodes-per-algorithm-nodes.txt
+ │ │ ├── pr-curve-ensemble-nodes.png
+ │ │ ├── pr-curve-ensemble-nodes.txt
+ │ │ ├── pr-pca-chosen-pathway-nodes.png
+ │ │ ├── pr-pca-chosen-pathway-nodes.txt
+ │ │ ├── pr-pca-chosen-pathway-per-algorithm-nodes.png
+ │ │ ├── pr-pca-chosen-pathway-per-algorithm-nodes.txt
+ │ │ ├── pr-per-pathway-for-mincostflow-nodes.png
+ │ │ ├── pr-per-pathway-for-mincostflow-nodes.txt
+ │ │ ├── pr-per-pathway-for-omicsintegrator2-nodes.png
+ │ │ ├── pr-per-pathway-for-omicsintegrator2-nodes.txt
+ │ │ ├── pr-per-pathway-for-pathlinker-nodes.png
+ │ │ ├── pr-per-pathway-for-pathlinker-nodes.txt
+ │ │ ├── pr-per-pathway-for-rwr-nodes.png
+ │ │ ├── pr-per-pathway-for-rwr-nodes.txt
+ │ │ ├── pr-per-pathway-for-strwr-nodes.png
+ │ │ ├── pr-per-pathway-for-strwr-nodes.txt
+ │ │ ├── pr-per-pathway-nodes.png
+ │ │ └── pr-per-pathway-nodes.txt
+ │ ├── egfr-mincostflow-params-42UBTQI/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-mincostflow-params-B4P4LUU/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-mincostflow-params-KTZPGLQ/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-mincostflow-params-MY6UCHG/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-ml/
+ │ │ ├── ensemble-pathway.txt
+ │ │ ├── hac-clusters-horizontal.txt
+ │ │ ├── hac-clusters-vertical.txt
+ │ │ ├── hac-horizontal.png
+ │ │ ├── hac-vertical.png
+ │ │ ├── jaccard-heatmap.png
+ │ │ ├── jaccard-matrix.txt
+ │ │ ├── mincostflow-ensemble-pathway.txt
+ │ │ ├── mincostflow-hac-clusters-horizontal.txt
+ │ │ ├── mincostflow-hac-clusters-vertical.txt
+ │ │ ├── mincostflow-hac-horizontal.png
+ │ │ ├── mincostflow-hac-vertical.png
+ │ │ ├── mincostflow-jaccard-heatmap.png
+ │ │ ├── mincostflow-jaccard-matrix.txt
+ │ │ ├── mincostflow-pca-coordinates.txt
+ │ │ ├── mincostflow-pca-variance.txt
+ │ │ ├── mincostflow-pca.png
+ │ │ ├── omicsintegrator2-ensemble-pathway.txt
+ │ │ ├── omicsintegrator2-hac-clusters-horizontal.txt
+ │ │ ├── omicsintegrator2-hac-clusters-vertical.txt
+ │ │ ├── omicsintegrator2-hac-horizontal.png
+ │ │ ├── omicsintegrator2-hac-vertical.png
+ │ │ ├── omicsintegrator2-jaccard-heatmap.png
+ │ │ ├── omicsintegrator2-jaccard-matrix.txt
+ │ │ ├── omicsintegrator2-pca-coordinates.txt
+ │ │ ├── omicsintegrator2-pca-variance.txt
+ │ │ ├── omicsintegrator2-pca.png
+ │ │ ├── pathlinker-ensemble-pathway.txt
+ │ │ ├── pathlinker-hac-clusters-horizontal.txt
+ │ │ ├── pathlinker-hac-clusters-vertical.txt
+ │ │ ├── pathlinker-hac-horizontal.png
+ │ │ ├── pathlinker-hac-vertical.png
+ │ │ ├── pathlinker-jaccard-heatmap.png
+ │ │ ├── pathlinker-jaccard-matrix.txt
+ │ │ ├── pathlinker-pca-coordinates.txt
+ │ │ ├── pathlinker-pca-variance.txt
+ │ │ ├── pathlinker-pca.png
+ │ │ ├── pca-coordinates.txt
+ │ │ ├── pca-variance.txt
+ │ │ ├── pca.png
+ │ │ ├── rwr-ensemble-pathway.txt
+ │ │ ├── rwr-hac-clusters-horizontal.txt
+ │ │ ├── rwr-hac-clusters-vertical.txt
+ │ │ ├── rwr-hac-horizontal.png
+ │ │ ├── rwr-hac-vertical.png
+ │ │ ├── rwr-jaccard-heatmap.png
+ │ │ ├── rwr-jaccard-matrix.txt
+ │ │ ├── rwr-pca-coordinates.txt
+ │ │ ├── rwr-pca-variance.txt
+ │ │ ├── rwr-pca.png
+ │ │ ├── strwr-ensemble-pathway.txt
+ │ │ ├── strwr-hac-clusters-horizontal.txt
+ │ │ ├── strwr-hac-clusters-vertical.txt
+ │ │ ├── strwr-hac-horizontal.png
+ │ │ ├── strwr-hac-vertical.png
+ │ │ ├── strwr-jaccard-heatmap.png
+ │ │ ├── strwr-jaccard-matrix.txt
+ │ │ ├── strwr-pca-coordinates.txt
+ │ │ ├── strwr-pca-variance.txt
+ │ │ └── strwr-pca.png
+ │ ├── egfr-omicsintegrator2-params-44PJEHW/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-4NC62EL/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-4VRLTK5/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-52OUGT2/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-KEVHYWP/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-RUGOWNI/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-RVH2YKU/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-omicsintegrator2-params-WW2ILRO/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-pathlinker-params-7S4SLU6/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-pathlinker-params-D4TUKMX/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-pathlinker-params-TFORORH/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-pathlinker-params-VQL7BDZ/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-pathway-summary.txt
+ │ ├── egfr-rwr-params-34NN6EK/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-rwr-params-GGZCZBU/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-strwr-params-34NN6EK/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── egfr-strwr-params-GGZCZBU/
+ │ │ ├── pathway.txt
+ │ │ └── raw-pathway.txt
+ │ ├── gs-gs_egfr-merged.pickle
+ │ ├── logs/
+ │ │ ├── datasets-egfr.yaml
+ │ │ ├── parameters-mincostflow-params-42UBTQI.yaml
+ │ │ ├── parameters-mincostflow-params-B4P4LUU.yaml
+ │ │ ├── parameters-mincostflow-params-KTZPGLQ.yaml
+ │ │ ├── parameters-mincostflow-params-MY6UCHG.yaml
+ │ │ ├── parameters-omicsintegrator2-params-44PJEHW.yaml
+ │ │ ├── parameters-omicsintegrator2-params-4NC62EL.yaml
+ │ │ ├── parameters-omicsintegrator2-params-4VRLTK5.yaml
+ │ │ ├── parameters-omicsintegrator2-params-52OUGT2.yaml
+ │ │ ├── parameters-omicsintegrator2-params-KEVHYWP.yaml
+ │ │ ├── parameters-omicsintegrator2-params-RUGOWNI.yaml
+ │ │ ├── parameters-omicsintegrator2-params-RVH2YKU.yaml
+ │ │ ├── parameters-omicsintegrator2-params-WW2ILRO.yaml
+ │ │ ├── parameters-pathlinker-params-7S4SLU6.yaml
+ │ │ ├── parameters-pathlinker-params-D4TUKMX.yaml
+ │ │ ├── parameters-pathlinker-params-TFORORH.yaml
+ │ │ ├── parameters-pathlinker-params-VQL7BDZ.yaml
+ │ │ ├── parameters-rwr-params-34NN6EK.yaml
+ │ │ ├── parameters-rwr-params-GGZCZBU.yaml
+ │ │ ├── parameters-strwr-params-34NN6EK.yaml
+ │ │ └── parameters-strwr-params-GGZCZBU.yaml
+ │ └── prepared/
+ │ ├── egfr-mincostflow-inputs/
+ │ │ ├── edges.txt
+ │ │ ├── sources.txt
+ │ │ └── targets.txt
+ │ ├── egfr-omicsintegrator2-inputs/
+ │ │ ├── edges.txt
+ │ │ └── prizes.txt
+ │ ├── egfr-pathlinker-inputs/
+ │ │ ├── network.txt
+ │ │ └── nodetypes.txt
+ │ ├── egfr-rwr-inputs/
+ │ │ ├── network.txt
+ │ │ └── nodes.txt
+ │ └── egfr-strwr-inputs/
+ │ ├── network.txt
+ │ ├── sources.txt
+ │ └── targets.txt
+
+4.2 What is parameter selection?
+================================
+
+Parameter selection refers to the process of determining which parameter
+combinations should be used for evaluation on a gold standard dataset.
+Each parameter selection method has its own corresponding evaluation
+procedure.
+
+.. note::
+
+ There is no single principled way to decide which outputs to
+ evaluate, so SPRAS provides several parameter selection strategies
+ instead of committing to one. Some strategies pick a single
+ representative output for each algorithm, while others evaluate
+ across the full set of parameter combinations.
+
+ Parameter selection also guards against overtuning. Algorithms differ
+ in how many parameters they expose and how much they can be tuned to
+ get a better answer, so comparing them on a representative output
+ rather than on the full sweep puts them on some fairer footing.
+
+ Selecting a representative output also measures how an algorithm
+ typically behaves rather than its best run, which is a better basis
+ for judging an algorithm in practice, where the ideal parameters for
+ a new dataset are not known in advance.
+
+Parameter selection is handled in the evaluation code, which supports
+multiple parameter selection strategies. A user can enable evaluation
+(by setting evaluation ``include: true``) and it will run all of the
+parameter selection code.
+
+.. note::
+
+ Some parameter selection features are still under development and
+ will be added in future SPRAS releases.
+
+PCA-based parameter selection
+-----------------------------
+
+The PCA-based approach identifies a representative parameter setting for
+each pathway reconstruction algorithm on a given dataset. It selects the
+single parameter combination that best captures the central trend of an
+algorithm's reconstruction behavior.
+
+For each algorithm, all reconstructed subnetworks are projected into an
+algorithm-specific 2D PCA space based on the set of edges produced by
+the respective parameter combinations for that algorithm. This
+projection summarizes how the algorithm's outputs vary across different
+parameter combinations, allowing patterns in the outputs to be
+visualized in a lower-dimensional space.
+
+Within each PCA space, a kernel density estimate (KDE) is computed over
+the projected points to identify regions of high density. The output
+closest to the highest KDE peak is selected as the most representative
+parameter setting, as it corresponds to the region where the algorithm
+most consistently produces similar subnetworks.
+
+.. image:: ../_static/images/pca-kde.png
+ :alt: Principal component analysis visualization across pathway outputs with a kernel density estimate computed on top
+ :width: 600
+ :align: center
+
+.. raw:: html
+
+
+
+If two or more pathways are equally close to the highest peak of the
+KDE, SPRAS resolves the tie by:
+
+- Choosing the smallest pathway (fewest nodes and edges).
+- If a tie remains, choosing the first pathway alphabetically by name.
+
+The chosen output subnetwork is then compared to the gold standard, and
+its precision and recall are measured.
+
+Ensemble network-based parameter selection
+------------------------------------------
+
+The ensemble-based approach combines results from all parameter settings
+for each pathway reconstruction algorithm on a given dataset. Instead of
+focusing on a single "best" parameter combination, it summarizes the
+algorithm's overall reconstruction behavior across parameters.
+
+All reconstructed subnetworks are merged into algorithm-specific
+ensemble networks, where each edge weight reflects how frequently that
+interaction appears across the outputs. Edges that occur more often are
+assigned higher weights, highlighting interactions that are most
+consistently recovered by the algorithm.
+
+These consensus networks help identify the core patterns and overall
+stability of an algorithm's output's without needing to choose a single
+parameter setting (no clear optimal parameter combination could exists).
+
+For each algorithm-specific ensemble network, SPRAS generates a
+precision-recall curve by treating edge frequencies as thresholds and
+evaluating each ensemble network against the dataset's associated gold
+standard.
+
+All Plausible Parameters (No parameter selection)
+-------------------------------------------------
+
+The all plausible parameters approach evaluates all parameter
+combinations without selecting a representative subset or ensembling.
+This method provides an holistic view of algorithm performance by
+evaluating every output. For each algorithm and dataset, we compute
+precision and recall for every subnetwork. This allows us to measure
+reconstruction performance across the full range of parameter settings
+and observe each algorithm's full range of capabilities.
+
+4.4 Reviewing the evalaution outputs
+====================================
+
+PCA-based parameter selection
+-----------------------------
+
+#. Open the PCA chosen parameter selection evaluation
+
+In your file explorer, go to
+``output/intermediate/egfr-gs_egfr-eval/pr-per-pathway-nodes.png`` and
+open it locally.
+
+PCA-based parameter selection computes a precision and recall for a
+single reconstructed network selected using PCA from all reconstructed
+networks for an algorithm for given dataset.
+
+.. image:: ../_static/images/pr-pca-chosen-pathway-per-algorithm-nodes.png
+ :alt: Precision and recall computed for each pathway chosen by the PCA-selection method and visualized on a scatter plot
+ :width: 600
+ :align: center
+
+Ensemble network-based parameter selection
+------------------------------------------
+
+#. Open the Ensemble-based parameter selection evalaution
+
+In your file explorer, go to
+``output/intermediate/egfr-gs_egfr-eval/pr-curve-ensemble-nodes-per-algorithm-nodes.png``
+and open it locally.
+
+Ensemble-based parameter selection generates precision-recall curves by
+thresholding on the frequency of edges across an ensemble of
+reconstructed networks for an algorithm for given dataset.
+
+.. image:: ../_static/images/pr-curve-ensemble-nodes-per-algorithm-nodes.png
+ :alt: Precision-recall curve computed for a single ensemble file / pathway and visualized as a curve
+ :width: 600
+ :align: center
+
+.. raw:: html
+
+
+
+All Plausible Parameters (No parameter selection)
+-------------------------------------------------
+
+#. Open the all plausible parameters (no parameter selection) evalaution
+
+In your file explorer, go to
+``output/intermediate/egfr-gs_egfr-eval/pr-per-pathway-nodes.png`` and
+open it locally.
+
+For each pathway, evaluation can be run independently of any parameter
+selection method to directly inspect precision and recall for each
+reconstructed network from a given dataset.
+
+.. raw:: html
+
+
+
+.. image:: ../_static/images/pr-per-pathway-nodes.png
+ :alt: Precision and recall computed for each pathway and visualized on a scatter plot
+ :width: 600
+ :align: center
+
+.. raw:: html
+
+
-References
-==========
+************
+ References
+************
.. [1]
diff --git a/docs/tutorial/introduction.rst b/docs/tutorial/introduction.rst
index 369146742..bd9a6457b 100644
--- a/docs/tutorial/introduction.rst
+++ b/docs/tutorial/introduction.rst
@@ -22,7 +22,133 @@ Together, we will cover:
Prerequisites for this tutorial
*********************************
+Required knowledge
+==================
+
+- Ability to run command line operations and modify YAML files.
+- Basic biology concepts
+
+Option 1: Running SPRAS in a GitHub Codespace
+=============================================
+
+SPRAS also ships with a dev container, and the quickest way to use it is
+through `GitHub Codespaces `_.
+
+A Codespace builds the dev container on GitHub's infrastructure and
+opens it in your browser, so you do not need to install Docker or set up
+a local Python environment. The ``.devcontainer`` configuration in SPRAS
+sets up the environment for you.
+
+Prerequisites
+-------------
+
+A GitHub account. Sign up at `github.com `_ if you
+do not have one.
+
+Step 1: Create a Codespace
+--------------------------
+
+#. Go to `github.com/codespaces `_.
+#. Select **New codespace**.
+#. In the repository field, search for and select
+ ``Reed-CompBio/spras``.
+#. Select **Create codespace**.
+
+GitHub builds the container from the SPRAS ``.devcontainer``
+configuration (the first build takes around 15 minutes) and opens a VS
+Code environment in your browser with the SPRAS dependencies already
+installed. Once the build finishes, you are ready to run SPRAS.
+
+.. note::
+
+ All GitHub personal accounts include a quota of free compute time and
+ storage for GitHub Codespaces. Usage beyond the included amounts is
+ billed to the personal account. See the `GitHub Codespaces billing
+ documentation
+ `_
+ for details.
+
+ .. list-table::
+ :header-rows: 1
+ :widths: 40 30 30
+
+ - - Account plan
+ - Storage per month
+ - Compute time per month
+
+ - - GitHub Free for personal accounts
+ - 15 GB-month
+ - 120 hrs
+
+ - - GitHub Pro
+ - 20 GB-month
+ - 180 hrs
+
+ You will not be charged for codespace usage unless you exceed your
+ quota. If you hit the limit, free options are to create a new GitHub
+ account or switch to the local SPRAS setup.
+
+Step 2: Set up the SPRAS environment
+------------------------------------
+
+From the root directory of the SPRAS repository, create and activate the
+Conda environment, then install the SPRAS Python package.
+
+First, create the environment:
+
+.. code:: bash
+
+ conda env create -f environment.yml
+ conda init
+
+Open a new terminal and then run:
+
+.. code:: bash
+
+ conda activate spras
+ python -m pip install .
+
+.. note::
+
+ The first command performs a one-time installation of the SPRAS
+ dependencies by creating a Conda environment (an isolated space that
+ keeps all required packages and versions separate from your system).
+
+ The second command activates the newly created environment so you can
+ use these dependencies when running SPRAS; this step must be done
+ each time you open a new terminal session.
+
+ The last command is a one-time installation of the SPRAS package into
+ the environment.
+
+.. note::
+
+ You may see the following error during installation:
+
+ .. code:: text
+
+ ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
+ dsub 0.4.13 requires tenacity<=8.2.3, but you have tenacity 9.1.4 which is incompatible.
+
+ This is safe to ignore. We do not use dsub as a container option in
+ this tutorial, and a fix is currently in progress. SPRAS and this
+ tutorial will run correctly without a working dsub installation.
+
+Step 3: Test the installation
+-----------------------------
+
+Run the following command to confirm that SPRAS has been set up
+successfully from the command line:
+
+.. code:: bash
+
+ python -c "import spras; print('SPRAS import successful')"
+
+Option 2: Running SPRAS locally
+===============================
+
Required software:
+------------------
- `Conda
`__
@@ -41,15 +167,75 @@ Required software:
- (Optional) `Cytoscape `__ for visualizing
networks (download locally, the web version will not suffice)
-Required knowledge:
+.. note::
+
+ Mac users who experience performance issues with Docker Desktop can
+ try `OrbStack `_ as an alternative.
-- Ability to run command line operations and modify YAML files.
-- Basic biology concepts
+Step 1: Clone the SPRAS repository
+----------------------------------
+
+Visit the `SPRAS GitHub repository
+`__ and clone it locally
+
+.. note::
+
+ If you are using the dev container, you can skip this step
+
+Step 2: Set up the SPRAS environment
+------------------------------------
+
+From the root directory of the SPRAS repository, create and activate the
+Conda environment and install the SPRAS python package:
+
+.. code:: bash
+
+ conda env create -f environment.yml
+ conda activate spras
+ python -m pip install .
+
+.. note::
+
+ The first command performs a one-time installation of the SPRAS
+ dependencies by creating a Conda environment (an isolated space that
+ keeps all required packages and versions separate from your system).
+
+ The second command activates the newly created environment so you can
+ use these dependencies when running SPRAS; this step must be done
+ each time you open a new terminal session.
+
+ The last command is a one-time installation of the SPRAS package into
+ the environment.
+
+Step 3: Test the installation
+-----------------------------
+
+Run the following command to confirm that SPRAS has been set up
+successfully from the command line:
+
+.. code:: bash
+
+ python -c "import spras; print('SPRAS import successful')"
+
+Step 4: Start Docker
+--------------------
+
+Before running SPRAS, make sure Docker Desktop is running.
+
+Launch Docker Desktop and wait until it says "Docker is running".
+
+.. note::
+
+ SPRAS itself does not run inside a Docker container. However, Docker
+ is required because SPRAS uses it to execute individual pathway
+ reconstruction algorithms and certain post-analysis steps within
+ isolated containers. These containers include all the necessary
+ dependencies to run each algorithm or post analysis.
.. note::
- This tutorial will require downloading approximately 18.3 GB of
- Docker images and running many Docker containers.
+ Running tutorial locally will require downloading approximately 7 GB
+ of Docker images and running many Docker containers.
SPRAS does not automatically clean up these containers or images
after execution, so users will need to remove them manually if
@@ -80,7 +266,7 @@ experiment.
Pathway reconstruction algorithms address this by mapping molecules of
interest onto large-scale interaction networks (interactomes) to
generate candidate context-specific subnetworks that better reflect the
-high-throughput experimental data.
+condition or experiment.
These algorithms allow researchers to propose computational-backed
hypothetical subnetworks that capture the unique characteristics of a
@@ -88,7 +274,7 @@ given context without having to experimentally test every individual
interaction.
Running a single pathway reconstruction algorithm on a single dataset
-can be challenging, since each algorithm often requires its own input
+can be challenging, as each algorithm often requires its own input
format, software environment, or even a full reimplementation. These
challenges only grow when scaling up to using multiple algorithms and
datasets.