eddUG · eddUG · Jul 16, 2024 · Jul 16, 2024 · Jul 16, 2024
diff --git a/README.md b/README.md
@@ -16,6 +16,52 @@ nextflow /path/to/repository/main.nf \
         --project_id XXXXXXXXXX \
         --input_manifest /path/to/input_manifest.tsv
 ```
+## Validation
+
+This repository includes a set of validation scripts to ensure the integrity and correctness of the pipeline.
+
+### Validation Scripts
+
+The `validation` directory contains the following scripts:
+
+- `validate_manifest.py`: Validates the input manifest.
+- `validate_selectvariants_output.sh`: Validates the output of the SelectVariants module.
+- `validate_bgziptabix.py`: Validates the output of the BgzipAndTabix module.
+- `validate_whatshapPhase.py`: Validates the output of the WhatsHapPhase module.
+- `validate_whatshapStats.py`: Validates the output of the WhatsHapStats module.
+- `validate_mergeVcfs.py`: Validates the output of the MergeVcfs module.
+- `validate_bgziptabixii.py`: Validates the output of the BgzipAndTabixII module.
+- `validate_shapeit.py`: Validates the output of the SHAPEIT module.
+- `validate_tabixii.py`: Validates the output of the TabixII module.
+- `validate_ligateChunks.py`: Validates the output of the LigateRegions module.
+- `validate_cohortvcf2zarr.py`: Validates the output of the cohortVcfToZarr module.
+
+### Running the Validation Scripts
+
+#### Interactive approach
+
+To run the validation scripts, navigate to the `validation` directory and execute the desired script. For example:
+
+```bash
+cd validation
+./validate_selectvariants_output.sh
+```
+
+```bash
+cd validation
+python3 validate_ligateregions.py
+```
+
+#### Non-interactive approach
+
+To submit all validation tasks in a single batch, make use of the all_in_one_validation script.
+
+```bash
+cd validation
+python3 all_in_one_validation.py
+```
+
+Ensure that the necessary dependencies (e.g., bcftools, zarr, pyVCF) are installed and available in your PATH.
 
 
 ## Authors and Acknowledgements

diff --git a/validation/all_in_one_validation.py b/validation/all_in_one_validation.py
@@ -0,0 +1,57 @@
+import subprocess
+
+# List of validation scripts and their descriptions
+validations = [
+    ("validate_manifest.py", "Validate Input Manifest"),
+    ("validate_selectvariants_output.sh", "Validate SelectVariants Output"),
+    ("validate_bgziptabix.py", "Validate Bgzip and Tabix"),
+    ("validate_whatshapPhase.py", "Validate WhatsHap Phase"),
+    #("validate_whatshapStats.py", "Validate WhatsHap Stats"),
+    #("validate_mergeVcfs.py", "Validate MergeVcfs"),
+    #("validate_bgziptabixII.py", "Validate BgzipAndTabixII"),
+    #("validate_shapeit.py", "Validate SHAPEIT"),
+    #("validate_tabixII.py", "Validate TabixII"),
+    #("validate_ligateChunks.py", "Validate LigateRegions"),
+    #("validate_cohortvcf2zarr.py", "Validate cohortVcfToZarr")
+]
+
+# Function to run a script and return the result
+def run_script(script):
+    try:
+        result = subprocess.run(["python", script] if script.endswith(".py") else ["bash", script], capture_output=True, text=True)
+        success = result.returncode == 0
+        details = result.stderr if not success else result.stdout
+        return success, details.strip()
+    except Exception as e:
+        return False, str(e)
+
+# Run all validation scripts and collect results
+results = {}
+for script, description in validations:
+    success, details = run_script(script)
+    results[description] = {"status": success, "details": details}
+
+# Generate checklist report
+def generate_validation_report(results):
+    report = []
+    report.append("Validation Checklist Report")
+    report.append("==============================")
+
+    for step, result in results.items():
+        if result['status']:
+            report.append(f"{step}: \033[92mSuccess\033[0m")
+        else:
+            report.append(f"{step}: \033[91mFailed\033[0m")
+            if 'details' in result:
+                report.append(f"  Details: {result['details']}")
+        report.append("------------------------------")
+
+    return "\n".join(report)
+
+# Generate and print the report
+report = generate_validation_report(results)
+print(report)
+
+# Save the report to a file
+with open("validation_report.txt", "w") as report_file:
+    report_file.write(report)
diff --git a/validation/validate_bgziptabix.py b/validation/validate_bgziptabix.py
@@ -0,0 +1,103 @@
+import os
+import subprocess
+import glob
+
+def check_file_exists(file_path):
+    """Check if the file exists."""
+    if not os.path.isfile(file_path):
+        print(f"Error: File {file_path} does not exist.")
+        return False
+    return True
+
+def check_vcf_gz_format(vcf_gz_path):
+    """Check if the .vcf.gz file is correctly formatted using bcftools."""
+    try:
+        result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: VCF file {vcf_gz_path} is not correctly formatted. bcftools output: {result.stderr}")
+            return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while validating VCF format. {str(e)}")
+        return False
+
+def extract_region(vcf_gz_path):
+    """Extract the region from the first and second entries in the VCF file."""
+    try:
+        result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: Failed to read VCF file {vcf_gz_path}. bcftools output: {result.stderr}")
+            return None
+        lines = result.stdout.split('\n')
+        positions = []
+        for line in lines:
+            if not line.startswith('#'):
+                # Split the line into columns and extract the positions of the first two entries
+                columns = line.split('\t')
+                contig = columns[0]
+                position = columns[1]
+                positions.append(position)
+                if len(positions) == 2:
+                    break
+        if len(positions) < 2:
+            print(f"Error: Less than two data entries found in VCF file {vcf_gz_path}.")
+            return None
+        region = f"{contig}:{positions[0]}-{positions[1]}"
+        return region
+    except Exception as e:
+        print(f"Error: An exception occurred while extracting region from VCF file. {str(e)}")
+        return None
+
+def check_tbi_association(vcf_gz_path, region):
+    """Check if the .tbi index file is correctly associated with the .vcf.gz file using tabix."""
+    try:
+        result = subprocess.run(['tabix', vcf_gz_path, region], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: TBI file for {vcf_gz_path} is not correctly associated. Tabix output: {result.stderr}")
+            return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while checking TBI association. {str(e)}")
+        return False
+
+def validate_bgzip_and_tabix_output(vcf_gz_path):
+    """Validate the output of the BgzipAndTabix module."""
+    if not check_file_exists(vcf_gz_path):
+        return False
+
+    tbi_path = vcf_gz_path + '.tbi'
+    if not check_file_exists(tbi_path):
+        return False
+
+    if not check_vcf_gz_format(vcf_gz_path):
+        return False
+
+    region = extract_region(vcf_gz_path)
+    if not region:
+        return False
+
+    if not check_tbi_association(vcf_gz_path, region):
+        return False
+
+    print(f"Success: The .vcf.gz and .tbi files for {vcf_gz_path} are correctly formatted and valid.")
+    return True
+
+def validate_multiple_files(directory):
+    """Validate multiple .vcf.gz and .tbi files in a given directory."""
+    vcf_gz_files = glob.glob(os.path.join(directory, '*.vcf.gz'))
+
+    all_valid = True
+    for vcf_gz_path in vcf_gz_files:
+        if not validate_bgzip_and_tabix_output(vcf_gz_path):
+            all_valid = False
+
+    if all_valid:
+        print("All files validated successfully.")
+    else:
+        print("Some files failed validation.")
+
+# Define the directory containing the .vcf.gz and .tbi files
+directory = '../vector_hap/results/select_variants/'
+
+validate_multiple_files(directory)
+
diff --git a/validation/validate_bgziptabixII.py b/validation/validate_bgziptabixII.py
@@ -0,0 +1,104 @@
+import os
+import subprocess
+import glob
+
+def check_file_exists(file_path):
+    """Check if the file exists."""
+    if not os.path.isfile(file_path):
+        print(f"Error: File {file_path} does not exist.")
+        return False
+    return True
+
+def check_vcf_gz_format(vcf_gz_path):
+    """Check if the .vcf.gz file is correctly formatted using bcftools."""
+    try:
+        result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: VCF file {vcf_gz_path} is not correctly formatted. bcftools output: {result.stderr}")
+            return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while validating VCF format. {str(e)}")
+        return False
+
+def extract_region(vcf_gz_path):
+    """Extract the region from the first and second entries in the VCF file."""
+    try:
+        result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: Failed to read VCF file {vcf_gz_path}. bcftools output: {result.stderr}")
+            return None
+        lines = result.stdout.split('\n')
+        positions = []
+        for line in lines:
+            if not line.startswith('#'):
+                # Split the line into columns and extract the positions of the first two entries
+                columns = line.split('\t')
+                contig = columns[0]
+                position = columns[1]
+                positions.append(position)
+                if len(positions) == 2:
+                    break
+        if len(positions) < 2:
+            print(f"Error: Less than two data entries found in VCF file {vcf_gz_path}.")
+            return None
+        region = f"{contig}:{positions[0]}-{positions[1]}"
+        return region
+    except Exception as e:
+        print(f"Error: An exception occurred while extracting region from VCF file. {str(e)}")
+        return None
+
+def check_tbi_association(vcf_gz_path, region):
+    """Check if the .tbi index file is correctly associated with the .vcf.gz file using tabix."""
+    try:
+        result = subprocess.run(['tabix', vcf_gz_path, region], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: TBI file for {vcf_gz_path} is not correctly associated. Tabix output: {result.stderr}")
+            return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while checking TBI association. {str(e)}")
+        return False
+
+def validate_bgzip_and_tabix_output(vcf_gz_path):
+    """Validate the output of the BgzipAndTabix module."""
+    if not check_file_exists(vcf_gz_path):
+        return False
+
+    tbi_path = vcf_gz_path + '.tbi'
+    if not check_file_exists(tbi_path):
+        return False
+
+    if not check_vcf_gz_format(vcf_gz_path):
+        return False
+
+    region = extract_region(vcf_gz_path)
+    if not region:
+        return False
+
+    if not check_tbi_association(vcf_gz_path, region):
+        return False
+
+    print(f"Success: The .vcf.gz and .tbi files for {vcf_gz_path} are correctly formatted and valid.")
+    return True
+
+def validate_multiple_files(directory):
+    """Validate multiple .vcf.gz and .tbi files in a given directory."""
+    vcf_gz_files = glob.glob(os.path.join(directory, '*_merged.vcf.gz'))
+
+    all_valid = True
+    for vcf_gz_path in vcf_gz_files:
+        print(f"Validating {vcf_gz_path}...")
+        if not validate_bgzip_and_tabix_output(vcf_gz_path):
+            all_valid = False
+
+    if all_valid:
+        print("All files validated successfully.")
+    else:
+        print("Some files failed validation.")
+
+# Define the directory containing the .vcf.gz and .tbi files
+directory = '../vector_hap/results/cohort_phasing/'
+
+validate_multiple_files(directory)
+
diff --git a/validation/validate_cohortvcf2zarr.py b/validation/validate_cohortvcf2zarr.py
@@ -0,0 +1,52 @@
+import os
+
+def check_directory_exists(directory_path):
+    """Check if a directory exists."""
+    if not os.path.isdir(directory_path):
+        print(f"Error: Directory {directory_path} does not exist.")
+        return False
+    return True
+
+def check_zarr_structure(zarr_path, contig):
+    """Check the structure of the Zarr directory."""
+    expected_folders = [
+        'calldata/GT',
+        'variants/AC', 'variants/AF', 'variants/ALT',
+        'variants/CM', 'variants/POS', 'variants/REF'
+    ]
+    for folder in expected_folders:
+        full_path = os.path.join(zarr_path, folder)
+        if not check_directory_exists(full_path):
+            print(f"Error: Zarr directory {zarr_path} is missing {folder}.")
+            return False
+    return True
+
+def validate_cohort_vcf_to_zarr(contig, zarr_directory):
+    """Validate the output of the cohort VCF to Zarr conversion."""
+    zarr_output = os.path.join(zarr_directory, f"validation_{contig}.zarr", contig)
+
+    if not check_directory_exists(zarr_output):
+        return False
+
+    if not check_zarr_structure(zarr_output, contig):
+        return False
+
+    print(f"Success: The Zarr directory {zarr_output} is correctly formatted and valid.")
+    return True
+
+def validate_multiple_files(contigs, zarr_directory):
+    """Validate multiple files processed by cohort_vcf_to_zarr.py."""
+    for contig in contigs:
+        print(f"Validating contig {contig}...")
+        if validate_cohort_vcf_to_zarr(contig, zarr_directory):
+            print(f"Validation passed for contig {contig}.")
+        else:
+            print(f"Validation failed for contig {contig}.")
+
+# Define the directories
+contigs = ['2R', '2L', '3R', '3L', 'X']
+zarr_directory = '../vector_hap/results/cohort_phasing/ligate/outputs/'
+
+# Run validation on multiple files
+validate_multiple_files(contigs, zarr_directory)
+