From 156876ea3a248100386057f6d5b8ba4d79c4b3a9 Mon Sep 17 00:00:00 2001
From: Edward Lukyamuzi <elukyamuzi2010@gmail.com>
Date: Tue, 16 Jul 2024 13:16:11 -0700
Subject: [PATCH 1/3] add pipeline integrity checks

---
 README.md                                    |  44 ++++++++
 validation/all_in_one_validation.py          |  57 ++++++++++
 validation/validate_bgziptabix.py            | 103 ++++++++++++++++++
 validation/validate_bgziptabixII.py          | 104 +++++++++++++++++++
 validation/validate_cohortvcf2zarr.py        |  52 ++++++++++
 validation/validate_ligateChunks.py          |  99 ++++++++++++++++++
 validation/validate_manifest.py              |  49 +++++++++
 validation/validate_mergedVcfs.py            |  57 ++++++++++
 validation/validate_selectvariants_output.sh |  37 +++++++
 validation/validate_shapeit.py               |  83 +++++++++++++++
 validation/validate_tabixII.py               |  95 +++++++++++++++++
 validation/validate_whatshapPhase.py         |  73 +++++++++++++
 validation/validate_whatshapStats.py         |  97 +++++++++++++++++
 13 files changed, 950 insertions(+)
 create mode 100755 validation/all_in_one_validation.py
 create mode 100755 validation/validate_bgziptabix.py
 create mode 100755 validation/validate_bgziptabixII.py
 create mode 100755 validation/validate_cohortvcf2zarr.py
 create mode 100755 validation/validate_ligateChunks.py
 create mode 100755 validation/validate_manifest.py
 create mode 100755 validation/validate_mergedVcfs.py
 create mode 100755 validation/validate_selectvariants_output.sh
 create mode 100755 validation/validate_shapeit.py
 create mode 100755 validation/validate_tabixII.py
 create mode 100755 validation/validate_whatshapPhase.py
 create mode 100755 validation/validate_whatshapStats.py

diff --git a/README.md b/README.md
index a7b5537..a43e26c 100644
--- a/README.md
+++ b/README.md
@@ -16,6 +16,50 @@ nextflow /path/to/repository/main.nf \
         --project_id XXXXXXXXXX \
         --input_manifest /path/to/input_manifest.tsv
 ```
+## Validation
+
+This repository includes a set of validation scripts to ensure the integrity and correctness of the pipeline.
+
+### Validation Scripts
+
+The `validation` directory contains the following scripts:
+
+- `validate_manifest.py`: Validates the input manifest.
+- `validate_selectvariants_output.sh`: Validates the output of the SelectVariants module.
+- `validate_bgziptabix.py`: Validates the output of the BgzipAndTabix module.
+- `validate_whatshapPhase.py`: Validates the output of the WhatsHapPhase module.
+- `validate_whatshapStats.py`: Validates the output of the WhatsHapStats module.
+- `validate_mergeVcfs.py`: Validates the output of the MergeVcfs module.
+- `validate_bgziptabixii.py`: Validates the output of the BgzipAndTabixII module.
+- `validate_shapeit.py`: Validates the output of the SHAPEIT module.
+- `validate_tabixii.py`: Validates the output of the TabixII module.
+- `validate_ligateChunks.py`: Validates the output of the LigateRegions module.
+- `validate_cohortvcf2zarr.py`: Validates the output of the cohortVcfToZarr module.
+
+### Running the Validation Scripts
+
+####Interactive approach
+To run the validation scripts, navigate to the `validation` directory and execute the desired script. For example:
+
+```bash
+cd validation
+./validate_selectvariants_output.sh
+```
+
+```bash
+cd validation
+python3 validate_ligateregions.py
+```
+
+####Non-interactive approach
+To submit all validation tasks in a single batch, make use of the all_in_one_validation script.
+
+```bash
+cd validation
+python3 all_in_one_validation.py
+```
+
+Ensure that the necessary dependencies (e.g., bcftools, Zarr, pyVCF) are installed and available in your PATH.
 
 
 ## Authors and Acknowledgements
diff --git a/validation/all_in_one_validation.py b/validation/all_in_one_validation.py
new file mode 100755
index 0000000..e53a5a8
--- /dev/null
+++ b/validation/all_in_one_validation.py
@@ -0,0 +1,57 @@
+import subprocess
+
+# List of validation scripts and their descriptions
+validations = [
+    ("validate_manifest.py", "Validate Input Manifest"),
+    ("validate_selectvariants_output.sh", "Validate SelectVariants Output"),
+    ("validate_bgziptabix.py", "Validate Bgzip and Tabix"),
+    ("validate_whatshapPhase.py", "Validate WhatsHap Phase"),
+    #("validate_whatshapStats.py", "Validate WhatsHap Stats"),
+    #("validate_mergeVcfs.py", "Validate MergeVcfs"),
+    #("validate_bgziptabixII.py", "Validate BgzipAndTabixII"),
+    #("validate_shapeit.py", "Validate SHAPEIT"),
+    #("validate_tabixII.py", "Validate TabixII"),
+    #("validate_ligateChunks.py", "Validate LigateRegions"),
+    #("validate_cohortvcf2zarr.py", "Validate cohortVcfToZarr")
+]
+
+# Function to run a script and return the result
+def run_script(script):
+    try:
+        result = subprocess.run(["python", script] if script.endswith(".py") else ["bash", script], capture_output=True, text=True)
+        success = result.returncode == 0
+        details = result.stderr if not success else result.stdout
+        return success, details.strip()
+    except Exception as e:
+        return False, str(e)
+
+# Run all validation scripts and collect results
+results = {}
+for script, description in validations:
+    success, details = run_script(script)
+    results[description] = {"status": success, "details": details}
+
+# Generate checklist report
+def generate_validation_report(results):
+    report = []
+    report.append("Validation Checklist Report")
+    report.append("==============================")
+
+    for step, result in results.items():
+        if result['status']:
+            report.append(f"{step}: \033[92mSuccess\033[0m")
+        else:
+            report.append(f"{step}: \033[91mFailed\033[0m")
+            if 'details' in result:
+                report.append(f"  Details: {result['details']}")
+        report.append("------------------------------")
+
+    return "\n".join(report)
+
+# Generate and print the report
+report = generate_validation_report(results)
+print(report)
+
+# Save the report to a file
+with open("validation_report.txt", "w") as report_file:
+    report_file.write(report)
diff --git a/validation/validate_bgziptabix.py b/validation/validate_bgziptabix.py
new file mode 100755
index 0000000..acc0253
--- /dev/null
+++ b/validation/validate_bgziptabix.py
@@ -0,0 +1,103 @@
+import os
+import subprocess
+import glob
+
+def check_file_exists(file_path):
+    """Check if the file exists."""
+    if not os.path.isfile(file_path):
+        print(f"Error: File {file_path} does not exist.")
+        return False
+    return True
+
+def check_vcf_gz_format(vcf_gz_path):
+    """Check if the .vcf.gz file is correctly formatted using bcftools."""
+    try:
+        result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: VCF file {vcf_gz_path} is not correctly formatted. bcftools output: {result.stderr}")
+            return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while validating VCF format. {str(e)}")
+        return False
+
+def extract_region(vcf_gz_path):
+    """Extract the region from the first and second entries in the VCF file."""
+    try:
+        result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: Failed to read VCF file {vcf_gz_path}. bcftools output: {result.stderr}")
+            return None
+        lines = result.stdout.split('\n')
+        positions = []
+        for line in lines:
+            if not line.startswith('#'):
+                # Split the line into columns and extract the positions of the first two entries
+                columns = line.split('\t')
+                contig = columns[0]
+                position = columns[1]
+                positions.append(position)
+                if len(positions) == 2:
+                    break
+        if len(positions) < 2:
+            print(f"Error: Less than two data entries found in VCF file {vcf_gz_path}.")
+            return None
+        region = f"{contig}:{positions[0]}-{positions[1]}"
+        return region
+    except Exception as e:
+        print(f"Error: An exception occurred while extracting region from VCF file. {str(e)}")
+        return None
+
+def check_tbi_association(vcf_gz_path, region):
+    """Check if the .tbi index file is correctly associated with the .vcf.gz file using tabix."""
+    try:
+        result = subprocess.run(['tabix', vcf_gz_path, region], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: TBI file for {vcf_gz_path} is not correctly associated. Tabix output: {result.stderr}")
+            return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while checking TBI association. {str(e)}")
+        return False
+
+def validate_bgzip_and_tabix_output(vcf_gz_path):
+    """Validate the output of the BgzipAndTabix module."""
+    if not check_file_exists(vcf_gz_path):
+        return False
+
+    tbi_path = vcf_gz_path + '.tbi'
+    if not check_file_exists(tbi_path):
+        return False
+
+    if not check_vcf_gz_format(vcf_gz_path):
+        return False
+
+    region = extract_region(vcf_gz_path)
+    if not region:
+        return False
+
+    if not check_tbi_association(vcf_gz_path, region):
+        return False
+
+    print(f"Success: The .vcf.gz and .tbi files for {vcf_gz_path} are correctly formatted and valid.")
+    return True
+
+def validate_multiple_files(directory):
+    """Validate multiple .vcf.gz and .tbi files in a given directory."""
+    vcf_gz_files = glob.glob(os.path.join(directory, '*.vcf.gz'))
+
+    all_valid = True
+    for vcf_gz_path in vcf_gz_files:
+        if not validate_bgzip_and_tabix_output(vcf_gz_path):
+            all_valid = False
+
+    if all_valid:
+        print("All files validated successfully.")
+    else:
+        print("Some files failed validation.")
+
+# Define the directory containing the .vcf.gz and .tbi files
+directory = '../vector_hap/results/select_variants/'
+
+validate_multiple_files(directory)
+
diff --git a/validation/validate_bgziptabixII.py b/validation/validate_bgziptabixII.py
new file mode 100755
index 0000000..612295b
--- /dev/null
+++ b/validation/validate_bgziptabixII.py
@@ -0,0 +1,104 @@
+import os
+import subprocess
+import glob
+
+def check_file_exists(file_path):
+    """Check if the file exists."""
+    if not os.path.isfile(file_path):
+        print(f"Error: File {file_path} does not exist.")
+        return False
+    return True
+
+def check_vcf_gz_format(vcf_gz_path):
+    """Check if the .vcf.gz file is correctly formatted using bcftools."""
+    try:
+        result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: VCF file {vcf_gz_path} is not correctly formatted. bcftools output: {result.stderr}")
+            return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while validating VCF format. {str(e)}")
+        return False
+
+def extract_region(vcf_gz_path):
+    """Extract the region from the first and second entries in the VCF file."""
+    try:
+        result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: Failed to read VCF file {vcf_gz_path}. bcftools output: {result.stderr}")
+            return None
+        lines = result.stdout.split('\n')
+        positions = []
+        for line in lines:
+            if not line.startswith('#'):
+                # Split the line into columns and extract the positions of the first two entries
+                columns = line.split('\t')
+                contig = columns[0]
+                position = columns[1]
+                positions.append(position)
+                if len(positions) == 2:
+                    break
+        if len(positions) < 2:
+            print(f"Error: Less than two data entries found in VCF file {vcf_gz_path}.")
+            return None
+        region = f"{contig}:{positions[0]}-{positions[1]}"
+        return region
+    except Exception as e:
+        print(f"Error: An exception occurred while extracting region from VCF file. {str(e)}")
+        return None
+
+def check_tbi_association(vcf_gz_path, region):
+    """Check if the .tbi index file is correctly associated with the .vcf.gz file using tabix."""
+    try:
+        result = subprocess.run(['tabix', vcf_gz_path, region], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: TBI file for {vcf_gz_path} is not correctly associated. Tabix output: {result.stderr}")
+            return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while checking TBI association. {str(e)}")
+        return False
+
+def validate_bgzip_and_tabix_output(vcf_gz_path):
+    """Validate the output of the BgzipAndTabix module."""
+    if not check_file_exists(vcf_gz_path):
+        return False
+
+    tbi_path = vcf_gz_path + '.tbi'
+    if not check_file_exists(tbi_path):
+        return False
+
+    if not check_vcf_gz_format(vcf_gz_path):
+        return False
+
+    region = extract_region(vcf_gz_path)
+    if not region:
+        return False
+
+    if not check_tbi_association(vcf_gz_path, region):
+        return False
+
+    print(f"Success: The .vcf.gz and .tbi files for {vcf_gz_path} are correctly formatted and valid.")
+    return True
+
+def validate_multiple_files(directory):
+    """Validate multiple .vcf.gz and .tbi files in a given directory."""
+    vcf_gz_files = glob.glob(os.path.join(directory, '*_merged.vcf.gz'))
+
+    all_valid = True
+    for vcf_gz_path in vcf_gz_files:
+        print(f"Validating {vcf_gz_path}...")
+        if not validate_bgzip_and_tabix_output(vcf_gz_path):
+            all_valid = False
+
+    if all_valid:
+        print("All files validated successfully.")
+    else:
+        print("Some files failed validation.")
+
+# Define the directory containing the .vcf.gz and .tbi files
+directory = '../vector_hap/results/cohort_phasing/'
+
+validate_multiple_files(directory)
+
diff --git a/validation/validate_cohortvcf2zarr.py b/validation/validate_cohortvcf2zarr.py
new file mode 100755
index 0000000..784405d
--- /dev/null
+++ b/validation/validate_cohortvcf2zarr.py
@@ -0,0 +1,52 @@
+import os
+
+def check_directory_exists(directory_path):
+    """Check if a directory exists."""
+    if not os.path.isdir(directory_path):
+        print(f"Error: Directory {directory_path} does not exist.")
+        return False
+    return True
+
+def check_zarr_structure(zarr_path, contig):
+    """Check the structure of the Zarr directory."""
+    expected_folders = [
+        'calldata/GT',
+        'variants/AC', 'variants/AF', 'variants/ALT',
+        'variants/CM', 'variants/POS', 'variants/REF'
+    ]
+    for folder in expected_folders:
+        full_path = os.path.join(zarr_path, folder)
+        if not check_directory_exists(full_path):
+            print(f"Error: Zarr directory {zarr_path} is missing {folder}.")
+            return False
+    return True
+
+def validate_cohort_vcf_to_zarr(contig, zarr_directory):
+    """Validate the output of the cohort VCF to Zarr conversion."""
+    zarr_output = os.path.join(zarr_directory, f"validation_{contig}.zarr", contig)
+
+    if not check_directory_exists(zarr_output):
+        return False
+
+    if not check_zarr_structure(zarr_output, contig):
+        return False
+
+    print(f"Success: The Zarr directory {zarr_output} is correctly formatted and valid.")
+    return True
+
+def validate_multiple_files(contigs, zarr_directory):
+    """Validate multiple files processed by cohort_vcf_to_zarr.py."""
+    for contig in contigs:
+        print(f"Validating contig {contig}...")
+        if validate_cohort_vcf_to_zarr(contig, zarr_directory):
+            print(f"Validation passed for contig {contig}.")
+        else:
+            print(f"Validation failed for contig {contig}.")
+
+# Define the directories
+contigs = ['2R', '2L', '3R', '3L', 'X']
+zarr_directory = '../vector_hap/results/cohort_phasing/ligate/outputs/'
+
+# Run validation on multiple files
+validate_multiple_files(contigs, zarr_directory)
+
diff --git a/validation/validate_ligateChunks.py b/validation/validate_ligateChunks.py
new file mode 100755
index 0000000..a480583
--- /dev/null
+++ b/validation/validate_ligateChunks.py
@@ -0,0 +1,99 @@
+import os
+import glob
+import subprocess
+
+def check_file_exists(file_path):
+    """Check if the file exists."""
+    if not os.path.isfile(file_path):
+        print(f"Error: File {file_path} does not exist.")
+        return False
+    return True
+
+def check_vcf_format(vcf_path):
+    """Check if the VCF file is correctly formatted using bcftools."""
+    try:
+        result = subprocess.run(['bcftools', 'view', vcf_path], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: VCF file {vcf_path} is not correctly formatted. bcftools output: {result.stderr}")
+            return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while validating VCF format. {str(e)}")
+        return False
+
+def list_phased_chunks(contig, phased_vcf_directory):
+    """List all phased chunk files for a given contig."""
+    phased_files_pattern = os.path.join(phased_vcf_directory, f"*{contig}_phased.vcf.gz")
+    phased_files = glob.glob(phased_files_pattern)
+    return phased_files
+
+def extract_chrom_entries_with_grep(vcf_path):
+    """Extract chrom entries from a VCF file using grep."""
+    try:
+        result = subprocess.run(['zgrep', '-v', '^#', vcf_path], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: Failed to extract chrom entries from {vcf_path}. grep output: {result.stderr}")
+            return None
+        entries = [line.split('\t')[0:2] for line in result.stdout.splitlines()]
+        return entries
+    except Exception as e:
+        print(f"Error: An exception occurred while extracting chrom entries. {str(e)}")
+        return None
+
+def validate_concatenated_file(contig, phased_files, concatenated_file_path):
+    """Validate that all phased chunk files are concatenated into one file."""
+    if not check_file_exists(concatenated_file_path):
+        print(f"Error: Concatenated VCF file {concatenated_file_path} does not exist.")
+        return False
+
+    if not check_vcf_format(concatenated_file_path):
+        return False
+
+    concatenated_entries = extract_chrom_entries_with_grep(concatenated_file_path)
+    if concatenated_entries is None:
+        return False
+    concatenated_set = set(tuple(entry) for entry in concatenated_entries)
+
+    for phased_file in phased_files:
+        phased_entries = extract_chrom_entries_with_grep(phased_file)
+        if phased_entries is None:
+            return False
+
+        for entry in phased_entries:
+            if tuple(entry) not in concatenated_set:
+                print(f"Error: Chrom entry {entry} from {phased_file} is not found in {concatenated_file_path}.")
+                return False
+
+    return True
+
+def validate_ligate_regions(vcf_directory, phased_vcf_directory):
+    """Validate that all phased chunks per contig are concatenated into one file per contig."""
+    contigs = ['2R', '2L', '3R', '3L', 'X']
+    for contig in contigs:
+        concatenated_file_pattern = os.path.join(vcf_directory, f"*{contig}_phased.vcf.gz")
+        concatenated_files = glob.glob(concatenated_file_pattern)
+
+        if not concatenated_files:
+            print(f"Error: No concatenated VCF file found for contig {contig}.")
+            continue
+
+        concatenated_file_path = concatenated_files[0]  # Use the first match
+        phased_files = list_phased_chunks(contig, phased_vcf_directory)
+        
+        if not phased_files:
+            print(f"Error: No phased chunk files found for contig {contig}.")
+            continue
+        
+        print(f"Validating {contig} with concatenated VCF {concatenated_file_path}...")
+        if validate_concatenated_file(contig, phased_files, concatenated_file_path):
+            print(f"Validation passed for contig {contig}.")
+        else:
+            print(f"Validation failed for contig {contig}.")
+
+# Define the directories
+vcf_directory = '../vector_hap/results/cohort_phasing/ligate/'
+phased_vcf_directory = '../vector_hap/results/cohort_phasing/phased_chunks/'
+
+# Run validation
+validate_ligate_regions(vcf_directory, phased_vcf_directory)
+
diff --git a/validation/validate_manifest.py b/validation/validate_manifest.py
new file mode 100755
index 0000000..6080cd2
--- /dev/null
+++ b/validation/validate_manifest.py
@@ -0,0 +1,49 @@
+import csv
+import os
+
+# Path to the input manifest file
+input_manifest = '../vector_hap/wgs_snp_testdata.tsv'
+
+# Required columns
+required_columns = ['sample_id', 'bam_path', 'bai_path', 'vcf_path', 'zarr_path']
+
+def validate_manifest(manifest_path):
+    """
+    Validate the input manifest file for required columns and non-empty values.
+    
+    Args:
+    manifest_path (str): Path to the manifest file.
+    
+    Returns:
+    bool: True if validation passes, False otherwise.
+    """
+    try:
+        with open(manifest_path, 'r') as file:
+            reader = csv.DictReader(file, delimiter='\t')
+            
+            # Check if all required columns are present
+            if not all(column in reader.fieldnames for column in required_columns):
+                print(f"Error: Manifest file is missing required columns. Required columns are: {required_columns}")
+                return False
+            
+            # Check each row for non-empty values
+            for row in reader:
+                for column in required_columns:
+                    if not row[column].strip():
+                        print(f"Error: Missing value for column '{column}' in row: {row}")
+                        return False
+                        
+            print("Success: Manifest file is correctly formatted and contains all required fields.")
+            return True
+    
+    except Exception as e:
+        print(f"Error: An exception occurred while validating the manifest file. {str(e)}")
+        return False
+
+# Run the validation
+if __name__ == "__main__":
+    if validate_manifest(input_manifest):
+        print("Manifest file validation passed.")
+    else:
+        print("Manifest file validation failed.")
+
diff --git a/validation/validate_mergedVcfs.py b/validation/validate_mergedVcfs.py
new file mode 100755
index 0000000..e7a415c
--- /dev/null
+++ b/validation/validate_mergedVcfs.py
@@ -0,0 +1,57 @@
+
+import os
+import subprocess
+import glob
+
+def check_file_exists(file_path):
+    """Check if the file exists."""
+    if not os.path.isfile(file_path):
+        print(f"Error: File {file_path} does not exist.")
+        return False
+    return True
+
+
+def check_mergedvcf_format(vcf_path):
+    """Check if the _merged.vcf.gz file is correctly formatted using bcftools."""
+    try:
+        result = subprocess.run(['bcftools', 'view', vcf_path], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: Merged VCF file {vcf_path} is not correctly formatted. bcftools output: {result.stderr}")
+            return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while validating VCF format. {str(e)}")
+        return False
+
+def validate_mergedvcf_output(vcf_path):
+    """Validate the output of the MergedVCF module."""
+    if not check_file_exists(vcf_path):
+        return False
+
+    if not check_mergedvcf_format(vcf_path):
+        return False
+
+    print(f"Success: The Merged VCF file {vcf_path} is correctly formatted and valid.")
+    return True
+
+def validate_multiple_files(vcf_directory):
+    """Validate multiple _merged.vcf.gz files in the given directory."""
+    vcf_files = glob.glob(os.path.join(vcf_directory, "*_merged.vcf"))
+
+    all_valid = True
+    for vcf_path in vcf_files:
+        print(f"Validating {vcf_path}...")
+        if not validate_mergedvcf_output(vcf_path):
+            all_valid = False
+
+    if all_valid:
+        print("All files validated successfully.")
+    else:
+        print("Some files failed validation.")
+
+# Define the directory containing the merged VCF files
+vcf_directory = '../vector_hap/results/cohort_phasing/'
+
+# Run validation on multiple files
+validate_multiple_files(vcf_directory)
+
diff --git a/validation/validate_selectvariants_output.sh b/validation/validate_selectvariants_output.sh
new file mode 100755
index 0000000..c57bee0
--- /dev/null
+++ b/validation/validate_selectvariants_output.sh
@@ -0,0 +1,37 @@
+#!/bin/bash
+
+# Function to validate a single VCF file
+validate_vcf() {
+    local vcf_path=$1
+    if bcftools view "$vcf_path" &> /dev/null; then
+        echo "VCF file $vcf_path is valid."
+        return 0
+    else
+        echo "VCF file $vcf_path is not valid."
+        return 1
+    fi
+}
+
+# Directory containing subset VCF files
+VCF_DIR="../vector_hap/results/select_variants"
+
+# Initialize all VCF files in the directory
+VCF_FILES=("$VCF_DIR"/*.vcf)
+
+# Run validation for each VCF file
+all_valid=true
+for vcf_path in "${VCF_FILES[@]}"; do
+    validate_vcf "$vcf_path"
+    if [[ $? -ne 0 ]]; then
+        all_valid=false
+    fi
+done
+
+# Report overall validation result
+if $all_valid; then
+    echo "All subset VCF files are valid."
+else
+    echo "Some subset VCF files failed validation."
+    exit 1
+fi
+
diff --git a/validation/validate_shapeit.py b/validation/validate_shapeit.py
new file mode 100755
index 0000000..e084850
--- /dev/null
+++ b/validation/validate_shapeit.py
@@ -0,0 +1,83 @@
+import os
+import glob
+import subprocess
+
+def check_file_exists(file_path):
+    """Check if the file exists."""
+    if not os.path.isfile(file_path):
+        print(f"Error: File {file_path} does not exist.")
+        return False
+    return True
+
+def check_vcf_format(vcf_path):
+    """Check if the VCF file is correctly formatted using bcftools."""
+    try:
+        result = subprocess.run(['bcftools', 'view', vcf_path], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: VCF file {vcf_path} is not correctly formatted. bcftools output: {result.stderr}")
+            return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while validating VCF format. {str(e)}")
+        return False
+
+def count_intervals(interval_list_path):
+    """Count the number of intervals in the interval list file."""
+    try:
+        with open(interval_list_path, 'r') as file:
+            intervals = file.readlines()
+        return len(intervals)
+    except Exception as e:
+        print(f"Error: An exception occurred while reading interval list file. {str(e)}")
+        return 0
+
+def validate_phased_chunks(contig, interval_count, phased_vcf_directory):
+    """Validate that the number of phased chunk files matches the number of intervals and their formats."""
+    phased_files_pattern = os.path.join(phased_vcf_directory, f"*{contig}_phased.vcf.gz")
+    phased_files = glob.glob(phased_files_pattern)
+    
+    if len(phased_files) != interval_count:
+        print(f"Error: Number of phased chunk files ({len(phased_files)}) does not match the number of intervals ({interval_count}) for contig {contig}.")
+        return False
+    
+    for phased_file in phased_files:
+        if not check_vcf_format(phased_file):
+            return False
+    
+    return True
+
+def validate_shapeit_phasing(vcf_directory, interval_directory, phased_vcf_directory):
+    """Validate that each contig's merged VCF file has been phased into the correct number of chunks."""
+    contigs = ['2R', '2L', '3R', '3L', 'X']
+    for contig in contigs:
+        merged_vcf_pattern = os.path.join(vcf_directory, f"*{contig}_merged.vcf.gz")
+        merged_vcf_files = glob.glob(merged_vcf_pattern)
+        
+        if not merged_vcf_files:
+            print(f"Error: No merged VCF file found for contig {contig}.")
+            continue
+        
+        merged_vcf_path = merged_vcf_files[0]  # Use the first match
+        interval_list_path = os.path.join(interval_directory, f"intervals_gamb_colu_arab_{contig}_200000_40000.txt")
+        
+        if not check_file_exists(merged_vcf_path):
+            continue
+        
+        interval_count = count_intervals(interval_list_path)
+        if interval_count == 0:
+            continue
+        
+        print(f"Validating {contig} with merged VCF {merged_vcf_path}...")
+        if validate_phased_chunks(contig, interval_count, phased_vcf_directory):
+            print(f"Validation passed for contig {contig}.")
+        else:
+            print(f"Validation failed for contig {contig}.")
+
+# Define the directories
+vcf_directory = '../vector_hap/results/cohort_phasing/'
+interval_directory = '../vector_hap/resources/'
+phased_vcf_directory = '../vector_hap/results/cohort_phasing/phased_chunks/'
+
+# Run validation
+validate_shapeit_phasing(vcf_directory, interval_directory, phased_vcf_directory)
+
diff --git a/validation/validate_tabixII.py b/validation/validate_tabixII.py
new file mode 100755
index 0000000..a01f4b7
--- /dev/null
+++ b/validation/validate_tabixII.py
@@ -0,0 +1,95 @@
+import os
+import subprocess
+import glob
+
+def check_file_exists(file_path):
+    """Check if the file exists."""
+    if not os.path.isfile(file_path):
+        print(f"Error: File {file_path} does not exist.")
+        return False
+    return True
+
+def extract_region(vcf_gz_path):
+    """Extract the region from the first and second entries in the VCF file."""
+    try:
+        result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: Failed to read VCF file {vcf_gz_path}. bcftools output: {result.stderr}")
+            return None
+        lines = result.stdout.split('\n')
+        positions = []
+        for line in lines:
+            if not line.startswith('#'):
+                # Split the line into columns and extract the positions of the first two entries
+                columns = line.split('\t')
+                contig = columns[0]
+                position = columns[1]
+                positions.append(position)
+                if len(positions) == 2:
+                    break
+        if len(positions) < 2:
+            print(f"Error: Less than two data entries found in VCF file {vcf_gz_path}.")
+            return None
+        region = f"{contig}:{positions[0]}-{positions[1]}"
+        return region
+    except Exception as e:
+        print(f"Error: An exception occurred while extracting region from VCF file. {str(e)}")
+        return None
+
+def check_tbi_association(vcf_gz_path, region):
+    """Check if the .tbi index file is correctly associated with the .vcf.gz file using tabix."""
+    try:
+        result = subprocess.run(['tabix', vcf_gz_path, region], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: TBI file for {vcf_gz_path} is not correctly associated. Tabix output: {result.stderr}")
+            return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while checking TBI association. {str(e)}")
+        return False
+
+def validate_tabix_output(vcf_gz_path):
+    """Validate the output of the TabixII module."""
+    if not check_file_exists(vcf_gz_path):
+        return False
+
+    tbi_path = vcf_gz_path + '.tbi'
+    if not check_file_exists(tbi_path):
+        return False
+
+    region = extract_region(vcf_gz_path)
+    if not region:
+        return False
+
+    if not check_tbi_association(vcf_gz_path, region):
+        return False
+
+    print(f"Success: The .tbi file for {vcf_gz_path} is correctly formatted and valid.")
+    return True
+
+def validate_multiple_files(directory):
+    """Validate multiple .tbi files."""
+    contigs = ['2R', '2L', '3R', '3L', 'X']
+    for contig in contigs:
+        #merged_vcf_pattern = os.path.join(vcf_directory, f"*{contig}_merged.vcf.gz")
+        #merged_vcf_files = glob.glob(merged_vcf_pattern)
+        #vcf_gz_files = glob.glob(os.path.join(directory, '*phased.vcf.gz'))
+        vcf_gz_files_pattern = os.path.join(directory, f"*{contig}_phased.vcf.gz")
+        vcf_gz_files = glob.glob(vcf_gz_files_pattern)
+
+        all_valid = True
+        for vcf_gz_path in vcf_gz_files:
+            print(f"Validating {contig}...")
+            if not validate_tabix_output(vcf_gz_path):
+                all_valid = False
+
+        if all_valid:
+            print("All files validated successfully.")
+        else:
+            print("Some files failed validation.")
+
+# Define the directory containing the .vcf.gz and .tbi files
+directory = '../vector_hap/results/cohort_phasing/phased_chunks/'
+
+validate_multiple_files(directory)
+
diff --git a/validation/validate_whatshapPhase.py b/validation/validate_whatshapPhase.py
new file mode 100755
index 0000000..752348c
--- /dev/null
+++ b/validation/validate_whatshapPhase.py
@@ -0,0 +1,73 @@
+import os
+import subprocess
+import glob
+import vcf
+
+def check_file_exists(file_path):
+    """Check if a file exists."""
+    if not os.path.isfile(file_path):
+        print(f"Error: File {file_path} does not exist.")
+        return False
+    return True
+
+def check_vcf_format(vcf_path):
+    """Check if a VCF file is correctly formatted using bcftools."""
+    try:
+        result = subprocess.run(['bcftools', 'view', vcf_path], capture_output=True, text=True)
+        if result.returncode != 0:
+            print(f"Error: VCF file {vcf_path} is not correctly formatted. bcftools output: {result.stderr}")
+            return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while validating VCF format. {str(e)}")
+        return False
+
+def check_phased_genotype_consistency(vcf_path):
+    """Check if phased genotypes in a VCF file are consistent."""
+    try:
+        vcf_reader = vcf.Reader(filename=vcf_path)
+        for record in vcf_reader:
+            for sample in record.samples:
+                if sample.phased:
+                    gt = sample['GT']
+                    if "|" not in gt:
+                        print(f"Error: Found unphased genotype in phased sample in file {vcf_path} at position {record.POS}.")
+                        return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while checking phased genotype consistency. {str(e)}")
+        return False
+
+def validate_whats_hap_phase_output(vcf_path):
+    """Validate the output of the WhatsHapPhase module."""
+    if not check_file_exists(vcf_path):
+        return False
+
+    if not check_vcf_format(vcf_path):
+        return False
+
+    if not check_phased_genotype_consistency(vcf_path):
+        return False
+
+    print(f"Success: The phased VCF file {vcf_path} is correctly formatted and contains consistent phased genotypes.")
+    return True
+
+def validate_multiple_files(directory):
+    """Validate multiple VCF files in a given directory."""
+    vcf_files = glob.glob(os.path.join(directory, '*.vcf.gz'))
+
+    all_valid = True
+    for vcf_path in vcf_files:
+        if not validate_whats_hap_phase_output(vcf_path):
+            all_valid = False
+
+    if all_valid:
+        print("All files validated successfully.")
+    else:
+        print("Some files failed validation.")
+
+# Define the directory containing the VCF files
+directory = '../vector_hap/results/sample_phasing/'
+
+validate_multiple_files(directory)
+
diff --git a/validation/validate_whatshapStats.py b/validation/validate_whatshapStats.py
new file mode 100755
index 0000000..8858384
--- /dev/null
+++ b/validation/validate_whatshapStats.py
@@ -0,0 +1,97 @@
+import os
+import glob
+
+def check_file_exists(file_path):
+    """Check if a file exists."""
+    if not os.path.isfile(file_path):
+        print(f"Error: File {file_path} does not exist.")
+        return False
+    return True
+
+def check_stats_format(stats_path):
+    """Check the format and content of the WhatsHap stats file."""
+    try:
+        with open(stats_path, 'r') as file:
+            header = file.readline().strip().split('\t')
+            expected_header = ['#sample', 'chromosome', 'file_name', 'variants', 'phased', 'unphased', 'singletons', 'blocks', 'variant_per_block_median', 'variant_per_block_avg', 'variant_per_block_min', 'variant_per_block_max', 'variant_per_block_sum', 'bp_per_block_median', 'bp_per_block_avg', 'bp_per_block_min', 'bp_per_block_max', 'bp_per_block_sum', 'heterozygous_variants', 'heterozygous_snvs', 'phased_snvs', 'block_n50']
+            if header != expected_header:
+                print(f"Error: Stats file {stats_path} has an incorrect header: {header}")
+                return False
+
+            for line in file:
+                columns = line.strip().split('\t')
+                if len(columns) != 22:
+                    print(f"Error: Stats file {stats_path} has an incorrect number of columns: {columns}")
+                    return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while validating stats file format. {str(e)}")
+        return False
+
+def check_gtf_format(gtf_path):
+    """Check if a GTF file is correctly formatted."""
+    try:
+        with open(gtf_path, 'r') as file:
+            for line in file:
+                columns = line.strip().split('\t')
+                if len(columns) != 9:
+                    print(f"Error: GTF file {gtf_path} has an incorrect number of columns: {columns}")
+                    return False
+                if columns[2] != "exon":
+                    print(f"Error: GTF file {gtf_path} contains a non-exon entry: {columns}")
+                    return False
+        return True
+    except Exception as e:
+        print(f"Error: An exception occurred while validating GTF format. {str(e)}")
+        return False
+
+def validate_whats_hap_stats_output(vcf_gz_path, tbi_path, stats_path, gtf_path):
+    """Validate the output of the WhatsHapStats module."""
+    if not check_file_exists(vcf_gz_path):
+        return False
+
+    if not check_file_exists(tbi_path):
+        return False
+
+    if not check_file_exists(stats_path):
+        return False
+
+    if not check_file_exists(gtf_path):
+        return False
+
+    if not check_stats_format(stats_path):
+        return False
+
+    if not check_gtf_format(gtf_path):
+        return False
+
+    print(f"Success: The phased VCF file {vcf_gz_path}, its index, the stats file {stats_path}, and the GTF file {gtf_path} are correctly formatted and valid.")
+    return True
+
+def validate_multiple_files(vcf_directory, stats_directory):
+    """Validate multiple VCF files and their associated files in the given directories."""
+    vcf_files = glob.glob(os.path.join(vcf_directory, "*.vcf.gz"))
+    for vcf_file in vcf_files:
+        base_name = os.path.basename(vcf_file).replace('.subset.vcf.phased.vcf.gz', '')
+        stats_file_name = f"{base_name}.stats.tsv"
+        gtf_file_name = f"{base_name}.blocks.gtf"
+
+        vcf_path = vcf_file
+        tbi_path = f"{vcf_path}.tbi"
+        stats_path = os.path.join(stats_directory, stats_file_name)
+        gtf_path = os.path.join(stats_directory, gtf_file_name)
+
+        print(f"Validating {vcf_path} with index {tbi_path}, stats {stats_path}, and GTF {gtf_path}...")
+        if validate_whats_hap_stats_output(vcf_path, tbi_path, stats_path, gtf_path):
+            print(f"Validation passed for {vcf_path}.")
+        else:
+            print(f"Validation failed for {vcf_path}.")
+
+# Define the directories containing the VCF files, stats files, and GTF files
+vcf_directory = '../vector_hap/results/sample_phasing/'
+stats_directory = '../vector_hap/results/phasing_stats/'
+#gtf_directory = '../vector_hap/results/phasing_gtf/'
+
+# Run validation on multiple files
+validate_multiple_files(vcf_directory, stats_directory)
+

From 7b40de6dec6c6fa97f759996aa8e15d47c88d2b3 Mon Sep 17 00:00:00 2001
From: Edward Lukyamuzi <elukyamuzi2010@gmail.com>
Date: Tue, 16 Jul 2024 13:21:37 -0700
Subject: [PATCH 2/3] add pipeline integrity checks

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a43e26c..2a428df 100644
--- a/README.md
+++ b/README.md
@@ -59,7 +59,7 @@ cd validation
 python3 all_in_one_validation.py
 ```
 
-Ensure that the necessary dependencies (e.g., bcftools, Zarr, pyVCF) are installed and available in your PATH.
+Ensure that the necessary dependencies (e.g., bcftools, zarr, pyVCF) are installed and available in your PATH.
 
 
 ## Authors and Acknowledgements

From f82a87d8dd8fcd8073cbbf0eb84c04aaa9312c08 Mon Sep 17 00:00:00 2001
From: eddUG <44267477+eddUG@users.noreply.github.com>
Date: Tue, 16 Jul 2024 23:29:09 +0300
Subject: [PATCH 3/3] Update README.md

---
 README.md | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/README.md b/README.md
index 2a428df..36c2c37 100644
--- a/README.md
+++ b/README.md
@@ -38,7 +38,8 @@ The `validation` directory contains the following scripts:
 
 ### Running the Validation Scripts
 
-####Interactive approach
+#### Interactive approach
+
 To run the validation scripts, navigate to the `validation` directory and execute the desired script. For example:
 
 ```bash
@@ -51,7 +52,8 @@ cd validation
 python3 validate_ligateregions.py
 ```
 
-####Non-interactive approach
+#### Non-interactive approach
+
 To submit all validation tasks in a single batch, make use of the all_in_one_validation script.
 
 ```bash