From 156876ea3a248100386057f6d5b8ba4d79c4b3a9 Mon Sep 17 00:00:00 2001 From: Edward Lukyamuzi Date: Tue, 16 Jul 2024 13:16:11 -0700 Subject: [PATCH 1/3] add pipeline integrity checks --- README.md | 44 ++++++++ validation/all_in_one_validation.py | 57 ++++++++++ validation/validate_bgziptabix.py | 103 ++++++++++++++++++ validation/validate_bgziptabixII.py | 104 +++++++++++++++++++ validation/validate_cohortvcf2zarr.py | 52 ++++++++++ validation/validate_ligateChunks.py | 99 ++++++++++++++++++ validation/validate_manifest.py | 49 +++++++++ validation/validate_mergedVcfs.py | 57 ++++++++++ validation/validate_selectvariants_output.sh | 37 +++++++ validation/validate_shapeit.py | 83 +++++++++++++++ validation/validate_tabixII.py | 95 +++++++++++++++++ validation/validate_whatshapPhase.py | 73 +++++++++++++ validation/validate_whatshapStats.py | 97 +++++++++++++++++ 13 files changed, 950 insertions(+) create mode 100755 validation/all_in_one_validation.py create mode 100755 validation/validate_bgziptabix.py create mode 100755 validation/validate_bgziptabixII.py create mode 100755 validation/validate_cohortvcf2zarr.py create mode 100755 validation/validate_ligateChunks.py create mode 100755 validation/validate_manifest.py create mode 100755 validation/validate_mergedVcfs.py create mode 100755 validation/validate_selectvariants_output.sh create mode 100755 validation/validate_shapeit.py create mode 100755 validation/validate_tabixII.py create mode 100755 validation/validate_whatshapPhase.py create mode 100755 validation/validate_whatshapStats.py diff --git a/README.md b/README.md index a7b5537..a43e26c 100644 --- a/README.md +++ b/README.md @@ -16,6 +16,50 @@ nextflow /path/to/repository/main.nf \ --project_id XXXXXXXXXX \ --input_manifest /path/to/input_manifest.tsv ``` +## Validation + +This repository includes a set of validation scripts to ensure the integrity and correctness of the pipeline. + +### Validation Scripts + +The `validation` directory contains the following scripts: + +- `validate_manifest.py`: Validates the input manifest. +- `validate_selectvariants_output.sh`: Validates the output of the SelectVariants module. +- `validate_bgziptabix.py`: Validates the output of the BgzipAndTabix module. +- `validate_whatshapPhase.py`: Validates the output of the WhatsHapPhase module. +- `validate_whatshapStats.py`: Validates the output of the WhatsHapStats module. +- `validate_mergeVcfs.py`: Validates the output of the MergeVcfs module. +- `validate_bgziptabixii.py`: Validates the output of the BgzipAndTabixII module. +- `validate_shapeit.py`: Validates the output of the SHAPEIT module. +- `validate_tabixii.py`: Validates the output of the TabixII module. +- `validate_ligateChunks.py`: Validates the output of the LigateRegions module. +- `validate_cohortvcf2zarr.py`: Validates the output of the cohortVcfToZarr module. + +### Running the Validation Scripts + +####Interactive approach +To run the validation scripts, navigate to the `validation` directory and execute the desired script. For example: + +```bash +cd validation +./validate_selectvariants_output.sh +``` + +```bash +cd validation +python3 validate_ligateregions.py +``` + +####Non-interactive approach +To submit all validation tasks in a single batch, make use of the all_in_one_validation script. + +```bash +cd validation +python3 all_in_one_validation.py +``` + +Ensure that the necessary dependencies (e.g., bcftools, Zarr, pyVCF) are installed and available in your PATH. ## Authors and Acknowledgements diff --git a/validation/all_in_one_validation.py b/validation/all_in_one_validation.py new file mode 100755 index 0000000..e53a5a8 --- /dev/null +++ b/validation/all_in_one_validation.py @@ -0,0 +1,57 @@ +import subprocess + +# List of validation scripts and their descriptions +validations = [ + ("validate_manifest.py", "Validate Input Manifest"), + ("validate_selectvariants_output.sh", "Validate SelectVariants Output"), + ("validate_bgziptabix.py", "Validate Bgzip and Tabix"), + ("validate_whatshapPhase.py", "Validate WhatsHap Phase"), + #("validate_whatshapStats.py", "Validate WhatsHap Stats"), + #("validate_mergeVcfs.py", "Validate MergeVcfs"), + #("validate_bgziptabixII.py", "Validate BgzipAndTabixII"), + #("validate_shapeit.py", "Validate SHAPEIT"), + #("validate_tabixII.py", "Validate TabixII"), + #("validate_ligateChunks.py", "Validate LigateRegions"), + #("validate_cohortvcf2zarr.py", "Validate cohortVcfToZarr") +] + +# Function to run a script and return the result +def run_script(script): + try: + result = subprocess.run(["python", script] if script.endswith(".py") else ["bash", script], capture_output=True, text=True) + success = result.returncode == 0 + details = result.stderr if not success else result.stdout + return success, details.strip() + except Exception as e: + return False, str(e) + +# Run all validation scripts and collect results +results = {} +for script, description in validations: + success, details = run_script(script) + results[description] = {"status": success, "details": details} + +# Generate checklist report +def generate_validation_report(results): + report = [] + report.append("Validation Checklist Report") + report.append("==============================") + + for step, result in results.items(): + if result['status']: + report.append(f"{step}: \033[92mSuccess\033[0m") + else: + report.append(f"{step}: \033[91mFailed\033[0m") + if 'details' in result: + report.append(f" Details: {result['details']}") + report.append("------------------------------") + + return "\n".join(report) + +# Generate and print the report +report = generate_validation_report(results) +print(report) + +# Save the report to a file +with open("validation_report.txt", "w") as report_file: + report_file.write(report) diff --git a/validation/validate_bgziptabix.py b/validation/validate_bgziptabix.py new file mode 100755 index 0000000..acc0253 --- /dev/null +++ b/validation/validate_bgziptabix.py @@ -0,0 +1,103 @@ +import os +import subprocess +import glob + +def check_file_exists(file_path): + """Check if the file exists.""" + if not os.path.isfile(file_path): + print(f"Error: File {file_path} does not exist.") + return False + return True + +def check_vcf_gz_format(vcf_gz_path): + """Check if the .vcf.gz file is correctly formatted using bcftools.""" + try: + result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True) + if result.returncode != 0: + print(f"Error: VCF file {vcf_gz_path} is not correctly formatted. bcftools output: {result.stderr}") + return False + return True + except Exception as e: + print(f"Error: An exception occurred while validating VCF format. {str(e)}") + return False + +def extract_region(vcf_gz_path): + """Extract the region from the first and second entries in the VCF file.""" + try: + result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True) + if result.returncode != 0: + print(f"Error: Failed to read VCF file {vcf_gz_path}. bcftools output: {result.stderr}") + return None + lines = result.stdout.split('\n') + positions = [] + for line in lines: + if not line.startswith('#'): + # Split the line into columns and extract the positions of the first two entries + columns = line.split('\t') + contig = columns[0] + position = columns[1] + positions.append(position) + if len(positions) == 2: + break + if len(positions) < 2: + print(f"Error: Less than two data entries found in VCF file {vcf_gz_path}.") + return None + region = f"{contig}:{positions[0]}-{positions[1]}" + return region + except Exception as e: + print(f"Error: An exception occurred while extracting region from VCF file. {str(e)}") + return None + +def check_tbi_association(vcf_gz_path, region): + """Check if the .tbi index file is correctly associated with the .vcf.gz file using tabix.""" + try: + result = subprocess.run(['tabix', vcf_gz_path, region], capture_output=True, text=True) + if result.returncode != 0: + print(f"Error: TBI file for {vcf_gz_path} is not correctly associated. Tabix output: {result.stderr}") + return False + return True + except Exception as e: + print(f"Error: An exception occurred while checking TBI association. {str(e)}") + return False + +def validate_bgzip_and_tabix_output(vcf_gz_path): + """Validate the output of the BgzipAndTabix module.""" + if not check_file_exists(vcf_gz_path): + return False + + tbi_path = vcf_gz_path + '.tbi' + if not check_file_exists(tbi_path): + return False + + if not check_vcf_gz_format(vcf_gz_path): + return False + + region = extract_region(vcf_gz_path) + if not region: + return False + + if not check_tbi_association(vcf_gz_path, region): + return False + + print(f"Success: The .vcf.gz and .tbi files for {vcf_gz_path} are correctly formatted and valid.") + return True + +def validate_multiple_files(directory): + """Validate multiple .vcf.gz and .tbi files in a given directory.""" + vcf_gz_files = glob.glob(os.path.join(directory, '*.vcf.gz')) + + all_valid = True + for vcf_gz_path in vcf_gz_files: + if not validate_bgzip_and_tabix_output(vcf_gz_path): + all_valid = False + + if all_valid: + print("All files validated successfully.") + else: + print("Some files failed validation.") + +# Define the directory containing the .vcf.gz and .tbi files +directory = '../vector_hap/results/select_variants/' + +validate_multiple_files(directory) + diff --git a/validation/validate_bgziptabixII.py b/validation/validate_bgziptabixII.py new file mode 100755 index 0000000..612295b --- /dev/null +++ b/validation/validate_bgziptabixII.py @@ -0,0 +1,104 @@ +import os +import subprocess +import glob + +def check_file_exists(file_path): + """Check if the file exists.""" + if not os.path.isfile(file_path): + print(f"Error: File {file_path} does not exist.") + return False + return True + +def check_vcf_gz_format(vcf_gz_path): + """Check if the .vcf.gz file is correctly formatted using bcftools.""" + try: + result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True) + if result.returncode != 0: + print(f"Error: VCF file {vcf_gz_path} is not correctly formatted. bcftools output: {result.stderr}") + return False + return True + except Exception as e: + print(f"Error: An exception occurred while validating VCF format. {str(e)}") + return False + +def extract_region(vcf_gz_path): + """Extract the region from the first and second entries in the VCF file.""" + try: + result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True) + if result.returncode != 0: + print(f"Error: Failed to read VCF file {vcf_gz_path}. bcftools output: {result.stderr}") + return None + lines = result.stdout.split('\n') + positions = [] + for line in lines: + if not line.startswith('#'): + # Split the line into columns and extract the positions of the first two entries + columns = line.split('\t') + contig = columns[0] + position = columns[1] + positions.append(position) + if len(positions) == 2: + break + if len(positions) < 2: + print(f"Error: Less than two data entries found in VCF file {vcf_gz_path}.") + return None + region = f"{contig}:{positions[0]}-{positions[1]}" + return region + except Exception as e: + print(f"Error: An exception occurred while extracting region from VCF file. {str(e)}") + return None + +def check_tbi_association(vcf_gz_path, region): + """Check if the .tbi index file is correctly associated with the .vcf.gz file using tabix.""" + try: + result = subprocess.run(['tabix', vcf_gz_path, region], capture_output=True, text=True) + if result.returncode != 0: + print(f"Error: TBI file for {vcf_gz_path} is not correctly associated. Tabix output: {result.stderr}") + return False + return True + except Exception as e: + print(f"Error: An exception occurred while checking TBI association. {str(e)}") + return False + +def validate_bgzip_and_tabix_output(vcf_gz_path): + """Validate the output of the BgzipAndTabix module.""" + if not check_file_exists(vcf_gz_path): + return False + + tbi_path = vcf_gz_path + '.tbi' + if not check_file_exists(tbi_path): + return False + + if not check_vcf_gz_format(vcf_gz_path): + return False + + region = extract_region(vcf_gz_path) + if not region: + return False + + if not check_tbi_association(vcf_gz_path, region): + return False + + print(f"Success: The .vcf.gz and .tbi files for {vcf_gz_path} are correctly formatted and valid.") + return True + +def validate_multiple_files(directory): + """Validate multiple .vcf.gz and .tbi files in a given directory.""" + vcf_gz_files = glob.glob(os.path.join(directory, '*_merged.vcf.gz')) + + all_valid = True + for vcf_gz_path in vcf_gz_files: + print(f"Validating {vcf_gz_path}...") + if not validate_bgzip_and_tabix_output(vcf_gz_path): + all_valid = False + + if all_valid: + print("All files validated successfully.") + else: + print("Some files failed validation.") + +# Define the directory containing the .vcf.gz and .tbi files +directory = '../vector_hap/results/cohort_phasing/' + +validate_multiple_files(directory) + diff --git a/validation/validate_cohortvcf2zarr.py b/validation/validate_cohortvcf2zarr.py new file mode 100755 index 0000000..784405d --- /dev/null +++ b/validation/validate_cohortvcf2zarr.py @@ -0,0 +1,52 @@ +import os + +def check_directory_exists(directory_path): + """Check if a directory exists.""" + if not os.path.isdir(directory_path): + print(f"Error: Directory {directory_path} does not exist.") + return False + return True + +def check_zarr_structure(zarr_path, contig): + """Check the structure of the Zarr directory.""" + expected_folders = [ + 'calldata/GT', + 'variants/AC', 'variants/AF', 'variants/ALT', + 'variants/CM', 'variants/POS', 'variants/REF' + ] + for folder in expected_folders: + full_path = os.path.join(zarr_path, folder) + if not check_directory_exists(full_path): + print(f"Error: Zarr directory {zarr_path} is missing {folder}.") + return False + return True + +def validate_cohort_vcf_to_zarr(contig, zarr_directory): + """Validate the output of the cohort VCF to Zarr conversion.""" + zarr_output = os.path.join(zarr_directory, f"validation_{contig}.zarr", contig) + + if not check_directory_exists(zarr_output): + return False + + if not check_zarr_structure(zarr_output, contig): + return False + + print(f"Success: The Zarr directory {zarr_output} is correctly formatted and valid.") + return True + +def validate_multiple_files(contigs, zarr_directory): + """Validate multiple files processed by cohort_vcf_to_zarr.py.""" + for contig in contigs: + print(f"Validating contig {contig}...") + if validate_cohort_vcf_to_zarr(contig, zarr_directory): + print(f"Validation passed for contig {contig}.") + else: + print(f"Validation failed for contig {contig}.") + +# Define the directories +contigs = ['2R', '2L', '3R', '3L', 'X'] +zarr_directory = '../vector_hap/results/cohort_phasing/ligate/outputs/' + +# Run validation on multiple files +validate_multiple_files(contigs, zarr_directory) + diff --git a/validation/validate_ligateChunks.py b/validation/validate_ligateChunks.py new file mode 100755 index 0000000..a480583 --- /dev/null +++ b/validation/validate_ligateChunks.py @@ -0,0 +1,99 @@ +import os +import glob +import subprocess + +def check_file_exists(file_path): + """Check if the file exists.""" + if not os.path.isfile(file_path): + print(f"Error: File {file_path} does not exist.") + return False + return True + +def check_vcf_format(vcf_path): + """Check if the VCF file is correctly formatted using bcftools.""" + try: + result = subprocess.run(['bcftools', 'view', vcf_path], capture_output=True, text=True) + if result.returncode != 0: + print(f"Error: VCF file {vcf_path} is not correctly formatted. bcftools output: {result.stderr}") + return False + return True + except Exception as e: + print(f"Error: An exception occurred while validating VCF format. {str(e)}") + return False + +def list_phased_chunks(contig, phased_vcf_directory): + """List all phased chunk files for a given contig.""" + phased_files_pattern = os.path.join(phased_vcf_directory, f"*{contig}_phased.vcf.gz") + phased_files = glob.glob(phased_files_pattern) + return phased_files + +def extract_chrom_entries_with_grep(vcf_path): + """Extract chrom entries from a VCF file using grep.""" + try: + result = subprocess.run(['zgrep', '-v', '^#', vcf_path], capture_output=True, text=True) + if result.returncode != 0: + print(f"Error: Failed to extract chrom entries from {vcf_path}. grep output: {result.stderr}") + return None + entries = [line.split('\t')[0:2] for line in result.stdout.splitlines()] + return entries + except Exception as e: + print(f"Error: An exception occurred while extracting chrom entries. {str(e)}") + return None + +def validate_concatenated_file(contig, phased_files, concatenated_file_path): + """Validate that all phased chunk files are concatenated into one file.""" + if not check_file_exists(concatenated_file_path): + print(f"Error: Concatenated VCF file {concatenated_file_path} does not exist.") + return False + + if not check_vcf_format(concatenated_file_path): + return False + + concatenated_entries = extract_chrom_entries_with_grep(concatenated_file_path) + if concatenated_entries is None: + return False + concatenated_set = set(tuple(entry) for entry in concatenated_entries) + + for phased_file in phased_files: + phased_entries = extract_chrom_entries_with_grep(phased_file) + if phased_entries is None: + return False + + for entry in phased_entries: + if tuple(entry) not in concatenated_set: + print(f"Error: Chrom entry {entry} from {phased_file} is not found in {concatenated_file_path}.") + return False + + return True + +def validate_ligate_regions(vcf_directory, phased_vcf_directory): + """Validate that all phased chunks per contig are concatenated into one file per contig.""" + contigs = ['2R', '2L', '3R', '3L', 'X'] + for contig in contigs: + concatenated_file_pattern = os.path.join(vcf_directory, f"*{contig}_phased.vcf.gz") + concatenated_files = glob.glob(concatenated_file_pattern) + + if not concatenated_files: + print(f"Error: No concatenated VCF file found for contig {contig}.") + continue + + concatenated_file_path = concatenated_files[0] # Use the first match + phased_files = list_phased_chunks(contig, phased_vcf_directory) + + if not phased_files: + print(f"Error: No phased chunk files found for contig {contig}.") + continue + + print(f"Validating {contig} with concatenated VCF {concatenated_file_path}...") + if validate_concatenated_file(contig, phased_files, concatenated_file_path): + print(f"Validation passed for contig {contig}.") + else: + print(f"Validation failed for contig {contig}.") + +# Define the directories +vcf_directory = '../vector_hap/results/cohort_phasing/ligate/' +phased_vcf_directory = '../vector_hap/results/cohort_phasing/phased_chunks/' + +# Run validation +validate_ligate_regions(vcf_directory, phased_vcf_directory) + diff --git a/validation/validate_manifest.py b/validation/validate_manifest.py new file mode 100755 index 0000000..6080cd2 --- /dev/null +++ b/validation/validate_manifest.py @@ -0,0 +1,49 @@ +import csv +import os + +# Path to the input manifest file +input_manifest = '../vector_hap/wgs_snp_testdata.tsv' + +# Required columns +required_columns = ['sample_id', 'bam_path', 'bai_path', 'vcf_path', 'zarr_path'] + +def validate_manifest(manifest_path): + """ + Validate the input manifest file for required columns and non-empty values. + + Args: + manifest_path (str): Path to the manifest file. + + Returns: + bool: True if validation passes, False otherwise. + """ + try: + with open(manifest_path, 'r') as file: + reader = csv.DictReader(file, delimiter='\t') + + # Check if all required columns are present + if not all(column in reader.fieldnames for column in required_columns): + print(f"Error: Manifest file is missing required columns. Required columns are: {required_columns}") + return False + + # Check each row for non-empty values + for row in reader: + for column in required_columns: + if not row[column].strip(): + print(f"Error: Missing value for column '{column}' in row: {row}") + return False + + print("Success: Manifest file is correctly formatted and contains all required fields.") + return True + + except Exception as e: + print(f"Error: An exception occurred while validating the manifest file. {str(e)}") + return False + +# Run the validation +if __name__ == "__main__": + if validate_manifest(input_manifest): + print("Manifest file validation passed.") + else: + print("Manifest file validation failed.") + diff --git a/validation/validate_mergedVcfs.py b/validation/validate_mergedVcfs.py new file mode 100755 index 0000000..e7a415c --- /dev/null +++ b/validation/validate_mergedVcfs.py @@ -0,0 +1,57 @@ + +import os +import subprocess +import glob + +def check_file_exists(file_path): + """Check if the file exists.""" + if not os.path.isfile(file_path): + print(f"Error: File {file_path} does not exist.") + return False + return True + + +def check_mergedvcf_format(vcf_path): + """Check if the _merged.vcf.gz file is correctly formatted using bcftools.""" + try: + result = subprocess.run(['bcftools', 'view', vcf_path], capture_output=True, text=True) + if result.returncode != 0: + print(f"Error: Merged VCF file {vcf_path} is not correctly formatted. bcftools output: {result.stderr}") + return False + return True + except Exception as e: + print(f"Error: An exception occurred while validating VCF format. {str(e)}") + return False + +def validate_mergedvcf_output(vcf_path): + """Validate the output of the MergedVCF module.""" + if not check_file_exists(vcf_path): + return False + + if not check_mergedvcf_format(vcf_path): + return False + + print(f"Success: The Merged VCF file {vcf_path} is correctly formatted and valid.") + return True + +def validate_multiple_files(vcf_directory): + """Validate multiple _merged.vcf.gz files in the given directory.""" + vcf_files = glob.glob(os.path.join(vcf_directory, "*_merged.vcf")) + + all_valid = True + for vcf_path in vcf_files: + print(f"Validating {vcf_path}...") + if not validate_mergedvcf_output(vcf_path): + all_valid = False + + if all_valid: + print("All files validated successfully.") + else: + print("Some files failed validation.") + +# Define the directory containing the merged VCF files +vcf_directory = '../vector_hap/results/cohort_phasing/' + +# Run validation on multiple files +validate_multiple_files(vcf_directory) + diff --git a/validation/validate_selectvariants_output.sh b/validation/validate_selectvariants_output.sh new file mode 100755 index 0000000..c57bee0 --- /dev/null +++ b/validation/validate_selectvariants_output.sh @@ -0,0 +1,37 @@ +#!/bin/bash + +# Function to validate a single VCF file +validate_vcf() { + local vcf_path=$1 + if bcftools view "$vcf_path" &> /dev/null; then + echo "VCF file $vcf_path is valid." + return 0 + else + echo "VCF file $vcf_path is not valid." + return 1 + fi +} + +# Directory containing subset VCF files +VCF_DIR="../vector_hap/results/select_variants" + +# Initialize all VCF files in the directory +VCF_FILES=("$VCF_DIR"/*.vcf) + +# Run validation for each VCF file +all_valid=true +for vcf_path in "${VCF_FILES[@]}"; do + validate_vcf "$vcf_path" + if [[ $? -ne 0 ]]; then + all_valid=false + fi +done + +# Report overall validation result +if $all_valid; then + echo "All subset VCF files are valid." +else + echo "Some subset VCF files failed validation." + exit 1 +fi + diff --git a/validation/validate_shapeit.py b/validation/validate_shapeit.py new file mode 100755 index 0000000..e084850 --- /dev/null +++ b/validation/validate_shapeit.py @@ -0,0 +1,83 @@ +import os +import glob +import subprocess + +def check_file_exists(file_path): + """Check if the file exists.""" + if not os.path.isfile(file_path): + print(f"Error: File {file_path} does not exist.") + return False + return True + +def check_vcf_format(vcf_path): + """Check if the VCF file is correctly formatted using bcftools.""" + try: + result = subprocess.run(['bcftools', 'view', vcf_path], capture_output=True, text=True) + if result.returncode != 0: + print(f"Error: VCF file {vcf_path} is not correctly formatted. bcftools output: {result.stderr}") + return False + return True + except Exception as e: + print(f"Error: An exception occurred while validating VCF format. {str(e)}") + return False + +def count_intervals(interval_list_path): + """Count the number of intervals in the interval list file.""" + try: + with open(interval_list_path, 'r') as file: + intervals = file.readlines() + return len(intervals) + except Exception as e: + print(f"Error: An exception occurred while reading interval list file. {str(e)}") + return 0 + +def validate_phased_chunks(contig, interval_count, phased_vcf_directory): + """Validate that the number of phased chunk files matches the number of intervals and their formats.""" + phased_files_pattern = os.path.join(phased_vcf_directory, f"*{contig}_phased.vcf.gz") + phased_files = glob.glob(phased_files_pattern) + + if len(phased_files) != interval_count: + print(f"Error: Number of phased chunk files ({len(phased_files)}) does not match the number of intervals ({interval_count}) for contig {contig}.") + return False + + for phased_file in phased_files: + if not check_vcf_format(phased_file): + return False + + return True + +def validate_shapeit_phasing(vcf_directory, interval_directory, phased_vcf_directory): + """Validate that each contig's merged VCF file has been phased into the correct number of chunks.""" + contigs = ['2R', '2L', '3R', '3L', 'X'] + for contig in contigs: + merged_vcf_pattern = os.path.join(vcf_directory, f"*{contig}_merged.vcf.gz") + merged_vcf_files = glob.glob(merged_vcf_pattern) + + if not merged_vcf_files: + print(f"Error: No merged VCF file found for contig {contig}.") + continue + + merged_vcf_path = merged_vcf_files[0] # Use the first match + interval_list_path = os.path.join(interval_directory, f"intervals_gamb_colu_arab_{contig}_200000_40000.txt") + + if not check_file_exists(merged_vcf_path): + continue + + interval_count = count_intervals(interval_list_path) + if interval_count == 0: + continue + + print(f"Validating {contig} with merged VCF {merged_vcf_path}...") + if validate_phased_chunks(contig, interval_count, phased_vcf_directory): + print(f"Validation passed for contig {contig}.") + else: + print(f"Validation failed for contig {contig}.") + +# Define the directories +vcf_directory = '../vector_hap/results/cohort_phasing/' +interval_directory = '../vector_hap/resources/' +phased_vcf_directory = '../vector_hap/results/cohort_phasing/phased_chunks/' + +# Run validation +validate_shapeit_phasing(vcf_directory, interval_directory, phased_vcf_directory) + diff --git a/validation/validate_tabixII.py b/validation/validate_tabixII.py new file mode 100755 index 0000000..a01f4b7 --- /dev/null +++ b/validation/validate_tabixII.py @@ -0,0 +1,95 @@ +import os +import subprocess +import glob + +def check_file_exists(file_path): + """Check if the file exists.""" + if not os.path.isfile(file_path): + print(f"Error: File {file_path} does not exist.") + return False + return True + +def extract_region(vcf_gz_path): + """Extract the region from the first and second entries in the VCF file.""" + try: + result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True) + if result.returncode != 0: + print(f"Error: Failed to read VCF file {vcf_gz_path}. bcftools output: {result.stderr}") + return None + lines = result.stdout.split('\n') + positions = [] + for line in lines: + if not line.startswith('#'): + # Split the line into columns and extract the positions of the first two entries + columns = line.split('\t') + contig = columns[0] + position = columns[1] + positions.append(position) + if len(positions) == 2: + break + if len(positions) < 2: + print(f"Error: Less than two data entries found in VCF file {vcf_gz_path}.") + return None + region = f"{contig}:{positions[0]}-{positions[1]}" + return region + except Exception as e: + print(f"Error: An exception occurred while extracting region from VCF file. {str(e)}") + return None + +def check_tbi_association(vcf_gz_path, region): + """Check if the .tbi index file is correctly associated with the .vcf.gz file using tabix.""" + try: + result = subprocess.run(['tabix', vcf_gz_path, region], capture_output=True, text=True) + if result.returncode != 0: + print(f"Error: TBI file for {vcf_gz_path} is not correctly associated. Tabix output: {result.stderr}") + return False + return True + except Exception as e: + print(f"Error: An exception occurred while checking TBI association. {str(e)}") + return False + +def validate_tabix_output(vcf_gz_path): + """Validate the output of the TabixII module.""" + if not check_file_exists(vcf_gz_path): + return False + + tbi_path = vcf_gz_path + '.tbi' + if not check_file_exists(tbi_path): + return False + + region = extract_region(vcf_gz_path) + if not region: + return False + + if not check_tbi_association(vcf_gz_path, region): + return False + + print(f"Success: The .tbi file for {vcf_gz_path} is correctly formatted and valid.") + return True + +def validate_multiple_files(directory): + """Validate multiple .tbi files.""" + contigs = ['2R', '2L', '3R', '3L', 'X'] + for contig in contigs: + #merged_vcf_pattern = os.path.join(vcf_directory, f"*{contig}_merged.vcf.gz") + #merged_vcf_files = glob.glob(merged_vcf_pattern) + #vcf_gz_files = glob.glob(os.path.join(directory, '*phased.vcf.gz')) + vcf_gz_files_pattern = os.path.join(directory, f"*{contig}_phased.vcf.gz") + vcf_gz_files = glob.glob(vcf_gz_files_pattern) + + all_valid = True + for vcf_gz_path in vcf_gz_files: + print(f"Validating {contig}...") + if not validate_tabix_output(vcf_gz_path): + all_valid = False + + if all_valid: + print("All files validated successfully.") + else: + print("Some files failed validation.") + +# Define the directory containing the .vcf.gz and .tbi files +directory = '../vector_hap/results/cohort_phasing/phased_chunks/' + +validate_multiple_files(directory) + diff --git a/validation/validate_whatshapPhase.py b/validation/validate_whatshapPhase.py new file mode 100755 index 0000000..752348c --- /dev/null +++ b/validation/validate_whatshapPhase.py @@ -0,0 +1,73 @@ +import os +import subprocess +import glob +import vcf + +def check_file_exists(file_path): + """Check if a file exists.""" + if not os.path.isfile(file_path): + print(f"Error: File {file_path} does not exist.") + return False + return True + +def check_vcf_format(vcf_path): + """Check if a VCF file is correctly formatted using bcftools.""" + try: + result = subprocess.run(['bcftools', 'view', vcf_path], capture_output=True, text=True) + if result.returncode != 0: + print(f"Error: VCF file {vcf_path} is not correctly formatted. bcftools output: {result.stderr}") + return False + return True + except Exception as e: + print(f"Error: An exception occurred while validating VCF format. {str(e)}") + return False + +def check_phased_genotype_consistency(vcf_path): + """Check if phased genotypes in a VCF file are consistent.""" + try: + vcf_reader = vcf.Reader(filename=vcf_path) + for record in vcf_reader: + for sample in record.samples: + if sample.phased: + gt = sample['GT'] + if "|" not in gt: + print(f"Error: Found unphased genotype in phased sample in file {vcf_path} at position {record.POS}.") + return False + return True + except Exception as e: + print(f"Error: An exception occurred while checking phased genotype consistency. {str(e)}") + return False + +def validate_whats_hap_phase_output(vcf_path): + """Validate the output of the WhatsHapPhase module.""" + if not check_file_exists(vcf_path): + return False + + if not check_vcf_format(vcf_path): + return False + + if not check_phased_genotype_consistency(vcf_path): + return False + + print(f"Success: The phased VCF file {vcf_path} is correctly formatted and contains consistent phased genotypes.") + return True + +def validate_multiple_files(directory): + """Validate multiple VCF files in a given directory.""" + vcf_files = glob.glob(os.path.join(directory, '*.vcf.gz')) + + all_valid = True + for vcf_path in vcf_files: + if not validate_whats_hap_phase_output(vcf_path): + all_valid = False + + if all_valid: + print("All files validated successfully.") + else: + print("Some files failed validation.") + +# Define the directory containing the VCF files +directory = '../vector_hap/results/sample_phasing/' + +validate_multiple_files(directory) + diff --git a/validation/validate_whatshapStats.py b/validation/validate_whatshapStats.py new file mode 100755 index 0000000..8858384 --- /dev/null +++ b/validation/validate_whatshapStats.py @@ -0,0 +1,97 @@ +import os +import glob + +def check_file_exists(file_path): + """Check if a file exists.""" + if not os.path.isfile(file_path): + print(f"Error: File {file_path} does not exist.") + return False + return True + +def check_stats_format(stats_path): + """Check the format and content of the WhatsHap stats file.""" + try: + with open(stats_path, 'r') as file: + header = file.readline().strip().split('\t') + expected_header = ['#sample', 'chromosome', 'file_name', 'variants', 'phased', 'unphased', 'singletons', 'blocks', 'variant_per_block_median', 'variant_per_block_avg', 'variant_per_block_min', 'variant_per_block_max', 'variant_per_block_sum', 'bp_per_block_median', 'bp_per_block_avg', 'bp_per_block_min', 'bp_per_block_max', 'bp_per_block_sum', 'heterozygous_variants', 'heterozygous_snvs', 'phased_snvs', 'block_n50'] + if header != expected_header: + print(f"Error: Stats file {stats_path} has an incorrect header: {header}") + return False + + for line in file: + columns = line.strip().split('\t') + if len(columns) != 22: + print(f"Error: Stats file {stats_path} has an incorrect number of columns: {columns}") + return False + return True + except Exception as e: + print(f"Error: An exception occurred while validating stats file format. {str(e)}") + return False + +def check_gtf_format(gtf_path): + """Check if a GTF file is correctly formatted.""" + try: + with open(gtf_path, 'r') as file: + for line in file: + columns = line.strip().split('\t') + if len(columns) != 9: + print(f"Error: GTF file {gtf_path} has an incorrect number of columns: {columns}") + return False + if columns[2] != "exon": + print(f"Error: GTF file {gtf_path} contains a non-exon entry: {columns}") + return False + return True + except Exception as e: + print(f"Error: An exception occurred while validating GTF format. {str(e)}") + return False + +def validate_whats_hap_stats_output(vcf_gz_path, tbi_path, stats_path, gtf_path): + """Validate the output of the WhatsHapStats module.""" + if not check_file_exists(vcf_gz_path): + return False + + if not check_file_exists(tbi_path): + return False + + if not check_file_exists(stats_path): + return False + + if not check_file_exists(gtf_path): + return False + + if not check_stats_format(stats_path): + return False + + if not check_gtf_format(gtf_path): + return False + + print(f"Success: The phased VCF file {vcf_gz_path}, its index, the stats file {stats_path}, and the GTF file {gtf_path} are correctly formatted and valid.") + return True + +def validate_multiple_files(vcf_directory, stats_directory): + """Validate multiple VCF files and their associated files in the given directories.""" + vcf_files = glob.glob(os.path.join(vcf_directory, "*.vcf.gz")) + for vcf_file in vcf_files: + base_name = os.path.basename(vcf_file).replace('.subset.vcf.phased.vcf.gz', '') + stats_file_name = f"{base_name}.stats.tsv" + gtf_file_name = f"{base_name}.blocks.gtf" + + vcf_path = vcf_file + tbi_path = f"{vcf_path}.tbi" + stats_path = os.path.join(stats_directory, stats_file_name) + gtf_path = os.path.join(stats_directory, gtf_file_name) + + print(f"Validating {vcf_path} with index {tbi_path}, stats {stats_path}, and GTF {gtf_path}...") + if validate_whats_hap_stats_output(vcf_path, tbi_path, stats_path, gtf_path): + print(f"Validation passed for {vcf_path}.") + else: + print(f"Validation failed for {vcf_path}.") + +# Define the directories containing the VCF files, stats files, and GTF files +vcf_directory = '../vector_hap/results/sample_phasing/' +stats_directory = '../vector_hap/results/phasing_stats/' +#gtf_directory = '../vector_hap/results/phasing_gtf/' + +# Run validation on multiple files +validate_multiple_files(vcf_directory, stats_directory) + From 7b40de6dec6c6fa97f759996aa8e15d47c88d2b3 Mon Sep 17 00:00:00 2001 From: Edward Lukyamuzi Date: Tue, 16 Jul 2024 13:21:37 -0700 Subject: [PATCH 2/3] add pipeline integrity checks --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a43e26c..2a428df 100644 --- a/README.md +++ b/README.md @@ -59,7 +59,7 @@ cd validation python3 all_in_one_validation.py ``` -Ensure that the necessary dependencies (e.g., bcftools, Zarr, pyVCF) are installed and available in your PATH. +Ensure that the necessary dependencies (e.g., bcftools, zarr, pyVCF) are installed and available in your PATH. ## Authors and Acknowledgements From f82a87d8dd8fcd8073cbbf0eb84c04aaa9312c08 Mon Sep 17 00:00:00 2001 From: eddUG <44267477+eddUG@users.noreply.github.com> Date: Tue, 16 Jul 2024 23:29:09 +0300 Subject: [PATCH 3/3] Update README.md --- README.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2a428df..36c2c37 100644 --- a/README.md +++ b/README.md @@ -38,7 +38,8 @@ The `validation` directory contains the following scripts: ### Running the Validation Scripts -####Interactive approach +#### Interactive approach + To run the validation scripts, navigate to the `validation` directory and execute the desired script. For example: ```bash @@ -51,7 +52,8 @@ cd validation python3 validate_ligateregions.py ``` -####Non-interactive approach +#### Non-interactive approach + To submit all validation tasks in a single batch, make use of the all_in_one_validation script. ```bash