Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,52 @@ nextflow /path/to/repository/main.nf \
--project_id XXXXXXXXXX \
--input_manifest /path/to/input_manifest.tsv
```
## Validation

This repository includes a set of validation scripts to ensure the integrity and correctness of the pipeline.

### Validation Scripts

The `validation` directory contains the following scripts:

- `validate_manifest.py`: Validates the input manifest.
- `validate_selectvariants_output.sh`: Validates the output of the SelectVariants module.
- `validate_bgziptabix.py`: Validates the output of the BgzipAndTabix module.
- `validate_whatshapPhase.py`: Validates the output of the WhatsHapPhase module.
- `validate_whatshapStats.py`: Validates the output of the WhatsHapStats module.
- `validate_mergeVcfs.py`: Validates the output of the MergeVcfs module.
- `validate_bgziptabixii.py`: Validates the output of the BgzipAndTabixII module.
- `validate_shapeit.py`: Validates the output of the SHAPEIT module.
- `validate_tabixii.py`: Validates the output of the TabixII module.
- `validate_ligateChunks.py`: Validates the output of the LigateRegions module.
- `validate_cohortvcf2zarr.py`: Validates the output of the cohortVcfToZarr module.

### Running the Validation Scripts

#### Interactive approach

To run the validation scripts, navigate to the `validation` directory and execute the desired script. For example:

```bash
cd validation
./validate_selectvariants_output.sh
```

```bash
cd validation
python3 validate_ligateregions.py
```

#### Non-interactive approach

To submit all validation tasks in a single batch, make use of the all_in_one_validation script.

```bash
cd validation
python3 all_in_one_validation.py
```

Ensure that the necessary dependencies (e.g., bcftools, zarr, pyVCF) are installed and available in your PATH.


## Authors and Acknowledgements
Expand Down
57 changes: 57 additions & 0 deletions validation/all_in_one_validation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
import subprocess

# List of validation scripts and their descriptions
validations = [
("validate_manifest.py", "Validate Input Manifest"),
("validate_selectvariants_output.sh", "Validate SelectVariants Output"),
("validate_bgziptabix.py", "Validate Bgzip and Tabix"),
("validate_whatshapPhase.py", "Validate WhatsHap Phase"),
#("validate_whatshapStats.py", "Validate WhatsHap Stats"),
#("validate_mergeVcfs.py", "Validate MergeVcfs"),
#("validate_bgziptabixII.py", "Validate BgzipAndTabixII"),
#("validate_shapeit.py", "Validate SHAPEIT"),
#("validate_tabixII.py", "Validate TabixII"),
#("validate_ligateChunks.py", "Validate LigateRegions"),
#("validate_cohortvcf2zarr.py", "Validate cohortVcfToZarr")
]

# Function to run a script and return the result
def run_script(script):
try:
result = subprocess.run(["python", script] if script.endswith(".py") else ["bash", script], capture_output=True, text=True)
success = result.returncode == 0
details = result.stderr if not success else result.stdout
return success, details.strip()
except Exception as e:
return False, str(e)

# Run all validation scripts and collect results
results = {}
for script, description in validations:
success, details = run_script(script)
results[description] = {"status": success, "details": details}

# Generate checklist report
def generate_validation_report(results):
report = []
report.append("Validation Checklist Report")
report.append("==============================")

for step, result in results.items():
if result['status']:
report.append(f"{step}: \033[92mSuccess\033[0m")
else:
report.append(f"{step}: \033[91mFailed\033[0m")
if 'details' in result:
report.append(f" Details: {result['details']}")
report.append("------------------------------")

return "\n".join(report)

# Generate and print the report
report = generate_validation_report(results)
print(report)

# Save the report to a file
with open("validation_report.txt", "w") as report_file:
report_file.write(report)
103 changes: 103 additions & 0 deletions validation/validate_bgziptabix.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import os
import subprocess
import glob

def check_file_exists(file_path):
"""Check if the file exists."""
if not os.path.isfile(file_path):
print(f"Error: File {file_path} does not exist.")
return False
return True

def check_vcf_gz_format(vcf_gz_path):
"""Check if the .vcf.gz file is correctly formatted using bcftools."""
try:
result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True)
if result.returncode != 0:
print(f"Error: VCF file {vcf_gz_path} is not correctly formatted. bcftools output: {result.stderr}")
return False
return True
except Exception as e:
print(f"Error: An exception occurred while validating VCF format. {str(e)}")
return False

def extract_region(vcf_gz_path):
"""Extract the region from the first and second entries in the VCF file."""
try:
result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True)
if result.returncode != 0:
print(f"Error: Failed to read VCF file {vcf_gz_path}. bcftools output: {result.stderr}")
return None
lines = result.stdout.split('\n')
positions = []
for line in lines:
if not line.startswith('#'):
# Split the line into columns and extract the positions of the first two entries
columns = line.split('\t')
contig = columns[0]
position = columns[1]
positions.append(position)
if len(positions) == 2:
break
if len(positions) < 2:
print(f"Error: Less than two data entries found in VCF file {vcf_gz_path}.")
return None
region = f"{contig}:{positions[0]}-{positions[1]}"
return region
except Exception as e:
print(f"Error: An exception occurred while extracting region from VCF file. {str(e)}")
return None

def check_tbi_association(vcf_gz_path, region):
"""Check if the .tbi index file is correctly associated with the .vcf.gz file using tabix."""
try:
result = subprocess.run(['tabix', vcf_gz_path, region], capture_output=True, text=True)
if result.returncode != 0:
print(f"Error: TBI file for {vcf_gz_path} is not correctly associated. Tabix output: {result.stderr}")
return False
return True
except Exception as e:
print(f"Error: An exception occurred while checking TBI association. {str(e)}")
return False

def validate_bgzip_and_tabix_output(vcf_gz_path):
"""Validate the output of the BgzipAndTabix module."""
if not check_file_exists(vcf_gz_path):
return False

tbi_path = vcf_gz_path + '.tbi'
if not check_file_exists(tbi_path):
return False

if not check_vcf_gz_format(vcf_gz_path):
return False

region = extract_region(vcf_gz_path)
if not region:
return False

if not check_tbi_association(vcf_gz_path, region):
return False

print(f"Success: The .vcf.gz and .tbi files for {vcf_gz_path} are correctly formatted and valid.")
return True

def validate_multiple_files(directory):
"""Validate multiple .vcf.gz and .tbi files in a given directory."""
vcf_gz_files = glob.glob(os.path.join(directory, '*.vcf.gz'))

all_valid = True
for vcf_gz_path in vcf_gz_files:
if not validate_bgzip_and_tabix_output(vcf_gz_path):
all_valid = False

if all_valid:
print("All files validated successfully.")
else:
print("Some files failed validation.")

# Define the directory containing the .vcf.gz and .tbi files
directory = '../vector_hap/results/select_variants/'

validate_multiple_files(directory)

104 changes: 104 additions & 0 deletions validation/validate_bgziptabixII.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
import os
import subprocess
import glob

def check_file_exists(file_path):
"""Check if the file exists."""
if not os.path.isfile(file_path):
print(f"Error: File {file_path} does not exist.")
return False
return True

def check_vcf_gz_format(vcf_gz_path):
"""Check if the .vcf.gz file is correctly formatted using bcftools."""
try:
result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True)
if result.returncode != 0:
print(f"Error: VCF file {vcf_gz_path} is not correctly formatted. bcftools output: {result.stderr}")
return False
return True
except Exception as e:
print(f"Error: An exception occurred while validating VCF format. {str(e)}")
return False

def extract_region(vcf_gz_path):
"""Extract the region from the first and second entries in the VCF file."""
try:
result = subprocess.run(['bcftools', 'view', vcf_gz_path], capture_output=True, text=True)
if result.returncode != 0:
print(f"Error: Failed to read VCF file {vcf_gz_path}. bcftools output: {result.stderr}")
return None
lines = result.stdout.split('\n')
positions = []
for line in lines:
if not line.startswith('#'):
# Split the line into columns and extract the positions of the first two entries
columns = line.split('\t')
contig = columns[0]
position = columns[1]
positions.append(position)
if len(positions) == 2:
break
if len(positions) < 2:
print(f"Error: Less than two data entries found in VCF file {vcf_gz_path}.")
return None
region = f"{contig}:{positions[0]}-{positions[1]}"
return region
except Exception as e:
print(f"Error: An exception occurred while extracting region from VCF file. {str(e)}")
return None

def check_tbi_association(vcf_gz_path, region):
"""Check if the .tbi index file is correctly associated with the .vcf.gz file using tabix."""
try:
result = subprocess.run(['tabix', vcf_gz_path, region], capture_output=True, text=True)
if result.returncode != 0:
print(f"Error: TBI file for {vcf_gz_path} is not correctly associated. Tabix output: {result.stderr}")
return False
return True
except Exception as e:
print(f"Error: An exception occurred while checking TBI association. {str(e)}")
return False

def validate_bgzip_and_tabix_output(vcf_gz_path):
"""Validate the output of the BgzipAndTabix module."""
if not check_file_exists(vcf_gz_path):
return False

tbi_path = vcf_gz_path + '.tbi'
if not check_file_exists(tbi_path):
return False

if not check_vcf_gz_format(vcf_gz_path):
return False

region = extract_region(vcf_gz_path)
if not region:
return False

if not check_tbi_association(vcf_gz_path, region):
return False

print(f"Success: The .vcf.gz and .tbi files for {vcf_gz_path} are correctly formatted and valid.")
return True

def validate_multiple_files(directory):
"""Validate multiple .vcf.gz and .tbi files in a given directory."""
vcf_gz_files = glob.glob(os.path.join(directory, '*_merged.vcf.gz'))

all_valid = True
for vcf_gz_path in vcf_gz_files:
print(f"Validating {vcf_gz_path}...")
if not validate_bgzip_and_tabix_output(vcf_gz_path):
all_valid = False

if all_valid:
print("All files validated successfully.")
else:
print("Some files failed validation.")

# Define the directory containing the .vcf.gz and .tbi files
directory = '../vector_hap/results/cohort_phasing/'

validate_multiple_files(directory)

52 changes: 52 additions & 0 deletions validation/validate_cohortvcf2zarr.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import os

def check_directory_exists(directory_path):
"""Check if a directory exists."""
if not os.path.isdir(directory_path):
print(f"Error: Directory {directory_path} does not exist.")
return False
return True

def check_zarr_structure(zarr_path, contig):
"""Check the structure of the Zarr directory."""
expected_folders = [
'calldata/GT',
'variants/AC', 'variants/AF', 'variants/ALT',
'variants/CM', 'variants/POS', 'variants/REF'
]
for folder in expected_folders:
full_path = os.path.join(zarr_path, folder)
if not check_directory_exists(full_path):
print(f"Error: Zarr directory {zarr_path} is missing {folder}.")
return False
return True

def validate_cohort_vcf_to_zarr(contig, zarr_directory):
"""Validate the output of the cohort VCF to Zarr conversion."""
zarr_output = os.path.join(zarr_directory, f"validation_{contig}.zarr", contig)

if not check_directory_exists(zarr_output):
return False

if not check_zarr_structure(zarr_output, contig):
return False

print(f"Success: The Zarr directory {zarr_output} is correctly formatted and valid.")
return True

def validate_multiple_files(contigs, zarr_directory):
"""Validate multiple files processed by cohort_vcf_to_zarr.py."""
for contig in contigs:
print(f"Validating contig {contig}...")
if validate_cohort_vcf_to_zarr(contig, zarr_directory):
print(f"Validation passed for contig {contig}.")
else:
print(f"Validation failed for contig {contig}.")

# Define the directories
contigs = ['2R', '2L', '3R', '3L', 'X']
zarr_directory = '../vector_hap/results/cohort_phasing/ligate/outputs/'

# Run validation on multiple files
validate_multiple_files(contigs, zarr_directory)

Loading