From 694f70fff2a1a2a64c9097c13b5f1cc337a03517 Mon Sep 17 00:00:00 2001 From: nbtm-sh Date: Mon, 30 Jun 2025 12:47:18 +1000 Subject: [PATCH 1/5] feat(help): Remove doxx --- samplesheetutils/binaries/create_samplesheet.py | 2 +- samplesheetutils/binaries/sample_name.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/samplesheetutils/binaries/create_samplesheet.py b/samplesheetutils/binaries/create_samplesheet.py index cfe5396..8a519de 100644 --- a/samplesheetutils/binaries/create_samplesheet.py +++ b/samplesheetutils/binaries/create_samplesheet.py @@ -26,7 +26,7 @@ def create_samplesheet(): parser = argparse.ArgumentParser( prog="Create Samplesheet", description="Utility to create a samplesheet from directory, or AA string", - epilog="Written by Nathan Glades ") + epilog="Written by N.G @ unsw.edu.au") parser.add_argument('-a', '--aa-string', help='Single amino acid string', dest='aa_string') parser.add_argument('-m', '--msa-dir', help='Directory containing corresponding MSA files for samples', dest='msa_dir') diff --git a/samplesheetutils/binaries/sample_name.py b/samplesheetutils/binaries/sample_name.py index 5da0d7f..51bf822 100644 --- a/samplesheetutils/binaries/sample_name.py +++ b/samplesheetutils/binaries/sample_name.py @@ -15,7 +15,7 @@ def sample_name(): parser = argparse.ArgumentParser( prog="Read sample name(s) from FASTA", description="Utility to read the sample name(s) from a FASTA file and print them to stdout", - epilog="Written by Nathan Glades " + epilog="Written by N.G @ unsw.edu.au" ) parser.add_argument('-i', '--index', help='Index of the sample you wish to output.\nIf unset, all sample names will be output. Acceptable inputs are an integer, -1 for the last sample, or a range (a:b)', default=None, dest='index') From 3236839c28a3ee053fb518bd07eda5488c42f7c2 Mon Sep 17 00:00:00 2001 From: nbtm-sh Date: Mon, 30 Jun 2025 12:49:14 +1000 Subject: [PATCH 2/5] feat(help): Update affiliation --- samplesheetutils/binaries/create_samplesheet.py | 4 ++-- samplesheetutils/binaries/sample_name.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/samplesheetutils/binaries/create_samplesheet.py b/samplesheetutils/binaries/create_samplesheet.py index 8a519de..1b6bd44 100644 --- a/samplesheetutils/binaries/create_samplesheet.py +++ b/samplesheetutils/binaries/create_samplesheet.py @@ -19,14 +19,14 @@ logging.basicConfig() def version(): - print("") + print("samplesheet-utils 1.2.2") #if __name__ == "__main__": def create_samplesheet(): parser = argparse.ArgumentParser( prog="Create Samplesheet", description="Utility to create a samplesheet from directory, or AA string", - epilog="Written by N.G @ unsw.edu.au") + epilog="Written by nbtm-sh @ unsw.edu.au") parser.add_argument('-a', '--aa-string', help='Single amino acid string', dest='aa_string') parser.add_argument('-m', '--msa-dir', help='Directory containing corresponding MSA files for samples', dest='msa_dir') diff --git a/samplesheetutils/binaries/sample_name.py b/samplesheetutils/binaries/sample_name.py index 51bf822..8777db5 100644 --- a/samplesheetutils/binaries/sample_name.py +++ b/samplesheetutils/binaries/sample_name.py @@ -15,7 +15,7 @@ def sample_name(): parser = argparse.ArgumentParser( prog="Read sample name(s) from FASTA", description="Utility to read the sample name(s) from a FASTA file and print them to stdout", - epilog="Written by N.G @ unsw.edu.au" + epilog="Written by nbtm-sh @ unsw.edu.au" ) parser.add_argument('-i', '--index', help='Index of the sample you wish to output.\nIf unset, all sample names will be output. Acceptable inputs are an integer, -1 for the last sample, or a range (a:b)', default=None, dest='index') From 0617254bdfebd33d80c8689b54ca0e29873f05a6 Mon Sep 17 00:00:00 2001 From: nbtm-sh Date: Wed, 2 Jul 2025 13:07:37 +1000 Subject: [PATCH 3/5] feat(truncate-msa): Update truncation script --- samplesheetutils/binaries/truncate_msa.py | 106 ++++++++++++++++++++++ 1 file changed, 106 insertions(+) create mode 100644 samplesheetutils/binaries/truncate_msa.py diff --git a/samplesheetutils/binaries/truncate_msa.py b/samplesheetutils/binaries/truncate_msa.py new file mode 100644 index 0000000..999a7ab --- /dev/null +++ b/samplesheetutils/binaries/truncate_msa.py @@ -0,0 +1,106 @@ +import argparse, tempfile, logging, os +from samplesheetutils.utils.sample import * +from samplesheetutils.utils.output import * +from samplesheetutils.utils.fasta import * +from samplesheetutils.utils.input import * +from samplesheetutils.utils.alignment import * +from samplesheetutils.utils.a3m import * + +# Set up logging +logger = logging.getLogger(__name__) +logging.basicConfig() + +def version(): + print("1.2.2") + +#if __name__ == "__main__": +def truncate_msa(): + parser = argparse.ArgumentParser( + prog="Truncate MSA", + description="Utility for truncating MSAs for targeting a specified region.", + epilog="Written by nbtm-sh @ unsw.edu.au") + + parser.add_argument('input_file', default='?', help='Path to input file') + parser.add_argument('region_start', default='?', help='The index of the first residue to target.') + parser.add_argument('region_end', default='?', help='The index of the final residue to target.') + parser.add_argument('-o', '--output', help='Path to output file', default='output.a3m', dest='output') + parser.add_argument('-i', '--in-place', help='Replace the target file with the modified output (in-place)', default=False, action='store_true', dest='in_place') + parser.add_argument('-r', '--inverse', help='Invert output (delete residues within the target region)', default=False, action='store_true', dest='inverse') + parser.add_argument('--version', help='Show version number', default=False, action='store_true', dest='version') + parser.add_argument('--debug', help='Show debug output', default=False, action='store_true', dest='debug') + + args = parser.parse_args() + + if args.debug: + logger.setLevel(logging.DEBUG) + else: + logger.setLevel(logging.INFO) + + if (args.version): + version() + exit(0) + + if (args.debug): + version() + + # Check that all arguments are valid + args_valid = 0 + args_valid |= args.input_file == '?' + args_valid |= args.region_start == '?' + args_valid |= args.region_end == '?' + + if args_valid: + logger.error("Please specify all input arguments. Use --help to display required arguments") + exit(1) + + # Validate that an input was provided + logger.debug(f"Loading {args.input_file}...") + samples = [] + + # Try to load variables + try: + r_start = int(args.region_start) + r_end = int(args.region_end) + except ValueError: + logger.error("Please enter only real numbers for region_start and region_end") + + logger.debug(f"Region Start: {r_start}, Region End: {r_end}") + + try: + with open(args.input_file, "r") as fp: + logger.debug(f"Opened input file {args.input_file} for reading") + samples = read_a3m( + fp, + read_data=True + ) + logger.debug(f"Imported {len(samples)} samples.") + except FileNotFoundError: + logger.error(f"Input file {args.input_file} does not exist") + exit(1) + except PermissionError: + logger.error(f"Input file {args.input_file} could not be opened due to a permission error") + exit(1) + + # Edit the a3m file and preserve the first entry + # First check to make sure that the file is longer than 1 + if len(samples) == 1: + logger.error("The a3m file only cotnains one sample. The first sample of the a3m file is always preserved, so no changes to the file are needed") + exit(1) + + for i, o in enumerate(samples): + # Skip the first entry + if i == 0: + continue + + # Get a list of elements from the string + aln_l = [el for el in o.data] + # Edit the string + aln_sub = ['-' if (i > r_start and i < r_end) ^ args.inverse else k for i, k in enumerate(aln_l)] + samples[i].data = ''.join(aln_sub) + + output_file = args.output if not args.in_place else args.input_file + logger.debug(f"Opening output file {output_file} for writing...") + with open(output_file, "w") as fp: + make_a3m(fp, samples) + + From 0ecf62ecdad4b9318469887165445bf91f0a6fba Mon Sep 17 00:00:00 2001 From: nbtm-sh Date: Wed, 2 Jul 2025 13:08:03 +1000 Subject: [PATCH 4/5] feat(truncate-msa): Update setup.py --- setup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/setup.py b/setup.py index eef4bc7..12d2219 100644 --- a/setup.py +++ b/setup.py @@ -3,7 +3,7 @@ setup( name="samplesheetutils", version="1.2.2", - author="Nathan Glades", + author="nbtm-sh", author_email="n.glades@unsw.edu.au", packages=find_packages(), description="Collection of utilities for creating and transforming samplesheets and samples", @@ -12,7 +12,8 @@ entry_points={ 'console_scripts': [ 'create-samplesheet=samplesheetutils.binaries.create_samplesheet:create_samplesheet', - 'sample-name=samplesheetutils.binaries.sample_name:sample_name' + 'sample-name=samplesheetutils.binaries.sample_name:sample_name', + 'truncate-msa=samplesheetutils.binaries.truncate_msa:truncate_msa' ] } ) From 70445caaba270868c19d7acf9eb45792d6c59ebd Mon Sep 17 00:00:00 2001 From: nbtm-sh Date: Wed, 2 Jul 2025 13:17:17 +1000 Subject: [PATCH 5/5] feat(truncate-msa): Add supporting libraries --- samplesheetutils/utils/a3m.py | 52 +++++++++++++++++++++++++++++ samplesheetutils/utils/alignment.py | 5 +++ 2 files changed, 57 insertions(+) create mode 100644 samplesheetutils/utils/a3m.py create mode 100644 samplesheetutils/utils/alignment.py diff --git a/samplesheetutils/utils/a3m.py b/samplesheetutils/utils/a3m.py new file mode 100644 index 0000000..227a48b --- /dev/null +++ b/samplesheetutils/utils/a3m.py @@ -0,0 +1,52 @@ +from samplesheetutils.utils.sample import Sample +from samplesheetutils.utils.alignment import Alignment +from typing import Union +import re + +def read_a3m(fp, read_data=True): + """ + Read in an a3m file and return an array containing the alignment hits and their names. + read_data: Controls if you wish to read the a3m residue hits. Set to false if you only want to read the names of the hits + """ + + align_samples = [] + + lines = fp.readlines() + + temp_aln_object = None + + for fasta_line in lines: + if re.search("^\\>.*$", fasta_line): + # This is to add support for fixed-width fasta files + if temp_aln_object is not None: + align_samples.append(temp_aln_object) + temp_aln_object = Sample(fasta_line[1:].strip(), fp.name, "") + elif temp_aln_object is not None: + temp_aln_object.data += fasta_line.strip() + + if temp_aln_object is not None: + align_samples.append(temp_aln_object) + + fp.close() + + return align_samples + + +def make_a3m(fp, sample: Union[Alignment, list], header='>', fixed_width=False, fixed_width_column_count=80): + """ + Write an A3M file given a list of Alignment objects + sample: List of Alignment objects + header: The header character for each sample. It is recommended not to change this + fixed_width: Controls if data is written with line breaks every n characters, or not + fixed_width_column_count: Controls how often the data is broken up. + """ + if type(sample) is not list: + sample = [sample] + + for si in sample: + sample_data = si.data + if fixed_width: + sample_data = '\n'.join([sample_data[i:i+fixed_width_column_count] for i in range(0, len(sample_data), fixed_width_column_count)]) + fp.write(f"{header}{si.name}\n{sample_data}") + fp.flush() + diff --git a/samplesheetutils/utils/alignment.py b/samplesheetutils/utils/alignment.py new file mode 100644 index 0000000..3112132 --- /dev/null +++ b/samplesheetutils/utils/alignment.py @@ -0,0 +1,5 @@ +class Alignment: + def __init__(self, name, path, data, msa = None): + self.name = name + self.path = path + self.data = data