From 51c65f5a0155ddc628610ce887aba9c1e939b6a0 Mon Sep 17 00:00:00 2001 From: Rockwell Weiner Date: Fri, 25 Nov 2022 09:13:14 -0800 Subject: [PATCH 1/3] add max-width parameter --- src/ldmat/__main__.py | 51 ++++++++++++++++++++++++++++--------------- 1 file changed, 33 insertions(+), 18 deletions(-) diff --git a/src/ldmat/__main__.py b/src/ldmat/__main__.py index 66b0eae..c9a8eb7 100644 --- a/src/ldmat/__main__.py +++ b/src/ldmat/__main__.py @@ -255,6 +255,7 @@ def convert_full_chromosome_h5( precision, decimals, start_locus, + max_chunk_width, chromosome, locus_regex, loader_class=BroadInstituteLoader, @@ -281,24 +282,36 @@ def convert_full_chromosome_h5( first_missing_locus = start_locus - for i, (file, local_start_locus, local_end_locus) in enumerate(files): - if local_start_locus >= start_locus: - if i + 1 < len(files): - next_covered_locus = files[i + 1][1] - else: - next_covered_locus = local_end_locus - convert_h5( - file, - outfile, - first_missing_locus, - next_covered_locus, - precision, - decimals, - loader_class=loader_class, - ) - first_missing_locus = next_covered_locus + i = 0 + while i < len(files): + file, local_start_locus, local_end_locus = files[i] + if local_start_locus < start_locus: + i += 1 + continue + + if i + 1 < len(files): + next_covered_locus = files[i+1][1] + else: + next_covered_locus = local_end_locus + + if max_chunk_width and (first_missing_locus + max_chunk_width < next_covered_locus): + next_locus = first_missing_locus + max_chunk_width + else: + next_locus = next_covered_locus + i += 1 + + convert_h5( + file, + outfile, + first_missing_locus, + next_locus, + precision, + decimals, + loader_class=loader_class, + ) + first_missing_locus = next_locus - logger.info("{:.0f}% complete".format(((i + 1) * 100) / len(files))) + logger.info("{:.0f}% complete".format(((i + 1) * 100) / len(files))) def convert_maf_h5(infile, outfile, loader_class=BroadInstituteLoader): @@ -809,11 +822,12 @@ def convert(infile, outfile, min_value, decimals, start_locus, end_locus, loader @click.option("--min-value", "-m", type=float, default=None) @click.option("--decimals", "-d", type=int, default=None) @click.option("--start-locus", "-s", type=int, default=1) +@click.option("--max-chunk-width", "-w", type=int, default=None) @click.option("--chromosome", "-c", type=int, required=True) @click.option("--locus-regex", "-r", type=str, default=r"_(\d+)", show_default=True) @loader_option def convert_chromosome( - filepath, outfile, min_value, decimals, start_locus, chromosome, locus_regex, loader + filepath, outfile, min_value, decimals, start_locus, max_chunk_width, chromosome, locus_regex, loader ): logger.debug(f"Converting chromosome {chromosome}") @@ -823,6 +837,7 @@ def convert_chromosome( min_value, decimals, start_locus, + max_chunk_width, chromosome, locus_regex, loader, From 15b60a2421879d60387896eb7affc2771ff3557a Mon Sep 17 00:00:00 2001 From: Rockwell Weiner Date: Fri, 25 Nov 2022 16:57:38 -0800 Subject: [PATCH 2/3] fix % complete message --- src/ldmat/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/ldmat/__main__.py b/src/ldmat/__main__.py index c9a8eb7..56f2124 100644 --- a/src/ldmat/__main__.py +++ b/src/ldmat/__main__.py @@ -311,7 +311,7 @@ def convert_full_chromosome_h5( ) first_missing_locus = next_locus - logger.info("{:.0f}% complete".format(((i + 1) * 100) / len(files))) + logger.info("{:.0f}% complete".format((i * 100) / len(files))) def convert_maf_h5(infile, outfile, loader_class=BroadInstituteLoader): From 1b177e02498d8f7e2036b7ba496765f650faad38 Mon Sep 17 00:00:00 2001 From: Rockwell Weiner Date: Sat, 26 Nov 2022 11:19:40 -0800 Subject: [PATCH 3/3] separate row and column offsets --- src/ldmat/__main__.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/ldmat/__main__.py b/src/ldmat/__main__.py index 56f2124..90d558c 100644 --- a/src/ldmat/__main__.py +++ b/src/ldmat/__main__.py @@ -406,7 +406,10 @@ def extract_metadata_df_from_group(group): df = pd.DataFrame( group[POSITION_DATASET], columns=["BP"], index=group[NAME_DATASET] ) - df["relative_pos"] = np.arange(len(df)) + df["relative_col"] = np.arange(len(df)) + + start_offset = len(df[df.BP < group.attrs[START_ATTR]]) + df["relative_row"] = df["relative_col"] - start_offset return df @@ -419,8 +422,8 @@ def get_horizontal_slice(group, rows, columns, range_query): row_inds = df_ld_snps.BP.isin(rows) col_inds = df_ld_snps.BP.isin(columns) - row_positions = df_ld_snps[row_inds].relative_pos - col_positions = df_ld_snps[col_inds].relative_pos + row_positions = df_ld_snps[row_inds].relative_row + col_positions = df_ld_snps[col_inds].relative_col h_slice = None if len(row_positions) and len(col_positions):