diff --git a/CHANGES.md b/CHANGES.md index 54f954e5a..527b8fd16 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,10 +2,21 @@ ## __NEXT__ +### Major Changes + +* A metadata column named `id` is now preferred as the identifier column when present. This applies to all commands that work with metadata id columns (`parse`, `merge`, `filter`, `subsample`, `refine`, `traits`, `frequencies`, `export v2`). Typical Augur usage should start with ensuring your metadata file has an `id` column instead of passing `--metadata-id-columns` to each command. Note this is a behavior change: metadata that already contains an `id` column but relied on `strain`/`name` being inferred will now use `id`; pass `--metadata-id-columns strain name` to restore the previous behavior. [#1780][] @victorlin + +### Features + +* merge: Add `--output-metadata-id-column` to explicitly name the id column in the merged metadata output, overriding the default of using the first input's id column. [#1780][] @victorlin +* merge: Support a single `--metadata` or `--sequences` input, instead of requiring at least two. This allows `augur merge` to be used as a general entry point for standardization even when no actual merge is needed. [#1974][] @victorlin + ### Bug fixes * Bump minimum xopen version to 2.1.0. This removes the indirect dependency on `zstandard` as there is built-in zstd support in Python 3.10 and an equivalent backport for older versions. [#2009][] @victorlin +[#1780]: https://github.com/nextstrain/augur/issues/1780 +[#1974]: https://github.com/nextstrain/augur/issues/1974 [#2007]: https://github.com/nextstrain/augur/pull/2009 ## 33.4.1 (2 June 2026) diff --git a/augur/io/metadata.py b/augur/io/metadata.py index 094778335..90cc3dfc0 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -17,7 +17,7 @@ DEFAULT_DELIMITERS = (',', '\t') -DEFAULT_ID_COLUMNS = ("strain", "name") +DEFAULT_ID_COLUMNS = ("id", "strain", "name") METADATA_DATE_COLUMN = 'date' diff --git a/augur/merge.py b/augur/merge.py index 2a57415e9..e20e82452 100644 --- a/augur/merge.py +++ b/augur/merge.py @@ -1,5 +1,5 @@ """ -Merge two or more datasets into one. +Merge datasets into one. Datasets can consist of metadata and/or sequence files. If both are provided, the order and file contents are used independently. @@ -18,8 +18,9 @@ column or overwriting values in an existing column. For columns appearing in more than one table, non-empty values on the right hand side overwrite values on the left hand side. The first table's id column name is used as the output -id column name. Non-id columns in other input tables that would conflict with -this output id column name are not allowed and if present will cause an error. +id column name, unless overridden with --output-metadata-id-column. Non-id +columns in other input tables that would conflict with this output id column +name are not allowed and if present will cause an error. One generated column per input table may be optionally appended to the end of the output table to identify the source of each row's data. Column names are @@ -109,6 +110,7 @@ def register_parser(parent_subparsers): output_group = parser.add_argument_group("outputs", "options related to output") output_group.add_argument('--output-metadata', metavar="FILE", help="Merged metadata as TSV. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP) + output_group.add_argument('--output-metadata-id-column', metavar="COLUMN", help="Name to use for the id column in the merged metadata output. (default: the first input's id column)" + SKIP_AUTO_DEFAULT_IN_HELP) output_group.add_argument('--source-columns', metavar="TEMPLATE", help=f"Template with which to generate names for the columns (described above) identifying the source of each row's data. Must contain a literal placeholder, {{NAME}}, which stands in for the metadata table names assigned in --metadata. (default: disabled)" + SKIP_AUTO_DEFAULT_IN_HELP) output_group.add_argument('--no-source-columns', dest="source_columns", action="store_const", const=None, help=f"Suppress generated columns (described above) identifying the source of each row's data. This is the default behaviour, but it may be made explicit or used to override a previous --source-columns." + SKIP_AUTO_DEFAULT_IN_HELP) @@ -158,9 +160,6 @@ def run(args): def merge_metadata(args): # Parse --metadata arguments - if not len(args.metadata) >= 2: - raise AugurError(f"At least two metadata inputs are required for merging.") - if unnamed := [repr(x) for x in args.metadata if "=" not in x or x.startswith("=")]: raise AugurError(dedent(f"""\ All metadata inputs must be assigned a name, e.g. with NAME=FILE. @@ -240,9 +239,11 @@ def merge_metadata(args): # Clean up database file by default delete_db = True - # Track columns as we see them, in order. The first metadata's id column - # is always the first output column of the merge, so insert it now. - output_id_column = metadata[0].id_column + # Track columns as we see them, in order. The output id column (either + # the first metadata's id column, or an explicit override) is always the + # first output column of the merge, so insert it now. + output_id_column = args.output_metadata_id_column or metadata[0].id_column + output_id_column_source = "the given --output-metadata-id-column" if args.output_metadata_id_column else "the first input's id column" output_columns = { output_id_column: [] } if conflicting_columns := [f"{c!r} in metadata table {m.name!r} (id column: {m.id_column!r})" @@ -252,7 +253,7 @@ def merge_metadata(args): and c != m.id_column]: raise AugurError(dedent(f"""\ Non-id column names in metadata inputs may not conflict with the - output id column name ({output_id_column!r}, the first input's id column). + output id column name ({output_id_column!r}, {output_id_column_source}). The following input {_n("column", "columns", len(conflicting_columns))} would conflict: @@ -406,9 +407,6 @@ def merge_metadata(args): def merge_sequences(args): - if not len(args.sequences) >= 2: - raise AugurError(f"At least two sequence inputs are required for merging.") - if not args.skip_input_sequences_validation: for s in args.sequences: print_info(f"Validating {s!r}…") @@ -418,6 +416,8 @@ def merge_sequences(args): # Reversed because seqkit rmdup keeps the first entry but this command # should keep the last entry. + # TODO: use augur read-file once its slowness has been sorted out. + # command = f""" {seqkit()} --threads {args.nthreads} rmdup {" ".join(shquote(s) for s in reversed(args.sequences))} | {augur()} write-file {shquote(args.output_sequences)} @@ -426,9 +426,9 @@ def merge_sequences(args): try: run_shell_command(command, raise_errors=True) except Exception: - # Ideally, read-file, seqkit, and write-file errors would all be handled - # separately. That is not possible because everything is done in one - # child process with one return code. + # Ideally, seqkit and write-file errors would all be handled separately. + # That is not possible because everything is done in one child process + # with one return code. if os.path.isfile(args.output_sequences): # Remove the partial output file. os.remove(args.output_sequences) diff --git a/augur/parse.py b/augur/parse.py index 8cc76af2c..73d0a0109 100644 --- a/augur/parse.py +++ b/augur/parse.py @@ -12,7 +12,7 @@ from .dates import get_numerical_date_from_value from .errors import AugurError -PARSE_DEFAULT_ID_COLUMNS = ("strain", "name") +PARSE_DEFAULT_ID_COLUMNS = ("id", "strain", "name") forbidden_characters = str.maketrans( {' ': None, diff --git a/docs/faq/metadata.rst b/docs/faq/metadata.rst index 692002002..bafe3013a 100644 --- a/docs/faq/metadata.rst +++ b/docs/faq/metadata.rst @@ -40,12 +40,14 @@ don't open it in Excel again!). Format ~~~~~~ -**Strain names** - -You must have one column named ``strain`` or ``name``. It contains your -sequence names, and needs to match the identifiers of your sequences (in -the Fasta or VCF file) *exactly* and must not contain characters such as -spaces, or ``()[]{}|#><``. +**Sequence identifiers** + +You must have a column that contains a unique identifier for each row. Augur +prefers a column named ``id``, falling back to ``strain`` or ``name`` if ``id`` +is not present. A different column can be used by passing +``--metadata-id-columns`` to commands that support it. Values in this column +must match the identifiers of your sequences (in the Fasta or VCF file) +*exactly* and must not contain characters such as spaces, or ``()[]{}|#><``. **Dates** diff --git a/tests/functional/export_v2/cram/metadata-id-columns.t b/tests/functional/export_v2/cram/metadata-id-columns.t index 3568e2c53..bf7404216 100644 --- a/tests/functional/export_v2/cram/metadata-id-columns.t +++ b/tests/functional/export_v2/cram/metadata-id-columns.t @@ -41,5 +41,5 @@ This should fail with a helpful error message. > --auspice-config "$TESTDIR/../data/auspice_config1.json" \ > --maintainers "Nextstrain Team" \ > --output dataset.json > /dev/null - ERROR: None of the possible id columns ('strain', 'name') were found in the metadata's columns ('invalid_id', 'div', 'mutation_length') + ERROR: None of the possible id columns ('id', 'strain', 'name') were found in the metadata's columns ('invalid_id', 'div', 'mutation_length') [1] diff --git a/tests/functional/merge/cram/merge-metadata.t b/tests/functional/merge/cram/merge-metadata.t index a63601d38..ea069d2ea 100644 --- a/tests/functional/merge/cram/merge-metadata.t +++ b/tests/functional/merge/cram/merge-metadata.t @@ -9,30 +9,44 @@ Full outer join like behaviour, column coalescing/overwriting, and recording of each row's source file(s) in extra columns. $ cat >x.tsv <<~~ - > strain a b c + > id a b c > one X1a X1b X1c > two X2a X2b X2c > ~~ $ cat >y.tsv <<~~ - > strain b c f e d + > id b c f e d > two Y2c Y2f Y2e Y2d > three Y3f Y3e Y3d > ~~ +A single metadata input is supported, e.g. for decompression, adding source +columns, or standardizing the output id column, without an actual merge taking +place. + + $ ${AUGUR} merge \ + > --metadata X=x.tsv \ + > --source-columns '__source_metadata_{NAME}' \ + > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty + id a b c __source_metadata_X + one X1a X1b X1c 1 + two X2a X2b X2c 1 + +Two inputs. + $ ${AUGUR} merge \ > --metadata X=x.tsv Y=y.tsv \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 More than two inputs. $ cat >z.tsv <<~~ - > strain g c + > id g c > one Z1g > two Z2g Z2c > three Z3g @@ -42,16 +56,16 @@ More than two inputs. > --metadata X=x.tsv Y=y.tsv Z=z.tsv \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d g __source_metadata_X __source_metadata_Y __source_metadata_Z - one X1a X1b X1c Z1g 1 0 1 - two X2a X2b Z2c Y2f Y2e Y2d Z2g 1 1 1 - three Y3f Y3e Y3d Z3g 0 1 1 + id a b c f e d g __source_metadata_X __source_metadata_Y __source_metadata_Z + one X1a X1b X1c Z1g 1 0 1 + two X2a X2b Z2c Y2f Y2e Y2d Z2g 1 1 1 + three Y3f Y3e Y3d Z3g 0 1 1 Supports Augur's standard id column detection. Note that the first file's id column name (e.g. "name" here) is used as the output id column name, per Augur's convention of preserving the input id column name. - $ sed '1s/^strain/name/g' < x.tsv > x-name-column.tsv + $ sed '1s/^id/name/g' < x.tsv > x-name-column.tsv $ ${AUGUR} merge \ > --metadata X=x-name-column.tsv Y=y.tsv \ > --source-columns '__source_metadata_{NAME}' \ @@ -61,25 +75,37 @@ Augur's convention of preserving the input id column name. two X2a X2b Y2c Y2f Y2e Y2d 1 1 three Y3f Y3e Y3d 0 1 - $ sed '1s/^strain/name/g' < y.tsv > y-name-column.tsv + $ sed '1s/^id/name/g' < y.tsv > y-name-column.tsv $ ${AUGUR} merge \ > --metadata X=x.tsv Y=y-name-column.tsv \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 + +Supports --output-metadata-id-column to override the output id column name. + + $ ${AUGUR} merge \ + > --metadata X=x-name-column.tsv Y=y.tsv \ + > --output-metadata-id-column my_id \ + > --source-columns '__source_metadata_{NAME}' \ + > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty + my_id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 Supports --metadata-id-columns. - $ sed '1s/^strain/id/g' < x.tsv > x-id-column.tsv + $ sed '1s/^id/my_id/g' < x.tsv > x-my-id-column.tsv $ ${AUGUR} merge \ - > --metadata X=x-id-column.tsv Y=y.tsv \ - > --metadata-id-columns id strain \ + > --metadata X=x-my-id-column.tsv Y=y.tsv \ + > --metadata-id-columns my_id id \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - id a b c f e d __source_metadata_X __source_metadata_Y + my_id a b c f e d __source_metadata_X __source_metadata_Y one X1a X1b X1c 1 0 two X2a X2b Y2c Y2f Y2e Y2d 1 1 three Y3f Y3e Y3d 0 1 @@ -87,24 +113,24 @@ Supports --metadata-id-columns. Supports table-specific --metadata-id-columns. $ ${AUGUR} merge \ - > --metadata X=x-id-column.tsv Y=y.tsv \ - > --metadata-id-columns X=id \ + > --metadata X=x-my-id-column.tsv Y=y.tsv \ + > --metadata-id-columns X=my_id \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - id a b c f e d __source_metadata_X __source_metadata_Y + my_id a b c f e d __source_metadata_X __source_metadata_Y one X1a X1b X1c 1 0 two X2a X2b Y2c Y2f Y2e Y2d 1 1 three Y3f Y3e Y3d 0 1 $ ${AUGUR} merge \ > --metadata X=x.tsv Y=y-name-column.tsv \ - > --metadata-id-columns strain Y=name \ + > --metadata-id-columns id Y=name \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 Supports Augur's standard delimiter detection. @@ -113,10 +139,10 @@ Supports Augur's standard delimiter detection. > --metadata X=x.csv Y=y.tsv \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 Supports --metadata-delimiters. @@ -126,10 +152,10 @@ Supports --metadata-delimiters. > --metadata-delimiters '|' $'\t' \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 Supports table-specific --metadata-delimiters. @@ -138,20 +164,20 @@ Supports table-specific --metadata-delimiters. > --metadata-delimiters X='|' \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 $ ${AUGUR} merge \ > --metadata X=x.txt Y=y.tsv \ > --metadata-delimiters $'\t' X='|' \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 Supports Augur's standard accepted compression formats. @@ -161,10 +187,10 @@ Supports Augur's standard accepted compression formats. > --metadata X=x.tsv.xz Y=y.tsv.zst \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 OFF THE BEATEN PATH @@ -182,7 +208,7 @@ Metadata names are only the part before the first '='. Metadata field values with metachars (field or record delimiters) are handled properly. $ cat >metachars.csv <<~~ - > strain,comma,tab,newline + > id,comma,tab,newline > one,"x,x","x x","x > x" > two,"","","" @@ -191,7 +217,7 @@ Metadata field values with metachars (field or record delimiters) are handled pr > --metadata X=x.tsv metachars=metachars.csv \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet - strain a b c comma tab newline __source_metadata_X __source_metadata_metachars + id a b c comma tab newline __source_metadata_X __source_metadata_metachars one X1a X1b X1c x,x "x x" "x x" 1 1 two X2a X2b X2c 1 1 @@ -202,10 +228,10 @@ Source columns template. > --metadata X=x.tsv Y=y.tsv \ > --source-columns 'origin_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d origin_X origin_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d origin_X origin_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 No source columns (explicitly or by default). @@ -213,30 +239,22 @@ No source columns (explicitly or by default). > --metadata X=x.tsv Y=y.tsv \ > --no-source-columns \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d - one X1a X1b X1c - two X2a X2b Y2c Y2f Y2e Y2d - three Y3f Y3e Y3d + id a b c f e d + one X1a X1b X1c + two X2a X2b Y2c Y2f Y2e Y2d + three Y3f Y3e Y3d $ ${AUGUR} merge \ > --metadata X=x.tsv Y=y.tsv \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d - one X1a X1b X1c - two X2a X2b Y2c Y2f Y2e Y2d - three Y3f Y3e Y3d + id a b c f e d + one X1a X1b X1c + two X2a X2b Y2c Y2f Y2e Y2d + three Y3f Y3e Y3d ERROR HANDLING -At least two metadata inputs are required. - - $ ${AUGUR} merge \ - > --metadata X=x.tsv \ - > --output-metadata - - ERROR: At least two metadata inputs are required for merging. - [2] - Metadata names are required. $ ${AUGUR} merge \ @@ -267,7 +285,7 @@ Metadata names must be unique. Duplicates. $ cat >dups.tsv <<~~ - > strain a b c + > id a b c > one 1a 1b 1c > one 2a 2b 2c > ~~ @@ -314,7 +332,7 @@ Unknown metadata names in --metadata-id-columns. $ ${AUGUR} merge \ > --metadata X=x.tsv Y=y.tsv \ - > --metadata-id-columns strain whatsit=id \ + > --metadata-id-columns id whatsit=id \ > --output-metadata - ERROR: Unknown metadata table name in --metadata-id-columns: @@ -326,7 +344,7 @@ Unknown metadata names in --metadata-id-columns. $ ${AUGUR} merge \ > --metadata X=x.tsv Y=y.tsv \ - > --metadata-id-columns whatsit=id whosit=accession X=strain Y=strain \ + > --metadata-id-columns whatsit=id whosit=accession X=id Y=id \ > --output-metadata - ERROR: Unknown metadata table names in --metadata-id-columns: @@ -340,30 +358,48 @@ Unknown metadata names in --metadata-id-columns. No id column found. $ ${AUGUR} merge \ - > --metadata X=x-id-column.tsv Y=y.tsv \ + > --metadata X=x.tsv Y=y.tsv \ > --metadata-id-columns strain \ > --output-metadata /dev/null - ERROR: x-id-column.tsv: None of the possible id columns ('strain') were found in the metadata's columns ('id', 'a', 'b', 'c'). + ERROR: x.tsv: None of the possible id columns ('strain') were found in the metadata's columns ('id', 'a', 'b', 'c'). [2] Non-id column names conflicting with output id column name. - $ cat >id-and-strain.csv <<~~ - > id,strain + $ cat >my-id-and-id.csv <<~~ + > my_id,id > one,1 > two,2 > three,3 > ~~ $ ${AUGUR} merge \ - > --metadata strain-only=x.tsv id-and-strain=id-and-strain.csv \ - > --metadata-id-columns id strain \ + > --metadata id-only=x.tsv my-id-and-id=my-id-and-id.csv \ + > --metadata-id-columns my_id id \ + > --output-metadata /dev/null --quiet + ERROR: Non-id column names in metadata inputs may not conflict with the + output id column name ('id', the first input's id column). + + The following input column would conflict: + + 'id' in metadata table 'my-id-and-id' (id column: 'my_id') + + Please rename or drop the conflicting column before merging. + Renaming may be done with `augur curate rename`. + + [2] + +Non-id column names conflicting with an explicit --output-metadata-id-column. + + $ ${AUGUR} merge \ + > --metadata X=x.tsv Y=y.tsv \ + > --output-metadata-id-column a \ > --output-metadata /dev/null --quiet ERROR: Non-id column names in metadata inputs may not conflict with the - output id column name ('strain', the first input's id column). + output id column name ('a', the given --output-metadata-id-column). The following input column would conflict: - 'strain' in metadata table 'id-and-strain' (id column: 'id') + 'a' in metadata table 'X' (id column: 'id') Please rename or drop the conflicting column before merging. Renaming may be done with `augur curate rename`. diff --git a/tests/functional/merge/cram/merge-sequences-errors.t b/tests/functional/merge/cram/merge-sequences-errors.t index 9a32c1f3c..6352385d7 100644 --- a/tests/functional/merge/cram/merge-sequences-errors.t +++ b/tests/functional/merge/cram/merge-sequences-errors.t @@ -70,12 +70,3 @@ A missing input file shows a meaningful error during validation… WARNING: Metadata merge was successful but sequence merge has failed, so --output-metadata has no corresponding sequence output. ERROR: Merging failed, see error(s) above. The command may have already written data to stdout. You may want to clean up any partial outputs. [2] - -At least two sequence inputs are required. - - $ ${AUGUR} merge \ - > --sequences x.fasta \ - > --output-sequences - - WARNING: Metadata merge was successful but sequence merge has failed, so --output-metadata has no corresponding sequence output. - ERROR: At least two sequence inputs are required for merging. - [2] diff --git a/tests/functional/merge/cram/merge-sequences-only.t b/tests/functional/merge/cram/merge-sequences-only.t index ecc6eed31..7324564e0 100644 --- a/tests/functional/merge/cram/merge-sequences-only.t +++ b/tests/functional/merge/cram/merge-sequences-only.t @@ -20,6 +20,25 @@ Merge sequences without metadata. > GCTA > ~~ +A single sequence input is supported, e.g. for decompression, without an actual +merge taking place. + + $ ${AUGUR} merge \ + > --sequences x.fasta \ + > --output-sequences merged.fasta + Validating 'x.fasta'… + Merging sequences and writing to 'merged.fasta'… + + $ cat merged.fasta + >seq1 + ATCG + >seq2 + GCTA + >seq3 + TCGA + +Two inputs. + $ ${AUGUR} merge \ > --sequences x.fasta y.fasta \ > --output-sequences merged.fasta diff --git a/tests/functional/parse.t b/tests/functional/parse.t index 2ae357ac5..6565ec325 100644 --- a/tests/functional/parse.t +++ b/tests/functional/parse.t @@ -53,33 +53,32 @@ This should fail. ERROR: Output id field 'notexist' not found in fields ['strain', 'virus', 'accession', 'date', 'region', 'country', 'division', 'city', 'db', 'segment', 'authors', 'url', 'title', 'journal', 'paper_url']. [2] -Parse Zika sequences into sequences and metadata, preferred default ids is 'name', then 'strain', then first field. +Parse Zika sequences into sequences and metadata, preferred default ids is 'id', then 'strain', then 'name', then first field. $ ${AUGUR} parse \ > --sequences "$TESTDIR/parse/zika.fasta" \ > --output-sequences "sequences.fasta" \ > --output-metadata "metadata.tsv" \ - > --fields strain virus name date region country division city db segment authors url title journal paper_url \ - > --output-id-field 'name' \ + > --fields id virus name date region country division city db segment authors url title journal paper_url \ > --prettify-fields region country division city \ > --fix-dates monthfirst - $ diff -u "$TESTDIR/parse/sequences_other.fasta" "sequences.fasta" + $ diff -u "$TESTDIR/parse/sequences.fasta" "sequences.fasta" -Parse Zika sequences into sequences and metadata when there is no 'name' field. -This should use the 2nd entry in DEFAULT_ID_COLUMNS ('name', 'strain') instead. +Parse Zika sequences into sequences and metadata when there is no 'id' or 'strain' field. +This should use the 3rd entry in DEFAULT_ID_COLUMNS ('id', 'strain', 'name') instead. $ ${AUGUR} parse \ > --sequences "$TESTDIR/parse/zika.fasta" \ > --output-sequences "sequences.fasta" \ > --output-metadata "metadata.tsv" \ - > --fields col1 virus strain date region country division city db segment authors url title journal paper_url \ + > --fields col1 virus name date region country division city db segment authors url title journal paper_url \ > --prettify-fields region country division city \ > --fix-dates monthfirst $ diff -u "$TESTDIR/parse/sequences_other.fasta" "sequences.fasta" -Parse Zika sequences into sequences and metadata when no output-id-field is provided and none of the fields match DEFAULT_ID_COLUMNS (e.g. ('strain', 'name')). +Parse Zika sequences into sequences and metadata when no output-id-field is provided and none of the fields match DEFAULT_ID_COLUMNS (e.g. ('id', 'strain', 'name')). This should use the first field as the id field and the metadata should not have an extra strain or name column. $ ${AUGUR} parse \ diff --git a/tests/io/test_metadata.py b/tests/io/test_metadata.py index 548db3328..5d4c3a988 100644 --- a/tests/io/test_metadata.py +++ b/tests/io/test_metadata.py @@ -3,7 +3,7 @@ from io import BytesIO from augur.errors import AugurError -from augur.io.metadata import InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv, Metadata +from augur.io.metadata import DEFAULT_ID_COLUMNS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv, Metadata from augur.types import DataErrorMethod @@ -123,6 +123,17 @@ def metadata_file(tmpdir): ]) return path +@pytest.fixture +def metadata_file_with_id_and_strain(tmpdir): + path = str(tmpdir / 'metadata.tsv') + with open(path, 'w') as fh: + fh.writelines([ + 'strain\tid\tdate\n', + 'SEQ_A\tID_A\t2020-10-01\n', + 'SEQ_T\tID_T\t2020-10-02\n', + ]) + return path + def unmatched_sequences(): return [ '>EXTRA_SEQ_A\nAAAAA\n', @@ -524,6 +535,21 @@ def test_attributes(self, metadata_file): assert m.columns == ['strain', 'country', 'date'] assert m.id_column == 'strain' + def test_default_prefers_id(self, metadata_file_with_id_and_strain): + """By default, an explicit 'id' column is preferred over 'strain'.""" + m = Metadata(metadata_file_with_id_and_strain, delimiters=['\t'], id_columns=DEFAULT_ID_COLUMNS) + assert m.id_column == 'id' + + def test_default_falls_back_to_strain(self, metadata_file): + """When 'id' is absent, the default falls back to 'strain'.""" + m = Metadata(metadata_file, delimiters=['\t'], id_columns=DEFAULT_ID_COLUMNS) + assert m.id_column == 'strain' + + def test_explicit_id_columns_override(self, metadata_file_with_id_and_strain): + """An explicit id_columns argument overrides the 'id' preference.""" + m = Metadata(metadata_file_with_id_and_strain, delimiters=['\t'], id_columns=['strain']) + assert m.id_column == 'strain' + def test_invalid_delimiter(self, metadata_file): """Failure to detect delimiter raises an error.""" with pytest.raises(InvalidDelimiter):