From 6e2e6412e2e88e8bcac7426a6067555716e2ee3c Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Wed, 1 Jul 2026 11:01:23 -0700 Subject: [PATCH 1/6] Update docs Mention that values must be unique, and --metadata-id-columns can be used to use other columns. --- docs/faq/metadata.rst | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/docs/faq/metadata.rst b/docs/faq/metadata.rst index 692002002..33cb7eac4 100644 --- a/docs/faq/metadata.rst +++ b/docs/faq/metadata.rst @@ -42,10 +42,12 @@ Format **Strain names** -You must have one column named ``strain`` or ``name``. It contains your -sequence names, and needs to match the identifiers of your sequences (in -the Fasta or VCF file) *exactly* and must not contain characters such as -spaces, or ``()[]{}|#><``. +You must have a column that contains a unique identifier for each row. Augur +prefers a column named ``strain``, falling back to ``name`` if ``strain`` +is not present. A different column can be used by passing +``--metadata-id-columns`` to commands that support it. Values in this column +must match the identifiers of your sequences (in the Fasta or VCF file) +*exactly* and must not contain characters such as spaces, or ``()[]{}|#><``. **Dates** From 3267aeb08b4eb4999fece722952cbe18f9eefce2 Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Wed, 1 Jul 2026 11:21:34 -0700 Subject: [PATCH 2/6] Update tests Reflect the changes in "Prefer `strain` over `name` as sequence ID field" (6064fa2). --- tests/functional/parse.t | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/tests/functional/parse.t b/tests/functional/parse.t index 2ae357ac5..c3e679769 100644 --- a/tests/functional/parse.t +++ b/tests/functional/parse.t @@ -53,27 +53,26 @@ This should fail. ERROR: Output id field 'notexist' not found in fields ['strain', 'virus', 'accession', 'date', 'region', 'country', 'division', 'city', 'db', 'segment', 'authors', 'url', 'title', 'journal', 'paper_url']. [2] -Parse Zika sequences into sequences and metadata, preferred default ids is 'name', then 'strain', then first field. +Parse Zika sequences into sequences and metadata, preferred default ids is 'strain', then 'name', then first field. $ ${AUGUR} parse \ > --sequences "$TESTDIR/parse/zika.fasta" \ > --output-sequences "sequences.fasta" \ > --output-metadata "metadata.tsv" \ > --fields strain virus name date region country division city db segment authors url title journal paper_url \ - > --output-id-field 'name' \ > --prettify-fields region country division city \ > --fix-dates monthfirst - $ diff -u "$TESTDIR/parse/sequences_other.fasta" "sequences.fasta" + $ diff -u "$TESTDIR/parse/sequences.fasta" "sequences.fasta" -Parse Zika sequences into sequences and metadata when there is no 'name' field. -This should use the 2nd entry in DEFAULT_ID_COLUMNS ('name', 'strain') instead. +Parse Zika sequences into sequences and metadata when there is no 'strain' field. +This should use the 2nd entry in DEFAULT_ID_COLUMNS ('strain', 'name') instead. $ ${AUGUR} parse \ > --sequences "$TESTDIR/parse/zika.fasta" \ > --output-sequences "sequences.fasta" \ > --output-metadata "metadata.tsv" \ - > --fields col1 virus strain date region country division city db segment authors url title journal paper_url \ + > --fields col1 virus name date region country division city db segment authors url title journal paper_url \ > --prettify-fields region country division city \ > --fix-dates monthfirst From ca578426c6f5c4520bc915d9622b7ab1982282af Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Wed, 1 Jul 2026 11:49:21 -0700 Subject: [PATCH 3/6] Prefer 'id' as identifier column Prefer a new explicit 'id' column to represent unique identifier, instead of 'strain' and 'name' which have historically been associated with non-unique columns from some data sources. --- CHANGES.md | 5 + augur/io/metadata.py | 2 +- augur/parse.py | 2 +- docs/faq/metadata.rst | 4 +- .../export_v2/cram/metadata-id-columns.t | 2 +- tests/functional/merge/cram/merge-metadata.t | 148 +++++++++--------- tests/functional/parse.t | 10 +- tests/io/test_metadata.py | 28 +++- 8 files changed, 116 insertions(+), 85 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 54f954e5a..61c6c166d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -2,10 +2,15 @@ ## __NEXT__ +### Major Changes + +* A metadata column named `id` is now preferred as the identifier column when present. This applies to all commands that work with metadata id columns (`parse`, `merge`, `filter`, `subsample`, `refine`, `traits`, `frequencies`, `export v2`). Typical Augur usage should start with ensuring your metadata file has an `id` column instead of passing `--metadata-id-columns` to each command. Note this is a behavior change: metadata that already contains an `id` column but relied on `strain`/`name` being inferred will now use `id`; pass `--metadata-id-columns strain name` to restore the previous behavior. [#1780][] @victorlin + ### Bug fixes * Bump minimum xopen version to 2.1.0. This removes the indirect dependency on `zstandard` as there is built-in zstd support in Python 3.10 and an equivalent backport for older versions. [#2009][] @victorlin +[#1780]: https://github.com/nextstrain/augur/issues/1780 [#2007]: https://github.com/nextstrain/augur/pull/2009 ## 33.4.1 (2 June 2026) diff --git a/augur/io/metadata.py b/augur/io/metadata.py index 094778335..90cc3dfc0 100644 --- a/augur/io/metadata.py +++ b/augur/io/metadata.py @@ -17,7 +17,7 @@ DEFAULT_DELIMITERS = (',', '\t') -DEFAULT_ID_COLUMNS = ("strain", "name") +DEFAULT_ID_COLUMNS = ("id", "strain", "name") METADATA_DATE_COLUMN = 'date' diff --git a/augur/parse.py b/augur/parse.py index 8cc76af2c..73d0a0109 100644 --- a/augur/parse.py +++ b/augur/parse.py @@ -12,7 +12,7 @@ from .dates import get_numerical_date_from_value from .errors import AugurError -PARSE_DEFAULT_ID_COLUMNS = ("strain", "name") +PARSE_DEFAULT_ID_COLUMNS = ("id", "strain", "name") forbidden_characters = str.maketrans( {' ': None, diff --git a/docs/faq/metadata.rst b/docs/faq/metadata.rst index 33cb7eac4..bafe3013a 100644 --- a/docs/faq/metadata.rst +++ b/docs/faq/metadata.rst @@ -40,10 +40,10 @@ don't open it in Excel again!). Format ~~~~~~ -**Strain names** +**Sequence identifiers** You must have a column that contains a unique identifier for each row. Augur -prefers a column named ``strain``, falling back to ``name`` if ``strain`` +prefers a column named ``id``, falling back to ``strain`` or ``name`` if ``id`` is not present. A different column can be used by passing ``--metadata-id-columns`` to commands that support it. Values in this column must match the identifiers of your sequences (in the Fasta or VCF file) diff --git a/tests/functional/export_v2/cram/metadata-id-columns.t b/tests/functional/export_v2/cram/metadata-id-columns.t index 3568e2c53..bf7404216 100644 --- a/tests/functional/export_v2/cram/metadata-id-columns.t +++ b/tests/functional/export_v2/cram/metadata-id-columns.t @@ -41,5 +41,5 @@ This should fail with a helpful error message. > --auspice-config "$TESTDIR/../data/auspice_config1.json" \ > --maintainers "Nextstrain Team" \ > --output dataset.json > /dev/null - ERROR: None of the possible id columns ('strain', 'name') were found in the metadata's columns ('invalid_id', 'div', 'mutation_length') + ERROR: None of the possible id columns ('id', 'strain', 'name') were found in the metadata's columns ('invalid_id', 'div', 'mutation_length') [1] diff --git a/tests/functional/merge/cram/merge-metadata.t b/tests/functional/merge/cram/merge-metadata.t index a63601d38..f5261331c 100644 --- a/tests/functional/merge/cram/merge-metadata.t +++ b/tests/functional/merge/cram/merge-metadata.t @@ -9,13 +9,13 @@ Full outer join like behaviour, column coalescing/overwriting, and recording of each row's source file(s) in extra columns. $ cat >x.tsv <<~~ - > strain a b c + > id a b c > one X1a X1b X1c > two X2a X2b X2c > ~~ $ cat >y.tsv <<~~ - > strain b c f e d + > id b c f e d > two Y2c Y2f Y2e Y2d > three Y3f Y3e Y3d > ~~ @@ -24,15 +24,15 @@ each row's source file(s) in extra columns. > --metadata X=x.tsv Y=y.tsv \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 More than two inputs. $ cat >z.tsv <<~~ - > strain g c + > id g c > one Z1g > two Z2g Z2c > three Z3g @@ -42,16 +42,16 @@ More than two inputs. > --metadata X=x.tsv Y=y.tsv Z=z.tsv \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d g __source_metadata_X __source_metadata_Y __source_metadata_Z - one X1a X1b X1c Z1g 1 0 1 - two X2a X2b Z2c Y2f Y2e Y2d Z2g 1 1 1 - three Y3f Y3e Y3d Z3g 0 1 1 + id a b c f e d g __source_metadata_X __source_metadata_Y __source_metadata_Z + one X1a X1b X1c Z1g 1 0 1 + two X2a X2b Z2c Y2f Y2e Y2d Z2g 1 1 1 + three Y3f Y3e Y3d Z3g 0 1 1 Supports Augur's standard id column detection. Note that the first file's id column name (e.g. "name" here) is used as the output id column name, per Augur's convention of preserving the input id column name. - $ sed '1s/^strain/name/g' < x.tsv > x-name-column.tsv + $ sed '1s/^id/name/g' < x.tsv > x-name-column.tsv $ ${AUGUR} merge \ > --metadata X=x-name-column.tsv Y=y.tsv \ > --source-columns '__source_metadata_{NAME}' \ @@ -61,25 +61,25 @@ Augur's convention of preserving the input id column name. two X2a X2b Y2c Y2f Y2e Y2d 1 1 three Y3f Y3e Y3d 0 1 - $ sed '1s/^strain/name/g' < y.tsv > y-name-column.tsv + $ sed '1s/^id/name/g' < y.tsv > y-name-column.tsv $ ${AUGUR} merge \ > --metadata X=x.tsv Y=y-name-column.tsv \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 Supports --metadata-id-columns. - $ sed '1s/^strain/id/g' < x.tsv > x-id-column.tsv + $ sed '1s/^id/my_id/g' < x.tsv > x-my-id-column.tsv $ ${AUGUR} merge \ - > --metadata X=x-id-column.tsv Y=y.tsv \ - > --metadata-id-columns id strain \ + > --metadata X=x-my-id-column.tsv Y=y.tsv \ + > --metadata-id-columns my_id id \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - id a b c f e d __source_metadata_X __source_metadata_Y + my_id a b c f e d __source_metadata_X __source_metadata_Y one X1a X1b X1c 1 0 two X2a X2b Y2c Y2f Y2e Y2d 1 1 three Y3f Y3e Y3d 0 1 @@ -87,24 +87,24 @@ Supports --metadata-id-columns. Supports table-specific --metadata-id-columns. $ ${AUGUR} merge \ - > --metadata X=x-id-column.tsv Y=y.tsv \ - > --metadata-id-columns X=id \ + > --metadata X=x-my-id-column.tsv Y=y.tsv \ + > --metadata-id-columns X=my_id \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - id a b c f e d __source_metadata_X __source_metadata_Y + my_id a b c f e d __source_metadata_X __source_metadata_Y one X1a X1b X1c 1 0 two X2a X2b Y2c Y2f Y2e Y2d 1 1 three Y3f Y3e Y3d 0 1 $ ${AUGUR} merge \ > --metadata X=x.tsv Y=y-name-column.tsv \ - > --metadata-id-columns strain Y=name \ + > --metadata-id-columns id Y=name \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 Supports Augur's standard delimiter detection. @@ -113,10 +113,10 @@ Supports Augur's standard delimiter detection. > --metadata X=x.csv Y=y.tsv \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 Supports --metadata-delimiters. @@ -126,10 +126,10 @@ Supports --metadata-delimiters. > --metadata-delimiters '|' $'\t' \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 Supports table-specific --metadata-delimiters. @@ -138,20 +138,20 @@ Supports table-specific --metadata-delimiters. > --metadata-delimiters X='|' \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 $ ${AUGUR} merge \ > --metadata X=x.txt Y=y.tsv \ > --metadata-delimiters $'\t' X='|' \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 Supports Augur's standard accepted compression formats. @@ -161,10 +161,10 @@ Supports Augur's standard accepted compression formats. > --metadata X=x.tsv.xz Y=y.tsv.zst \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d __source_metadata_X __source_metadata_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 OFF THE BEATEN PATH @@ -182,7 +182,7 @@ Metadata names are only the part before the first '='. Metadata field values with metachars (field or record delimiters) are handled properly. $ cat >metachars.csv <<~~ - > strain,comma,tab,newline + > id,comma,tab,newline > one,"x,x","x x","x > x" > two,"","","" @@ -191,7 +191,7 @@ Metadata field values with metachars (field or record delimiters) are handled pr > --metadata X=x.tsv metachars=metachars.csv \ > --source-columns '__source_metadata_{NAME}' \ > --output-metadata - --quiet - strain a b c comma tab newline __source_metadata_X __source_metadata_metachars + id a b c comma tab newline __source_metadata_X __source_metadata_metachars one X1a X1b X1c x,x "x x" "x x" 1 1 two X2a X2b X2c 1 1 @@ -202,10 +202,10 @@ Source columns template. > --metadata X=x.tsv Y=y.tsv \ > --source-columns 'origin_{NAME}' \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d origin_X origin_Y - one X1a X1b X1c 1 0 - two X2a X2b Y2c Y2f Y2e Y2d 1 1 - three Y3f Y3e Y3d 0 1 + id a b c f e d origin_X origin_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 No source columns (explicitly or by default). @@ -213,18 +213,18 @@ No source columns (explicitly or by default). > --metadata X=x.tsv Y=y.tsv \ > --no-source-columns \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d - one X1a X1b X1c - two X2a X2b Y2c Y2f Y2e Y2d - three Y3f Y3e Y3d + id a b c f e d + one X1a X1b X1c + two X2a X2b Y2c Y2f Y2e Y2d + three Y3f Y3e Y3d $ ${AUGUR} merge \ > --metadata X=x.tsv Y=y.tsv \ > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty - strain a b c f e d - one X1a X1b X1c - two X2a X2b Y2c Y2f Y2e Y2d - three Y3f Y3e Y3d + id a b c f e d + one X1a X1b X1c + two X2a X2b Y2c Y2f Y2e Y2d + three Y3f Y3e Y3d ERROR HANDLING @@ -267,7 +267,7 @@ Metadata names must be unique. Duplicates. $ cat >dups.tsv <<~~ - > strain a b c + > id a b c > one 1a 1b 1c > one 2a 2b 2c > ~~ @@ -314,7 +314,7 @@ Unknown metadata names in --metadata-id-columns. $ ${AUGUR} merge \ > --metadata X=x.tsv Y=y.tsv \ - > --metadata-id-columns strain whatsit=id \ + > --metadata-id-columns id whatsit=id \ > --output-metadata - ERROR: Unknown metadata table name in --metadata-id-columns: @@ -326,7 +326,7 @@ Unknown metadata names in --metadata-id-columns. $ ${AUGUR} merge \ > --metadata X=x.tsv Y=y.tsv \ - > --metadata-id-columns whatsit=id whosit=accession X=strain Y=strain \ + > --metadata-id-columns whatsit=id whosit=accession X=id Y=id \ > --output-metadata - ERROR: Unknown metadata table names in --metadata-id-columns: @@ -340,30 +340,30 @@ Unknown metadata names in --metadata-id-columns. No id column found. $ ${AUGUR} merge \ - > --metadata X=x-id-column.tsv Y=y.tsv \ + > --metadata X=x.tsv Y=y.tsv \ > --metadata-id-columns strain \ > --output-metadata /dev/null - ERROR: x-id-column.tsv: None of the possible id columns ('strain') were found in the metadata's columns ('id', 'a', 'b', 'c'). + ERROR: x.tsv: None of the possible id columns ('strain') were found in the metadata's columns ('id', 'a', 'b', 'c'). [2] Non-id column names conflicting with output id column name. - $ cat >id-and-strain.csv <<~~ - > id,strain + $ cat >my-id-and-id.csv <<~~ + > my_id,id > one,1 > two,2 > three,3 > ~~ $ ${AUGUR} merge \ - > --metadata strain-only=x.tsv id-and-strain=id-and-strain.csv \ - > --metadata-id-columns id strain \ + > --metadata id-only=x.tsv my-id-and-id=my-id-and-id.csv \ + > --metadata-id-columns my_id id \ > --output-metadata /dev/null --quiet ERROR: Non-id column names in metadata inputs may not conflict with the - output id column name ('strain', the first input's id column). + output id column name ('id', the first input's id column). The following input column would conflict: - 'strain' in metadata table 'id-and-strain' (id column: 'id') + 'id' in metadata table 'my-id-and-id' (id column: 'my_id') Please rename or drop the conflicting column before merging. Renaming may be done with `augur curate rename`. diff --git a/tests/functional/parse.t b/tests/functional/parse.t index c3e679769..6565ec325 100644 --- a/tests/functional/parse.t +++ b/tests/functional/parse.t @@ -53,20 +53,20 @@ This should fail. ERROR: Output id field 'notexist' not found in fields ['strain', 'virus', 'accession', 'date', 'region', 'country', 'division', 'city', 'db', 'segment', 'authors', 'url', 'title', 'journal', 'paper_url']. [2] -Parse Zika sequences into sequences and metadata, preferred default ids is 'strain', then 'name', then first field. +Parse Zika sequences into sequences and metadata, preferred default ids is 'id', then 'strain', then 'name', then first field. $ ${AUGUR} parse \ > --sequences "$TESTDIR/parse/zika.fasta" \ > --output-sequences "sequences.fasta" \ > --output-metadata "metadata.tsv" \ - > --fields strain virus name date region country division city db segment authors url title journal paper_url \ + > --fields id virus name date region country division city db segment authors url title journal paper_url \ > --prettify-fields region country division city \ > --fix-dates monthfirst $ diff -u "$TESTDIR/parse/sequences.fasta" "sequences.fasta" -Parse Zika sequences into sequences and metadata when there is no 'strain' field. -This should use the 2nd entry in DEFAULT_ID_COLUMNS ('strain', 'name') instead. +Parse Zika sequences into sequences and metadata when there is no 'id' or 'strain' field. +This should use the 3rd entry in DEFAULT_ID_COLUMNS ('id', 'strain', 'name') instead. $ ${AUGUR} parse \ > --sequences "$TESTDIR/parse/zika.fasta" \ @@ -78,7 +78,7 @@ This should use the 2nd entry in DEFAULT_ID_COLUMNS ('strain', 'name') instead. $ diff -u "$TESTDIR/parse/sequences_other.fasta" "sequences.fasta" -Parse Zika sequences into sequences and metadata when no output-id-field is provided and none of the fields match DEFAULT_ID_COLUMNS (e.g. ('strain', 'name')). +Parse Zika sequences into sequences and metadata when no output-id-field is provided and none of the fields match DEFAULT_ID_COLUMNS (e.g. ('id', 'strain', 'name')). This should use the first field as the id field and the metadata should not have an extra strain or name column. $ ${AUGUR} parse \ diff --git a/tests/io/test_metadata.py b/tests/io/test_metadata.py index 548db3328..5d4c3a988 100644 --- a/tests/io/test_metadata.py +++ b/tests/io/test_metadata.py @@ -3,7 +3,7 @@ from io import BytesIO from augur.errors import AugurError -from augur.io.metadata import InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv, Metadata +from augur.io.metadata import DEFAULT_ID_COLUMNS, InvalidDelimiter, read_table_to_dict, read_metadata_with_sequences, write_records_to_tsv, Metadata from augur.types import DataErrorMethod @@ -123,6 +123,17 @@ def metadata_file(tmpdir): ]) return path +@pytest.fixture +def metadata_file_with_id_and_strain(tmpdir): + path = str(tmpdir / 'metadata.tsv') + with open(path, 'w') as fh: + fh.writelines([ + 'strain\tid\tdate\n', + 'SEQ_A\tID_A\t2020-10-01\n', + 'SEQ_T\tID_T\t2020-10-02\n', + ]) + return path + def unmatched_sequences(): return [ '>EXTRA_SEQ_A\nAAAAA\n', @@ -524,6 +535,21 @@ def test_attributes(self, metadata_file): assert m.columns == ['strain', 'country', 'date'] assert m.id_column == 'strain' + def test_default_prefers_id(self, metadata_file_with_id_and_strain): + """By default, an explicit 'id' column is preferred over 'strain'.""" + m = Metadata(metadata_file_with_id_and_strain, delimiters=['\t'], id_columns=DEFAULT_ID_COLUMNS) + assert m.id_column == 'id' + + def test_default_falls_back_to_strain(self, metadata_file): + """When 'id' is absent, the default falls back to 'strain'.""" + m = Metadata(metadata_file, delimiters=['\t'], id_columns=DEFAULT_ID_COLUMNS) + assert m.id_column == 'strain' + + def test_explicit_id_columns_override(self, metadata_file_with_id_and_strain): + """An explicit id_columns argument overrides the 'id' preference.""" + m = Metadata(metadata_file_with_id_and_strain, delimiters=['\t'], id_columns=['strain']) + assert m.id_column == 'strain' + def test_invalid_delimiter(self, metadata_file): """Failure to detect delimiter raises an error.""" with pytest.raises(InvalidDelimiter): From 9b5449851be5d8d09c8ce13de2af9de1b95ab72e Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Wed, 1 Jul 2026 14:50:50 -0700 Subject: [PATCH 4/6] merge: Add --output-metadata-id-column Allow explicitly naming the id column in merged metadata output, instead of always using the first input's id column, so workflows can use a standard id column name regardless of inputs. --- CHANGES.md | 4 +++ augur/merge.py | 16 +++++++---- tests/functional/merge/cram/merge-metadata.t | 30 ++++++++++++++++++++ 3 files changed, 44 insertions(+), 6 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 61c6c166d..2bd8c5c7d 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -6,6 +6,10 @@ * A metadata column named `id` is now preferred as the identifier column when present. This applies to all commands that work with metadata id columns (`parse`, `merge`, `filter`, `subsample`, `refine`, `traits`, `frequencies`, `export v2`). Typical Augur usage should start with ensuring your metadata file has an `id` column instead of passing `--metadata-id-columns` to each command. Note this is a behavior change: metadata that already contains an `id` column but relied on `strain`/`name` being inferred will now use `id`; pass `--metadata-id-columns strain name` to restore the previous behavior. [#1780][] @victorlin +### Features + +* merge: Add `--output-metadata-id-column` to explicitly name the id column in the merged metadata output, overriding the default of using the first input's id column. [#1780][] @victorlin + ### Bug fixes * Bump minimum xopen version to 2.1.0. This removes the indirect dependency on `zstandard` as there is built-in zstd support in Python 3.10 and an equivalent backport for older versions. [#2009][] @victorlin diff --git a/augur/merge.py b/augur/merge.py index 2a57415e9..e063f3f00 100644 --- a/augur/merge.py +++ b/augur/merge.py @@ -18,8 +18,9 @@ column or overwriting values in an existing column. For columns appearing in more than one table, non-empty values on the right hand side overwrite values on the left hand side. The first table's id column name is used as the output -id column name. Non-id columns in other input tables that would conflict with -this output id column name are not allowed and if present will cause an error. +id column name, unless overridden with --output-metadata-id-column. Non-id +columns in other input tables that would conflict with this output id column +name are not allowed and if present will cause an error. One generated column per input table may be optionally appended to the end of the output table to identify the source of each row's data. Column names are @@ -109,6 +110,7 @@ def register_parser(parent_subparsers): output_group = parser.add_argument_group("outputs", "options related to output") output_group.add_argument('--output-metadata', metavar="FILE", help="Merged metadata as TSV. Compressed files are supported." + SKIP_AUTO_DEFAULT_IN_HELP) + output_group.add_argument('--output-metadata-id-column', metavar="COLUMN", help="Name to use for the id column in the merged metadata output. (default: the first input's id column)" + SKIP_AUTO_DEFAULT_IN_HELP) output_group.add_argument('--source-columns', metavar="TEMPLATE", help=f"Template with which to generate names for the columns (described above) identifying the source of each row's data. Must contain a literal placeholder, {{NAME}}, which stands in for the metadata table names assigned in --metadata. (default: disabled)" + SKIP_AUTO_DEFAULT_IN_HELP) output_group.add_argument('--no-source-columns', dest="source_columns", action="store_const", const=None, help=f"Suppress generated columns (described above) identifying the source of each row's data. This is the default behaviour, but it may be made explicit or used to override a previous --source-columns." + SKIP_AUTO_DEFAULT_IN_HELP) @@ -240,9 +242,11 @@ def merge_metadata(args): # Clean up database file by default delete_db = True - # Track columns as we see them, in order. The first metadata's id column - # is always the first output column of the merge, so insert it now. - output_id_column = metadata[0].id_column + # Track columns as we see them, in order. The output id column (either + # the first metadata's id column, or an explicit override) is always the + # first output column of the merge, so insert it now. + output_id_column = args.output_metadata_id_column or metadata[0].id_column + output_id_column_source = "the given --output-metadata-id-column" if args.output_metadata_id_column else "the first input's id column" output_columns = { output_id_column: [] } if conflicting_columns := [f"{c!r} in metadata table {m.name!r} (id column: {m.id_column!r})" @@ -252,7 +256,7 @@ def merge_metadata(args): and c != m.id_column]: raise AugurError(dedent(f"""\ Non-id column names in metadata inputs may not conflict with the - output id column name ({output_id_column!r}, the first input's id column). + output id column name ({output_id_column!r}, {output_id_column_source}). The following input {_n("column", "columns", len(conflicting_columns))} would conflict: diff --git a/tests/functional/merge/cram/merge-metadata.t b/tests/functional/merge/cram/merge-metadata.t index f5261331c..bd5c43b84 100644 --- a/tests/functional/merge/cram/merge-metadata.t +++ b/tests/functional/merge/cram/merge-metadata.t @@ -71,6 +71,18 @@ Augur's convention of preserving the input id column name. two X2a X2b Y2c Y2f Y2e Y2d 1 1 three Y3f Y3e Y3d 0 1 +Supports --output-metadata-id-column to override the output id column name. + + $ ${AUGUR} merge \ + > --metadata X=x-name-column.tsv Y=y.tsv \ + > --output-metadata-id-column my_id \ + > --source-columns '__source_metadata_{NAME}' \ + > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty + my_id a b c f e d __source_metadata_X __source_metadata_Y + one X1a X1b X1c 1 0 + two X2a X2b Y2c Y2f Y2e Y2d 1 1 + three Y3f Y3e Y3d 0 1 + Supports --metadata-id-columns. $ sed '1s/^id/my_id/g' < x.tsv > x-my-id-column.tsv @@ -370,6 +382,24 @@ Non-id column names conflicting with output id column name. [2] +Non-id column names conflicting with an explicit --output-metadata-id-column. + + $ ${AUGUR} merge \ + > --metadata X=x.tsv Y=y.tsv \ + > --output-metadata-id-column a \ + > --output-metadata /dev/null --quiet + ERROR: Non-id column names in metadata inputs may not conflict with the + output id column name ('a', the given --output-metadata-id-column). + + The following input column would conflict: + + 'a' in metadata table 'X' (id column: 'id') + + Please rename or drop the conflicting column before merging. + Renaming may be done with `augur curate rename`. + + [2] + Invalid source columns template. $ ${AUGUR} merge \ From 42ad22472f4b6ec853d337c0ab3c8926b5af6eb6 Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Wed, 1 Jul 2026 15:22:32 -0700 Subject: [PATCH 5/6] merge: Support single inputs Allow --metadata and --sequences to accept a single input instead of requiring at least two, since augur merge is also used as a general entry point for standardization even when no actual merge across inputs is needed. --- CHANGES.md | 2 ++ augur/merge.py | 8 +------ tests/functional/merge/cram/merge-metadata.t | 22 ++++++++++++------- .../merge/cram/merge-sequences-errors.t | 9 -------- .../merge/cram/merge-sequences-only.t | 19 ++++++++++++++++ 5 files changed, 36 insertions(+), 24 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index 2bd8c5c7d..527b8fd16 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -9,12 +9,14 @@ ### Features * merge: Add `--output-metadata-id-column` to explicitly name the id column in the merged metadata output, overriding the default of using the first input's id column. [#1780][] @victorlin +* merge: Support a single `--metadata` or `--sequences` input, instead of requiring at least two. This allows `augur merge` to be used as a general entry point for standardization even when no actual merge is needed. [#1974][] @victorlin ### Bug fixes * Bump minimum xopen version to 2.1.0. This removes the indirect dependency on `zstandard` as there is built-in zstd support in Python 3.10 and an equivalent backport for older versions. [#2009][] @victorlin [#1780]: https://github.com/nextstrain/augur/issues/1780 +[#1974]: https://github.com/nextstrain/augur/issues/1974 [#2007]: https://github.com/nextstrain/augur/pull/2009 ## 33.4.1 (2 June 2026) diff --git a/augur/merge.py b/augur/merge.py index e063f3f00..8db0720e7 100644 --- a/augur/merge.py +++ b/augur/merge.py @@ -1,5 +1,5 @@ """ -Merge two or more datasets into one. +Merge datasets into one. Datasets can consist of metadata and/or sequence files. If both are provided, the order and file contents are used independently. @@ -160,9 +160,6 @@ def run(args): def merge_metadata(args): # Parse --metadata arguments - if not len(args.metadata) >= 2: - raise AugurError(f"At least two metadata inputs are required for merging.") - if unnamed := [repr(x) for x in args.metadata if "=" not in x or x.startswith("=")]: raise AugurError(dedent(f"""\ All metadata inputs must be assigned a name, e.g. with NAME=FILE. @@ -410,9 +407,6 @@ def merge_metadata(args): def merge_sequences(args): - if not len(args.sequences) >= 2: - raise AugurError(f"At least two sequence inputs are required for merging.") - if not args.skip_input_sequences_validation: for s in args.sequences: print_info(f"Validating {s!r}…") diff --git a/tests/functional/merge/cram/merge-metadata.t b/tests/functional/merge/cram/merge-metadata.t index bd5c43b84..ea069d2ea 100644 --- a/tests/functional/merge/cram/merge-metadata.t +++ b/tests/functional/merge/cram/merge-metadata.t @@ -20,6 +20,20 @@ each row's source file(s) in extra columns. > three Y3f Y3e Y3d > ~~ +A single metadata input is supported, e.g. for decompression, adding source +columns, or standardizing the output id column, without an actual merge taking +place. + + $ ${AUGUR} merge \ + > --metadata X=x.tsv \ + > --source-columns '__source_metadata_{NAME}' \ + > --output-metadata - --quiet | csv2tsv --csv-delim $'\t' | tsv-pretty + id a b c __source_metadata_X + one X1a X1b X1c 1 + two X2a X2b X2c 1 + +Two inputs. + $ ${AUGUR} merge \ > --metadata X=x.tsv Y=y.tsv \ > --source-columns '__source_metadata_{NAME}' \ @@ -241,14 +255,6 @@ No source columns (explicitly or by default). ERROR HANDLING -At least two metadata inputs are required. - - $ ${AUGUR} merge \ - > --metadata X=x.tsv \ - > --output-metadata - - ERROR: At least two metadata inputs are required for merging. - [2] - Metadata names are required. $ ${AUGUR} merge \ diff --git a/tests/functional/merge/cram/merge-sequences-errors.t b/tests/functional/merge/cram/merge-sequences-errors.t index 9a32c1f3c..6352385d7 100644 --- a/tests/functional/merge/cram/merge-sequences-errors.t +++ b/tests/functional/merge/cram/merge-sequences-errors.t @@ -70,12 +70,3 @@ A missing input file shows a meaningful error during validation… WARNING: Metadata merge was successful but sequence merge has failed, so --output-metadata has no corresponding sequence output. ERROR: Merging failed, see error(s) above. The command may have already written data to stdout. You may want to clean up any partial outputs. [2] - -At least two sequence inputs are required. - - $ ${AUGUR} merge \ - > --sequences x.fasta \ - > --output-sequences - - WARNING: Metadata merge was successful but sequence merge has failed, so --output-metadata has no corresponding sequence output. - ERROR: At least two sequence inputs are required for merging. - [2] diff --git a/tests/functional/merge/cram/merge-sequences-only.t b/tests/functional/merge/cram/merge-sequences-only.t index ecc6eed31..7324564e0 100644 --- a/tests/functional/merge/cram/merge-sequences-only.t +++ b/tests/functional/merge/cram/merge-sequences-only.t @@ -20,6 +20,25 @@ Merge sequences without metadata. > GCTA > ~~ +A single sequence input is supported, e.g. for decompression, without an actual +merge taking place. + + $ ${AUGUR} merge \ + > --sequences x.fasta \ + > --output-sequences merged.fasta + Validating 'x.fasta'… + Merging sequences and writing to 'merged.fasta'… + + $ cat merged.fasta + >seq1 + ATCG + >seq2 + GCTA + >seq3 + TCGA + +Two inputs. + $ ${AUGUR} merge \ > --sequences x.fasta y.fasta \ > --output-sequences merged.fasta From 54a41e968021adea013566110816e0c308114e1f Mon Sep 17 00:00:00 2001 From: Victor Lin Date: Wed, 1 Jul 2026 15:25:04 -0700 Subject: [PATCH 6/6] Add note to restore augur read-file Follow-up to "Read FASTA files with SeqKit directly" (44b72fe) which is really more of a workaround than a fix. --- augur/merge.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/augur/merge.py b/augur/merge.py index 8db0720e7..e20e82452 100644 --- a/augur/merge.py +++ b/augur/merge.py @@ -416,6 +416,8 @@ def merge_sequences(args): # Reversed because seqkit rmdup keeps the first entry but this command # should keep the last entry. + # TODO: use augur read-file once its slowness has been sorted out. + # command = f""" {seqkit()} --threads {args.nthreads} rmdup {" ".join(shquote(s) for s in reversed(args.sequences))} | {augur()} write-file {shquote(args.output_sequences)} @@ -424,9 +426,9 @@ def merge_sequences(args): try: run_shell_command(command, raise_errors=True) except Exception: - # Ideally, read-file, seqkit, and write-file errors would all be handled - # separately. That is not possible because everything is done in one - # child process with one return code. + # Ideally, seqkit and write-file errors would all be handled separately. + # That is not possible because everything is done in one child process + # with one return code. if os.path.isfile(args.output_sequences): # Remove the partial output file. os.remove(args.output_sequences)