nextstrain · huddlej · May 5, 2022 · Apr 28, 2022 · May 4, 2022 · May 4, 2022
diff --git a/augur/export_v2.py b/augur/export_v2.py
@@ -9,7 +9,9 @@
 import numbers
 import re
 from Bio import Phylo
-from .utils import read_metadata, read_node_data, write_json, read_config, read_lat_longs, read_colors
+
+from .io import read_metadata
+from .utils import read_node_data, write_json, read_config, read_lat_longs, read_colors
 from .validate import export_v2 as validate_v2, auspice_config_v2 as validate_auspice_config_v2, ValidateError
 
 # Set up warnings & exceptions
@@ -992,10 +994,16 @@ def run_v2(args):
 
     if args.metadata is not None:
         try:
-            metadata_file, _ = read_metadata(args.metadata)
+            metadata_file = read_metadata(args.metadata).to_dict(orient="index")
+            for strain in metadata_file.keys():
+                if "strain" not in metadata_file[strain]:
+                    metadata_file[strain]["strain"] = strain
         except FileNotFoundError:
-            print(f"ERROR: meta data file ({args.metadata}) does not exist")
+            print(f"ERROR: meta data file ({args.metadata}) does not exist", file=sys.stderr)
             sys.exit(2)
+        except Exception as error:
+            print(f"ERROR: {error}", file=sys.stderr)
+            sys.exit(1)
     else:
         metadata_file = {}
 

diff --git a/augur/io.py b/augur/io.py
@@ -91,11 +91,13 @@ def read_metadata(metadata_file, id_columns=("strain", "name"), chunk_size=None)
         kwargs["chunksize"] = chunk_size
 
     # Inspect the first chunk of the metadata, to find any valid index columns.
-    chunk = pd.read_csv(
+    metadata = pd.read_csv(
         metadata_file,
         iterator=True,
         **kwargs,
-    ).read(nrows=1)
+    )
+    chunk = metadata.read(nrows=1)
+    metadata.close()
 
     id_columns_present = [
         id_column

diff --git a/tests/functional/export_v2.t b/tests/functional/export_v2.t
@@ -65,3 +65,49 @@ Export with auspice config JSON with an extensions block
   $ python3 "$TESTDIR/../../scripts/diff_jsons.py"  export_v2/dataset2.json "$TMP/dataset3.json" \
   >   --exclude-paths "root['meta']['updated']"
   {}
+
+Run export with metadata using the default id column of "strain".
+
+  $ ${AUGUR} export v2 \
+  >  --tree export_v2/tree.nwk \
+  >  --metadata export_v2/dataset1_metadata_with_strain.tsv \
+  >  --node-data export_v2/div_node-data.json export_v2/location_node-data.json \
+  >  --auspice-config export_v2/auspice_config1.json \
+  >  --maintainers "Nextstrain Team" \
+  >  --output "$TMP/dataset1.json" > /dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" export_v2/dataset1.json "$TMP/dataset1.json" \
+  >   --exclude-paths "root['meta']['updated']" "root['meta']['maintainers']"
+  {}
+  $ rm -f "$TMP/dataset1.json"
+
+Run export with metadata that uses a different id column other than "strain".
+In this case, the column is "name" (one of the default columns expected by Augur's `io.read_metadata` function).
+
+  $ ${AUGUR} export v2 \
+  >  --tree export_v2/tree.nwk \
+  >  --metadata export_v2/dataset1_metadata_with_name.tsv \
+  >  --node-data export_v2/div_node-data.json export_v2/location_node-data.json \
+  >  --auspice-config export_v2/auspice_config1.json \
+  >  --maintainers "Nextstrain Team" \
+  >  --output "$TMP/dataset1.json" > /dev/null
+
+  $ python3 "$TESTDIR/../../scripts/diff_jsons.py" export_v2/dataset1.json "$TMP/dataset1.json" \
+  >   --exclude-paths "root['meta']['updated']" "root['meta']['maintainers']"
+  {}
+  $ rm -f "$TMP/dataset1.json"
+
+Run export with metadata that uses an invalid id column.
+This should fail with a helpful error message.
+
+  $ ${AUGUR} export v2 \
+  >  --tree export_v2/tree.nwk \
+  >  --metadata export_v2/dataset1_metadata_without_valid_id.tsv \
+  >  --node-data export_v2/div_node-data.json export_v2/location_node-data.json \
+  >  --auspice-config export_v2/auspice_config1.json \
+  >  --maintainers "Nextstrain Team" \
+  >  --output "$TMP/dataset1.json" > /dev/null
+  ERROR: None of the possible id columns (('strain', 'name')) were found in the metadata's columns ('invalid_id', 'div', 'mutation_length')
+  [1]
+
+  $ popd > /dev/null
diff --git a/tests/functional/export_v2/dataset1_metadata_with_name.tsv b/tests/functional/export_v2/dataset1_metadata_with_name.tsv
@@ -0,0 +1,7 @@
+name	div	mutation_length
+tipA	1	1
+tipB	3	1
+tipC	3	1
+tipD	8	3
+tipE	9	4
+tipF	6	1
diff --git a/tests/functional/export_v2/dataset1_metadata_with_strain.tsv b/tests/functional/export_v2/dataset1_metadata_with_strain.tsv
@@ -0,0 +1,7 @@
+strain	div	mutation_length
+tipA	1	1
+tipB	3	1
+tipC	3	1
+tipD	8	3
+tipE	9	4
+tipF	6	1
diff --git a/tests/functional/export_v2/dataset1_metadata_without_valid_id.tsv b/tests/functional/export_v2/dataset1_metadata_without_valid_id.tsv
@@ -0,0 +1,7 @@
+invalid_id	div	mutation_length
+tipA	1	1
+tipB	3	1
+tipC	3	1
+tipD	8	3
+tipE	9	4
+tipF	6	1