Skip to content

Commit

Permalink
[curate rename] allow column duplication
Browse files Browse the repository at this point in the history
See discussion in PR review for context
<#1506 (comment)>
  • Loading branch information
jameshadfield committed Jul 4, 2024
1 parent 1b653e9 commit 0930b9e
Show file tree
Hide file tree
Showing 2 changed files with 46 additions and 24 deletions.
46 changes: 22 additions & 24 deletions augur/curate/rename.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
Renames fields / columns of the input data
"""

from typing import Iterable, Literal, Union, Dict, List, Tuple
from typing import Iterable, Literal, Union, List, Tuple
import argparse
from augur.io.print import print_err
from augur.errors import AugurError
Expand All @@ -14,9 +14,9 @@ def register_parser(parent_subparsers):

required = parser.add_argument_group(title="REQUIRED")
required.add_argument("--field-map", nargs="+", required=True,
help="Fields names in the NDJSON record mapped to new field names, " +
"formatted as '{old_field_name}={new_field_name}'. " +
help="Rename fields/columns via '{old_field_name}={new_field_name}'. " +
"If the new field already exists, then the renaming of the old field will be skipped. " +
"Multiple entries with the same '{old_field_name}' will duplicate the field/column. " +
"Skips the field if the old field name is the same as the new field name (case-sensitive).")

optional = parser.add_argument_group(title="OPTIONAL")
Expand All @@ -27,10 +27,10 @@ def register_parser(parent_subparsers):
return parser


def parse_field_map(field_map_arg: List[str]) -> Dict[str,str]:
seen_old, seen_new = set(), set()
def parse_field_map(field_map_arg: List[str]) -> List[Tuple[str,str]]:
seen_new = set() # keep track of the new field names

field_map = {}
field_map = []
for field in field_map_arg:
fields = [n.strip() for n in field.split('=')]
if len(fields)!=2:
Expand All @@ -42,46 +42,44 @@ def parse_field_map(field_map_arg: List[str]) -> Dict[str,str]:
raise AugurError(f"The field-map {field!r} doesn't specify a name for the existing field.")
if not new_name:
raise AugurError(f"The field-map {field!r} doesn't specify a name for the new field.")
if old_name in seen_old:
raise AugurError(f"Asked to rename field {old_name!r} multiple times.")
if new_name in seen_new:
raise AugurError(f"Asked to rename multiple fields to {new_name!r}.")
seen_old.add(old_name)
seen_new.add(new_name)

if old_name == new_name:
continue

field_map[old_name] = new_name
field_map.append((old_name, new_name))
return field_map


def transform_columns(existing_fields: List[str], field_map: Dict[str,str], force: bool) -> List[Tuple[str,str]]:
def transform_columns(existing_fields: List[str], field_map: List[Tuple[str,str]], force: bool) -> List[Tuple[str,str]]:
"""
Calculate the mapping of old column names to new column names
"""
# check that all columns to be renamed exist
for name in list(field_map.keys()):
if name not in existing_fields:
print_err(f"WARNING: Asked to rename field {name!r} (to {field_map[name]!r}) but it doesn't exist in the input data.")
del field_map[name]
for idx,names in enumerate(field_map[:]):
old_name, new_name = names
if old_name not in existing_fields:
print_err(f"WARNING: Asked to rename field {old_name!r} (to {new_name!r}) but it doesn't exist in the input data.")
field_map.pop(idx)

# iterate through field_map and remove rename requests if they would drop an existing column
# doing this ahead-of-time allows us to preserve the order of fields using a simpler implementation
if not force:
for old_field, new_field in list(field_map.items()):
if new_field in existing_fields:
for idx, fields in enumerate(field_map[:]):
old_field, new_field = fields
if new_field in existing_fields and new_field!=old_field:
print_err(
f"WARNING: skipping rename of {old_field} because record",
f"already has a field named {new_field}."
)
del field_map[old_field]
field_map.pop(idx)

names_to_change, new_names = set([f[0] for f in field_map]), set([f[1] for f in field_map])

m = []
for field in existing_fields:
if field in field_map:
m.append((field, field_map[field]))
elif field in field_map.values():
if field in names_to_change:
m += [(field,new_field) for old_field, new_field in field_map if old_field==field]
elif field in new_names:
pass # another column is renamed to this name, so we drop it
else:
m.append((field, field)) # no change to field name
Expand Down
24 changes: 24 additions & 0 deletions tests/functional/curate/cram/rename/duplicate.t
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
Setup

$ export AUGUR="${AUGUR:-$TESTDIR/../../../../../bin/augur}"

$ cat >records.ndjson <<~~
> {"accession": "record_1", "country": "country_1"}
> {"accession": "record_2", "country": "country_2"}
> ~~


Asking to rename the same column multiple times results in duplication of the column.
Additional columns are inserted next to the existing one, and the order of the new columns
matches the field-map

$ $AUGUR curate rename --field-map "accession=id" "accession=genbank_accession" < <(cat records.ndjson)
{"id": "record_1", "genbank_accession": "record_1", "country": "country_1"}
{"id": "record_2", "genbank_accession": "record_2", "country": "country_2"}


We can use the same name to keep the original column

$ $AUGUR curate rename --field-map "accession=id" "accession=accession" < <(cat records.ndjson)
{"id": "record_1", "accession": "record_1", "country": "country_1"}
{"id": "record_2", "accession": "record_2", "country": "country_2"}

0 comments on commit 0930b9e

Please sign in to comment.