Skip to content

Commit

Permalink
[ingest] fix metadata conflicts across segments
Browse files Browse the repository at this point in the history
Mismatched field values across segments (e.g. segments disagree on the
'date') are now resolved by choosing the most common occurrence with
the intention they are resolved upstream, as implemented here.

This approach was the third implementation. Initially I resolved
disagreements within `group_segments.py` via a provided resolutions
YAML. After discussion with @joverlee521 we decided this could be better
implemented via `augur curate` and the original implementation here did
this _after_ the segment grouping, however this made it impossible to
distinguish disagreements which will be fixed vs those which won't¹

¹ <#18 (comment)>
  • Loading branch information
jameshadfield committed Oct 9, 2024
1 parent 148d76d commit 8a178b8
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 15 deletions.
17 changes: 17 additions & 0 deletions ingest/defaults/annotations.tsv
Original file line number Diff line number Diff line change
Expand Up @@ -13,3 +13,20 @@ PP952118 date 2024-06-11
PP952117 region North America # strain IRCCS-SCDC_1/2024 from traveler, S segment
PP952117 country Cuba # https://www.ncbi.nlm.nih.gov/pmc/articles/PMC11212459/
PP952117 date 2024-06-11

# Strain 'H498913', 'date' had 2 observed values: HQ830423, HQ830388: 1988-XX-XX; HQ830457: 1990-XX-XX
HQ830457 date 1988-XX-XX

# When grouped by strain these segments have similar (but different) authors - we change them to the most complete author list
PP477303 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477315 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477304 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477316 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477305 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477317 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477306 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477318 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477307 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477319 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477308 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
PP477320 authors Limonta,D.,Peres-Restrepo,L.S.,Ciouderis,K.,Hernandez-Ortiz,J.P.,Osorio,J.R.,Perez,L.J.,Perez-Restrepo,L.S.,Ciuoderis,K.,Usuga,J.,Moreno,I.,Vargas,V.,Arevalo-Arbelaez,A.J.,Berg,M.G.,Cloherty,G.A.,Osorio,J.E.
34 changes: 19 additions & 15 deletions ingest/scripts/group_segments.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,9 +13,9 @@
keys "strain", "accession" and "segment" informing the program which accession (of
multiple) to use.
Any disagreement within the "--common-strain-fields" will result in the strain being
dropped, however empty values may be replaced and ambiguous dates may be replaced with
specific ones (where appropriate).
Any disagreement within the "--common-strain-fields" will result in us picking the most
common value (across the segments) and a loud warning. The intention is for any such
disagreements to be resolved upstream in an `augur curate` chain or similar.
"""

import argparse
Expand Down Expand Up @@ -151,6 +151,11 @@ class HeaderInfo(TypedDict):


def pick_from_values(strain_name:str, field_name:str, rows:list, accession_key:str, allow_empty=True)->str:
"""
If there's only one valid value (across the provided rows) we return it.
If there are multiple values we return the most common and print a warning instructing the
user to manually fix this up as appropriate.
"""
values = set(row[field_name] for row in rows)
if allow_empty and "" in values and len(values)!=1:
values.remove("")
Expand All @@ -162,14 +167,20 @@ def pick_from_values(strain_name:str, field_name:str, rows:list, accession_key:s
# continue, and use the error message printing below
pass

# want to print out helpful messages about disagreement, so order by most commonly observed
# want to print out helpful messages about disagreement, but we will return a value
# so that the sample is not excluded from the ingested data. The messages / warnings
# should be used to correct metadata downstream of this script.
obs = defaultdict(list)
for row in rows:
obs[row[field_name]].append(row[accession_key])
msg = f"Strain '{strain_name}' Disagreement for '{field_name}', {len(obs)} observed values:"
for v,acc in sorted(obs.items(), key=lambda item: item[1], reverse=True):
obs_sorted = sorted(obs.items(), key=lambda item: len(item[1]), reverse=True)
value_to_use = obs_sorted[0][0]
msg = f"WARNING: Strain '{strain_name}', '{field_name}' had {len(obs)} observed values:"
for v,acc in obs_sorted:
msg+=f"\n\t{', '.join(acc)}: {v}"
raise ValueMatchingError(msg)
msg += f"\n\tWe've picked {value_to_use} however you may wish to fix this metadata yourself."
log(msg)
return value_to_use
return values.pop()


Expand All @@ -192,15 +203,8 @@ def make_wide(strain: str, rows: list, segment_names: list[str], header_info:Hea
"n_segments": str(len(segments.keys())),
}

observed_mismatches = False
for field_name in header_info['common']:
try:
metadata[field_name] = pick_from_values(strain, field_name, list(segments.values()), accession_key)
except ValueMatchingError as e:
log(e)
observed_mismatches = True
if observed_mismatches:
return None
metadata[field_name] = pick_from_values(strain, field_name, list(segments.values()), accession_key)

for info in header_info['segment_specific']:
if info['segment'] not in segments:
Expand Down

0 comments on commit 8a178b8

Please sign in to comment.