From 33fab73667691d38a2ebe3b122014fe48c8fece2 Mon Sep 17 00:00:00 2001 From: Jennifer Chang Date: Wed, 13 Sep 2023 13:55:57 -0700 Subject: [PATCH] Copy join-metadata-and-clades.py from monkeypox --- join-metadata-and-clades.py | 77 +++++++++++++++++++ .../join-metadata-and-clades.t | 25 ++++++ 2 files changed, 102 insertions(+) create mode 100755 join-metadata-and-clades.py create mode 100644 tests/join-metadata-and-clades/join-metadata-and-clades.t diff --git a/join-metadata-and-clades.py b/join-metadata-and-clades.py new file mode 100755 index 0000000..99ed732 --- /dev/null +++ b/join-metadata-and-clades.py @@ -0,0 +1,77 @@ +#!/usr/bin/env python3 +import argparse +import re +import sys +import pandas as pd + +NEXTCLADE_JOIN_COLUMN_NAME = 'seqName' +VALUE_MISSING_DATA = '?' + +column_map = { + "clade": "clade", + "outbreak": "outbreak", + "lineage": "lineage", + "coverage": "coverage", + "totalMissing": "missing_data", + "totalSubstitutions": "divergence", + "totalNonACGTNs": "nonACGTN", + "qc.missingData.status": "QC_missing_data", + "qc.mixedSites.status": "QC_mixed_sites", + "qc.privateMutations.status": "QC_rare_mutations", + "qc.frameShifts.status": "QC_frame_shifts", + "qc.stopCodons.status": "QC_stop_codons", + "frameShifts": "frame_shifts", + "isReverseComplement": "is_reverse_complement", +# "deletions": "deletions", +# "insertions": "insertions" +# "substitutions": "substitutions", +# "aaSubstitutions": "aaSubstitutions" +} + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Joins metadata file with Nextclade clade output", + ) + parser.add_argument("--metadata") + parser.add_argument("--nextclade") + parser.add_argument("--id-field") + parser.add_argument("-o", default=sys.stdout) + return parser.parse_args() + +def main(): + args = parse_args() + + metadata = pd.read_csv(args.metadata, index_col=args.id_field, + sep='\t', low_memory=False, na_filter = False) + + # Read and rename clade column to be more descriptive + clades = pd.read_csv(args.nextclade, index_col=NEXTCLADE_JOIN_COLUMN_NAME, + sep='\t', low_memory=False, na_filter = False) \ + .rename(columns=column_map) + + clades.index = clades.index.map(lambda x: re.sub(" \|.*", "", x)) + + # Select columns in column map + clades = clades[list(column_map.values())] + + # Separate long from short columns + short_metadata = metadata.iloc[:,:-2].copy() + long_metadata = metadata.iloc[:,-2:].copy() + + # Concatenate on columns + result = pd.merge( + short_metadata, clades, + left_index=True, + right_index=True, + how='left' + ) + + # Add long columns to back + result = pd.concat([result, long_metadata], axis=1) + + result.to_csv(args.o, index_label=args.id_field, sep='\t') + + +if __name__ == '__main__': + main() diff --git a/tests/join-metadata-and-clades/join-metadata-and-clades.t b/tests/join-metadata-and-clades/join-metadata-and-clades.t new file mode 100644 index 0000000..beae563 --- /dev/null +++ b/tests/join-metadata-and-clades/join-metadata-and-clades.t @@ -0,0 +1,25 @@ +Join pathogen metadata pulled from an external database with nextclade clade calls into one final metadata file. + +Create metadata file for testing. + + $ cat > metadata_raw.tsv <<~~ + > strain date + > id_1 2023-01-01 + > id_2 2023-02-02 + > ~~ + +Create nextclade file for testing. + + $ cat > nextclade.tsv <<~~ + > seqName clade outbreak lineage coverage totalMissing totalSubstitutions totalNonACGTNs qc.missingData.status qc.mixedSites.status qc.privateMutations.status qc.frameShifts.status qc.stopCodons.status frameShifts isReverseComplement + > id_1 val_1 val_1 val_1 val_1 val_1 val_1 val_1 val_1 val_1 val_1 val_1 val_1 val_1 val_1 + > id_2 val_2 val_2 val_2 val_2 val_2 val_2 val_2 val_2 val_2 val_2 val_2 val_2 val_2 val_2 + > ~~ + +Check whether join-metadata-clades script produces an output metadata file, but do not assess the accuracy or validity of that output file. + + $ python $TESTDIR/../../join-metadata-and-clades.py \ + > --id-field strain \ + > --metadata metadata_raw.tsv \ + > --nextclade nextclade.tsv \ + > -o test_metadata.tsv \ No newline at end of file