Explicitly specify a file encoding of UTF-8 everywhere

Augur mostly assumes the default file encoding UTF-8, but this is only true on systems where the system default or default locale use UTF-8. On systems which use the POSIX "C" locale, for example, the Python's default file encoding is ASCII, which can cause encoding failures like that observed with `augur traits` in #559. UTF-8 is a near universal standard for encodings these days. Note that Python 3.7 includes PEP-0538 and PEP-0540 to help address the difference between this common assumption and the reality of default encodings, but a) they do not allow application code to reliably avoid specifying encodings and b) Augur supports 3.6 anyway. Resolves #559.
nextstrain · May 29, 2020 · 0e52323 · 0e52323
1 parent 42a26ed
commit 0e52323
Show file tree

Hide file tree

Showing 13 changed files with 32 additions and 32 deletions.
diff --git a/augur/align.py b/augur/align.py
@@ -294,7 +294,7 @@ def analyse_insertions(aln, ungapped, insertion_csv):
         for insertion_seq, strains in i_data.items():
             for strain in strains:
                 strain_data[strain][idx] = insertion_seq
-    with open(insertion_csv, 'w') as fh:
+    with open(insertion_csv, 'w', encoding='utf-8') as fh:
         print(",".join(header), file=fh)
         for strain in strain_data:
             print("{},{}".format(strain, ",".join(strain_data[strain])), file=fh)

diff --git a/augur/distance.py b/augur/distance.py
@@ -166,7 +166,7 @@ def read_distance_map(map_file):
     [('default', 0.0), ('map', {'SigPep': {0: {('W', 'P'): -8.3}}})]
     """
     # Load the JSON.
-    with open(map_file, "r") as fh:
+    with open(map_file, "r", encoding='utf-8') as fh:
         json_distance_map = json.load(fh)
 
     # Confirm that all required fields are present.

diff --git a/augur/export_v2.py b/augur/export_v2.py
@@ -783,7 +783,7 @@ def set_description(data_json, cmd_line_description_file):
     `meta.description` in *data_json* to the text provided.
     """
     try:
-        with open(cmd_line_description_file) as description_file:
+        with open(cmd_line_description_file, encoding='utf-8') as description_file:
             markdown_text = description_file.read()
             data_json['meta']['description'] = markdown_text
     except FileNotFoundError:

diff --git a/augur/filter.py b/augur/filter.py
@@ -16,9 +16,9 @@
 def read_vcf(filename):
     if filename.lower().endswith(".gz"):
         import gzip
-        file = gzip.open(filename, mode="rt")
+        file = gzip.open(filename, mode="rt", encoding='utf-8')
     else:
-        file = open(filename)
+        file = open(filename, encoding='utf-8')
 
     chrom_line = next(line for line in file if line.startswith("#C"))
     file.close()
@@ -55,7 +55,7 @@ def write_vcf(input_filename, output_filename, dropped_samps):
 
 def read_priority_scores(fname):
     try:
-        with open(fname) as pfile:
+        with open(fname, encoding='utf-8') as pfile:
             return defaultdict(float, {
                 elems[0]: float(elems[1])
                 for elems in (line.strip().split() for line in pfile.readlines())
@@ -169,7 +169,7 @@ def run(args):
     num_excluded_by_name = 0
     if args.exclude:
         try:
-            with open(args.exclude, 'r') as ifile:
+            with open(args.exclude, 'r', encoding='utf-8') as ifile:
                 to_exclude = set()
                 for line in ifile:
                     if line[0] != comment_char:
@@ -326,7 +326,7 @@ def run(args):
     # Note that we are also not checking for existing meta data here
     num_included_by_name = 0
     if args.include and os.path.isfile(args.include):
-        with open(args.include, 'r') as ifile:
+        with open(args.include, 'r', encoding='utf-8') as ifile:
             to_include = set(
                 [
                     line.strip()

diff --git a/augur/frequencies.py b/augur/frequencies.py
@@ -82,7 +82,7 @@ def run(args):
     if args.method == "kde":
         # Load weights if they have been provided.
         if args.weights:
-            with open(args.weights, "r") as fh:
+            with open(args.weights, "r", encoding='utf-8') as fh:
                 weights = json.load(fh)
 
             weights_attribute = args.weights_attribute

diff --git a/augur/import_beast.py b/augur/import_beast.py
@@ -231,7 +231,7 @@ def parse_nexus(tree_path, treestring_regex=r'tree [A-Za-z\_]+([0-9]+)', verbose
 
     if isinstance(tree_path,str): ## determine if path or handle was provided to function
         try:
-            handle=open(tree_path,'r')
+            handle=open(tree_path,'r', encoding='utf-8')
         except FileNotFoundError:
             print("FATAL: No such file {}".format(tree_path))
             sys.exit(2)

diff --git a/augur/lbi.py b/augur/lbi.py
@@ -94,7 +94,7 @@ def run(args):
     tree = Bio.Phylo.read(args.tree, "newick")
 
     # Load branch lengths.
-    with open(args.branch_lengths, "r") as json_fh:
+    with open(args.branch_lengths, "r", encoding='utf-8') as json_fh:
         branch_lengths = json.load(json_fh)
 
     # Annotate branch lengths and dates onto tree nodes.

diff --git a/augur/parse.py b/augur/parse.py
@@ -92,7 +92,7 @@ def run(args):
         strain_key = args.fields[0]
 
     # loop over sequences, parse fasta header of each sequence
-    with open(args.output_sequences, 'w') as output:
+    with open(args.output_sequences, 'w', encoding='utf-8') as output:
         for seq in seqs:
             fields = map(str.strip, seq.description.split(args.separator))
             tmp_meta = dict(zip(args.fields, fields))

diff --git a/augur/reconstruct_sequences.py b/augur/reconstruct_sequences.py
@@ -73,7 +73,7 @@ def run(args):
     #if VCF, read in the reference seq for each gene, put on root
     if(is_vcf):
         node_data["nodes"][root_node]['aa_sequences'] = {}
-        with open(args.vcf_aa_reference) as handle:
+        with open(args.vcf_aa_reference, encoding='utf-8') as handle:
             for record in SeqIO.parse(handle, "fasta"):
                 if record.id==args.gene:
                     #'root' may not be same as 'reference', so apply any mutations at root here!

diff --git a/augur/traits.py b/augur/traits.py
@@ -145,7 +145,7 @@ def run(args):
     if args.weights:
         weight_dict = {c:{} for c in args.columns}
         sep = ',' if args.weights.endswith('csv') else '\t'
-        with open(args.weights, 'r') as fh:
+        with open(args.weights, 'r', encoding='utf-8') as fh:
             for line in fh:
                 if line[0]=='#':
                     continue
@@ -187,7 +187,7 @@ def run(args):
             models[column]['transition_matrix'] = [list(x) for x in gtr.W]
 
         if gtr:
-            with open(out_prefix+'%s.mugration_model.txt'%column, 'w') as ofile:
+            with open(out_prefix+'%s.mugration_model.txt'%column, 'w', encoding='utf-8') as ofile:
                 ofile.write('Map from character to field name\n')
                 for k,v in alphabet.items():
                     ofile.write(k+':\t'+str(v)+'\n')

diff --git a/augur/translate.py b/augur/translate.py
@@ -285,7 +285,7 @@ def assign_aa_fasta(tree, translations):
 def get_genes_from_file(fname):
     genes = []
     if os.path.isfile(fname):
-        with open(fname) as ifile:
+        with open(fname, encoding='utf-8') as ifile:
             for line in ifile:
                 fields = line.strip().split('#')
                 if fields[0].strip():

diff --git a/augur/tree.py b/augur/tree.py
@@ -136,13 +136,13 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt
         aln_file    file name of input aligment
         out_file    file name to write tree to
     '''
-    with open(aln_file) as ifile:
+    with open(aln_file, encoding='utf-8') as ifile:
         tmp_seqs = ifile.readlines()
 
     # IQ-tree messes with taxon names. Hence remove offending characters, reinstaniate later
     tmp_aln_file = aln_file.replace(".fasta", "-delim.fasta")
     log_file = tmp_aln_file.replace(".fasta", ".iqtree.log")
-    with open(tmp_aln_file, 'w') as ofile:
+    with open(tmp_aln_file, 'w', encoding='utf-8') as ofile:
         for line in tmp_seqs:
             ofile.write(line.replace('/', '_X_X_').replace('|','_Y_Y_').replace("(","_X_Y_").replace(")","_Y_X_"))
 
@@ -256,7 +256,7 @@ def write_out_informative_fasta(compress_seq, alignment, stripFile=None):
 
     #If want a position map, print:
     if printPositionMap:
-        with open(fasta_file+".positions.txt", 'w') as the_file:
+        with open(fasta_file+".positions.txt", 'w', encoding='utf-8') as the_file:
             the_file.write("\n".join(pos))
 
     return fasta_file
@@ -294,7 +294,7 @@ def mask_sites_in_multiple_sequence_alignment(alignment_file, excluded_sites_fil
     # Write the masked alignment to disk one record at a time.
     alignment_file_path = Path(alignment_file)
     masked_alignment_file = str(alignment_file_path.parent / ("masked_%s" % alignment_file_path.name))
-    with open(masked_alignment_file, "w") as oh:
+    with open(masked_alignment_file, "w", encoding='utf-8') as oh:
         for record in alignment:
             # Convert to a mutable sequence to enable masking with Ns.
             sequence = record.seq.tomutable()

diff --git a/augur/utils.py b/augur/utils.py
@@ -26,10 +26,10 @@ def open_file(fname, mode):
         if "t" not in mode:
             # For interoperability, gzip needs to open files in "text" mode
             mode = mode + "t"
-        with gzip.open(fname, mode) as fh:
+        with gzip.open(fname, mode, encoding='utf-8') as fh:
             yield fh
     else:
-        with open(fname, mode) as fh:
+        with open(fname, mode, encoding='utf-8') as fh:
             yield fh
 
 def is_vcf(fname):
@@ -47,9 +47,9 @@ def is_vcf(fname):
 def myopen(fname, mode):
     if fname.endswith('.gz'):
         import gzip
-        return gzip.open(fname, mode)
+        return gzip.open(fname, mode, encoding='utf-8')
     else:
-        return open(fname, mode)
+        return open(fname, mode, encoding='utf-8')
 
 def get_json_name(args, default=None):
     if args.output_node_data:
@@ -231,7 +231,7 @@ def read_node_data(fnames, tree=None):
     node_data = {"nodes": {}}
     for fname in fnames:
         if os.path.isfile(fname):
-            with open(fname) as jfile:
+            with open(fname, encoding='utf-8') as jfile:
                 tmp_data = json.load(jfile)
             if tmp_data.get("annotations"):
                 try:
@@ -327,7 +327,7 @@ def write_json(data, file_name, indent=(None if os.environ.get("AUGUR_MINIFY_JSO
     if include_version:
         data["generated_by"] = {"program": "augur", "version": get_augur_version()}
 
-    with open(file_name, 'w') as handle:
+    with open(file_name, 'w', encoding='utf-8') as handle:
         json.dump(data, handle, indent=indent, sort_keys=True)
 
 
@@ -348,7 +348,7 @@ def load_features(reference, feature_names=None):
             return None
         limit_info = dict( gff_type = ['gene'] )
 
-        with open(reference) as in_handle:
+        with open(reference, encoding='utf-8') as in_handle:
             for rec in GFF.parse(in_handle, limit_info=limit_info):
                 for feat in rec.features:
                     if feature_names is not None: #check both tags; user may have used either
@@ -468,7 +468,7 @@ def add_line(line):
 
     if overrides:
         if os.path.isfile(overrides):
-            with open(overrides) as fh:
+            with open(overrides, encoding='utf-8') as fh:
                 for line in fh:
                     add_line(line)
         else:
@@ -497,7 +497,7 @@ def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name):
 
     #prepare the header of the VCF & write out
     header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+seqNames
-    with open(vcf_file_name, 'w') as the_file:
+    with open(vcf_file_name, 'w', encoding='utf-8') as the_file:
         the_file.write( "##fileformat=VCFv4.2\n"+
                         "##source=NextStrain_Protein_Translation\n"+
                         "##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
@@ -552,10 +552,10 @@ def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name):
             vcfWrite.append("\t".join(output))
 
     #write it all out
-    with open(ref_file_name, 'w') as the_file:
+    with open(ref_file_name, 'w', encoding='utf-8') as the_file:
         the_file.write("\n".join(refWrite))
 
-    with open(vcf_file_name, 'a') as the_file:
+    with open(vcf_file_name, 'a', encoding='utf-8') as the_file:
         the_file.write("\n".join(vcfWrite))
 
     if vcf_file_name.lower().endswith('.gz'):
@@ -869,7 +869,7 @@ def read_mask_file(mask_file):
         Sorted list of unique zero-indexed sites
     """
     mask_sites = []
-    with open(mask_file) as mf:
+    with open(mask_file, encoding='utf-8') as mf:
         for idx, line in enumerate(l.strip() for l in mf.readlines()):
             if "\t" in line:
                 line = line.split("\t")[1]