Skip to content

Commit

Permalink
Explicitly specify a file encoding of UTF-8 everywhere
Browse files Browse the repository at this point in the history
Augur mostly assumes the default file encoding UTF-8, but this is only
true on systems where the system default or default locale use UTF-8.
On systems which use the POSIX "C" locale, for example, the Python's
default file encoding is ASCII, which can cause encoding failures like
that observed with `augur traits` in #559.  UTF-8 is a near universal
standard for encodings these days.

Note that Python 3.7 includes PEP-0538 and PEP-0540 to help address the
difference between this common assumption and the reality of default
encodings, but a) they do not allow application code to reliably avoid
specifying encodings and b) Augur supports 3.6 anyway.

Resolves #559.
  • Loading branch information
tsibley committed May 29, 2020
1 parent 42a26ed commit 0e52323
Show file tree
Hide file tree
Showing 13 changed files with 32 additions and 32 deletions.
2 changes: 1 addition & 1 deletion augur/align.py
Original file line number Diff line number Diff line change
Expand Up @@ -294,7 +294,7 @@ def analyse_insertions(aln, ungapped, insertion_csv):
for insertion_seq, strains in i_data.items():
for strain in strains:
strain_data[strain][idx] = insertion_seq
with open(insertion_csv, 'w') as fh:
with open(insertion_csv, 'w', encoding='utf-8') as fh:
print(",".join(header), file=fh)
for strain in strain_data:
print("{},{}".format(strain, ",".join(strain_data[strain])), file=fh)
Expand Down
2 changes: 1 addition & 1 deletion augur/distance.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,7 @@ def read_distance_map(map_file):
[('default', 0.0), ('map', {'SigPep': {0: {('W', 'P'): -8.3}}})]
"""
# Load the JSON.
with open(map_file, "r") as fh:
with open(map_file, "r", encoding='utf-8') as fh:
json_distance_map = json.load(fh)

# Confirm that all required fields are present.
Expand Down
2 changes: 1 addition & 1 deletion augur/export_v2.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,7 +783,7 @@ def set_description(data_json, cmd_line_description_file):
`meta.description` in *data_json* to the text provided.
"""
try:
with open(cmd_line_description_file) as description_file:
with open(cmd_line_description_file, encoding='utf-8') as description_file:
markdown_text = description_file.read()
data_json['meta']['description'] = markdown_text
except FileNotFoundError:
Expand Down
10 changes: 5 additions & 5 deletions augur/filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,9 @@
def read_vcf(filename):
if filename.lower().endswith(".gz"):
import gzip
file = gzip.open(filename, mode="rt")
file = gzip.open(filename, mode="rt", encoding='utf-8')
else:
file = open(filename)
file = open(filename, encoding='utf-8')

chrom_line = next(line for line in file if line.startswith("#C"))
file.close()
Expand Down Expand Up @@ -55,7 +55,7 @@ def write_vcf(input_filename, output_filename, dropped_samps):

def read_priority_scores(fname):
try:
with open(fname) as pfile:
with open(fname, encoding='utf-8') as pfile:
return defaultdict(float, {
elems[0]: float(elems[1])
for elems in (line.strip().split() for line in pfile.readlines())
Expand Down Expand Up @@ -169,7 +169,7 @@ def run(args):
num_excluded_by_name = 0
if args.exclude:
try:
with open(args.exclude, 'r') as ifile:
with open(args.exclude, 'r', encoding='utf-8') as ifile:
to_exclude = set()
for line in ifile:
if line[0] != comment_char:
Expand Down Expand Up @@ -326,7 +326,7 @@ def run(args):
# Note that we are also not checking for existing meta data here
num_included_by_name = 0
if args.include and os.path.isfile(args.include):
with open(args.include, 'r') as ifile:
with open(args.include, 'r', encoding='utf-8') as ifile:
to_include = set(
[
line.strip()
Expand Down
2 changes: 1 addition & 1 deletion augur/frequencies.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ def run(args):
if args.method == "kde":
# Load weights if they have been provided.
if args.weights:
with open(args.weights, "r") as fh:
with open(args.weights, "r", encoding='utf-8') as fh:
weights = json.load(fh)

weights_attribute = args.weights_attribute
Expand Down
2 changes: 1 addition & 1 deletion augur/import_beast.py
Original file line number Diff line number Diff line change
Expand Up @@ -231,7 +231,7 @@ def parse_nexus(tree_path, treestring_regex=r'tree [A-Za-z\_]+([0-9]+)', verbose

if isinstance(tree_path,str): ## determine if path or handle was provided to function
try:
handle=open(tree_path,'r')
handle=open(tree_path,'r', encoding='utf-8')
except FileNotFoundError:
print("FATAL: No such file {}".format(tree_path))
sys.exit(2)
Expand Down
2 changes: 1 addition & 1 deletion augur/lbi.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,7 +94,7 @@ def run(args):
tree = Bio.Phylo.read(args.tree, "newick")

# Load branch lengths.
with open(args.branch_lengths, "r") as json_fh:
with open(args.branch_lengths, "r", encoding='utf-8') as json_fh:
branch_lengths = json.load(json_fh)

# Annotate branch lengths and dates onto tree nodes.
Expand Down
2 changes: 1 addition & 1 deletion augur/parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,7 @@ def run(args):
strain_key = args.fields[0]

# loop over sequences, parse fasta header of each sequence
with open(args.output_sequences, 'w') as output:
with open(args.output_sequences, 'w', encoding='utf-8') as output:
for seq in seqs:
fields = map(str.strip, seq.description.split(args.separator))
tmp_meta = dict(zip(args.fields, fields))
Expand Down
2 changes: 1 addition & 1 deletion augur/reconstruct_sequences.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def run(args):
#if VCF, read in the reference seq for each gene, put on root
if(is_vcf):
node_data["nodes"][root_node]['aa_sequences'] = {}
with open(args.vcf_aa_reference) as handle:
with open(args.vcf_aa_reference, encoding='utf-8') as handle:
for record in SeqIO.parse(handle, "fasta"):
if record.id==args.gene:
#'root' may not be same as 'reference', so apply any mutations at root here!
Expand Down
4 changes: 2 additions & 2 deletions augur/traits.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,7 +145,7 @@ def run(args):
if args.weights:
weight_dict = {c:{} for c in args.columns}
sep = ',' if args.weights.endswith('csv') else '\t'
with open(args.weights, 'r') as fh:
with open(args.weights, 'r', encoding='utf-8') as fh:
for line in fh:
if line[0]=='#':
continue
Expand Down Expand Up @@ -187,7 +187,7 @@ def run(args):
models[column]['transition_matrix'] = [list(x) for x in gtr.W]

if gtr:
with open(out_prefix+'%s.mugration_model.txt'%column, 'w') as ofile:
with open(out_prefix+'%s.mugration_model.txt'%column, 'w', encoding='utf-8') as ofile:
ofile.write('Map from character to field name\n')
for k,v in alphabet.items():
ofile.write(k+':\t'+str(v)+'\n')
Expand Down
2 changes: 1 addition & 1 deletion augur/translate.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def assign_aa_fasta(tree, translations):
def get_genes_from_file(fname):
genes = []
if os.path.isfile(fname):
with open(fname) as ifile:
with open(fname, encoding='utf-8') as ifile:
for line in ifile:
fields = line.strip().split('#')
if fields[0].strip():
Expand Down
8 changes: 4 additions & 4 deletions augur/tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -136,13 +136,13 @@ def build_iqtree(aln_file, out_file, substitution_model="GTR", clean_up=True, nt
aln_file file name of input aligment
out_file file name to write tree to
'''
with open(aln_file) as ifile:
with open(aln_file, encoding='utf-8') as ifile:
tmp_seqs = ifile.readlines()

# IQ-tree messes with taxon names. Hence remove offending characters, reinstaniate later
tmp_aln_file = aln_file.replace(".fasta", "-delim.fasta")
log_file = tmp_aln_file.replace(".fasta", ".iqtree.log")
with open(tmp_aln_file, 'w') as ofile:
with open(tmp_aln_file, 'w', encoding='utf-8') as ofile:
for line in tmp_seqs:
ofile.write(line.replace('/', '_X_X_').replace('|','_Y_Y_').replace("(","_X_Y_").replace(")","_Y_X_"))

Expand Down Expand Up @@ -256,7 +256,7 @@ def write_out_informative_fasta(compress_seq, alignment, stripFile=None):

#If want a position map, print:
if printPositionMap:
with open(fasta_file+".positions.txt", 'w') as the_file:
with open(fasta_file+".positions.txt", 'w', encoding='utf-8') as the_file:
the_file.write("\n".join(pos))

return fasta_file
Expand Down Expand Up @@ -294,7 +294,7 @@ def mask_sites_in_multiple_sequence_alignment(alignment_file, excluded_sites_fil
# Write the masked alignment to disk one record at a time.
alignment_file_path = Path(alignment_file)
masked_alignment_file = str(alignment_file_path.parent / ("masked_%s" % alignment_file_path.name))
with open(masked_alignment_file, "w") as oh:
with open(masked_alignment_file, "w", encoding='utf-8') as oh:
for record in alignment:
# Convert to a mutable sequence to enable masking with Ns.
sequence = record.seq.tomutable()
Expand Down
24 changes: 12 additions & 12 deletions augur/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,10 +26,10 @@ def open_file(fname, mode):
if "t" not in mode:
# For interoperability, gzip needs to open files in "text" mode
mode = mode + "t"
with gzip.open(fname, mode) as fh:
with gzip.open(fname, mode, encoding='utf-8') as fh:
yield fh
else:
with open(fname, mode) as fh:
with open(fname, mode, encoding='utf-8') as fh:
yield fh

def is_vcf(fname):
Expand All @@ -47,9 +47,9 @@ def is_vcf(fname):
def myopen(fname, mode):
if fname.endswith('.gz'):
import gzip
return gzip.open(fname, mode)
return gzip.open(fname, mode, encoding='utf-8')
else:
return open(fname, mode)
return open(fname, mode, encoding='utf-8')

def get_json_name(args, default=None):
if args.output_node_data:
Expand Down Expand Up @@ -231,7 +231,7 @@ def read_node_data(fnames, tree=None):
node_data = {"nodes": {}}
for fname in fnames:
if os.path.isfile(fname):
with open(fname) as jfile:
with open(fname, encoding='utf-8') as jfile:
tmp_data = json.load(jfile)
if tmp_data.get("annotations"):
try:
Expand Down Expand Up @@ -327,7 +327,7 @@ def write_json(data, file_name, indent=(None if os.environ.get("AUGUR_MINIFY_JSO
if include_version:
data["generated_by"] = {"program": "augur", "version": get_augur_version()}

with open(file_name, 'w') as handle:
with open(file_name, 'w', encoding='utf-8') as handle:
json.dump(data, handle, indent=indent, sort_keys=True)


Expand All @@ -348,7 +348,7 @@ def load_features(reference, feature_names=None):
return None
limit_info = dict( gff_type = ['gene'] )

with open(reference) as in_handle:
with open(reference, encoding='utf-8') as in_handle:
for rec in GFF.parse(in_handle, limit_info=limit_info):
for feat in rec.features:
if feature_names is not None: #check both tags; user may have used either
Expand Down Expand Up @@ -468,7 +468,7 @@ def add_line(line):

if overrides:
if os.path.isfile(overrides):
with open(overrides) as fh:
with open(overrides, encoding='utf-8') as fh:
for line in fh:
add_line(line)
else:
Expand Down Expand Up @@ -497,7 +497,7 @@ def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name):

#prepare the header of the VCF & write out
header=["#CHROM","POS","ID","REF","ALT","QUAL","FILTER","INFO","FORMAT"]+seqNames
with open(vcf_file_name, 'w') as the_file:
with open(vcf_file_name, 'w', encoding='utf-8') as the_file:
the_file.write( "##fileformat=VCFv4.2\n"+
"##source=NextStrain_Protein_Translation\n"+
"##FORMAT=<ID=GT,Number=1,Type=String,Description=\"Genotype\">\n")
Expand Down Expand Up @@ -552,10 +552,10 @@ def write_VCF_translation(prot_dict, vcf_file_name, ref_file_name):
vcfWrite.append("\t".join(output))

#write it all out
with open(ref_file_name, 'w') as the_file:
with open(ref_file_name, 'w', encoding='utf-8') as the_file:
the_file.write("\n".join(refWrite))

with open(vcf_file_name, 'a') as the_file:
with open(vcf_file_name, 'a', encoding='utf-8') as the_file:
the_file.write("\n".join(vcfWrite))

if vcf_file_name.lower().endswith('.gz'):
Expand Down Expand Up @@ -869,7 +869,7 @@ def read_mask_file(mask_file):
Sorted list of unique zero-indexed sites
"""
mask_sites = []
with open(mask_file) as mf:
with open(mask_file, encoding='utf-8') as mf:
for idx, line in enumerate(l.strip() for l in mf.readlines()):
if "\t" in line:
line = line.split("\t")[1]
Expand Down

0 comments on commit 0e52323

Please sign in to comment.