Skip to content

Commit

Permalink
Merge pull request pyfa-org#429 from Neugeniko/master
Browse files Browse the repository at this point in the history
Added detection of Unicode BOM (Byte Order Mark) ...
  • Loading branch information
blitzmann committed Nov 15, 2015
2 parents b37aea4 + dde8b1f commit 20759d2
Showing 1 changed file with 27 additions and 12 deletions.
39 changes: 27 additions & 12 deletions service/fit.py
Original file line number Diff line number Diff line change
Expand Up @@ -854,19 +854,34 @@ def importFitFromFiles(self, paths, callback=None):
file = open(path, "r")
srcString = file.read()
codec_found = None
# If file had ANSI encoding, convert it to unicode using system
# default codepage, or use fallbacks UTF-16, then cp1252 on any
# encoding errors
# If file had ANSI encoding, decode it to unicode using detection
# of BOM header or if there is no header try default
# codepage then fallback to utf-16, cp1252

if isinstance(srcString, str):
attempt_codecs = (defcodepage, "utf-16", "cp1252")
for page in attempt_codecs:
try:
srcString = unicode(srcString, page)
codec_found = page
except UnicodeDecodeError:
logger.warn("Error unicode decoding %s from page %s, trying next codec", path, page)
else:
break
encoding_map = (('\xef\xbb\xbf', 'utf-8'),('\xff\xfe\0\0', 'utf-32'),('\0\0\xfe\xff', 'UTF-32BE'),('\xff\xfe', 'utf-16'),('\xfe\xff', 'UTF-16BE'))
for bom, encoding in encoding_map:
if srcString.startswith(bom):
codec_found = encoding
savebom = bom

if codec_found is None:
logger.warn("Unicode BOM not found in file %s.", path)
attempt_codecs = (defcodepage, "utf-16", "cp1252")
for page in attempt_codecs:
try:
logger.warn("Attempting to decode file %s using %s page.", path, page)
srcString = unicode(srcString, page)
codec_found = page
logger.warn("File %s decoded using %s page.", path, page)
except UnicodeDecodeError:
logger.warn("Error unicode decoding %s from page %s, trying next codec", path, page)
else:
break
else:
logger.debug("Unicode BOM detected in %s, using %s page.", path, codec_found)
srcString = unicode(srcString[len(savebom):], codec_found)

else:
# nasty hack to detect other transparent utf-16 loading
if srcString[0] == '<' and 'utf-16' in srcString[:128].lower():
Expand Down

0 comments on commit 20759d2

Please sign in to comment.