Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Handling encodings correctly #900

Closed
wants to merge 28 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
28 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 13 additions & 0 deletions tests/test_cmdline.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
# coding: utf-8
import sys
import subprocess
import virtualenv
Expand All @@ -22,6 +23,18 @@ def test_commandline_explicit_interp(tmpdir):
str(tmpdir.join('venv'))
])

def test_commandline_non_ascii_path(tmpdir):
target_path = str(tmpdir.join('vénv'))
if sys.version_info[0] == 2:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not use a non-ASCII character that's present in the default code page on Appveyor, rather than messing about changing the locale of the system? Something like an accented character (é) should be available in the default codepage.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Some problems exist in multi-byte locales only. See the updated test case in 57a43e8.

target_path = target_path.decode('utf-8').encode(sys.getfilesystemencoding())

subprocess.check_call([
sys.executable,
VIRTUALENV_SCRIPT,
'-p', sys.executable,
target_path
])

# The registry lookups to support the abbreviated "-p 3.5" form of specifying
# a Python interpreter on Windows don't seem to work with Python 3.5. The
# registry layout is not well documented, and it's not clear that the feature
Expand Down
48 changes: 36 additions & 12 deletions virtualenv.py
Original file line number Diff line number Diff line change
Expand Up @@ -356,11 +356,21 @@ def copyfile(src, dest, symlink=True):
logger.info('Copying to %s', dest)
copyfileordir(src, dest, symlink)

def writefile(dest, content, overwrite=True):
def get_system_encoding():
if is_win:
# .encode('mbcs') is broken on Windows. For example,
# u'\u00a3'.encode('mbcs') gives a wrong result in CP437 (en-US), and
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you mean a wrong result? u"\u00a3" (£ sign) is encoded to \x9c in CP437. Is that not what you get? Note that "mbcs" is the ANSI codepage, which is not the same code page as GetOEMCP.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

On other systems sys.getfilesystemencoding() works fine, while on Windows sys.getfilesystemencoding() returns 'mbcs', which is not what virtualenv needs. If this comment is confusing I'll change or remove it.

# u'\uffe1'.encode('mbcs') raises and error in CP950 (zh-TW)
import ctypes
return 'cp%d' % ctypes.windll.kernel32.GetOEMCP()
else:
return sys.getfilesystemencoding()

def writefile(dest, content, overwrite=True, use_system_locale=False):
if not os.path.exists(dest):
logger.info('Writing %s', dest)
with open(dest, 'wb') as f:
f.write(content.encode('utf-8'))
f.write(content.encode(get_system_encoding() if use_system_locale else 'utf-8'))
return
else:
with open(dest, 'rb') as f:
Expand Down Expand Up @@ -720,15 +730,15 @@ def call_subprocess(cmd, show_stdout=True,
remove_from_env=None, stdin=None):
cmd_parts = []
for part in cmd:
if len(part) > 45:
part = part[:20]+"..."+part[-20:]
if ' ' in part or '\n' in part or '"' in part or "'" in part:
part = '"%s"' % part.replace('"', '\\"')
if hasattr(part, 'decode'):
try:
part = part.decode(sys.getdefaultencoding())
except UnicodeDecodeError:
part = part.decode(sys.getfilesystemencoding())
if len(part) > 45:
part = part[:20]+"..."+part[-20:]
if ' ' in part or '\n' in part or '"' in part or "'" in part:
part = '"%s"' % part.replace('"', '\\"')
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why did you move this to after the decode step?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For example, before my changes, if part is: b'/some/very/long/path/to/break/existing/codes/\xe5\x83\x85\xe6\x98\xaf\xe4\xb8\xad\xe6\x96\x87/bin/python', part[:20]+"..."+part[-20:] gives invalid UTF-8 sequences. As a result ellipsis insertion should occur after decoding.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, OK. Good catch. Thanks for the explanation.

cmd_parts.append(part)
cmd_desc = ' '.join(cmd_parts)
if show_stdout:
Expand Down Expand Up @@ -1285,6 +1295,15 @@ def install_python(home_dir, lib_dir, inc_dir, bin_dir, site_packages, clear, sy
raise SystemExit(3)
logger.info('Copying lib_pypy')
copyfile(d, os.path.join(home_dir, 'lib_pypy'), symlink)
else:
# On Unix-like systems, if PyPy is built with --shared, it
# requires libpypy-c.(so|dylib) to run. See
# _pypy_install_libs_after_virtualenv() in
# lib-python/(2.7|3)/subprocess.py of PyPy
for name in ['libpypy-c.so', 'libpypy-c.dylib']:
src = join(prefix, 'bin', name)
if os.path.exists(src):
copyfile(src, join(bin_dir, name), symlink)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems unrelated to encodings...

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As mentioned at #900 (comment):

If your code breaks on PyPy, the fixes should be in this PR, not in a separate one.

This is the (hopefully) correct fix that supersedes my previous dirty PyPy patches.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Submitted as a dependency PR at #946


if os.path.splitext(os.path.basename(py_executable))[0] != expected_exe:
secondary_exe = os.path.join(os.path.dirname(py_executable),
Expand Down Expand Up @@ -1371,8 +1390,9 @@ def install_python(home_dir, lib_dir, inc_dir, bin_dir, site_packages, clear, sy
else:
copyfile(py_executable, full_pth, symlink)

cmd = [py_executable, '-c', 'import sys;out=sys.stdout;'
'getattr(out, "buffer", out).write(sys.prefix.encode("utf-8"))']
cmd = [py_executable, '-c', 'import sys;out=sys.stdout;prefix=sys.prefix;'
'prefix=prefix.decode(sys.getfilesystemencoding()) if sys.version_info[0]==2 else prefix;'
'getattr(out, "buffer", out).write(prefix.encode("utf-8"))']
logger.info('Testing executable with %s %s "%s"' % tuple(cmd))
try:
proc = subprocess.Popen(cmd,
Expand All @@ -1388,9 +1408,11 @@ def install_python(home_dir, lib_dir, inc_dir, bin_dir, site_packages, clear, sy

proc_stdout = proc_stdout.strip().decode("utf-8")
proc_stdout = os.path.normcase(os.path.abspath(proc_stdout))
norm_home_dir = os.path.normcase(os.path.abspath(home_dir))
if hasattr(norm_home_dir, 'decode'):
norm_home_dir = norm_home_dir.decode(sys.getfilesystemencoding())
if hasattr(home_dir, 'decode'):
unicode_home_dir = home_dir.decode(sys.getfilesystemencoding())
else:
unicode_home_dir = home_dir
norm_home_dir = os.path.normcase(os.path.abspath(unicode_home_dir))
if proc_stdout != norm_home_dir:
logger.fatal(
'ERROR: The executable %s is not functioning' % py_executable)
Expand Down Expand Up @@ -1441,6 +1463,8 @@ def install_activate(home_dir, bin_dir, prompt=None):
# Run-time conditional enables (basic) Cygwin compatibility
home_dir_sh = ("""$(if [ "$OSTYPE" "==" "cygwin" ]; then cygpath -u '%s'; else echo '%s'; fi;)""" %
(home_dir, home_dir_msys))
if majver == 2:
home_dir_sh = home_dir_sh.decode(sys.getfilesystemencoding())
files['activate'] = ACTIVATE_SH.replace('__VIRTUAL_ENV__', home_dir_sh)

else:
Expand All @@ -1467,7 +1491,7 @@ def install_files(home_dir, bin_dir, prompt, files):
content = content.replace('__VIRTUAL_ENV__', home_dir)
content = content.replace('__VIRTUAL_NAME__', vname)
content = content.replace('__BIN_NAME__', os.path.basename(bin_dir))
writefile(os.path.join(bin_dir, name), content)
writefile(os.path.join(bin_dir, name), content, use_system_locale=(name.endswith('.bat')))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm sorry, but getfilesystemencoding simply won't work for Windows BAT files. I understand your point that sys.stdout.encoding may not be available, and I don't actually know a way of getting the "OEM" code page if that's the case. But that's what you need to do - using getfilesystemencoding is just as broken (on for example a UK English system) as using UTF-8.


def install_python_config(home_dir, bin_dir, prompt=None):
if sys.platform == 'win32' or is_jython and os._name == 'nt':
Expand Down