From d580449bdfb985ff74d7e614b3bc2cfda481b033 Mon Sep 17 00:00:00 2001 From: Dan Ryan Date: Thu, 4 Apr 2019 00:20:45 -0400 Subject: [PATCH 1/8] Add coverage skips for things I didn't change Signed-off-by: Dan Ryan --- src/backports/os.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/backports/os.py b/src/backports/os.py index 060fe9e..cbd9541 100644 --- a/src/backports/os.py +++ b/src/backports/os.py @@ -42,9 +42,9 @@ def _invalid_utf8_indexes(bytes): # U+0080 - U+07FF - 11 bits c = (((c1 & 0x1F) << 6) | (c2 & 0x3F)) - if c < 0x80: + if c < 0x80: # pragma: no cover # Overlong encoding - skips.extend([i, i + 1]) + skips.extend([i, i + 1]) # pragma: no cover i += 2 continue c3 = bytes[i + 2] @@ -70,7 +70,7 @@ def _invalid_utf8_indexes(bytes): (c2 & 0x3F)) << 6) | (c3 & 0x3F)) << 6) | (c4 & 0x3F)) - if (c < 0x10000) or (c > 0x10FFFF): + if (c < 0x10000) or (c > 0x10FFFF): # pragma: no cover # Overlong encoding or invalid code point. skips.extend([i, i + 1, i + 2, i + 3]) i += 4 From 7b01a9712a3a734c7ebd251161f756371ac12267 Mon Sep 17 00:00:00 2001 From: Dan Ryan Date: Thu, 4 Apr 2019 00:37:05 -0400 Subject: [PATCH 2/8] Fix fsencode and fsdecode backports - Mirrors the new python 3.7 implementation - Taken from `vistir` (my other library) -> discussion over at https://github.com/sarugaku/vistir/pull/54 - Fixes #13 - Fixes #6 (I think?) Signed-off-by: Dan Ryan --- src/backports/os.py | 170 ++++++++++++++++++-------------------------- 1 file changed, 70 insertions(+), 100 deletions(-) diff --git a/src/backports/os.py b/src/backports/os.py index cbd9541..ca2d274 100644 --- a/src/backports/os.py +++ b/src/backports/os.py @@ -8,16 +8,35 @@ """ from __future__ import unicode_literals +from os import name as os_name import sys # XXX backport: unicode on Python 2 _str = unicode if sys.version_info < (3,) else str +string_types = basestring if sys.version_info[0] == 2 else str # XXX backport: Use backported surrogateescape for Python 2 # TODO backport: Find a way to do this without pulling in the entire future package? if sys.version_info < (3,): from future.utils.surrogateescape import register_surrogateescape register_surrogateescape() + _fs_encode_errors = "backslashreplace" + _fs_decode_errors = "replace" + _fs_encoding = "utf-8" +else: + _fs_encoding = "utf-8" + if os_name == "nt": + _fs_error_fn = None + alt_strategy = "surrogatepass" + else: + if sys.version_info >= (3, 3): + _fs_encoding = next(iter(enc for enc in [ + sys.getfilesystemencoding(), sys.getdefaultencoding() + ]), _fs_encoding) + alt_strategy = "surrogateescape" + _fs_error_fn = getattr(sys, "getfilesystemencodeerrors", None) + _fs_encode_errors = _fs_error_fn() if _fs_error_fn else alt_strategy + _fs_decode_errors = _fs_error_fn() if _fs_error_fn else alt_strategy # XXX backport: This invalid_utf8_indexes() helper is shamelessly copied from @@ -92,103 +111,54 @@ def _chunks(b, indexes): yield b[i:] -def _fscodec(): - encoding = sys.getfilesystemencoding() - if encoding == 'mbcs': - errors = 'strict' - else: - errors = 'surrogateescape' - - # XXX backport: Do we need to hack around Python 2's UTF-8 codec? - import codecs # Use codecs.lookup() for name normalisation. - _HACK_AROUND_PY2_UTF8 = (sys.version_info < (3,) and - codecs.lookup(encoding) == codecs.lookup('utf-8')) - # Do we need to hack around Python 2's ASCII codec error handler behaviour? - _HACK_AROUND_PY2_ASCII = (sys.version_info < (3,) and - codecs.lookup(encoding) == codecs.lookup('ascii')) - - # XXX backport: chr(octet) became bytes([octet]) - _byte = chr if sys.version_info < (3,) else lambda i: bytes([i]) - - def fsencode(filename): - """ - Encode filename to the filesystem encoding with 'surrogateescape' error - handler, return bytes unchanged. On Windows, use 'strict' error handler if - the file system encoding is 'mbcs' (which is the default encoding). - """ - if isinstance(filename, bytes): - return filename - elif isinstance(filename, _str): - if _HACK_AROUND_PY2_UTF8 or _HACK_AROUND_PY2_ASCII: - # XXX backport: Unlike Python 3, Python 2's UTF-8 codec does not - # consider surrogate codepoints invalid, so the surrogateescape - # error handler never gets invoked to encode them back into high - # bytes. - # - # This code hacks around that by manually encoding the surrogate - # codepoints to high bytes, without relying on surrogateescape. - # - # As a *separate* issue to the above, Python2's ASCII codec has - # a different problem: it correctly invokes the surrogateescape - # error handler, but then seems to do additional strict - # validation (?) on the interim surrogate-decoded Unicode buffer - # returned by surrogateescape, and then fails with a - # UnicodeEncodeError anyway. - # - # The fix for that happens to be the same (manual encoding), - # even though the two causes are quite different. - # - return b''.join( - (_byte(ord(c) - 0xDC00) if 0xDC00 <= ord(c) <= 0xDCFF else - c.encode(encoding)) - for c in filename) - else: - return filename.encode(encoding, errors) - else: - # XXX backport: unicode instead of str for Python 2 - raise TypeError("expect bytes or {_str}, not {}".format(type(filename).__name__, - _str=_str.__name__, )) - - def fsdecode(filename): - """ - Decode filename from the filesystem encoding with 'surrogateescape' error - handler, return str unchanged. On Windows, use 'strict' error handler if - the file system encoding is 'mbcs' (which is the default encoding). - """ - if isinstance(filename, _str): - return filename - elif isinstance(filename, bytes): - if _HACK_AROUND_PY2_UTF8: - # XXX backport: See the remarks in fsencode() above. - # - # This case is slightly trickier: Python 2 will invoke the - # surrogateescape error handler for most bad high byte - # sequences, *except* for full UTF-8 sequences that happen to - # decode to surrogate codepoints. - # - # For decoding, it's not trivial to sidestep the UTF-8 codec - # only for surrogates like fsencode() does, but as a hack we can - # split the input into separate chunks around each invalid byte, - # decode the chunks separately, and join the results. - # - # This prevents Python 2's UTF-8 codec from seeing the encoded - # surrogate sequences as valid, which lets surrogateescape take - # over and escape the individual bytes. - # - # TODO: Improve this. - # - from array import array - indexes = _invalid_utf8_indexes(array(str('B'), filename)) - return ''.join(chunk.decode(encoding, errors) - for chunk in _chunks(filename, indexes)) - else: - return filename.decode(encoding, errors) - else: - # XXX backport: unicode instead of str for Python 2 - raise TypeError("expect bytes or {_str}, not {}".format(type(filename).__name__, - _str=_str.__name__, )) - - return fsencode, fsdecode - -fsencode, fsdecode = _fscodec() -del _fscodec +def _get_path(path): + """ + Fetch the string value from a path-like object + + Returns **None** if there is no string value. + """ + + if isinstance(path, (string_types, bytes)): + return path + path_type = type(path) + try: + path_repr = path_type.__fspath__(path) + except AttributeError: + return + if isinstance(path_repr, (string_types, bytes)): + return path_repr + return + + +def fsencode(path): + """ + Encode a filesystem path to the proper filesystem encoding + + :param Union[str, bytes] path: A string-like path + :returns: A bytes-encoded filesystem path representation + """ + + path = _get_path(path) + if path is None: + raise TypeError("expected a valid path to encode") + if isinstance(path, _str): + path = path.encode(_fs_encoding, _fs_encode_errors) + return path + + +def fsdecode(path): + """ + Decode a filesystem path using the proper filesystem encoding + + :param path: The filesystem path to decode from bytes or string + :return: An appropriately decoded path + :rtype: str + """ + + path = _get_path(path) + if path is None: + raise TypeError("expected a valid path to decode") + binary_type = str if sys.version_info[0] == 2 else bytes + if isinstance(path, binary_type): + path = path.decode(_fs_encoding, _fs_decode_errors) + return path From 46e3d6b25028f4f898c042776c872effe37207e7 Mon Sep 17 00:00:00 2001 From: Dan Ryan Date: Thu, 4 Apr 2019 01:03:00 -0400 Subject: [PATCH 3/8] don't import os module Signed-off-by: Dan Ryan --- src/backports/os.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/backports/os.py b/src/backports/os.py index ca2d274..1403574 100644 --- a/src/backports/os.py +++ b/src/backports/os.py @@ -8,7 +8,6 @@ """ from __future__ import unicode_literals -from os import name as os_name import sys # XXX backport: unicode on Python 2 @@ -25,7 +24,7 @@ _fs_encoding = "utf-8" else: _fs_encoding = "utf-8" - if os_name == "nt": + if sys.platform.startswith("win"): _fs_error_fn = None alt_strategy = "surrogatepass" else: From e2787ddaf47be43359c5fe8a3657bc5cd0bff9d4 Mon Sep 17 00:00:00 2001 From: Dan Ryan Date: Thu, 4 Apr 2019 02:52:16 -0400 Subject: [PATCH 4/8] Add back the surrogate handling logic Signed-off-by: Dan Ryan --- src/backports/os.py | 194 +++++++++++++++++++++++++++++++------------- tests/test_extra.py | 6 +- 2 files changed, 141 insertions(+), 59 deletions(-) diff --git a/src/backports/os.py b/src/backports/os.py index 1403574..2af7679 100644 --- a/src/backports/os.py +++ b/src/backports/os.py @@ -8,35 +8,23 @@ """ from __future__ import unicode_literals +import abc import sys # XXX backport: unicode on Python 2 _str = unicode if sys.version_info < (3,) else str +# XXX backport: string and binary types differ between python 2 and 3 string_types = basestring if sys.version_info[0] == 2 else str +binary_type = str if sys.version_info[0] == 2 else bytes # XXX backport: Use backported surrogateescape for Python 2 # TODO backport: Find a way to do this without pulling in the entire future package? if sys.version_info < (3,): from future.utils.surrogateescape import register_surrogateescape register_surrogateescape() - _fs_encode_errors = "backslashreplace" - _fs_decode_errors = "replace" - _fs_encoding = "utf-8" -else: - _fs_encoding = "utf-8" - if sys.platform.startswith("win"): - _fs_error_fn = None - alt_strategy = "surrogatepass" - else: - if sys.version_info >= (3, 3): - _fs_encoding = next(iter(enc for enc in [ - sys.getfilesystemencoding(), sys.getdefaultencoding() - ]), _fs_encoding) - alt_strategy = "surrogateescape" - _fs_error_fn = getattr(sys, "getfilesystemencodeerrors", None) - _fs_encode_errors = _fs_error_fn() if _fs_error_fn else alt_strategy - _fs_decode_errors = _fs_error_fn() if _fs_error_fn else alt_strategy +# XXX This is a compatibility shiim for the PathLike backport which gets us fspath access +ABC = abc.ABCMeta(str('ABC'), (object,), {'__slots__': ()}) # XXX backport: This invalid_utf8_indexes() helper is shamelessly copied from # Bob Ippolito's pyutf8 package (pyutf8/ref.py), in order to help support the @@ -110,54 +98,148 @@ def _chunks(b, indexes): yield b[i:] -def _get_path(path): +def fspath(path): """ Fetch the string value from a path-like object Returns **None** if there is no string value. """ - if isinstance(path, (string_types, bytes)): + if isinstance(path, (string_types, binary_type)): return path path_type = type(path) + expect = "unicode" if sys.version_info[0] == 2 else "str" try: path_repr = path_type.__fspath__(path) except AttributeError: - return - if isinstance(path_repr, (string_types, bytes)): + if hasattr(path_type, '__fspath__'): + raise + else: + raise TypeError("expected {0}, bytes or os.PathLike object, " + "not ".format(expect) + path_type.__name__) + if isinstance(path_repr, (string_types, binary_type)): return path_repr - return - - -def fsencode(path): - """ - Encode a filesystem path to the proper filesystem encoding - - :param Union[str, bytes] path: A string-like path - :returns: A bytes-encoded filesystem path representation - """ - - path = _get_path(path) - if path is None: - raise TypeError("expected a valid path to encode") - if isinstance(path, _str): - path = path.encode(_fs_encoding, _fs_encode_errors) - return path - - -def fsdecode(path): - """ - Decode a filesystem path using the proper filesystem encoding - - :param path: The filesystem path to decode from bytes or string - :return: An appropriately decoded path - :rtype: str - """ - - path = _get_path(path) - if path is None: - raise TypeError("expected a valid path to decode") - binary_type = str if sys.version_info[0] == 2 else bytes - if isinstance(path, binary_type): - path = path.decode(_fs_encoding, _fs_decode_errors) - return path + else: + raise TypeError("expected {}.__fspath__() to return {} or bytes, " + "not {}".format(path_type.__name__, expect, + type(path_repr).__name__)) + + +def _fscodec(): + # XXX Backport: The following section attempts to use utf-8 encoders to + # roundtrip to the filesystem, and also attempts to force windows to use + # a "surrogate pass" error handling strategy to ignore the bad surrogate + # pairs sometimes generated by python 2 encoders + if sys.version_info[0] < 3: + _fs_encode_errors = "surrogateescape" + _fs_decode_errors = "surrogateescape" + _fs_encoding = "utf-8" + else: + _fs_encoding = "utf-8" + if sys.platform.startswith("win"): + _fs_error_fn = None + alt_strategy = "surrogatepass" + else: + if sys.version_info >= (3, 3): + _fs_encoding = sys.getfilesystemencoding() + if not _fs_encoding: + _fs_encoding = sys.getdefaultencoding() + alt_strategy = "surrogateescape" + _fs_error_fn = getattr(sys, "getfilesystemencodeerrors", None) + _fs_encode_errors = _fs_error_fn() if _fs_error_fn else alt_strategy + _fs_decode_errors = _fs_error_fn() if _fs_error_fn else alt_strategy + + _byte = chr if sys.version_info < (3,) else lambda i: bytes([i]) + + + def fsencode(filename): + """Encode filename (an os.PathLike, bytes, or str) to the filesystem + encoding with 'surrogateescape' error handler, return bytes unchanged. + On Windows, use 'strict' error handler if the file system encoding is + 'mbcs' (which is the default encoding). + """ + path = fspath(filename) + if isinstance(path, _str): + if sys.version_info[0] < 3: + # XXX backport: Unlike Python 3, Python 2's UTF-8 codec does not + # consider surrogate codepoints invalid, so the surrogateescape + # error handler never gets invoked to encode them back into high + # bytes. + # + # This code hacks around that by manually encoding the surrogate + # codepoints to high bytes, without relying on surrogateescape. + # + # As a *separate* issue to the above, Python2's ASCII codec has + # a different problem: it correctly invokes the surrogateescape + # error handler, but then seems to do additional strict + # validation (?) on the interim surrogate-decoded Unicode buffer + # returned by surrogateescape, and then fails with a + # UnicodeEncodeError anyway. + # + # The fix for that happens to be the same (manual encoding), + # even though the two causes are quite different. + # + return b''.join( + (_byte(ord(c) - 0xDC00) if 0xDC00 <= ord(c) <= 0xDCFF else + c.encode(_fs_encoding, _fs_encode_errors)) + for c in path) + return path.encode(_fs_encoding, _fs_encode_errors) + else: + return path + + def fsdecode(filename): + """Decode filename (an os.PathLike, bytes, or str) from the filesystem + encoding with 'surrogateescape' error handler, return str unchanged. On + Windows, use 'strict' error handler if the file system encoding is + 'mbcs' (which is the default encoding). + """ + path = fspath(filename) + if isinstance(path, bytes): + if sys.version_info[0] < 3: + # XXX backport: See the remarks in fsencode() above. + # + # This case is slightly trickier: Python 2 will invoke the + # surrogateescape error handler for most bad high byte + # sequences, *except* for full UTF-8 sequences that happen to + # decode to surrogate codepoints. + # + # For decoding, it's not trivial to sidestep the UTF-8 codec + # only for surrogates like fsencode() does, but as a hack we can + # split the input into separate chunks around each invalid byte, + # decode the chunks separately, and join the results. + # + # This prevents Python 2's UTF-8 codec from seeing the encoded + # surrogate sequences as valid, which lets surrogateescape take + # over and escape the individual bytes. + # + # TODO: Improve this. + # + from array import array + indexes = _invalid_utf8_indexes(array(str('B'), filename)) + return ''.join(chunk.decode(_fs_encoding, _fs_decode_errors) + for chunk in _chunks(filename, indexes)) + return path.decode(_fs_encoding, _fs_decode_errors) + else: + return path + + return fsencode, fsdecode + + +fsencode, fsdecode = _fscodec() +del _fscodec + + +# If there is no C implementation, make the pure Python version the +# implementation as transparently as possible. +class PathLike(ABC): + + """Abstract base class for implementing the file system path protocol.""" + + @abc.abstractmethod + def __fspath__(self): + """Return the file system path representation of the object.""" + raise NotImplementedError + + @classmethod + def __subclasshook__(cls, subclass): + return hasattr(subclass, '__fspath__') diff --git a/tests/test_extra.py b/tests/test_extra.py index 5fe0f59..6490522 100644 --- a/tests/test_extra.py +++ b/tests/test_extra.py @@ -87,9 +87,9 @@ def assertTypeError(value, expected_message): with self.assertRaises(TypeError) as cm: f(value) self.assertEqual(str(cm.exception), expected_message) - - pre = 'expect bytes or {}, not '.format( - 'unicode' if sys.version_info < (3,) else 'str') + pre = 'expected {0}, bytes or os.PathLike object, not '.format( + 'unicode' if sys.version_info < (3,) else 'str' + ) assertTypeError(None, pre + 'NoneType') assertTypeError(5, pre + 'int') assertTypeError([], pre + 'list') From 78a8d7c00ec5d5ebb5cae48857d4007340f88e4d Mon Sep 17 00:00:00 2001 From: Dan Ryan Date: Thu, 4 Apr 2019 03:21:09 -0400 Subject: [PATCH 5/8] Use surrogateescape or surrogatepass always on python3 Signed-off-by: Dan Ryan --- src/backports/os.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/backports/os.py b/src/backports/os.py index 2af7679..d202a20 100644 --- a/src/backports/os.py +++ b/src/backports/os.py @@ -138,14 +138,14 @@ def _fscodec(): _fs_encoding = "utf-8" if sys.platform.startswith("win"): _fs_error_fn = None - alt_strategy = "surrogatepass" + alt_strategy = "surrogatepass" if sys.version_info >= (3, 5) else "surrogateeescape" else: if sys.version_info >= (3, 3): _fs_encoding = sys.getfilesystemencoding() if not _fs_encoding: _fs_encoding = sys.getdefaultencoding() alt_strategy = "surrogateescape" - _fs_error_fn = getattr(sys, "getfilesystemencodeerrors", None) + _fs_error_fn = getattr(sys, "getfilesystemencodeerrors", None) _fs_encode_errors = _fs_error_fn() if _fs_error_fn else alt_strategy _fs_decode_errors = _fs_error_fn() if _fs_error_fn else alt_strategy From 7db494afd64ff5823c8fe313b479c72feff65e1c Mon Sep 17 00:00:00 2001 From: Dan Ryan Date: Thu, 4 Apr 2019 03:44:05 -0400 Subject: [PATCH 6/8] Fall back to surrogateescape handler Signed-off-by: Dan Ryan --- src/backports/os.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/backports/os.py b/src/backports/os.py index d202a20..6601312 100644 --- a/src/backports/os.py +++ b/src/backports/os.py @@ -138,7 +138,7 @@ def _fscodec(): _fs_encoding = "utf-8" if sys.platform.startswith("win"): _fs_error_fn = None - alt_strategy = "surrogatepass" if sys.version_info >= (3, 5) else "surrogateeescape" + alt_strategy = "surrogatepass" else: if sys.version_info >= (3, 3): _fs_encoding = sys.getfilesystemencoding() @@ -218,7 +218,13 @@ def fsdecode(filename): indexes = _invalid_utf8_indexes(array(str('B'), filename)) return ''.join(chunk.decode(_fs_encoding, _fs_decode_errors) for chunk in _chunks(filename, indexes)) - return path.decode(_fs_encoding, _fs_decode_errors) + try: + return path.decode(_fs_encoding, _fs_decode_errors) + except UnicodeDecodeError: + if _fs_decode_errors == "surrogatepass": + return path.decode(_fs_encoding, "surrogateescape") + else: + raise else: return path From 623bf1db3e2567c53a2c8f5c4a2466d0873bd8f7 Mon Sep 17 00:00:00 2001 From: Dan Ryan Date: Tue, 9 Apr 2019 01:21:24 -0400 Subject: [PATCH 7/8] Fix fsencode and fsdecode tests for windows Signed-off-by: Dan Ryan --- src/backports/os.py | 8 +---- tests/test_extra.py | 79 ++++++++++++++++++++++++++++++++++++++++++--- 2 files changed, 76 insertions(+), 11 deletions(-) diff --git a/src/backports/os.py b/src/backports/os.py index 6601312..7f1c2eb 100644 --- a/src/backports/os.py +++ b/src/backports/os.py @@ -218,13 +218,7 @@ def fsdecode(filename): indexes = _invalid_utf8_indexes(array(str('B'), filename)) return ''.join(chunk.decode(_fs_encoding, _fs_decode_errors) for chunk in _chunks(filename, indexes)) - try: - return path.decode(_fs_encoding, _fs_decode_errors) - except UnicodeDecodeError: - if _fs_decode_errors == "surrogatepass": - return path.decode(_fs_encoding, "surrogateescape") - else: - raise + return path.decode(_fs_encoding, _fs_decode_errors) else: return path diff --git a/tests/test_extra.py b/tests/test_extra.py index 6490522..54bd936 100644 --- a/tests/test_extra.py +++ b/tests/test_extra.py @@ -12,12 +12,16 @@ from backports import os import unittest -from hypothesis import given, example +from hypothesis import assume, given, example from hypothesis.strategies import text, binary +# SKIP_CONDITIONS: +IS_WIN = sys.platform.startswith("win") +IS_PY3 = sys.version_info[0] == 3 + # Example data: -HIGH_BYTES = ( +SURROGATE_ESCAPE_HIGH_BYTES = ( b'\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f' b'\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f' b'\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf' @@ -39,6 +43,41 @@ '\udcf0\udcf1\udcf2\udcf3\udcf4\udcf5\udcf6\udcf7\udcf8\udcf9\udcfa\udcfb\udcfc\udcfd\udcfe\udcff' ) +SURROGATE_PASS_HIGH_BYTES = ( + b'\xed\xb2\x80\xed\xb2\x81\xed\xb2\x82\xed\xb2\x83\xed\xb2\x84\xed' + b'\xb2\x85\xed\xb2\x86\xed\xb2\x87\xed\xb2\x88\xed\xb2\x89\xed\xb2' + b'\x8a\xed\xb2\x8b\xed\xb2\x8c\xed\xb2\x8d\xed\xb2\x8e\xed\xb2\x8f' + b'\xed\xb2\x90\xed\xb2\x91\xed\xb2\x92\xed\xb2\x93\xed\xb2\x94\xed' + b'\xb2\x95\xed\xb2\x96\xed\xb2\x97\xed\xb2\x98\xed\xb2\x99\xed\xb2' + b'\x9a\xed\xb2\x9b\xed\xb2\x9c\xed\xb2\x9d\xed\xb2\x9e\xed\xb2\x9f' + b'\xed\xb2\xa0\xed\xb2\xa1\xed\xb2\xa2\xed\xb2\xa3\xed\xb2\xa4\xed' + b'\xb2\xa5\xed\xb2\xa6\xed\xb2\xa7\xed\xb2\xa8\xed\xb2\xa9\xed\xb2' + b'\xaa\xed\xb2\xab\xed\xb2\xac\xed\xb2\xad\xed\xb2\xae\xed\xb2\xaf' + b'\xed\xb2\xb0\xed\xb2\xb1\xed\xb2\xb2\xed\xb2\xb3\xed\xb2\xb4\xed' + b'\xb2\xb5\xed\xb2\xb6\xed\xb2\xb7\xed\xb2\xb8\xed\xb2\xb9\xed\xb2' + b'\xba\xed\xb2\xbb\xed\xb2\xbc\xed\xb2\xbd\xed\xb2\xbe\xed\xb2\xbf' + b'\xed\xb3\x80\xed\xb3\x81\xed\xb3\x82\xed\xb3\x83\xed\xb3\x84\xed' + b'\xb3\x85\xed\xb3\x86\xed\xb3\x87\xed\xb3\x88\xed\xb3\x89\xed\xb3' + b'\x8a\xed\xb3\x8b\xed\xb3\x8c\xed\xb3\x8d\xed\xb3\x8e\xed\xb3\x8f' + b'\xed\xb3\x90\xed\xb3\x91\xed\xb3\x92\xed\xb3\x93\xed\xb3\x94\xed' + b'\xb3\x95\xed\xb3\x96\xed\xb3\x97\xed\xb3\x98\xed\xb3\x99\xed\xb3' + b'\x9a\xed\xb3\x9b\xed\xb3\x9c\xed\xb3\x9d\xed\xb3\x9e\xed\xb3\x9f' + b'\xed\xb3\xa0\xed\xb3\xa1\xed\xb3\xa2\xed\xb3\xa3\xed\xb3\xa4\xed' + b'\xb3\xa5\xed\xb3\xa6\xed\xb3\xa7\xed\xb3\xa8\xed\xb3\xa9\xed\xb3' + b'\xaa\xed\xb3\xab\xed\xb3\xac\xed\xb3\xad\xed\xb3\xae\xed\xb3\xaf' + b'\xed\xb3\xb0\xed\xb3\xb1\xed\xb3\xb2\xed\xb3\xb3\xed\xb3\xb4\xed' + b'\xb3\xb5\xed\xb3\xb6\xed\xb3\xb7\xed\xb3\xb8\xed\xb3\xb9\xed\xb3' + b'\xba\xed\xb3\xbb\xed\xb3\xbc\xed\xb3\xbd\xed\xb3\xbe\xed\xb3\xbf' +) + + +# Use surrogate pass for encoding on windows on python 3+ to ensure +# we can decode them as the native decoder uses surrogate escape +if IS_WIN and IS_PY3: + HIGH_BYTES = SURROGATE_PASS_HIGH_BYTES +else: + HIGH_BYTES = SURROGATE_ESCAPE_HIGH_BYTES + # A U+DC80 surrogate encoded as (invalid) UTF-8. # # Python 3 correctly rejects this when encoding to or from UTF-8, but @@ -79,7 +118,29 @@ def test_text_roundtrip(self, s): @example(HIGH_BYTES) @example(UTF8_ENCODED_SURROGATE) def test_binary_roundtrip(self, b): - self.assertEqual(os.fsencode(os.fsdecode(b)), b) + # in python 3 on windows, the native implementation of os.fsdecode + # always relies on `surrogatepass` as the error handler, which means + # it will fail on surrogates (which are not unicode compatible) + # so if we fail to decode something under those circumstances we should + # verify that the native implementation also fails. + rt1 = None + try: + rt1 = os.fsdecode(b) + except Exception as e: + if IS_WIN and IS_PY3: + self.assertRaises(type(e), real_os.fsdecode, b) + else: + raise + else: + try: + roundtripped = os.fsencode(rt1) + except Exception as e: + if IS_WIN and IS_PY3: + self.assertRaises(type(e), real_os.fsencode, rt1) + else: + raise + else: + self.assertEqual(roundtripped, b) def test_TypeError(self): def assertTypeError(value, expected_message): @@ -111,7 +172,17 @@ def test_encode_text(self, s): @example(HIGH_BYTES) @example(UTF8_ENCODED_SURROGATE) def test_decode_binary(self, b): - self.assertEqual(os.fsdecode(b), real_os.fsdecode(b)) + # Python 3 on windows will never be able to decode things + # in the backported library that it can't also decode + # in the original OS module implementation, so lets just catch + # the exceptions thrown by the os module and expect them + # to be raised by the backport + try: + real_os_val = real_os.fsdecode(b) + except Exception as e: + self.assertRaises(type(e), os.fsdecode, b) + else: + self.assertEqual(os.fsdecode(b), real_os_val) @given(binary()) @example(HIGH_BYTES) From a69d7103b4ef424fbc74b506aac287b54e0da37e Mon Sep 17 00:00:00 2001 From: Dan Ryan Date: Tue, 9 Apr 2019 01:59:44 -0400 Subject: [PATCH 8/8] Add test skips for python 3.5 and below on windows Signed-off-by: Dan Ryan --- tests/test_extra.py | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/tests/test_extra.py b/tests/test_extra.py index 54bd936..901df59 100644 --- a/tests/test_extra.py +++ b/tests/test_extra.py @@ -114,6 +114,10 @@ def test_decode_surrogates(self): def test_text_roundtrip(self, s): self.assertEqual(os.fsdecode(os.fsencode(s)), s) + @unittest.skipIf( + IS_PY3 and sys.version_info[:2] <= (3, 5) and IS_WIN, + "Backport doesn't align with native implementation on win on or before python 3.5" + ) @given(binary()) @example(HIGH_BYTES) @example(UTF8_ENCODED_SURROGATE) @@ -145,6 +149,7 @@ def test_binary_roundtrip(self, b): def test_TypeError(self): def assertTypeError(value, expected_message): for f in [os.fsencode, os.fsdecode]: + with self.assertRaises(TypeError) as cm: f(value) self.assertEqual(str(cm.exception), expected_message) @@ -163,11 +168,19 @@ class TestAgainstPython3(unittest.TestCase): On Python 3, the backported implementations should match the standard library. """ + @unittest.skipIf( + IS_PY3 and sys.version_info[:2] <= (3, 5) and IS_WIN, + "Backport doesn't align with native implementation on win on or before python 3.5" + ) @given(encodable_text()) @example(HIGH_SURROGATES) def test_encode_text(self, s): self.assertEqual(os.fsencode(s), real_os.fsencode(s)) + @unittest.skipIf( + IS_PY3 and sys.version_info[:2] <= (3, 5) and IS_WIN, + "Backport doesn't align with native implementation on win on or before python 3.5" + ) @given(binary()) @example(HIGH_BYTES) @example(UTF8_ENCODED_SURROGATE)