diff --git a/src/backports/os.py b/src/backports/os.py index 060fe9e..7f1c2eb 100644 --- a/src/backports/os.py +++ b/src/backports/os.py @@ -8,10 +8,14 @@ """ from __future__ import unicode_literals +import abc import sys # XXX backport: unicode on Python 2 _str = unicode if sys.version_info < (3,) else str +# XXX backport: string and binary types differ between python 2 and 3 +string_types = basestring if sys.version_info[0] == 2 else str +binary_type = str if sys.version_info[0] == 2 else bytes # XXX backport: Use backported surrogateescape for Python 2 # TODO backport: Find a way to do this without pulling in the entire future package? @@ -19,6 +23,8 @@ from future.utils.surrogateescape import register_surrogateescape register_surrogateescape() +# XXX This is a compatibility shiim for the PathLike backport which gets us fspath access +ABC = abc.ABCMeta(str('ABC'), (object,), {'__slots__': ()}) # XXX backport: This invalid_utf8_indexes() helper is shamelessly copied from # Bob Ippolito's pyutf8 package (pyutf8/ref.py), in order to help support the @@ -42,9 +48,9 @@ def _invalid_utf8_indexes(bytes): # U+0080 - U+07FF - 11 bits c = (((c1 & 0x1F) << 6) | (c2 & 0x3F)) - if c < 0x80: + if c < 0x80: # pragma: no cover # Overlong encoding - skips.extend([i, i + 1]) + skips.extend([i, i + 1]) # pragma: no cover i += 2 continue c3 = bytes[i + 2] @@ -70,7 +76,7 @@ def _invalid_utf8_indexes(bytes): (c2 & 0x3F)) << 6) | (c3 & 0x3F)) << 6) | (c4 & 0x3F)) - if (c < 0x10000) or (c > 0x10FFFF): + if (c < 0x10000) or (c > 0x10FFFF): # pragma: no cover # Overlong encoding or invalid code point. skips.extend([i, i + 1, i + 2, i + 3]) i += 4 @@ -92,34 +98,69 @@ def _chunks(b, indexes): yield b[i:] -def _fscodec(): - encoding = sys.getfilesystemencoding() - if encoding == 'mbcs': - errors = 'strict' +def fspath(path): + """ + Fetch the string value from a path-like object + + Returns **None** if there is no string value. + """ + + if isinstance(path, (string_types, binary_type)): + return path + path_type = type(path) + expect = "unicode" if sys.version_info[0] == 2 else "str" + try: + path_repr = path_type.__fspath__(path) + except AttributeError: + if hasattr(path_type, '__fspath__'): + raise + else: + raise TypeError("expected {0}, bytes or os.PathLike object, " + "not ".format(expect) + path_type.__name__) + if isinstance(path_repr, (string_types, binary_type)): + return path_repr else: - errors = 'surrogateescape' + raise TypeError("expected {}.__fspath__() to return {} or bytes, " + "not {}".format(path_type.__name__, expect, + type(path_repr).__name__)) - # XXX backport: Do we need to hack around Python 2's UTF-8 codec? - import codecs # Use codecs.lookup() for name normalisation. - _HACK_AROUND_PY2_UTF8 = (sys.version_info < (3,) and - codecs.lookup(encoding) == codecs.lookup('utf-8')) - # Do we need to hack around Python 2's ASCII codec error handler behaviour? - _HACK_AROUND_PY2_ASCII = (sys.version_info < (3,) and - codecs.lookup(encoding) == codecs.lookup('ascii')) - # XXX backport: chr(octet) became bytes([octet]) +def _fscodec(): + # XXX Backport: The following section attempts to use utf-8 encoders to + # roundtrip to the filesystem, and also attempts to force windows to use + # a "surrogate pass" error handling strategy to ignore the bad surrogate + # pairs sometimes generated by python 2 encoders + if sys.version_info[0] < 3: + _fs_encode_errors = "surrogateescape" + _fs_decode_errors = "surrogateescape" + _fs_encoding = "utf-8" + else: + _fs_encoding = "utf-8" + if sys.platform.startswith("win"): + _fs_error_fn = None + alt_strategy = "surrogatepass" + else: + if sys.version_info >= (3, 3): + _fs_encoding = sys.getfilesystemencoding() + if not _fs_encoding: + _fs_encoding = sys.getdefaultencoding() + alt_strategy = "surrogateescape" + _fs_error_fn = getattr(sys, "getfilesystemencodeerrors", None) + _fs_encode_errors = _fs_error_fn() if _fs_error_fn else alt_strategy + _fs_decode_errors = _fs_error_fn() if _fs_error_fn else alt_strategy + _byte = chr if sys.version_info < (3,) else lambda i: bytes([i]) + def fsencode(filename): + """Encode filename (an os.PathLike, bytes, or str) to the filesystem + encoding with 'surrogateescape' error handler, return bytes unchanged. + On Windows, use 'strict' error handler if the file system encoding is + 'mbcs' (which is the default encoding). """ - Encode filename to the filesystem encoding with 'surrogateescape' error - handler, return bytes unchanged. On Windows, use 'strict' error handler if - the file system encoding is 'mbcs' (which is the default encoding). - """ - if isinstance(filename, bytes): - return filename - elif isinstance(filename, _str): - if _HACK_AROUND_PY2_UTF8 or _HACK_AROUND_PY2_ASCII: + path = fspath(filename) + if isinstance(path, _str): + if sys.version_info[0] < 3: # XXX backport: Unlike Python 3, Python 2's UTF-8 codec does not # consider surrogate codepoints invalid, so the surrogateescape # error handler never gets invoked to encode them back into high @@ -140,25 +181,21 @@ def fsencode(filename): # return b''.join( (_byte(ord(c) - 0xDC00) if 0xDC00 <= ord(c) <= 0xDCFF else - c.encode(encoding)) - for c in filename) - else: - return filename.encode(encoding, errors) + c.encode(_fs_encoding, _fs_encode_errors)) + for c in path) + return path.encode(_fs_encoding, _fs_encode_errors) else: - # XXX backport: unicode instead of str for Python 2 - raise TypeError("expect bytes or {_str}, not {}".format(type(filename).__name__, - _str=_str.__name__, )) + return path def fsdecode(filename): + """Decode filename (an os.PathLike, bytes, or str) from the filesystem + encoding with 'surrogateescape' error handler, return str unchanged. On + Windows, use 'strict' error handler if the file system encoding is + 'mbcs' (which is the default encoding). """ - Decode filename from the filesystem encoding with 'surrogateescape' error - handler, return str unchanged. On Windows, use 'strict' error handler if - the file system encoding is 'mbcs' (which is the default encoding). - """ - if isinstance(filename, _str): - return filename - elif isinstance(filename, bytes): - if _HACK_AROUND_PY2_UTF8: + path = fspath(filename) + if isinstance(path, bytes): + if sys.version_info[0] < 3: # XXX backport: See the remarks in fsencode() above. # # This case is slightly trickier: Python 2 will invoke the @@ -179,16 +216,30 @@ def fsdecode(filename): # from array import array indexes = _invalid_utf8_indexes(array(str('B'), filename)) - return ''.join(chunk.decode(encoding, errors) + return ''.join(chunk.decode(_fs_encoding, _fs_decode_errors) for chunk in _chunks(filename, indexes)) - else: - return filename.decode(encoding, errors) + return path.decode(_fs_encoding, _fs_decode_errors) else: - # XXX backport: unicode instead of str for Python 2 - raise TypeError("expect bytes or {_str}, not {}".format(type(filename).__name__, - _str=_str.__name__, )) + return path return fsencode, fsdecode + fsencode, fsdecode = _fscodec() del _fscodec + + +# If there is no C implementation, make the pure Python version the +# implementation as transparently as possible. +class PathLike(ABC): + + """Abstract base class for implementing the file system path protocol.""" + + @abc.abstractmethod + def __fspath__(self): + """Return the file system path representation of the object.""" + raise NotImplementedError + + @classmethod + def __subclasshook__(cls, subclass): + return hasattr(subclass, '__fspath__') diff --git a/tests/test_extra.py b/tests/test_extra.py index 5fe0f59..901df59 100644 --- a/tests/test_extra.py +++ b/tests/test_extra.py @@ -12,12 +12,16 @@ from backports import os import unittest -from hypothesis import given, example +from hypothesis import assume, given, example from hypothesis.strategies import text, binary +# SKIP_CONDITIONS: +IS_WIN = sys.platform.startswith("win") +IS_PY3 = sys.version_info[0] == 3 + # Example data: -HIGH_BYTES = ( +SURROGATE_ESCAPE_HIGH_BYTES = ( b'\x80\x81\x82\x83\x84\x85\x86\x87\x88\x89\x8a\x8b\x8c\x8d\x8e\x8f' b'\x90\x91\x92\x93\x94\x95\x96\x97\x98\x99\x9a\x9b\x9c\x9d\x9e\x9f' b'\xa0\xa1\xa2\xa3\xa4\xa5\xa6\xa7\xa8\xa9\xaa\xab\xac\xad\xae\xaf' @@ -39,6 +43,41 @@ '\udcf0\udcf1\udcf2\udcf3\udcf4\udcf5\udcf6\udcf7\udcf8\udcf9\udcfa\udcfb\udcfc\udcfd\udcfe\udcff' ) +SURROGATE_PASS_HIGH_BYTES = ( + b'\xed\xb2\x80\xed\xb2\x81\xed\xb2\x82\xed\xb2\x83\xed\xb2\x84\xed' + b'\xb2\x85\xed\xb2\x86\xed\xb2\x87\xed\xb2\x88\xed\xb2\x89\xed\xb2' + b'\x8a\xed\xb2\x8b\xed\xb2\x8c\xed\xb2\x8d\xed\xb2\x8e\xed\xb2\x8f' + b'\xed\xb2\x90\xed\xb2\x91\xed\xb2\x92\xed\xb2\x93\xed\xb2\x94\xed' + b'\xb2\x95\xed\xb2\x96\xed\xb2\x97\xed\xb2\x98\xed\xb2\x99\xed\xb2' + b'\x9a\xed\xb2\x9b\xed\xb2\x9c\xed\xb2\x9d\xed\xb2\x9e\xed\xb2\x9f' + b'\xed\xb2\xa0\xed\xb2\xa1\xed\xb2\xa2\xed\xb2\xa3\xed\xb2\xa4\xed' + b'\xb2\xa5\xed\xb2\xa6\xed\xb2\xa7\xed\xb2\xa8\xed\xb2\xa9\xed\xb2' + b'\xaa\xed\xb2\xab\xed\xb2\xac\xed\xb2\xad\xed\xb2\xae\xed\xb2\xaf' + b'\xed\xb2\xb0\xed\xb2\xb1\xed\xb2\xb2\xed\xb2\xb3\xed\xb2\xb4\xed' + b'\xb2\xb5\xed\xb2\xb6\xed\xb2\xb7\xed\xb2\xb8\xed\xb2\xb9\xed\xb2' + b'\xba\xed\xb2\xbb\xed\xb2\xbc\xed\xb2\xbd\xed\xb2\xbe\xed\xb2\xbf' + b'\xed\xb3\x80\xed\xb3\x81\xed\xb3\x82\xed\xb3\x83\xed\xb3\x84\xed' + b'\xb3\x85\xed\xb3\x86\xed\xb3\x87\xed\xb3\x88\xed\xb3\x89\xed\xb3' + b'\x8a\xed\xb3\x8b\xed\xb3\x8c\xed\xb3\x8d\xed\xb3\x8e\xed\xb3\x8f' + b'\xed\xb3\x90\xed\xb3\x91\xed\xb3\x92\xed\xb3\x93\xed\xb3\x94\xed' + b'\xb3\x95\xed\xb3\x96\xed\xb3\x97\xed\xb3\x98\xed\xb3\x99\xed\xb3' + b'\x9a\xed\xb3\x9b\xed\xb3\x9c\xed\xb3\x9d\xed\xb3\x9e\xed\xb3\x9f' + b'\xed\xb3\xa0\xed\xb3\xa1\xed\xb3\xa2\xed\xb3\xa3\xed\xb3\xa4\xed' + b'\xb3\xa5\xed\xb3\xa6\xed\xb3\xa7\xed\xb3\xa8\xed\xb3\xa9\xed\xb3' + b'\xaa\xed\xb3\xab\xed\xb3\xac\xed\xb3\xad\xed\xb3\xae\xed\xb3\xaf' + b'\xed\xb3\xb0\xed\xb3\xb1\xed\xb3\xb2\xed\xb3\xb3\xed\xb3\xb4\xed' + b'\xb3\xb5\xed\xb3\xb6\xed\xb3\xb7\xed\xb3\xb8\xed\xb3\xb9\xed\xb3' + b'\xba\xed\xb3\xbb\xed\xb3\xbc\xed\xb3\xbd\xed\xb3\xbe\xed\xb3\xbf' +) + + +# Use surrogate pass for encoding on windows on python 3+ to ensure +# we can decode them as the native decoder uses surrogate escape +if IS_WIN and IS_PY3: + HIGH_BYTES = SURROGATE_PASS_HIGH_BYTES +else: + HIGH_BYTES = SURROGATE_ESCAPE_HIGH_BYTES + # A U+DC80 surrogate encoded as (invalid) UTF-8. # # Python 3 correctly rejects this when encoding to or from UTF-8, but @@ -75,21 +114,48 @@ def test_decode_surrogates(self): def test_text_roundtrip(self, s): self.assertEqual(os.fsdecode(os.fsencode(s)), s) + @unittest.skipIf( + IS_PY3 and sys.version_info[:2] <= (3, 5) and IS_WIN, + "Backport doesn't align with native implementation on win on or before python 3.5" + ) @given(binary()) @example(HIGH_BYTES) @example(UTF8_ENCODED_SURROGATE) def test_binary_roundtrip(self, b): - self.assertEqual(os.fsencode(os.fsdecode(b)), b) + # in python 3 on windows, the native implementation of os.fsdecode + # always relies on `surrogatepass` as the error handler, which means + # it will fail on surrogates (which are not unicode compatible) + # so if we fail to decode something under those circumstances we should + # verify that the native implementation also fails. + rt1 = None + try: + rt1 = os.fsdecode(b) + except Exception as e: + if IS_WIN and IS_PY3: + self.assertRaises(type(e), real_os.fsdecode, b) + else: + raise + else: + try: + roundtripped = os.fsencode(rt1) + except Exception as e: + if IS_WIN and IS_PY3: + self.assertRaises(type(e), real_os.fsencode, rt1) + else: + raise + else: + self.assertEqual(roundtripped, b) def test_TypeError(self): def assertTypeError(value, expected_message): for f in [os.fsencode, os.fsdecode]: + with self.assertRaises(TypeError) as cm: f(value) self.assertEqual(str(cm.exception), expected_message) - - pre = 'expect bytes or {}, not '.format( - 'unicode' if sys.version_info < (3,) else 'str') + pre = 'expected {0}, bytes or os.PathLike object, not '.format( + 'unicode' if sys.version_info < (3,) else 'str' + ) assertTypeError(None, pre + 'NoneType') assertTypeError(5, pre + 'int') assertTypeError([], pre + 'list') @@ -102,16 +168,34 @@ class TestAgainstPython3(unittest.TestCase): On Python 3, the backported implementations should match the standard library. """ + @unittest.skipIf( + IS_PY3 and sys.version_info[:2] <= (3, 5) and IS_WIN, + "Backport doesn't align with native implementation on win on or before python 3.5" + ) @given(encodable_text()) @example(HIGH_SURROGATES) def test_encode_text(self, s): self.assertEqual(os.fsencode(s), real_os.fsencode(s)) + @unittest.skipIf( + IS_PY3 and sys.version_info[:2] <= (3, 5) and IS_WIN, + "Backport doesn't align with native implementation on win on or before python 3.5" + ) @given(binary()) @example(HIGH_BYTES) @example(UTF8_ENCODED_SURROGATE) def test_decode_binary(self, b): - self.assertEqual(os.fsdecode(b), real_os.fsdecode(b)) + # Python 3 on windows will never be able to decode things + # in the backported library that it can't also decode + # in the original OS module implementation, so lets just catch + # the exceptions thrown by the os module and expect them + # to be raised by the backport + try: + real_os_val = real_os.fsdecode(b) + except Exception as e: + self.assertRaises(type(e), os.fsdecode, b) + else: + self.assertEqual(os.fsdecode(b), real_os_val) @given(binary()) @example(HIGH_BYTES)