Skip to content

Commit 470a5b3

Browse files
committed
Fix a couple of bugs causing serialization/hashing to not be stable.
In particular, make sure we don't depend on __file__ for anything, since that would make it impossible to relocate a module on disk without affecting it. We also make sure that the function annotations dict is kept in sync with the relevant FunctionType* when we complete a Forward type cycle - otherwise depending on whether we look at python or C++ we get a different answer.
1 parent fb84819 commit 470a5b3

File tree

5 files changed

+204
-93
lines changed

5 files changed

+204
-93
lines changed

typed_python/CompilerVisibleObjectVisitor.hpp

Lines changed: 1 addition & 85 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,7 @@
1717
#pragma once
1818

1919
#include "ShaHash.hpp"
20+
#include "SpecialModuleNames.hpp"
2021
#include "util.hpp"
2122
#include <unordered_map>
2223

@@ -28,91 +29,6 @@ unique hash for types and functions.
2829
2930
******************************/
3031

31-
bool isCanonicalName(std::string name) {
32-
// this is the list of standard library modules in python 3.8
33-
static std::set<std::string> canonicalPythonModuleNames({
34-
"abc", "aifc", "antigravity", "argparse", "ast", "asynchat", "asyncio", "asyncore",
35-
"base64", "bdb", "binhex", "bisect", "_bootlocale", "bz2", "calendar", "cgi", "cgitb",
36-
"chunk", "cmd", "codecs", "codeop", "code", "collections", "_collections_abc",
37-
"colorsys", "_compat_pickle", "compileall", "_compression", "concurrent",
38-
"configparser", "contextlib", "contextvars", "copy", "copyreg", "cProfile", "crypt",
39-
"csv", "ctypes", "curses", "dataclasses", "datetime", "dbm", "decimal", "difflib",
40-
"dis", "distutils", "doctest", "dummy_threading", "_dummy_thread", "email",
41-
"encodings", "ensurepip", "enum", "filecmp", "fileinput", "fnmatch", "formatter",
42-
"fractions", "ftplib", "functools", "__future__", "genericpath", "getopt", "getpass",
43-
"gettext", "glob", "gzip", "hashlib", "heapq", "hmac", "html", "http", "idlelib",
44-
"imaplib", "imghdr", "importlib", "imp", "inspect", "io", "ipaddress", "json",
45-
"keyword", "lib2to3", "linecache", "locale", "logging", "lzma", "mailbox", "mailcap",
46-
"marshal",
47-
"_markupbase", "mimetypes", "modulefinder", "msilib", "multiprocessing", "netrc",
48-
"nntplib", "ntpath", "nturl2path", "numbers", "opcode", "operator", "optparse", "os",
49-
"_osx_support", "pathlib", "pdb", "pickle", "pickletools", "pipes", "pkgutil",
50-
"platform", "plistlib", "poplib", "posixpath", "pprint", "profile", "pstats", "pty",
51-
"_py_abc", "pyclbr", "py_compile", "_pydecimal", "pydoc_data", "pydoc", "_pyio",
52-
"queue", "quopri", "random", "reprlib", "re", "rlcompleter", "runpy", "sched",
53-
"secrets", "selectors", "shelve", "shlex", "shutil", "signal", "_sitebuiltins",
54-
"site-packages", "site", "smtpd", "smtplib", "sndhdr", "socket", "socketserver",
55-
"sqlite3", "sre_compile", "sre_constants", "sre_parse", "ssl", "statistics", "stat",
56-
"stringprep", "string", "_strptime", "struct", "subprocess", "sunau", "symbol",
57-
"symtable", "sysconfig", "tabnanny", "tarfile", "telnetlib", "tempfile", "test",
58-
"textwrap", "this", "_threading_local", "threading", "timeit", "tkinter", "tokenize",
59-
"token", "traceback", "tracemalloc", "trace", "tty", "turtledemo", "turtle", "types",
60-
"typing", "unittest", "urllib", "uuid", "uu", "venv", "warnings", "wave", "weakref",
61-
"_weakrefset", "webbrowser", "wsgiref", "xdrlib", "xml", "xmlrpc", "zipapp",
62-
"zipfile", "zipimport", "pytz", "psutil",
63-
64-
// and some standard ones we might commonly install
65-
"numpy", "pandas", "scipy", "pytest", "_pytest", "typed_python", "object_database", "llvmlite",
66-
"requests", "redis", "websockets", "boto3", "py", "xdist", "pytest_jsonreport",
67-
"pytest_metadata", "flask", "flaky", "coverage", "pyasn1", "cryptography", "paramiko",
68-
"six", "torch"
69-
});
70-
71-
std::string moduleNameRoot;
72-
73-
int posOfDot = name.find(".");
74-
if (posOfDot != std::string::npos) {
75-
moduleNameRoot = name.substr(0, posOfDot);
76-
} else {
77-
moduleNameRoot = name;
78-
}
79-
80-
return canonicalPythonModuleNames.find(moduleNameRoot) != canonicalPythonModuleNames.end();
81-
}
82-
83-
// is this a special name in a dict, module, or class that we shouldn't hash?
84-
// we do want to hash methods like __init__
85-
bool isSpecialIgnorableName(const std::string& name) {
86-
static std::set<std::string> canonicalMagicMethods({
87-
"__abs__", "__add__", "__and__", "__bool__",
88-
"__bytes__", "__call__", "__contains__", "__del__",
89-
"__delattr__", "__eq__", "__float__", "__floordiv__",
90-
"__format__", "__ge__", "__getitem__", "__gt__",
91-
"__hash__", "__iadd__", "__iand__", "__ieq__",
92-
"__ifloordiv__", "__ige__", "__igt__", "__ile__",
93-
"__ilshift__", "__ilt__", "__imatmul__", "__imod__",
94-
"__imul__", "__index__", "__ine__", "__init__",
95-
"__int__", "__invert__", "__ior__", "__ipow__",
96-
"__irshift__", "__isub__", "__itruediv__", "__ixor__",
97-
"__le__", "__len__", "__lshift__", "__lt__",
98-
"__matmul__", "__mod__", "__mul__", "__ne__",
99-
"__neg__", "__not__", "__or__", "__pos__",
100-
"__pow__", "__radd__", "__rand__", "__repr__",
101-
"__rfloordiv__", "__rlshift__", "__rmatmul__", "__rmod__",
102-
"__rmul__", "__ror__", "__round__", "__round__",
103-
"__rpow__", "__rrshift__", "__rshift__", "__rsub__",
104-
"__rtruediv__", "__rxor__", "__setattr__", "__setitem__",
105-
"__str__", "__sub__", "__truediv__", "__xor__",
106-
});
107-
108-
return (
109-
name.substr(0, 2) == "__"
110-
&& name.substr(name.size() - 2) == "__"
111-
&& canonicalMagicMethods.find(name) == canonicalMagicMethods.end()
112-
);
113-
}
114-
115-
11632
class VisitRecord {
11733
public:
11834
enum class kind { Hash=0, String=1, Topo=2, NameValuePair=3, Error=4 };

typed_python/FunctionType.hpp

Lines changed: 46 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/******************************************************************************
2-
Copyright 2017-2020 typed_python Authors
2+
Copyright 2017-2022 typed_python Authors
33
44
Licensed under the Apache License, Version 2.0 (the "License");
55
you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
2020
#include "TypedCellType.hpp"
2121
#include "ReprAccumulator.hpp"
2222
#include "Format.hpp"
23+
#include "SpecialModuleNames.hpp"
24+
#include "PyInstance.hpp"
2325

2426
class Function;
2527

@@ -778,12 +780,42 @@ class Function : public Type {
778780

779781
template<class visitor_type>
780782
void _visitReferencedTypes(const visitor_type& visitor) {
783+
// we need to keep mArgs and mReturnType in sync with
784+
// mAnnotations, so if we change one of our types we
785+
// need to update the resulting dictionary as well.
781786
if (mReturnType) {
787+
Type* retType = mReturnType;
788+
782789
visitor(mReturnType);
790+
791+
if (mReturnType != retType
792+
&& mFunctionAnnotations
793+
&& PyDict_Check(mFunctionAnnotations)
794+
) {
795+
PyDict_SetItemString(
796+
mFunctionAnnotations,
797+
"return",
798+
(PyObject*)PyInstance::typeObj(mReturnType)
799+
);
800+
}
783801
}
784802
for (auto& a: mArgs) {
803+
Type* typeFilter = a.getTypeFilter();
804+
785805
a._visitReferencedTypes(visitor);
806+
807+
if (a.getTypeFilter() != typeFilter
808+
&& mFunctionAnnotations
809+
&& PyDict_Check(mFunctionAnnotations)
810+
) {
811+
PyDict_SetItemString(
812+
mFunctionAnnotations,
813+
a.getName().c_str(),
814+
(PyObject*)PyInstance::typeObj(a.getTypeFilter())
815+
);
816+
}
786817
}
818+
787819
for (auto& varnameAndBinding: mClosureBindings) {
788820
varnameAndBinding.second._visitReferencedTypes(visitor);
789821
}
@@ -812,7 +844,6 @@ class Function : public Type {
812844
visitor.visitHash(ShaHash(2));
813845
}
814846

815-
816847
if (mSignatureFunction) {
817848
visitor.visitHash(ShaHash(1));
818849
visitor.visitTopo(mSignatureFunction);
@@ -897,15 +928,22 @@ class Function : public Type {
897928
std::string curName;
898929

899930
for (PyObject* name: sequence) {
931+
std::string nameStr = PyUnicode_AsUTF8(name);
932+
933+
if (isSpecialIgnorableName(nameStr)) {
934+
break;
935+
}
936+
900937
if (!curObj) {
901-
curName = PyUnicode_AsUTF8(name);
938+
curName = nameStr;
902939
} else {
903-
curName = curName + "." + PyUnicode_AsUTF8(name);
940+
curName = curName + "." + nameStr;
904941
}
905942

906943
if (!curObj) {
907944
// this is a lookup in the global dict
908945
curObj.set(PyDict_GetItem(globals, name));
946+
909947
if (!curObj) {
910948
// this is an invalid global lookup, which is OK. no need to hash anything.
911949
PyErr_Clear();
@@ -923,14 +961,15 @@ class Function : public Type {
923961
return;
924962
}
925963
} else {
926-
visitor(curName, (PyObject*)curObj);
927-
return;
964+
break;
928965
}
929966
}
930967
}
931968

932969
// also visit at the end of the sequence
933-
visitor(curName, (PyObject*)curObj);
970+
if (curObj) {
971+
visitor(curName, (PyObject*)curObj);
972+
}
934973
};
935974

936975
for (auto& sequence: dotAccesses) {
Lines changed: 101 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,101 @@
1+
/******************************************************************************
2+
Copyright 2017-2022 typed_python Authors
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
******************************************************************************/
16+
17+
#pragma once
18+
19+
inline bool isCanonicalName(std::string name) {
20+
// this is the list of standard library modules in python 3.8
21+
static std::set<std::string> canonicalPythonModuleNames({
22+
"abc", "aifc", "antigravity", "argparse", "ast", "asynchat", "asyncio", "asyncore",
23+
"base64", "bdb", "binhex", "bisect", "_bootlocale", "bz2", "calendar", "cgi", "cgitb",
24+
"chunk", "cmd", "codecs", "codeop", "code", "collections", "_collections_abc",
25+
"colorsys", "_compat_pickle", "compileall", "_compression", "concurrent",
26+
"configparser", "contextlib", "contextvars", "copy", "copyreg", "cProfile", "crypt",
27+
"csv", "ctypes", "curses", "dataclasses", "datetime", "dbm", "decimal", "difflib",
28+
"dis", "distutils", "doctest", "dummy_threading", "_dummy_thread", "email",
29+
"encodings", "ensurepip", "enum", "filecmp", "fileinput", "fnmatch", "formatter",
30+
"fractions", "ftplib", "functools", "__future__", "genericpath", "getopt", "getpass",
31+
"gettext", "glob", "gzip", "hashlib", "heapq", "hmac", "html", "http", "idlelib",
32+
"imaplib", "imghdr", "importlib", "imp", "inspect", "io", "ipaddress", "json",
33+
"keyword", "lib2to3", "linecache", "locale", "logging", "lzma", "mailbox", "mailcap",
34+
"marshal",
35+
"_markupbase", "mimetypes", "modulefinder", "msilib", "multiprocessing", "netrc",
36+
"nntplib", "ntpath", "nturl2path", "numbers", "opcode", "operator", "optparse", "os",
37+
"_osx_support", "pathlib", "pdb", "pickle", "pickletools", "pipes", "pkgutil",
38+
"platform", "plistlib", "poplib", "posixpath", "pprint", "profile", "pstats", "pty",
39+
"_py_abc", "pyclbr", "py_compile", "_pydecimal", "pydoc_data", "pydoc", "_pyio",
40+
"queue", "quopri", "random", "reprlib", "re", "rlcompleter", "runpy", "sched",
41+
"secrets", "selectors", "shelve", "shlex", "shutil", "signal", "_sitebuiltins",
42+
"site-packages", "site", "smtpd", "smtplib", "sndhdr", "socket", "socketserver",
43+
"sqlite3", "sre_compile", "sre_constants", "sre_parse", "ssl", "statistics", "stat",
44+
"stringprep", "string", "_strptime", "struct", "subprocess", "sunau", "symbol",
45+
"symtable", "sysconfig", "tabnanny", "tarfile", "telnetlib", "tempfile", "test",
46+
"textwrap", "this", "_threading_local", "threading", "timeit", "tkinter", "tokenize",
47+
"token", "traceback", "tracemalloc", "trace", "tty", "turtledemo", "turtle", "types",
48+
"typing", "unittest", "urllib", "uuid", "uu", "venv", "warnings", "wave", "weakref",
49+
"_weakrefset", "webbrowser", "wsgiref", "xdrlib", "xml", "xmlrpc", "zipapp",
50+
"zipfile", "zipimport", "pytz", "psutil",
51+
52+
// and some standard ones we might commonly install
53+
"numpy", "pandas", "scipy", "pytest", "_pytest", "typed_python", "object_database", "llvmlite",
54+
"requests", "redis", "websockets", "boto3", "py", "xdist", "pytest_jsonreport",
55+
"pytest_metadata", "flask", "flaky", "coverage", "pyasn1", "cryptography", "paramiko",
56+
"six", "torch"
57+
});
58+
59+
std::string moduleNameRoot;
60+
61+
int posOfDot = name.find(".");
62+
if (posOfDot != std::string::npos) {
63+
moduleNameRoot = name.substr(0, posOfDot);
64+
} else {
65+
moduleNameRoot = name;
66+
}
67+
68+
return canonicalPythonModuleNames.find(moduleNameRoot) != canonicalPythonModuleNames.end();
69+
}
70+
71+
// is this a special name in a dict, module, or class that we shouldn't hash?
72+
// we do want to hash methods like __init__
73+
inline bool isSpecialIgnorableName(const std::string& name) {
74+
static std::set<std::string> canonicalMagicMethods({
75+
"__abs__", "__add__", "__and__", "__bool__",
76+
"__bytes__", "__call__", "__contains__", "__del__",
77+
"__delattr__", "__eq__", "__float__", "__floordiv__",
78+
"__format__", "__ge__", "__getitem__", "__gt__",
79+
"__hash__", "__iadd__", "__iand__", "__ieq__",
80+
"__ifloordiv__", "__ige__", "__igt__", "__ile__",
81+
"__ilshift__", "__ilt__", "__imatmul__", "__imod__",
82+
"__imul__", "__index__", "__ine__", "__init__",
83+
"__int__", "__invert__", "__ior__", "__ipow__",
84+
"__irshift__", "__isub__", "__itruediv__", "__ixor__",
85+
"__le__", "__len__", "__lshift__", "__lt__",
86+
"__matmul__", "__mod__", "__mul__", "__ne__",
87+
"__neg__", "__not__", "__or__", "__pos__",
88+
"__pow__", "__radd__", "__rand__", "__repr__",
89+
"__rfloordiv__", "__rlshift__", "__rmatmul__", "__rmod__",
90+
"__rmul__", "__ror__", "__round__", "__round__",
91+
"__rpow__", "__rrshift__", "__rshift__", "__rsub__",
92+
"__rtruediv__", "__rxor__", "__setattr__", "__setitem__",
93+
"__str__", "__sub__", "__truediv__", "__xor__",
94+
});
95+
96+
return (
97+
name.substr(0, 2) == "__"
98+
&& name.substr(name.size() - 2) == "__"
99+
&& canonicalMagicMethods.find(name) == canonicalMagicMethods.end()
100+
);
101+
}

typed_python/type_identity_test.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -33,6 +33,7 @@
3333
typesAndObjectsVisibleToCompilerFrom,
3434
checkForHashInstability,
3535
resetCompilerVisibleObjectHashCache,
36+
typeWalkRecord
3637
)
3738

3839

@@ -46,6 +47,14 @@ def gModuleLevel(x):
4647
return fModuleLevel(x)
4748

4849

50+
def looksAtFilename():
51+
return __file__
52+
53+
54+
def looksAtFilename2():
55+
return typed_python.type_identity_test.__file__
56+
57+
4958
def checkHash(filesToWrite, expression):
5059
"""Check the hash of a piece of python code using a subprocess.
5160
@@ -68,6 +77,18 @@ def returnSerializedValue(filesToWrite, expression, printComments=False):
6877
)
6978

7079

80+
def test_identity_ignores_function_file_accesses():
81+
# make sure these functions succeed
82+
assert looksAtFilename()
83+
assert looksAtFilename2()
84+
85+
walk1 = typeWalkRecord(looksAtFilename)
86+
walk2 = typeWalkRecord(looksAtFilename2)
87+
88+
assert '__file__' not in walk1
89+
assert '__file__' not in walk2
90+
91+
7192
def test_identities_of_basic_types_different():
7293
assert identityHash(int) != identityHash(float)
7394
assert identityHash(TupleOf(int)) != identityHash(TupleOf(float))

0 commit comments

Comments
 (0)