From 46de172d98fbf282073f29e345fac0ea57420cca Mon Sep 17 00:00:00 2001
From: Dmitry Dygalo <dadygalo@gmail.com>
Date: Tue, 25 Aug 2020 15:07:02 +0200
Subject: [PATCH] Caching for canonicalised JSON

---
 src/hypothesis_jsonschema/_canonicalise.py | 61 ++++++++++++++++++++--
 1 file changed, 57 insertions(+), 4 deletions(-)

diff --git a/src/hypothesis_jsonschema/_canonicalise.py b/src/hypothesis_jsonschema/_canonicalise.py
index 08e3050..46e21a2 100644
--- a/src/hypothesis_jsonschema/_canonicalise.py
+++ b/src/hypothesis_jsonschema/_canonicalise.py
@@ -12,14 +12,14 @@
 most things by construction instead of by filtering.  That's the difference
 between "I'd like it to be faster" and "doesn't finish at all".
 """
-
+import functools
 import itertools
 import json
 import math
 import re
 from copy import deepcopy
 from json.encoder import _make_iterencode, encode_basestring_ascii  # type: ignore
-from typing import Any, Dict, List, NoReturn, Optional, Tuple, Union
+from typing import Any, Callable, Dict, List, NoReturn, Optional, Tuple, Type, Union
 
 import jsonschema
 from hypothesis.errors import InvalidArgument
@@ -108,9 +108,62 @@ class HypothesisRefResolutionError(jsonschema.exceptions.RefResolutionError):
     pass
 
 
-def encode_canonical_json(value: JSONType) -> str:
+def _make_cache_key(
+    value: JSONType,
+) -> Tuple[Type, Union[Tuple, None, bool, float, str]]:
+    """Make a hashable object from any JSON value.
+
+    The idea is to recursively convert all mutable values to immutable and adding values types as a discriminant.
+    """
+    if isinstance(value, dict):
+        return (dict, tuple((k, _make_cache_key(v)) for k, v in value.items()))
+    if isinstance(value, list):
+        return (list, tuple(map(_make_cache_key, value)))
+    # Primitive types are hashable
+    # `type` is needed to distinguish false-ish values - 0, "", False have the same hash (0)
+    return (type(value), value)
+
+
+class HashedJSON:
+    """A proxy that holds a JSON value.
+
+    Adds a capability for the inner value to be cached, loosely based on `functools._HashedSeq`.
+    """
+
+    __slots__ = ("value", "hashedvalue")
+
+    def __init__(self, value: JSONType):
+        self.value = value
+        # `hash` is called multiple times on cache miss, therefore it is evaluated only once
+        self.hashedvalue = hash(_make_cache_key(value))
+
+    def __hash__(self) -> int:
+        return self.hashedvalue
+
+    def __eq__(self, other: "HashedJSON") -> bool:  # type: ignore
+        # TYPES: This class should be used only for caching purposes and there should be
+        # no values of other types to compare
+        return self.hashedvalue == other.hashedvalue
+
+
+def cached_json(func: Callable[[HashedJSON], str]) -> Callable[[JSONType], str]:
+    """Cache calls to `encode_canonical_json`.
+
+    The same schemas are encoded multiple times during canonicalisation and caching gives visible performance impact.
+    """
+    cached_func = functools.lru_cache(maxsize=1024)(func)
+
+    @functools.wraps(cached_func)
+    def wrapped(value: JSONType) -> str:
+        return cached_func(HashedJSON(value))
+
+    return wrapped
+
+
+@cached_json
+def encode_canonical_json(value: HashedJSON) -> str:
     """Canonical form serialiser, for uniqueness testing."""
-    return json.dumps(value, sort_keys=True, cls=CanonicalisingJsonEncoder)
+    return json.dumps(value.value, sort_keys=True, cls=CanonicalisingJsonEncoder)
 
 
 def sort_key(value: JSONType) -> Tuple[int, float, Union[float, str]]: