decorator solution for lazy loading frequency_lists library

musicsnobj · dwolfhub · commit f4161482786f · 2025-02-19T12:11:01.000-06:00
diff --git a/tests/matching_test.py b/tests/matching_test.py
@@ -67,18 +67,6 @@ def test_build_ranked_dict():
     }
 
 
-def test_add_frequency_lists():
-    matching.add_frequency_lists({
-        'test_words': ['qidkviflkdoejjfkd', 'sjdshfidssdkdjdhfkl']
-    })
-
-    assert 'test_words' in matching.RANKED_DICTIONARIES
-    assert matching.RANKED_DICTIONARIES['test_words'] == {
-        'qidkviflkdoejjfkd': 1,
-        'sjdshfidssdkdjdhfkl': 2,
-    }
-
-
 def test_matching_utils():
     chr_map = {
         'a': 'A',
@@ -102,7 +90,7 @@ def test_matching_utils():
 
 def test_dictionary_matching():
     def dm(pw):
-        return matching.dictionary_match(pw, test_dicts)
+        return matching.dictionary_match(pw, _ranked_dictionaries=test_dicts)
 
     test_dicts = {
         'd1': {
@@ -196,7 +184,7 @@ def test_reverse_dictionary_matching():
         }
     }
     password = '0123456789'
-    matches = matching.reverse_dictionary_match(password, test_dicts)
+    matches = matching.reverse_dictionary_match(password, _ranked_dictionaries=test_dicts)
     msg = 'matches against reversed words'
     check_matches(msg, matches, 'dictionary', ['123', '456'], [[1, 3], [4, 6]],
                   {
@@ -236,7 +224,7 @@ def test_l33t_matching():
         assert matching.enumerate_l33t_subs(table) == subs, msg
 
     def lm(pw):
-        return matching.l33t_match(pw, dicts, test_table)
+        return matching.l33t_match(pw, _ranked_dictionaries=dicts, _l33t_table=test_table)
 
     dicts = {
         'words': {
diff --git a/tests/test_compatibility.py b/tests/test_compatibility.py
@@ -42,14 +42,13 @@ def main(argv):
     number_of_passwords = len(d)
     scores_collision = 0
     guesses_collision = 0
-    refresh_rate = number_of_passwords/100
+    refresh_rate = number_of_passwords // 100
 
     i = 0
     for js_zxcvbn_score in d:
         if i%refresh_rate== 0:
             update_console_status(i*100/number_of_passwords)
         i += 1
-
         py_zxcvbn_scroe = dict()
         py_zxcvbn_scroe_full = zxcvbn(js_zxcvbn_score['password'])
         py_zxcvbn_scroe["password"] = py_zxcvbn_scroe_full["password"]
@@ -64,15 +63,15 @@ def main(argv):
 expected:
 %s
 results:
-%s\033[00m""")%(js_zxcvbn_score, py_zxcvbn_scroe)
+%s\033[00m""" % (js_zxcvbn_score, py_zxcvbn_scroe))
 
         if py_zxcvbn_scroe["score"] != js_zxcvbn_score["score"]:
             scores_collision += 1
 
     if (guesses_collision or scores_collision):
         print ("""\033[91mFailed!
 guesses_collision:%d
-guesses_score:%d""")%(guesses_collision, scores_collision)
+guesses_score:%d""" % (guesses_collision, scores_collision))
     else:
         print ("\033[92mPassed!")
 
diff --git a/zxcvbn/__init__.py b/zxcvbn/__init__.py
@@ -27,10 +27,7 @@ def zxcvbn(password, user_inputs=None, max_length=72):
             arg = str(arg)
         sanitized_inputs.append(arg.lower())
 
-    ranked_dictionaries = matching.RANKED_DICTIONARIES
-    ranked_dictionaries['user_inputs'] = matching.build_ranked_dict(sanitized_inputs)
-
-    matches = matching.omnimatch(password, ranked_dictionaries)
+    matches = matching.omnimatch(password, user_inputs=sanitized_inputs)
     result = scoring.most_guessable_match_sequence(password, matches)
     result['calc_time'] = datetime.now() - start
 
diff --git a/zxcvbn/matching.py b/zxcvbn/matching.py
@@ -1,23 +1,44 @@
 from zxcvbn import scoring
 from . import adjacency_graphs
-from zxcvbn.frequency_lists import FREQUENCY_LISTS
 import re
+import functools
 
 from zxcvbn.scoring import most_guessable_match_sequence
 
 
 def build_ranked_dict(ordered_list):
     return {word: idx for idx, word in enumerate(ordered_list, 1)}
 
-RANKED_DICTIONARIES = {}
-
-
-def add_frequency_lists(frequency_lists_):
-    for name, lst in frequency_lists_.items():
-        RANKED_DICTIONARIES[name] = build_ranked_dict(lst)
-
-
-add_frequency_lists(FREQUENCY_LISTS)
+RANKED_DICTIONARIES = None
+
+def get_ranked_dictionaries():
+    """
+    Lazy-load large dictionary data set.
+    Return global _RANKED_DICTIONARIES, ensuring it is built only once.
+    """
+    global RANKED_DICTIONARIES
+
+    if RANKED_DICTIONARIES is None:
+        # Do the expensive import here only
+        from zxcvbn.frequency_lists import FREQUENCY_LISTS
+
+        # Build the dictionary once
+        RANKED_DICTIONARIES = {}
+        for name, lst in FREQUENCY_LISTS.items():
+          RANKED_DICTIONARIES[name] = build_ranked_dict(lst)
+    return RANKED_DICTIONARIES
+
+
+def ensure_ranked_dictionaries(func):
+    """Decorator to ensure _ranked_dictionaries argument is always populated."""
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        # If an explicit _ranked_dictionaries arg was passed, use it.
+        # Otherwise fetch from the global cache.
+        if '_ranked_dictionaries' not in kwargs or kwargs['_ranked_dictionaries'] is None:
+            kwargs['_ranked_dictionaries'] = get_ranked_dictionaries()
+        return func(*args, **kwargs)
+    return wrapper
 
 GRAPHS = {
     'qwerty': adjacency_graphs.ADJACENCY_GRAPHS['qwerty'],
@@ -75,7 +96,11 @@ def add_frequency_lists(frequency_lists_):
 
 
 # omnimatch -- perform all matches
-def omnimatch(password, _ranked_dictionaries=RANKED_DICTIONARIES):
+@ensure_ranked_dictionaries
+def omnimatch(password, _ranked_dictionaries=None, user_inputs=[]):
+    if len(user_inputs):
+        _ranked_dictionaries['user_inputs'] = build_ranked_dict(user_inputs)
+
     matches = []
     for matcher in [
         dictionary_match,
@@ -93,7 +118,8 @@ def omnimatch(password, _ranked_dictionaries=RANKED_DICTIONARIES):
 
 
 # dictionary match (common passwords, english, last names, etc)
-def dictionary_match(password, _ranked_dictionaries=RANKED_DICTIONARIES):
+@ensure_ranked_dictionaries
+def dictionary_match(password, _ranked_dictionaries=None):
     matches = []
     length = len(password)
     password_lower = password.lower()
@@ -117,11 +143,11 @@ def dictionary_match(password, _ranked_dictionaries=RANKED_DICTIONARIES):
 
     return sorted(matches, key=lambda x: (x['i'], x['j']))
 
-
+@ensure_ranked_dictionaries
 def reverse_dictionary_match(password,
-                             _ranked_dictionaries=RANKED_DICTIONARIES):
+                             _ranked_dictionaries=None):
     reversed_password = ''.join(reversed(password))
-    matches = dictionary_match(reversed_password, _ranked_dictionaries)
+    matches = dictionary_match(reversed_password, _ranked_dictionaries=_ranked_dictionaries)
     for match in matches:
         match['token'] = ''.join(reversed(match['token']))
         match['reversed'] = True
@@ -212,7 +238,8 @@ def translate(string, chr_map):
     return ''.join(chars)
 
 
-def l33t_match(password, _ranked_dictionaries=RANKED_DICTIONARIES,
+@ensure_ranked_dictionaries
+def l33t_match(password, _ranked_dictionaries=None,
                _l33t_table=L33T_TABLE):
     matches = []
 
@@ -222,7 +249,7 @@ def l33t_match(password, _ranked_dictionaries=RANKED_DICTIONARIES,
             break
 
         subbed_password = translate(password, sub)
-        for match in dictionary_match(subbed_password, _ranked_dictionaries):
+        for match in dictionary_match(subbed_password, _ranked_dictionaries=_ranked_dictionaries):
             token = password[match['i']:match['j'] + 1]
             if token.lower() == match['matched_word']:
                 # only return the matches that contain an actual substitution
@@ -247,7 +274,8 @@ def l33t_match(password, _ranked_dictionaries=RANKED_DICTIONARIES,
 
 
 # repeats (aaa, abcabcabc) and sequences (abcdef)
-def repeat_match(password, _ranked_dictionaries=RANKED_DICTIONARIES):
+@ensure_ranked_dictionaries
+def repeat_match(password, _ranked_dictionaries=None):
     matches = []
     greedy = re.compile(r'(.+)\1+')
     lazy = re.compile(r'(.+?)\1+')
@@ -298,7 +326,8 @@ def repeat_match(password, _ranked_dictionaries=RANKED_DICTIONARIES):
     return matches
 
 
-def spatial_match(password, _graphs=GRAPHS, _ranked_dictionaries=RANKED_DICTIONARIES):
+@ensure_ranked_dictionaries
+def spatial_match(password, _graphs=GRAPHS, _ranked_dictionaries=None):
     matches = []
     for graph_name, graph in _graphs.items():
         matches.extend(spatial_match_helper(password, graph, graph_name))
@@ -379,7 +408,8 @@ def spatial_match_helper(password, graph, graph_name):
 MAX_DELTA = 5
 
 
-def sequence_match(password, _ranked_dictionaries=RANKED_DICTIONARIES):
+@ensure_ranked_dictionaries
+def sequence_match(password, _ranked_dictionaries=None):
     # Identifies sequences by looking for repeated differences in unicode codepoint.
     # this allows skipping, such as 9753, and also matches some extended unicode sequences
     # such as Greek and Cyrillic alphabets.
@@ -440,7 +470,8 @@ def update(i, j, delta):
     return result
 
 
-def regex_match(password, _regexen=REGEXEN, _ranked_dictionaries=RANKED_DICTIONARIES):
+@ensure_ranked_dictionaries
+def regex_match(password, _regexen=REGEXEN, _ranked_dictionaries=None):
     matches = []
     for name, regex in _regexen.items():
         for rx_match in regex.finditer(password):
@@ -456,7 +487,8 @@ def regex_match(password, _regexen=REGEXEN, _ranked_dictionaries=RANKED_DICTIONA
     return sorted(matches, key=lambda x: (x['i'], x['j']))
 
 
-def date_match(password, _ranked_dictionaries=RANKED_DICTIONARIES):
+@ensure_ranked_dictionaries
+def date_match(password, _ranked_dictionaries=None):
     # a "date" is recognized as:
     #   any 3-tuple that starts or ends with a 2- or 4-digit year,
     #   with 2 or 0 separator chars (1.1.91 or 1191),