From 7bab5c7a6756aa8cce82fe59b1aa86dde0e3fd92 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 22 Dec 2023 17:33:48 -0500 Subject: [PATCH 1/6] Disable complexity checks/warnings for the main() Subsequent changes would make it only more complex etc -- it needs proper de-spagettification --- codespell_lib/_codespell.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 6e3662a8b7..93ffad3dc4 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -1026,7 +1026,7 @@ def _script_main() -> int: return main(*sys.argv[1:]) -def main(*args: str) -> int: +def main(*args: str) -> int: # noqa: C901,PLR0915 """Contains flow control""" try: options, parser, used_cfg_files = parse_options(args) From 8cce2e7fa5ad65193121c861d010d2f8b74f8421 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 22 Dec 2023 13:46:25 -0500 Subject: [PATCH 2/6] RF: move finding files into an embeded function to centralize invocation of parse_file --- codespell_lib/_codespell.py | 118 +++++++++++++++++++----------------- 1 file changed, 61 insertions(+), 57 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 93ffad3dc4..ac4546469b 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -23,7 +23,18 @@ import re import sys import textwrap -from typing import Any, Dict, List, Match, Optional, Pattern, Sequence, Set, Tuple +from typing import ( + Any, + Dict, + Generator, + List, + Match, + Optional, + Pattern, + Sequence, + Set, + Tuple, +) # autogenerated by setuptools_scm from ._version import __version__ as VERSION # type: ignore # noqa: N812 @@ -1177,65 +1188,58 @@ def main(*args: str) -> int: # noqa: C901,PLR0915 return EX_USAGE bad_count = 0 - for filename in sorted(options.files): - # ignore hidden files - if is_hidden(filename, options.check_hidden): - continue - if os.path.isdir(filename): - for root, dirs, files in os.walk(filename): - if glob_match.match(root): # skip (absolute) directories - dirs.clear() - continue - if is_hidden(root, options.check_hidden): # dir itself hidden - continue - for file_ in sorted(files): - # ignore hidden files in directories - if is_hidden(file_, options.check_hidden): + def _find_files() -> Generator[str, None, None]: + """Yields filename for the parsing""" + for filename in sorted(options.files): + # ignore hidden files + if is_hidden(filename, options.check_hidden): + continue + + if os.path.isdir(filename): + for root, dirs, files in os.walk(filename): + if glob_match.match(root): # skip (absolute) directories + dirs.clear() continue - if glob_match.match(file_): # skip files + if is_hidden(root, options.check_hidden): # dir itself hidden continue - fname = os.path.join(root, file_) - if glob_match.match(fname): # skip paths - continue - bad_count += parse_file( - fname, - colors, - summary, - misspellings, - exclude_lines, - file_opener, - word_regex, - ignore_word_regex, - uri_regex, - uri_ignore_words, - context, - options, - ) - - # skip (relative) directories - dirs[:] = [ - dir_ - for dir_ in dirs - if not glob_match.match(dir_) - and not is_hidden(dir_, options.check_hidden) - ] - - elif not glob_match.match(filename): # skip files - bad_count += parse_file( - filename, - colors, - summary, - misspellings, - exclude_lines, - file_opener, - word_regex, - ignore_word_regex, - uri_regex, - uri_ignore_words, - context, - options, - ) + for file_ in sorted(files): + # ignore hidden files in directories + if is_hidden(file_, options.check_hidden): + continue + if glob_match.match(file_): # skip files + continue + fname = os.path.join(root, file_) + if glob_match.match(fname): # skip paths + continue + yield fname + + # skip (relative) directories + dirs[:] = [ + dir_ + for dir_ in dirs + if not glob_match.match(dir_) + and not is_hidden(dir_, options.check_hidden) + ] + + elif not glob_match.match(filename): # skip files + yield filename + + for filename in _find_files(): + bad_count += parse_file( + filename, + colors, + summary, + misspellings, + exclude_lines, + file_opener, + word_regex, + ignore_word_regex, + uri_regex, + uri_ignore_words, + context, + options, + ) if summary: print("\n-------8<-------\nSUMMARY:") From 016b730e9057a8b8b7852437d84a7f5ae8b6e17f Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 22 Dec 2023 13:51:41 -0500 Subject: [PATCH 3/6] RF: provide closure for parse_file to pass only filename RF right away to use sum(map()) to reduce number of statements --- codespell_lib/_codespell.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index ac4546469b..e96d9cd14a 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -1187,8 +1187,6 @@ def main(*args: str) -> int: # noqa: C901,PLR0915 ) return EX_USAGE - bad_count = 0 - def _find_files() -> Generator[str, None, None]: """Yields filename for the parsing""" for filename in sorted(options.files): @@ -1225,8 +1223,9 @@ def _find_files() -> Generator[str, None, None]: elif not glob_match.match(filename): # skip files yield filename - for filename in _find_files(): - bad_count += parse_file( + # closure to pass only relevant to the job filename + def _parse_file(filename: str) -> int: + return parse_file( filename, colors, summary, @@ -1241,6 +1240,8 @@ def _find_files() -> Generator[str, None, None]: options, ) + bad_count = sum(map(_parse_file, _find_files())) + if summary: print("\n-------8<-------\nSUMMARY:") print(summary) From a6081f24bf2f54befcdd395ed437d5e766b89ed8 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 22 Dec 2023 17:32:55 -0500 Subject: [PATCH 4/6] TEMP: Tried with multiprocessing -- not good since can't pickle embedded function --- codespell_lib/_codespell.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index e96d9cd14a..9cd413c9ca 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -23,6 +23,7 @@ import re import sys import textwrap +from multiprocessing import Pool from typing import ( Any, Dict, @@ -1240,7 +1241,18 @@ def _parse_file(filename: str) -> int: options, ) - bad_count = sum(map(_parse_file, _find_files())) + njobs = os.cpu_count() or 1 + if njobs: + # parse_file would be in subprocess(es) + with Pool(njobs) as pool: + results = pool.map(_parse_file, _find_files()) + for result in results: + if isinstance(result, Exception): + raise result + bad_count = sum(results) + else: + # serial + bad_count = sum(map(_parse_file, _find_files())) if summary: print("\n-------8<-------\nSUMMARY:") From 045231e221d3f506bc8e113e6d6b05a47980812a Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 22 Dec 2023 18:42:34 -0500 Subject: [PATCH 5/6] RF: Define file level FileParser class to pass options to parse_file --- codespell_lib/_codespell.py | 78 +++++++++++++++++++++++++++++-------- 1 file changed, 61 insertions(+), 17 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 9cd413c9ca..0fe7275244 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -1033,6 +1033,52 @@ def parse_file( return bad_count +class _FileParser: + """A helper class to provide top level closure for parse_file()""" + + def __init__( + self, + colors: TermColors, + summary: Optional[Summary], + misspellings: Dict[str, Misspelling], + exclude_lines: Set[str], + file_opener: FileOpener, + word_regex: Pattern[str], + ignore_word_regex: Optional[Pattern[str]], + uri_regex: Pattern[str], + uri_ignore_words: Set[str], + context: Optional[Tuple[int, int]], + options: argparse.Namespace, + ) -> None: + self.colors = colors + self.summary = summary + self.misspellings = misspellings + self.exclude_lines = exclude_lines + self.file_opener = file_opener + self.word_regex = word_regex + self.ignore_word_regex = ignore_word_regex + self.uri_regex = uri_regex + self.uri_ignore_words = uri_ignore_words + self.context = context + self.options = options + + def __call__(self, filename: str) -> int: + return parse_file( + filename, + self.colors, + self.summary, + self.misspellings, + self.exclude_lines, + self.file_opener, + self.word_regex, + self.ignore_word_regex, + self.uri_regex, + self.uri_ignore_words, + self.context, + self.options, + ) + + def _script_main() -> int: """Wrap to main() for setuptools.""" return main(*sys.argv[1:]) @@ -1225,34 +1271,32 @@ def _find_files() -> Generator[str, None, None]: yield filename # closure to pass only relevant to the job filename - def _parse_file(filename: str) -> int: - return parse_file( - filename, - colors, - summary, - misspellings, - exclude_lines, - file_opener, - word_regex, - ignore_word_regex, - uri_regex, - uri_ignore_words, - context, - options, - ) + file_parser = _FileParser( + colors, + summary, + misspellings, + exclude_lines, + file_opener, + word_regex, + ignore_word_regex, + uri_regex, + uri_ignore_words, + context, + options, + ) njobs = os.cpu_count() or 1 if njobs: # parse_file would be in subprocess(es) with Pool(njobs) as pool: - results = pool.map(_parse_file, _find_files()) + results = pool.map(file_parser, _find_files()) for result in results: if isinstance(result, Exception): raise result bad_count = sum(results) else: # serial - bad_count = sum(map(_parse_file, _find_files())) + bad_count = sum(map(file_parser, _find_files())) if summary: print("\n-------8<-------\nSUMMARY:") From abebc4f162efda3a0151412cc3a40a083b22aa15 Mon Sep 17 00:00:00 2001 From: Yaroslav Halchenko Date: Fri, 22 Dec 2023 19:28:42 -0500 Subject: [PATCH 6/6] Provide CLI option -J|--jobs to control number of jobs --- codespell_lib/_codespell.py | 40 +++++++++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 4 deletions(-) diff --git a/codespell_lib/_codespell.py b/codespell_lib/_codespell.py index 0fe7275244..cc14fcc22c 100644 --- a/codespell_lib/_codespell.py +++ b/codespell_lib/_codespell.py @@ -467,6 +467,20 @@ def parse_options( "should match the to-be-excluded lines exactly", ) + parser.add_argument( + "-J", + "--jobs", + action="store", + type=int, + default=0, + help="set number of jobs to parallelize processing - one " + "subprocess per file:\n" + "- 0: no parallelization (default)" + "- positive integer: number of sub-processes to use\n" + "- -1: use all available CPUs\n" + "Interactive mode is not compatible with parallel processing", + ) + parser.add_argument( "-i", "--interactive", @@ -1084,7 +1098,7 @@ def _script_main() -> int: return main(*sys.argv[1:]) -def main(*args: str) -> int: # noqa: C901,PLR0915 +def main(*args: str) -> int: # noqa: C901,PLR0915,PLR0911 """Contains flow control""" try: options, parser, used_cfg_files = parse_options(args) @@ -1196,6 +1210,25 @@ def main(*args: str) -> int: # noqa: C901,PLR0915 else: summary = None + if options.jobs and options.interactive: + print( + "ERROR: do not enable parallelization in interactive mode", + file=sys.stderr, + ) + # no point to parser.print_help() - just hides ERROR away here + return EX_USAGE + + jobs = options.jobs + if jobs == -1: + jobs = os.cpu_count() + elif jobs < -1: + print( + f"ERROR: invalid number of jobs: {jobs}", + file=sys.stderr, + ) + parser.print_help() + return EX_USAGE + context = None if options.context is not None: if (options.before_context is not None) or (options.after_context is not None): @@ -1285,10 +1318,9 @@ def _find_files() -> Generator[str, None, None]: options, ) - njobs = os.cpu_count() or 1 - if njobs: + if jobs: # parse_file would be in subprocess(es) - with Pool(njobs) as pool: + with Pool(jobs) as pool: results = pool.map(file_parser, _find_files()) for result in results: if isinstance(result, Exception):