@@ -1707,7 +1707,6 @@ def new_function2(value):
17071707"""
17081708 expected_code = """import numpy as np
17091709
1710- print("Hello world")
17111710a=2
17121711print("Hello world")
17131712def some_fn():
@@ -1783,7 +1782,6 @@ def new_function2(value):
17831782"""
17841783 expected_code = """import numpy as np
17851784
1786- print("Hello world")
17871785print("Hello world")
17881786def some_fn():
17891787 a=np.zeros(10)
@@ -1862,7 +1860,6 @@ def new_function2(value):
18621860"""
18631861 expected_code = """import numpy as np
18641862
1865- print("Hello world")
18661863a=3
18671864print("Hello world")
18681865def some_fn():
@@ -1940,7 +1937,6 @@ def new_function2(value):
19401937"""
19411938 expected_code = """import numpy as np
19421939
1943- print("Hello world")
19441940a=2
19451941print("Hello world")
19461942def some_fn():
@@ -2019,7 +2015,6 @@ def new_function2(value):
20192015"""
20202016 expected_code = """import numpy as np
20212017
2022- print("Hello world")
20232018a=3
20242019print("Hello world")
20252020def some_fn():
@@ -2106,7 +2101,6 @@ def new_function2(value):
21062101
21072102a = 6
21082103
2109- print("Hello world")
21102104if 2<3:
21112105 a=4
21122106else:
@@ -3453,3 +3447,157 @@ def hydrate_input_text_actions_with_field_names(
34533447 main_file .unlink (missing_ok = True )
34543448
34553449 assert new_code == expected
3450+
3451+ def test_duplicate_global_assignments_when_reverting_helpers ():
3452+ root_dir = Path (__file__ ).parent .parent .resolve ()
3453+ main_file = Path (root_dir / "code_to_optimize/temp_main.py" ).resolve ()
3454+
3455+ original_code = '''"""Chunking objects not specific to a particular chunking strategy."""
3456+ from __future__ import annotations
3457+ import collections
3458+ import copy
3459+ from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
3460+ import regex
3461+ from typing_extensions import Self, TypeAlias
3462+ from unstructured.utils import lazyproperty
3463+ from unstructured.documents.elements import Element
3464+ # ================================================================================================
3465+ # MODEL
3466+ # ================================================================================================
3467+ CHUNK_MAX_CHARS_DEFAULT: int = 500
3468+ # ================================================================================================
3469+ # PRE-CHUNKER
3470+ # ================================================================================================
3471+ class PreChunker:
3472+ """Gathers sequential elements into pre-chunks as length constraints allow.
3473+ The pre-chunker's responsibilities are:
3474+ - **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
3475+ either side of those boundaries into different sections. In this case, the primary indicator
3476+ of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a
3477+ semantic boundary when `multipage_sections` is `False`.
3478+ - **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
3479+ into sections as big as possible without exceeding the chunk window size.
3480+ - **Minimize chunks that must be split mid-text.** Precompute the text length of each section
3481+ and only produce a section that exceeds the chunk window size when there is a single element
3482+ with text longer than that window.
3483+ A Table element is placed into a section by itself. CheckBox elements are dropped.
3484+ The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates
3485+ a new "section", hence the "by-title" designation.
3486+ """
3487+ def __init__(self, elements: Iterable[Element], opts: ChunkingOptions):
3488+ self._elements = elements
3489+ self._opts = opts
3490+ @lazyproperty
3491+ def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
3492+ """The semantic-boundary detectors to be applied to break pre-chunks."""
3493+ return self._opts.boundary_predicates
3494+ def _is_in_new_semantic_unit(self, element: Element) -> bool:
3495+ """True when `element` begins a new semantic unit such as a section or page."""
3496+ # -- all detectors need to be called to update state and avoid double counting
3497+ # -- boundaries that happen to coincide, like Table and new section on same element.
3498+ # -- Using `any()` would short-circuit on first True.
3499+ semantic_boundaries = [pred(element) for pred in self._boundary_predicates]
3500+ return any(semantic_boundaries)
3501+ '''
3502+ main_file .write_text (original_code , encoding = "utf-8" )
3503+ optim_code = f'''```python:{ main_file .relative_to (root_dir )}
3504+ # ================================================================================================
3505+ # PRE-CHUNKER
3506+ # ================================================================================================
3507+ from __future__ import annotations
3508+ from typing import Iterable
3509+ from unstructured.documents.elements import Element
3510+ from unstructured.utils import lazyproperty
3511+ class PreChunker:
3512+ def __init__(self, elements: Iterable[Element], opts: ChunkingOptions):
3513+ self._elements = elements
3514+ self._opts = opts
3515+ @lazyproperty
3516+ def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
3517+ """The semantic-boundary detectors to be applied to break pre-chunks."""
3518+ return self._opts.boundary_predicates
3519+ def _is_in_new_semantic_unit(self, element: Element) -> bool:
3520+ """True when `element` begins a new semantic unit such as a section or page."""
3521+ # Use generator expression for lower memory usage and avoid building intermediate list
3522+ for pred in self._boundary_predicates:
3523+ if pred(element):
3524+ return True
3525+ return False
3526+ ```
3527+ '''
3528+
3529+ func = FunctionToOptimize (function_name = "_is_in_new_semantic_unit" , parents = [FunctionParent ("PreChunker" , "ClassDef" )], file_path = main_file )
3530+ test_config = TestConfig (
3531+ tests_root = root_dir / "tests/pytest" ,
3532+ tests_project_rootdir = root_dir ,
3533+ project_root_path = root_dir ,
3534+ test_framework = "pytest" ,
3535+ pytest_cmd = "pytest" ,
3536+ )
3537+ func_optimizer = FunctionOptimizer (function_to_optimize = func , test_cfg = test_config )
3538+ code_context : CodeOptimizationContext = func_optimizer .get_code_optimization_context ().unwrap ()
3539+
3540+ original_helper_code : dict [Path , str ] = {}
3541+ helper_function_paths = {hf .file_path for hf in code_context .helper_functions }
3542+ for helper_function_path in helper_function_paths :
3543+ with helper_function_path .open (encoding = "utf8" ) as f :
3544+ helper_code = f .read ()
3545+ original_helper_code [helper_function_path ] = helper_code
3546+
3547+ func_optimizer .args = Args ()
3548+ func_optimizer .replace_function_and_helpers_with_optimized_code (
3549+ code_context = code_context , optimized_code = CodeStringsMarkdown .parse_markdown_code (optim_code ), original_helper_code = original_helper_code
3550+ )
3551+
3552+
3553+ new_code = main_file .read_text (encoding = "utf-8" )
3554+ main_file .unlink (missing_ok = True )
3555+
3556+ expected = '''"""Chunking objects not specific to a particular chunking strategy."""
3557+ from __future__ import annotations
3558+ import collections
3559+ import copy
3560+ from typing import Any, Callable, DefaultDict, Iterable, Iterator, cast
3561+ import regex
3562+ from typing_extensions import Self, TypeAlias
3563+ from unstructured.utils import lazyproperty
3564+ from unstructured.documents.elements import Element
3565+ # ================================================================================================
3566+ # MODEL
3567+ # ================================================================================================
3568+ CHUNK_MAX_CHARS_DEFAULT: int = 500
3569+ # ================================================================================================
3570+ # PRE-CHUNKER
3571+ # ================================================================================================
3572+ class PreChunker:
3573+ """Gathers sequential elements into pre-chunks as length constraints allow.
3574+ The pre-chunker's responsibilities are:
3575+ - **Segregate semantic units.** Identify semantic unit boundaries and segregate elements on
3576+ either side of those boundaries into different sections. In this case, the primary indicator
3577+ of a semantic boundary is a `Title` element. A page-break (change in page-number) is also a
3578+ semantic boundary when `multipage_sections` is `False`.
3579+ - **Minimize chunk count for each semantic unit.** Group the elements within a semantic unit
3580+ into sections as big as possible without exceeding the chunk window size.
3581+ - **Minimize chunks that must be split mid-text.** Precompute the text length of each section
3582+ and only produce a section that exceeds the chunk window size when there is a single element
3583+ with text longer than that window.
3584+ A Table element is placed into a section by itself. CheckBox elements are dropped.
3585+ The "by-title" strategy specifies breaking on section boundaries; a `Title` element indicates
3586+ a new "section", hence the "by-title" designation.
3587+ """
3588+ def __init__(self, elements: Iterable[Element], opts: ChunkingOptions):
3589+ self._elements = elements
3590+ self._opts = opts
3591+ @lazyproperty
3592+ def _boundary_predicates(self) -> tuple[BoundaryPredicate, ...]:
3593+ """The semantic-boundary detectors to be applied to break pre-chunks."""
3594+ return self._opts.boundary_predicates
3595+ def _is_in_new_semantic_unit(self, element: Element) -> bool:
3596+ """True when `element` begins a new semantic unit such as a section or page."""
3597+ # Use generator expression for lower memory usage and avoid building intermediate list
3598+ for pred in self._boundary_predicates:
3599+ if pred(element):
3600+ return True
3601+ return False
3602+ '''
3603+ assert new_code == expected
0 commit comments