feat(profiling): [unwrapt] remove wrapt dependency from Lock Profiler (#15003)

vlad-scherbich · taegyunkim · web-flow · commit aaf064680fda · 2025-11-11T14:39:58.000-05:00
https://datadoghq.atlassian.net/browse/PROF-12854 --- ## Description This PR removes `wrapt` dependency from lock profiler - replaced with direct delegation using `__slots__`. It implements the following proposal in the Lock Profiler RFC: [Remove wrapt dependency](https://docs.google.com/document/d/12ao0XhiO8SJpEB1PNku-Brzkm6ru4OQqgVDosiW_Bi0/edit?tab=t.0#heading=h.l944x0fppaqe) ### Why - **Reduce memory overhead:** 75% reduction in per-lock memory usage - **Simpler code:** No external dependency, easier to maintain - **Consistent behavior:** No more WRAPT_C_EXT detection (wrapt's compiled C extension vs pure Python) ### Changes - Replaced `wrapt.ObjectProxy` with internal `_ProfiledLock` class using `__slots__` - `__slots__` explicitly declares data members without using `__dict__` - Eliminates wrapt's 360-byte hidden dictionary - Added `_LockAllocatorWrapper` as protocol wrapper - Removed environment-dependent frame depth logic (2 if WRAPT_C_EXT, else 3) - Implemented essential special methods: `__hash__`, `__eq__`, `__getattr__` ### Performance **Measurement tools:** - Memory: `gc.get_referents()` + `sys.getsizeof()` (direct object measurement) - Performance: Custom `benchmark.py` with `tracemalloc` and `time.perf_counter()` - Verification: Cache-busted tests in fresh Python processes **Test configuration:** Ran empirical benchmarks at 1% and 100% sampling rates: - Memory: 1,000 locks - Creation: 10,000 locks (5 iterations) - Acquire/Release: 100,000 operations (5 iterations) - Throughput: 10 threads × 10,000 operations | Metric | Before | After | Change | |--------|--------|-------|--------| | Memory (wrapper) | 416 bytes/lock | 104 bytes/lock | -75% | | Creation | 1.58 µs/lock | 1.56 µs/lock | +1% | | Acquire/Release | 1,429 ns/op | 1,419 ns/op | +1% | | Throughput | 738k ops/sec | 737k ops/sec | ≈ Same | **Key findings:** - Memory savings are consistent across all sampling rates (1%, 100%, etc.) - No performance regression in synthetic benchmarks - At 100k locks: saves ~30 MB _Note: These are synthetic benchmarks. Real-world impact will be measured using [dd-trace-doe](https://github.com/DataDog/dd-trace-doe) benchmarking framework._ _See [`benchmarks/lock_profiler_wrapt_removal/VERIFIED_COMPARISON.md`](https://github.com/DataDog/dd-trace-py/blob/vlad/benchmark-lock-profiler/benchmarks/lock_profiler_wrapt_removal/VERIFIED_COMPARISON.md) for details._ ### Implementation Notes `wrapt.ObjectProxy` provided automatic delegation. We now implement these features directly: 1. Attribute forwarding via `__getattr__` 2. Identity operations via `__hash__` and `__eq__` 3. Context management via `__enter__`, `__exit__`, `__aenter__`, `__aexit__` 4. Direct method wrapping for acquire/release profiling Removed/Not Needed: - Transparent `isinstance()` checks - Full introspection support (`dir()`, `vars()`) - Pickle (serialization) support - Weak reference support ## Testing - All existing tests pass (updated `test_patch` for new behavior) - Empirically verified with cache-busted tests on both branches - Full benchmark results: [`vlad/benchmark-lock-profiler`](https://github.com/DataDog/dd-trace-py/tree/vlad/benchmark-lock-profiler/benchmarks/lock_profiler_wrapt_removal) ## Risks **Low:** Existing functionality intact, as shown by unit/e2e tests and empirical benchmarks. **Potential issues:** Users with esoteric workflows depending on deep wrapt features (unlikely). --------- Co-authored-by: Taegyun Kim <taegyun.kim@datadoghq.com>
diff --git a/ddtrace/profiling/collector/_lock.py b/ddtrace/profiling/collector/_lock.py
@@ -17,8 +17,6 @@
 from typing import Tuple
 from typing import Type
 
-import wrapt
-
 from ddtrace.internal.datadog.profiling import ddup
 from ddtrace.profiling import _threading
 from ddtrace.profiling import collector
@@ -40,41 +38,63 @@ def _current_thread() -> Tuple[int, str]:
     return thread_id, _threading.get_thread_name(thread_id)
 
 
-# We need to know if wrapt is compiled in C or not. If it's not using the C module, then the wrappers function will
-# appear in the stack trace and we need to hide it.
-WRAPT_C_EXT: bool
-if os.environ.get("WRAPT_DISABLE_EXTENSIONS"):
-    WRAPT_C_EXT = False
-else:
-    try:
-        import wrapt._wrappers as _w  # noqa: F401
-    except ImportError:
-        WRAPT_C_EXT = False
-    else:
-        WRAPT_C_EXT = True
-        del _w
+class _ProfiledLock:
+    """
+    Lightweight lock wrapper that profiles lock acquire/release operations.
+    It intercepts lock methods without the overhead of a full proxy object.
+    """
 
+    __slots__ = (
+        "__wrapped__",
+        "_self_tracer",
+        "_self_max_nframes",
+        "_self_capture_sampler",
+        "_self_init_loc",
+        "_self_acquired_at",
+        "_self_name",
+    )
 
-class _ProfiledLock(wrapt.ObjectProxy):
     def __init__(
         self,
         wrapped: Any,
         tracer: Optional[Tracer],
         max_nframes: int,
         capture_sampler: collector.CaptureSampler,
-        endpoint_collection_enabled: bool,
     ) -> None:
-        wrapt.ObjectProxy.__init__(self, wrapped)
+        self.__wrapped__: Any = wrapped
         self._self_tracer: Optional[Tracer] = tracer
         self._self_max_nframes: int = max_nframes
         self._self_capture_sampler: collector.CaptureSampler = capture_sampler
-        self._self_endpoint_collection_enabled: bool = endpoint_collection_enabled
-        frame: FrameType = sys._getframe(2 if WRAPT_C_EXT else 3)
+        # Frame depth: 0=__init__, 1=_profiled_allocate_lock, 2=_LockAllocatorWrapper.__call__, 3=caller
+        frame: FrameType = sys._getframe(3)
         code: CodeType = frame.f_code
         self._self_init_loc: str = "%s:%d" % (os.path.basename(code.co_filename), frame.f_lineno)
         self._self_acquired_at: int = 0
         self._self_name: Optional[str] = None
 
+    ### DUNDER methods ###
+
+    def __eq__(self, other: Any) -> bool:
+        if isinstance(other, _ProfiledLock):
+            return self.__wrapped__ == other.__wrapped__
+        return self.__wrapped__ == other
+
+    def __getattr__(self, name: str) -> Any:
+        # Delegates acquire_lock, release_lock, locked_lock, and any future methods
+        return getattr(self.__wrapped__, name)
+
+    def __hash__(self) -> int:
+        return hash(self.__wrapped__)
+
+    def __repr__(self) -> str:
+        return f"<_ProfiledLock({self.__wrapped__!r}) at {self._self_init_loc}>"
+
+    ### Regular methods ###
+
+    def locked(self) -> bool:
+        """Return True if lock is currently held."""
+        return self.__wrapped__.locked()
+
     def acquire(self, *args: Any, **kwargs: Any) -> Any:
         return self._acquire(self.__wrapped__.acquire, *args, **kwargs)
 
@@ -115,11 +135,6 @@ def __aexit__(self, *args: Any, **kwargs: Any) -> Any:
         return self._release(self.__wrapped__.__aexit__, *args, **kwargs)
 
     def _release(self, inner_func: Callable[..., Any], *args: Any, **kwargs: Any) -> None:
-        # The underlying threading.Lock class is implemented using C code, and
-        # it doesn't have the __dict__ attribute. So we can't do
-        # self.__dict__.pop("_self_acquired_at", None) to remove the attribute.
-        # Instead, we need to use the following workaround to retrieve and
-        # remove the attribute.
         start: Optional[int] = getattr(self, "_self_acquired_at", None)
         try:
             # Though it should generally be avoided to call release() from
@@ -130,7 +145,6 @@ def _release(self, inner_func: Callable[..., Any], *args: Any, **kwargs: Any) ->
             # and unlocked lock, and the expected behavior is to propagate that.
             del self._self_acquired_at
         except AttributeError:
-            # We just ignore the error, if the attribute is not found.
             pass
 
         try:
@@ -196,9 +210,13 @@ def _find_self_name(self, var_dict: Dict[str, Any]) -> Optional[str]:
                 return name
             if config.lock.name_inspect_dir:
                 for attribute in dir(value):
-                    if not attribute.startswith("__") and getattr(value, attribute) is self:
-                        self._self_name = attribute
-                        return attribute
+                    try:
+                        if not attribute.startswith("__") and getattr(value, attribute) is self:
+                            self._self_name = attribute
+                            return attribute
+                    except AttributeError:
+                        # Accessing unset attributes in __slots__ raises AttributeError.
+                        pass
         return None
 
     # Get lock acquire/release call location and variable name the lock is assigned to
@@ -225,11 +243,19 @@ def _maybe_update_self_name(self) -> None:
         self._self_name = self._find_self_name(frame.f_locals) or self._find_self_name(frame.f_globals) or ""
 
 
-class FunctionWrapper(wrapt.FunctionWrapper):
-    # Override the __get__ method: whatever happens, _allocate_lock is always considered by Python like a "static"
-    # method, even when used as a class attribute. Python never tried to "bind" it to a method, because it sees it is a
-    # builtin function. Override default wrapt behavior here that tries to detect bound method.
-    def __get__(self, instance: Any, owner: Optional[Type] = None) -> FunctionWrapper:  # type: ignore
+class _LockAllocatorWrapper:
+    """Wrapper for lock allocator functions that prevents method binding."""
+
+    __slots__ = ("_func",)
+
+    def __init__(self, func: Callable[..., Any]) -> None:
+        self._func: Callable[..., Any] = func
+
+    def __call__(self, *args: Any, **kwargs: Any) -> Any:
+        return self._func(*args, **kwargs)
+
+    def __get__(self, instance: Any, owner: Optional[Type] = None) -> _LockAllocatorWrapper:
+        # Prevent automatic method binding (e.g., Foo.lock_class = threading.Lock)
         return self
 
 
@@ -241,16 +267,14 @@ class LockCollector(collector.CaptureSamplerCollector):
     def __init__(
         self,
         nframes: int = config.max_frames,
-        endpoint_collection_enabled: bool = config.endpoint_collection,
         tracer: Optional[Tracer] = None,
         *args: Any,
         **kwargs: Any,
     ) -> None:
         super().__init__(*args, **kwargs)
         self.nframes: int = nframes
-        self.endpoint_collection_enabled: bool = endpoint_collection_enabled
         self.tracer: Optional[Tracer] = tracer
-        self._original: Optional[Any] = None
+        self._original_lock: Any = None
 
     @abc.abstractmethod
     def _get_patch_target(self) -> Callable[..., Any]:
@@ -272,23 +296,20 @@ def _stop_service(self) -> None:
 
     def patch(self) -> None:
         """Patch the module for tracking lock allocation."""
-        # We only patch the lock from the `threading` module.
-        # Nobody should use locks from `_thread`; if they do so, then it's deliberate and we don't profile.
-        self._original = self._get_patch_target()
+        self._original_lock = self._get_patch_target()
+        original_lock: Any = self._original_lock  # Capture non-None value
 
-        # TODO: `instance` is unused
-        def _allocate_lock(wrapped: Any, instance: Any, args: Any, kwargs: Any) -> _ProfiledLock:
-            lock: Any = wrapped(*args, **kwargs)
+        def _profiled_allocate_lock(*args: Any, **kwargs: Any) -> _ProfiledLock:
+            """Simple wrapper that returns profiled locks."""
             return self.PROFILED_LOCK_CLASS(
-                lock,
-                self.tracer,
-                self.nframes,
-                self._capture_sampler,
-                self.endpoint_collection_enabled,
+                wrapped=original_lock(*args, **kwargs),
+                tracer=self.tracer,
+                max_nframes=self.nframes,
+                capture_sampler=self._capture_sampler,
             )
 
-        self._set_patch_target(FunctionWrapper(self._original, _allocate_lock))
+        self._set_patch_target(_LockAllocatorWrapper(_profiled_allocate_lock))
 
     def unpatch(self) -> None:
         """Unpatch the threading module for tracking lock allocation."""
-        self._set_patch_target(self._original)
+        self._set_patch_target(self._original_lock)
diff --git a/releasenotes/notes/remove-wrapt-lock-profiler-c5f2e83097e0ff28.yaml b/releasenotes/notes/remove-wrapt-lock-profiler-c5f2e83097e0ff28.yaml
@@ -0,0 +1,4 @@
+---
+other:
+  - |
+    profiling: This removes the ``wrapt`` library dependency from the Lock Profiler implementation, improving performance and reducing overhead during lock instrumentation.
diff --git a/tests/profiling_v2/collector/test_asyncio.py b/tests/profiling_v2/collector/test_asyncio.py
@@ -21,9 +21,7 @@
 def test_repr():
     test_collector._test_repr(
         collector_asyncio.AsyncioLockCollector,
-        "AsyncioLockCollector(status=<ServiceStatus.STOPPED: 'stopped'>, "
-        "capture_pct=1.0, nframes=64, "
-        "endpoint_collection_enabled=True, tracer=None)",
+        "AsyncioLockCollector(status=<ServiceStatus.STOPPED: 'stopped'>, capture_pct=1.0, nframes=64, tracer=None)",  # noqa: E501
     )
 
 
diff --git a/tests/profiling_v2/collector/test_threading.py b/tests/profiling_v2/collector/test_threading.py

-Original file line number
+Diff line change
@@ @@ -0,0 +1,4 @@ @@
 +---
 +other:
 +  - |
 +    profiling: This removes the ``wrapt`` library dependency from the Lock Profiler implementation, improving performance and reducing overhead during lock instrumentation.
Original file line number	Diff line number	Diff line change
`@@ -21,9 +21,7 @@`
`21`	`21`	`def test_repr():`
`22`	`22`	`test_collector._test_repr(`
`23`	`23`	`collector_asyncio.AsyncioLockCollector,`
`24`		`- "AsyncioLockCollector(status=<ServiceStatus.STOPPED: 'stopped'>, "`
`25`		`- "capture_pct=1.0, nframes=64, "`
`26`		`- "endpoint_collection_enabled=True, tracer=None)",`
	`24`	`+ "AsyncioLockCollector(status=<ServiceStatus.STOPPED: 'stopped'>, capture_pct=1.0, nframes=64, tracer=None)", # noqa: E501`
`27`	`25`	`)`
`28`	`26`
`29`	`27`