Skip to content

Commit bf65130

Browse files
authored
Remove dependency on Presidio Anonymizer (#47)
* Remove dependency on Presidio Anonymizer * Mask in sorted order * Improve overlap detection * Fix known bank codes * Clean up test files
1 parent d98116d commit bf65130

File tree

6 files changed

+492
-44
lines changed

6 files changed

+492
-44
lines changed

pyproject.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,6 @@ dependencies = [
1212
"openai-agents>=0.3.3",
1313
"pip>=25.0.1",
1414
"presidio-analyzer>=2.2.360",
15-
"presidio-anonymizer>=2.2.360",
1615
"thinc>=8.3.6",
1716
]
1817
classifiers = [

src/guardrails/_base_client.py

Lines changed: 41 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -246,8 +246,7 @@ def _apply_pii_masking_to_structured_content(
246246
Returns:
247247
Modified messages with PII masking applied to each text part
248248
"""
249-
from presidio_anonymizer import AnonymizerEngine
250-
from presidio_anonymizer.entities import OperatorConfig
249+
from guardrails.utils.anonymizer import OperatorConfig, anonymize
251250

252251
# Extract detected entity types and config
253252
detected = pii_result.info.get("detected_entities", {})
@@ -256,18 +255,17 @@ def _apply_pii_masking_to_structured_content(
256255

257256
detect_encoded_pii = pii_result.info.get("detect_encoded_pii", False)
258257

259-
# Get Presidio engines - entity types are guaranteed valid from detection
258+
# Get analyzer engine - entity types are guaranteed valid from detection
260259
from .checks.text.pii import _get_analyzer_engine
261260

262261
analyzer = _get_analyzer_engine()
263-
anonymizer = AnonymizerEngine()
264262
entity_types = list(detected.keys())
265263

266264
# Create operators for each entity type
267265
operators = {entity_type: OperatorConfig("replace", {"new_value": f"<{entity_type}>"}) for entity_type in entity_types}
268266

269267
def _mask_text(text: str) -> str:
270-
"""Mask using Presidio's analyzer and anonymizer with Unicode normalization.
268+
"""Mask using custom anonymizer with Unicode normalization.
271269
272270
Handles both plain and encoded PII consistently with main detection path.
273271
"""
@@ -302,7 +300,7 @@ def _mask_text(text: str) -> str:
302300
# Mask plain PII
303301
masked = normalized
304302
if has_plain_pii:
305-
masked = anonymizer.anonymize(text=masked, analyzer_results=analyzer_results, operators=operators).text
303+
masked = anonymize(text=masked, analyzer_results=analyzer_results, operators=operators).text
306304

307305
# Mask encoded PII if found
308306
if has_encoded_pii:
@@ -311,19 +309,50 @@ def _mask_text(text: str) -> str:
311309
decoded_results = analyzer.analyze(decoded_text_for_masking, entities=entity_types, language="en")
312310

313311
if decoded_results:
314-
# Map detections back to mask encoded chunks
312+
# Build list of (candidate, entity_type) pairs to mask
313+
candidates_to_mask = []
314+
315315
for result in decoded_results:
316316
detected_value = decoded_text_for_masking[result.start : result.end]
317317
entity_type = result.entity_type
318318

319-
# Find candidate that contains this PII
319+
# Find candidate that overlaps with this PII
320+
# Use comprehensive overlap logic matching pii.py implementation
320321
for candidate in candidates_for_masking:
321-
if detected_value in candidate.decoded_text:
322-
# Mask the encoded version
323-
entity_marker = f"<{entity_type}_ENCODED>"
324-
masked = masked[: candidate.start] + entity_marker + masked[candidate.end :]
322+
if not candidate.decoded_text:
323+
continue
324+
325+
candidate_lower = candidate.decoded_text.lower()
326+
detected_lower = detected_value.lower()
327+
328+
# Check if candidate's decoded text overlaps with the detection
329+
# Handle partial encodings where encoded span may include extra characters
330+
# e.g., %3A%6a%6f%65%40 → ":joe@" but only "joe@" is in email "joe@domain.com"
331+
has_overlap = (
332+
candidate_lower in detected_lower # Candidate is substring of detection
333+
or detected_lower in candidate_lower # Detection is substring of candidate
334+
or (
335+
len(candidate_lower) >= 3
336+
and any( # Any 3-char chunk overlaps
337+
candidate_lower[i : i + 3] in detected_lower
338+
for i in range(len(candidate_lower) - 2)
339+
)
340+
)
341+
)
342+
343+
if has_overlap:
344+
candidates_to_mask.append((candidate, entity_type))
325345
break
326346

347+
# Sort by position (reverse) to mask from end to start
348+
# This preserves position validity for subsequent replacements
349+
candidates_to_mask.sort(key=lambda x: x[0].start, reverse=True)
350+
351+
# Mask from end to start
352+
for candidate, entity_type in candidates_to_mask:
353+
entity_marker = f"<{entity_type}_ENCODED>"
354+
masked = masked[: candidate.start] + entity_marker + masked[candidate.end :]
355+
327356
return masked
328357

329358
# Mask each text part

src/guardrails/checks/text/pii.py

Lines changed: 51 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -89,13 +89,12 @@
8989
from presidio_analyzer.predefined_recognizers.country_specific.korea.kr_rrn_recognizer import (
9090
KrRrnRecognizer,
9191
)
92-
from presidio_anonymizer import AnonymizerEngine
93-
from presidio_anonymizer.entities import OperatorConfig
9492
from pydantic import BaseModel, ConfigDict, Field
9593

9694
from guardrails.registry import default_spec_registry
9795
from guardrails.spec import GuardrailSpecMetadata
9896
from guardrails.types import GuardrailResult
97+
from guardrails.utils.anonymizer import OperatorConfig, anonymize
9998

10099
__all__ = ["pii"]
101100

@@ -155,15 +154,54 @@ def _get_analyzer_engine() -> AnalyzerEngine:
155154
)
156155

157156
# BIC/SWIFT code recognizer (8 or 11 characters: 4 bank + 2 country + 2 location + 3 branch)
158-
bic_pattern = Pattern(
159-
name="bic_swift_pattern",
160-
regex=r"\b[A-Z]{4}[A-Z]{2}[A-Z0-9]{2}([A-Z0-9]{3})?\b",
161-
score=0.75,
157+
# Uses context-aware pattern to reduce false positives on common words like "CUSTOMER"
158+
# Requires either:
159+
# 1. Explicit prefix (SWIFT:, BIC:, Bank Code:, etc.) OR
160+
# 2. Known bank code from major financial institutions
161+
# This significantly reduces false positives while maintaining high recall for actual BIC codes
162+
163+
# Pattern 1: Explicit context with common BIC/SWIFT prefixes (high confidence)
164+
# Case-insensitive for the context words, but code itself must be uppercase
165+
bic_with_context_pattern = Pattern(
166+
name="bic_with_context",
167+
regex=r"(?i)(?:swift|bic|bank[\s-]?code|swift[\s-]?code|bic[\s-]?code)(?-i)[:\s=]+([A-Z]{4}[A-Z]{2}[A-Z0-9]{2}(?:[A-Z0-9]{3})?)\b",
168+
score=0.95,
162169
)
170+
171+
# Pattern 2: Known banking institutions (4-letter bank codes from major banks)
172+
# This whitelist approach has very low false positive rate
173+
# Only detects codes starting with known bank identifiers
174+
# NOTE: Must be exactly 4 characters (bank identifier only, not full BIC)
175+
known_bank_codes = (
176+
"DEUT|CHAS|BARC|HSBC|BNPA|CITI|WELL|BOFA|JPMC|GSCC|MSNY|" # Major international
177+
"COBA|DRSD|BYLA|MALA|HYVE|" # Germany
178+
"WFBI|USBC|" # US
179+
"LOYD|MIDL|NWBK|RBOS|" # UK
180+
"CRLY|SOGE|AGRI|" # France
181+
"UBSW|CRES|" # Switzerland
182+
"SANB|BBVA|" # Spain
183+
"UNCR|BCIT|" # Italy
184+
"INGB|ABNA|RABO|" # Netherlands
185+
"ROYA|TDOM|BNSC|" # Canada
186+
"ANZB|NATA|WPAC|CTBA|" # Australia
187+
"BKCH|MHCB|BOTK|" # Japan
188+
"ICBK|ABOC|PCBC|" # China
189+
"HSBC|SCBL|" # Hong Kong
190+
"DBSS|OCBC|UOVB|" # Singapore
191+
"CZNB|SHBK|KOEX|HVBK|NACF|IBKO|KODB|HNBN|CITI" # South Korea
192+
)
193+
194+
known_bic_pattern = Pattern(
195+
name="known_bic_codes",
196+
regex=rf"\b(?:{known_bank_codes})[A-Z]{{2}}[A-Z0-9]{{2}}(?:[A-Z0-9]{{3}})?\b",
197+
score=0.90,
198+
)
199+
200+
# Register both patterns
163201
registry.add_recognizer(
164202
PatternRecognizer(
165203
supported_entity="BIC_SWIFT",
166-
patterns=[bic_pattern],
204+
patterns=[bic_with_context_pattern, known_bic_pattern],
167205
supported_language="en",
168206
)
169207
)
@@ -192,19 +230,6 @@ def _get_analyzer_engine() -> AnalyzerEngine:
192230
return engine
193231

194232

195-
@functools.lru_cache(maxsize=1)
196-
def _get_anonymizer_engine() -> AnonymizerEngine:
197-
"""Return a cached AnonymizerEngine for PII masking.
198-
199-
Uses Presidio's built-in anonymization for optimal performance and
200-
correct handling of overlapping entities, Unicode, and special characters.
201-
202-
Returns:
203-
AnonymizerEngine: Configured anonymizer for replacing PII entities.
204-
"""
205-
return AnonymizerEngine()
206-
207-
208233
class PIIEntity(str, Enum):
209234
"""Supported PII entity types for detection.
210235
@@ -460,9 +485,7 @@ def _try_decode_base64(text: str) -> str | None:
460485
decoded_bytes = base64.b64decode(text, validate=True)
461486
# Security: Fail closed - reject content > 10KB to prevent memory DoS and PII bypass
462487
if len(decoded_bytes) > 10_000:
463-
msg = (
464-
f"Base64 decoded content too large ({len(decoded_bytes):,} bytes). Maximum allowed is 10KB."
465-
)
488+
msg = f"Base64 decoded content too large ({len(decoded_bytes):,} bytes). Maximum allowed is 10KB."
466489
raise ValueError(msg)
467490
# Check if result is valid UTF-8
468491
return decoded_bytes.decode("utf-8", errors="strict")
@@ -590,11 +613,10 @@ def _build_decoded_text(text: str) -> tuple[str, list[EncodedCandidate]]:
590613

591614

592615
def _mask_pii(text: str, detection: PiiDetectionResult, config: PIIConfig) -> tuple[str, dict[str, list[str]]]:
593-
"""Mask detected PII using Presidio's AnonymizerEngine.
616+
"""Mask detected PII using custom anonymizer.
594617
595618
Normalizes Unicode before masking to ensure consistency with detection.
596-
Uses Presidio's built-in anonymization for optimal performance and
597-
correct handling of overlapping entities, Unicode, and special characters.
619+
Handles overlapping entities, Unicode, and special characters correctly.
598620
599621
If detect_encoded_pii is enabled, also detects and masks PII in
600622
Base64/URL-encoded/hex strings using a hybrid approach.
@@ -627,13 +649,11 @@ def _mask_pii(text: str, detection: PiiDetectionResult, config: PIIConfig) -> tu
627649
# No PII detected - return original text to preserve special characters
628650
return text, {}
629651

630-
# Use Presidio's optimized anonymizer with replace operator
631-
anonymizer = _get_anonymizer_engine()
632-
633652
# Create operators mapping each entity type to a replace operator
634653
operators = {entity_type: OperatorConfig("replace", {"new_value": f"<{entity_type}>"}) for entity_type in detection.mapping.keys()}
635654

636-
result = anonymizer.anonymize(
655+
# Use custom anonymizer
656+
result = anonymize(
637657
text=normalized_text,
638658
analyzer_results=detection.analyzer_results,
639659
operators=operators,
@@ -706,7 +726,7 @@ def _mask_encoded_pii(text: str, config: PIIConfig, original_text: str | None =
706726
len(candidate_lower) >= 3
707727
and any( # Any 3-char chunk overlaps
708728
candidate_lower[i : i + 3] in detected_lower
709-
for i in range(0, len(candidate_lower) - 2, 2) # Step by 2 for efficiency
729+
for i in range(len(candidate_lower) - 2)
710730
)
711731
)
712732
)

src/guardrails/utils/anonymizer.py

Lines changed: 148 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,148 @@
1+
"""Custom anonymizer for PII masking.
2+
3+
This module provides a lightweight replacement for presidio-anonymizer,
4+
implementing text masking functionality for detected PII entities.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
from collections.abc import Sequence
10+
from dataclasses import dataclass
11+
from typing import Any, Protocol
12+
13+
14+
class RecognizerResult(Protocol):
15+
"""Protocol for analyzer results from presidio-analyzer.
16+
17+
Attributes:
18+
start: Start position of the entity in text.
19+
end: End position of the entity in text.
20+
entity_type: Type of the detected entity (e.g., "EMAIL_ADDRESS").
21+
"""
22+
23+
start: int
24+
end: int
25+
entity_type: str
26+
27+
28+
@dataclass(frozen=True, slots=True)
29+
class OperatorConfig:
30+
"""Configuration for an anonymization operator.
31+
32+
Args:
33+
operator_name: Name of the operator (e.g., "replace").
34+
params: Parameters for the operator (e.g., {"new_value": "<EMAIL>"}).
35+
"""
36+
37+
operator_name: str
38+
params: dict[str, Any]
39+
40+
41+
@dataclass(frozen=True, slots=True)
42+
class AnonymizeResult:
43+
"""Result of text anonymization.
44+
45+
Attributes:
46+
text: The anonymized text with entities masked.
47+
"""
48+
49+
text: str
50+
51+
52+
def _resolve_overlaps(results: Sequence[RecognizerResult]) -> list[RecognizerResult]:
53+
"""Remove overlapping entity spans, keeping longer/earlier ones.
54+
55+
When entities overlap, prioritize:
56+
1. Longer spans over shorter ones
57+
2. Earlier positions when spans are equal length
58+
59+
Args:
60+
results: Sequence of recognizer results to resolve.
61+
62+
Returns:
63+
List of non-overlapping recognizer results.
64+
65+
Examples:
66+
>>> # If EMAIL_ADDRESS spans (0, 20) and PERSON spans (5, 10), keep EMAIL_ADDRESS
67+
>>> # If two entities span (0, 10) and (5, 15), keep the one starting at 0
68+
"""
69+
if not results:
70+
return []
71+
72+
# Sort by: 1) longer spans first, 2) earlier position for equal lengths
73+
sorted_results = sorted(
74+
results,
75+
key=lambda r: (-(r.end - r.start), r.start),
76+
)
77+
78+
# Filter out overlapping spans
79+
non_overlapping: list[RecognizerResult] = []
80+
for result in sorted_results:
81+
# Check if this result overlaps with any already selected
82+
overlaps = False
83+
for selected in non_overlapping:
84+
# Two spans overlap if one starts before the other ends
85+
if (result.start < selected.end and result.end > selected.start):
86+
overlaps = True
87+
break
88+
89+
if not overlaps:
90+
non_overlapping.append(result)
91+
92+
return non_overlapping
93+
94+
95+
def anonymize(
96+
text: str,
97+
analyzer_results: Sequence[RecognizerResult],
98+
operators: dict[str, OperatorConfig],
99+
) -> AnonymizeResult:
100+
"""Anonymize text by replacing detected entities with placeholders.
101+
102+
This function replicates presidio-anonymizer's behavior for the "replace"
103+
operator, which we use to mask PII with placeholders like "<EMAIL_ADDRESS>".
104+
105+
Args:
106+
text: The original text to anonymize.
107+
analyzer_results: Sequence of detected entities with positions.
108+
operators: Mapping from entity type to operator configuration.
109+
110+
Returns:
111+
AnonymizeResult with masked text.
112+
113+
Examples:
114+
>>> from collections import namedtuple
115+
>>> Result = namedtuple("Result", ["start", "end", "entity_type"])
116+
>>> results = [Result(start=10, end=25, entity_type="EMAIL_ADDRESS")]
117+
>>> operators = {"EMAIL_ADDRESS": OperatorConfig("replace", {"new_value": "<EMAIL_ADDRESS>"})}
118+
>>> result = anonymize("Contact: john@example.com", results, operators)
119+
>>> result.text
120+
'Contact: <EMAIL_ADDRESS>'
121+
"""
122+
if not analyzer_results or not text:
123+
return AnonymizeResult(text=text)
124+
125+
# Resolve overlapping entities
126+
non_overlapping = _resolve_overlaps(analyzer_results)
127+
128+
# Sort by position (reverse order) to maintain correct offsets during replacement
129+
sorted_results = sorted(non_overlapping, key=lambda r: r.start, reverse=True)
130+
131+
# Replace entities from end to start
132+
masked_text = text
133+
for result in sorted_results:
134+
entity_type = result.entity_type
135+
operator_config = operators.get(entity_type)
136+
137+
if operator_config and operator_config.operator_name == "replace":
138+
# Extract the replacement value
139+
new_value = operator_config.params.get("new_value", f"<{entity_type}>")
140+
# Replace the text span
141+
masked_text = (
142+
masked_text[: result.start]
143+
+ new_value
144+
+ masked_text[result.end :]
145+
)
146+
147+
return AnonymizeResult(text=masked_text)
148+

0 commit comments

Comments
 (0)