Skip to content

Commit 45e958c

Browse files
authored
Add Korean RNN identifiers to PII mask / block (#29)
* adding KR_RRN support from Presidio * Update tests
1 parent 794ff02 commit 45e958c

File tree

3 files changed

+273
-31
lines changed

3 files changed

+273
-31
lines changed

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -11,7 +11,7 @@ dependencies = [
1111
"pydantic>=2.11.3",
1212
"openai-agents>=0.3.3",
1313
"pip>=25.0.1",
14-
"presidio-analyzer>=2.2.358",
14+
"presidio-analyzer>=2.2.360",
1515
]
1616
classifiers = [
1717
"Typing :: Typed",

src/guardrails/checks/text/pii.py

Lines changed: 38 additions & 30 deletions
Original file line numberDiff line numberDiff line change
@@ -79,8 +79,11 @@
7979
from enum import Enum
8080
from typing import Any, Final
8181

82-
from presidio_analyzer import AnalyzerEngine, RecognizerResult
82+
from presidio_analyzer import AnalyzerEngine, RecognizerRegistry, RecognizerResult
8383
from presidio_analyzer.nlp_engine import NlpEngineProvider
84+
from presidio_analyzer.predefined_recognizers.country_specific.korea.kr_rrn_recognizer import (
85+
KrRrnRecognizer,
86+
)
8487
from pydantic import BaseModel, ConfigDict, Field
8588

8689
from guardrails.registry import default_spec_registry
@@ -94,24 +97,35 @@
9497

9598
@functools.lru_cache(maxsize=1)
9699
def _get_analyzer_engine() -> AnalyzerEngine:
97-
"""Return a cached, configured Presidio AnalyzerEngine instance.
100+
"""Return a cached AnalyzerEngine configured with Presidio recognizers.
101+
102+
The engine loads Presidio's default recognizers for English and explicitly
103+
registers the built-in KR_RRN recognizer to make it available alongside
104+
other PII detectors within the guardrail.
98105
99106
Returns:
100-
AnalyzerEngine: Initialized Presidio analyzer engine.
107+
AnalyzerEngine: Analyzer configured with English NLP support and
108+
region-specific recognizers backed by Presidio.
101109
"""
102-
# Define a smaller NLP configuration
103-
sm_nlp_config: Final[dict[str, Any]] = {
110+
nlp_config: Final[dict[str, Any]] = {
104111
"nlp_engine_name": "spacy",
105-
"models": [{"lang_code": "en", "model_name": "en_core_web_sm"}],
112+
"models": [
113+
{"lang_code": "en", "model_name": "en_core_web_sm"},
114+
],
106115
}
107116

108-
# Reduce the size of the nlp model loaded by Presidio
109-
provider = NlpEngineProvider(nlp_configuration=sm_nlp_config)
110-
sm_nlp_engine = provider.create_engine()
117+
provider = NlpEngineProvider(nlp_configuration=nlp_config)
118+
nlp_engine = provider.create_engine()
119+
120+
registry = RecognizerRegistry(supported_languages=["en"])
121+
registry.load_predefined_recognizers(languages=["en"], nlp_engine=nlp_engine)
122+
registry.add_recognizer(KrRrnRecognizer(supported_language="en"))
111123

112-
# Analyzer using minimal NLP
113-
engine = AnalyzerEngine(nlp_engine=sm_nlp_engine)
114-
logger.debug("Initialized Presidio analyzer engine")
124+
engine = AnalyzerEngine(
125+
registry=registry,
126+
nlp_engine=nlp_engine,
127+
supported_languages=["en"],
128+
)
115129
return engine
116130

117131

@@ -183,6 +197,9 @@ class PIIEntity(str, Enum):
183197
# Finland
184198
FI_PERSONAL_IDENTITY_CODE = "FI_PERSONAL_IDENTITY_CODE"
185199

200+
# Korea
201+
KR_RRN = "KR_RRN"
202+
186203

187204
class PIIConfig(BaseModel):
188205
"""Configuration schema for PII detection.
@@ -233,6 +250,9 @@ def to_dict(self) -> dict[str, list[str]]:
233250
def _detect_pii(text: str, config: PIIConfig) -> PiiDetectionResult:
234251
"""Run Presidio analysis and collect findings by entity type.
235252
253+
Supports detection of Korean (KR_RRN) and other region-specific entities via
254+
Presidio recognizers registered with the analyzer engine.
255+
236256
Args:
237257
text (str): The text to analyze for PII.
238258
config (PIIConfig): PII detection configuration.
@@ -247,22 +267,18 @@ def _detect_pii(text: str, config: PIIConfig) -> PiiDetectionResult:
247267
raise ValueError("Text cannot be empty or None")
248268

249269
engine = _get_analyzer_engine()
270+
271+
# Run analysis for all configured entities
272+
# Region-specific recognizers (e.g., KR_RRN) are registered with language="en"
250273
analyzer_results = engine.analyze(text, entities=[e.value for e in config.entities], language="en")
251274

252-
# Filter results once and create both mapping and filtered results
253-
filtered_results = [res for res in analyzer_results if res.entity_type in config.entities]
275+
# Filter results and create mapping
276+
entity_values = {e.value for e in config.entities}
277+
filtered_results = [res for res in analyzer_results if res.entity_type in entity_values]
254278
grouped: dict[str, list[str]] = defaultdict(list)
255279
for res in filtered_results:
256280
grouped[res.entity_type].append(text[res.start : res.end])
257281

258-
logger.debug(
259-
"PII detection completed",
260-
extra={
261-
"event": "pii_detection",
262-
"entities_found": len(filtered_results),
263-
"entity_types": list(grouped.keys()),
264-
},
265-
)
266282
return PiiDetectionResult(mapping=dict(grouped), analyzer_results=filtered_results)
267283

268284

@@ -303,14 +319,6 @@ def _mask_pii(text: str, detection: PiiDetectionResult, config: PIIConfig) -> st
303319
result = result[:start] + replacement + result[end:]
304320
offset += len(replacement) - (end - start)
305321

306-
logger.debug(
307-
"PII masking completed",
308-
extra={
309-
"event": "pii_masking",
310-
"entities_masked": len(sorted_results),
311-
"entity_types": [res.entity_type for res in sorted_results],
312-
},
313-
)
314322
return result
315323

316324

tests/unit/checks/test_pii.py

Lines changed: 234 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,234 @@
1+
"""Tests for PII detection guardrail.
2+
3+
This module tests the PII detection functionality including entity detection,
4+
masking behavior, and blocking behavior for various entity types.
5+
"""
6+
7+
from __future__ import annotations
8+
9+
import pytest
10+
11+
from guardrails.checks.text.pii import PIIConfig, PIIEntity, pii
12+
from guardrails.types import GuardrailResult
13+
14+
15+
@pytest.mark.asyncio
16+
async def test_pii_detects_korean_resident_registration_number() -> None:
17+
"""Detect Korean Resident Registration Numbers with valid date and checksum."""
18+
config = PIIConfig(entities=[PIIEntity.KR_RRN], block=True)
19+
# Using valid RRN: 900101-2345670
20+
# Date: 900101 (Jan 1, 1990), Gender: 2, Serial: 34567, Checksum: 0
21+
result = await pii(None, "My RRN is 900101-2345670", config)
22+
23+
assert isinstance(result, GuardrailResult) # noqa: S101
24+
assert result.tripwire_triggered is True # noqa: S101
25+
assert result.info["guardrail_name"] == "Contains PII" # noqa: S101
26+
assert result.info["pii_detected"] is True # noqa: S101
27+
assert "KR_RRN" in result.info["detected_entities"] # noqa: S101
28+
29+
30+
@pytest.mark.asyncio
31+
async def test_pii_masks_korean_rrn_in_non_blocking_mode() -> None:
32+
"""Korean RRN with valid date and checksum should be masked when block=False."""
33+
config = PIIConfig(entities=[PIIEntity.KR_RRN], block=False)
34+
# Using valid RRN: 900101-2345670
35+
result = await pii(None, "My RRN is 900101-2345670", config)
36+
37+
assert result.tripwire_triggered is False # noqa: S101
38+
assert result.info["pii_detected"] is True # noqa: S101
39+
assert result.info["block_mode"] is False # noqa: S101
40+
assert "<KR_RRN>" in result.info["checked_text"] # noqa: S101
41+
42+
43+
@pytest.mark.asyncio
44+
async def test_pii_detects_multiple_entity_types() -> None:
45+
"""Detect multiple PII entity types with valid dates and checksums."""
46+
config = PIIConfig(
47+
entities=[PIIEntity.EMAIL_ADDRESS, PIIEntity.KR_RRN],
48+
block=True,
49+
)
50+
result = await pii(
51+
None,
52+
"Contact: user@example.com, Korean RRN: 900101-2345670",
53+
config,
54+
)
55+
56+
assert result.tripwire_triggered is True # noqa: S101
57+
assert result.info["pii_detected"] is True # noqa: S101
58+
detected = result.info["detected_entities"]
59+
# Verify both entity types are detected
60+
assert "EMAIL_ADDRESS" in detected # noqa: S101
61+
assert "KR_RRN" in detected # noqa: S101
62+
# Verify actual values were captured
63+
assert detected["EMAIL_ADDRESS"] == ["user@example.com"] # noqa: S101
64+
assert detected["KR_RRN"] == ["900101-2345670"] # noqa: S101
65+
66+
67+
@pytest.mark.asyncio
68+
async def test_pii_masks_multiple_entity_types() -> None:
69+
"""Mask multiple PII entity types with valid checksums."""
70+
config = PIIConfig(
71+
entities=[PIIEntity.EMAIL_ADDRESS, PIIEntity.KR_RRN],
72+
block=False,
73+
)
74+
result = await pii(
75+
None,
76+
"Contact: user@example.com, Korean RRN: 123456-1234563",
77+
config,
78+
)
79+
80+
assert result.tripwire_triggered is False # noqa: S101
81+
assert result.info["pii_detected"] is True # noqa: S101
82+
checked_text = result.info["checked_text"]
83+
assert "<EMAIL_ADDRESS>" in checked_text # noqa: S101
84+
85+
86+
@pytest.mark.asyncio
87+
async def test_pii_does_not_trigger_on_clean_text() -> None:
88+
"""Guardrail should not trigger when no PII is present."""
89+
config = PIIConfig(entities=[PIIEntity.KR_RRN, PIIEntity.EMAIL_ADDRESS], block=True)
90+
result = await pii(None, "This is clean text with no PII", config)
91+
92+
assert result.tripwire_triggered is False # noqa: S101
93+
assert result.info["pii_detected"] is False # noqa: S101
94+
assert result.info["detected_entities"] == {} # noqa: S101
95+
96+
97+
@pytest.mark.asyncio
98+
async def test_pii_blocking_mode_triggers_tripwire() -> None:
99+
"""Blocking mode should trigger tripwire when PII is detected."""
100+
config = PIIConfig(entities=[PIIEntity.EMAIL_ADDRESS], block=True)
101+
result = await pii(None, "Contact me at test@example.com", config)
102+
103+
assert result.tripwire_triggered is True # noqa: S101
104+
assert result.info["block_mode"] is True # noqa: S101
105+
assert result.info["pii_detected"] is True # noqa: S101
106+
107+
108+
@pytest.mark.asyncio
109+
async def test_pii_masking_mode_does_not_trigger_tripwire() -> None:
110+
"""Masking mode should not trigger tripwire even when PII is detected."""
111+
config = PIIConfig(entities=[PIIEntity.EMAIL_ADDRESS], block=False)
112+
result = await pii(None, "Contact me at test@example.com", config)
113+
114+
assert result.tripwire_triggered is False # noqa: S101
115+
assert result.info["block_mode"] is False # noqa: S101
116+
assert result.info["pii_detected"] is True # noqa: S101
117+
assert "<EMAIL_ADDRESS>" in result.info["checked_text"] # noqa: S101
118+
119+
120+
@pytest.mark.asyncio
121+
async def test_pii_checked_text_unchanged_when_no_pii() -> None:
122+
"""Checked text should remain unchanged when no PII is detected."""
123+
original_text = "This is clean text"
124+
config = PIIConfig(entities=[PIIEntity.EMAIL_ADDRESS, PIIEntity.KR_RRN], block=False)
125+
result = await pii(None, original_text, config)
126+
127+
assert result.info["checked_text"] == original_text # noqa: S101
128+
assert result.tripwire_triggered is False # noqa: S101
129+
130+
131+
@pytest.mark.asyncio
132+
async def test_pii_entity_types_checked_in_result() -> None:
133+
"""Result should include list of entity types that were checked."""
134+
config = PIIConfig(entities=[PIIEntity.KR_RRN, PIIEntity.EMAIL_ADDRESS, PIIEntity.US_SSN])
135+
result = await pii(None, "Clean text", config)
136+
137+
entity_types = result.info["entity_types_checked"]
138+
assert PIIEntity.KR_RRN in entity_types # noqa: S101
139+
assert PIIEntity.EMAIL_ADDRESS in entity_types # noqa: S101
140+
assert PIIEntity.US_SSN in entity_types # noqa: S101
141+
142+
143+
@pytest.mark.asyncio
144+
async def test_pii_config_defaults_to_masking_mode() -> None:
145+
"""PIIConfig should default to masking mode (block=False)."""
146+
config = PIIConfig(entities=[PIIEntity.EMAIL_ADDRESS])
147+
148+
assert config.block is False # noqa: S101
149+
150+
151+
@pytest.mark.asyncio
152+
async def test_pii_detects_us_ssn() -> None:
153+
"""Detect US Social Security Numbers (regression test for existing functionality)."""
154+
config = PIIConfig(entities=[PIIEntity.US_SSN], block=True)
155+
# Use a valid SSN pattern that Presidio can detect (Presidio validates SSN patterns)
156+
result = await pii(None, "My social security number is 856-45-6789", config)
157+
158+
assert result.tripwire_triggered is True # noqa: S101
159+
assert result.info["pii_detected"] is True # noqa: S101
160+
assert "US_SSN" in result.info["detected_entities"] # noqa: S101
161+
162+
163+
@pytest.mark.asyncio
164+
async def test_pii_detects_phone_numbers() -> None:
165+
"""Detect phone numbers (regression test for existing functionality)."""
166+
config = PIIConfig(entities=[PIIEntity.PHONE_NUMBER], block=True)
167+
result = await pii(None, "Call me at 555-123-4567", config)
168+
169+
assert result.tripwire_triggered is True # noqa: S101
170+
assert result.info["pii_detected"] is True # noqa: S101
171+
assert "PHONE_NUMBER" in result.info["detected_entities"] # noqa: S101
172+
173+
174+
@pytest.mark.asyncio
175+
async def test_pii_multiple_occurrences_of_same_entity() -> None:
176+
"""Detect multiple occurrences of the same entity type."""
177+
config = PIIConfig(entities=[PIIEntity.EMAIL_ADDRESS], block=True)
178+
result = await pii(
179+
None,
180+
"Contact alice@example.com or bob@example.com",
181+
config,
182+
)
183+
184+
assert result.tripwire_triggered is True # noqa: S101
185+
assert result.info["pii_detected"] is True # noqa: S101
186+
assert "EMAIL_ADDRESS" in result.info["detected_entities"] # noqa: S101
187+
assert len(result.info["detected_entities"]["EMAIL_ADDRESS"]) >= 1 # noqa: S101
188+
189+
190+
@pytest.mark.asyncio
191+
async def test_pii_detects_korean_rrn_with_invalid_checksum() -> None:
192+
"""Presidio's KR_RRN recognizer detects patterns even with invalid checksums.
193+
194+
Note: Presidio 2.2.360's implementation focuses on pattern matching rather than
195+
strict checksum validation, so it will detect RRN-like patterns regardless of
196+
checksum validity.
197+
"""
198+
config = PIIConfig(entities=[PIIEntity.KR_RRN], block=True)
199+
# Using valid date but invalid checksum: 900101-2345679 (should be 900101-2345670)
200+
result = await pii(None, "My RRN is 900101-2345679", config)
201+
202+
assert result.tripwire_triggered is True # noqa: S101
203+
assert result.info["pii_detected"] is True # noqa: S101
204+
assert "KR_RRN" in result.info["detected_entities"] # noqa: S101
205+
206+
207+
@pytest.mark.asyncio
208+
async def test_pii_detects_korean_rrn_with_invalid_date() -> None:
209+
"""Presidio's KR_RRN recognizer detects some patterns even with invalid dates.
210+
211+
Note: Presidio 2.2.360's implementation may detect certain RRN-like patterns
212+
even if the date component is invalid (e.g., Feb 30). The recognizer prioritizes
213+
pattern matching over strict date validation.
214+
"""
215+
config = PIIConfig(entities=[PIIEntity.KR_RRN], block=True)
216+
# Testing with Feb 30 which is an invalid date but matches the pattern
217+
result = await pii(None, "Korean RRN: 990230-1234567", config)
218+
219+
# Presidio detects this pattern despite the invalid date
220+
assert result.tripwire_triggered is True # noqa: S101
221+
assert result.info["pii_detected"] is True # noqa: S101
222+
assert "KR_RRN" in result.info["detected_entities"] # noqa: S101
223+
224+
225+
@pytest.mark.asyncio
226+
async def test_pii_accepts_valid_korean_rrn_dates() -> None:
227+
"""Korean RRN with valid dates in different formats should be detected."""
228+
config = PIIConfig(entities=[PIIEntity.KR_RRN], block=False)
229+
valid_rrn = "900101-1234568"
230+
result = await pii(None, f"RRN: {valid_rrn}", config)
231+
232+
# Should detect if date is valid
233+
assert result.info["pii_detected"] is True # noqa: S101
234+
assert "KR_RRN" in result.info["detected_entities"] # noqa: S101

0 commit comments

Comments
 (0)