Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
6585c3b
fix: Rename method to get_recognizer_class_name for clarity and updat…
RonShakutai Dec 15, 2025
48349e2
fix: Clarify comments regarding excluded recognizer attributes in Rec…
RonShakutai Dec 15, 2025
ea6f9aa
feat: Add class_name parameter to BaseRecognizerConfig for improved r…
RonShakutai Dec 15, 2025
321352d
fix: Include 'class_name' in custom recognizers exclusion list for im…
RonShakutai Dec 15, 2025
a1b54e1
feat: Enhance Ollama recognizer to support custom instance names and …
RonShakutai Dec 23, 2025
d2e835e
Enhance recognizers to accept additional keyword arguments
RonShakutai Dec 23, 2025
f9540b3
Merge branch 'main' of https://github.com/microsoft/presidio into add…
RonShakutai Dec 23, 2025
db5974b
refactor: Simplify Ollama recognizer loading verification and assertions
RonShakutai Dec 23, 2025
53890b5
test: Update Ollama recognizer loading verification to ensure single …
RonShakutai Dec 23, 2025
e71ee99
feat: Enhance recognizer class name logic in RecognizerListLoader
RonShakutai Dec 23, 2025
b9b7caa
Refactor recognizers to explicitly handle 'name' parameter in __init_…
RonShakutai Dec 23, 2025
7acee5d
fix: Update Stanza and Transformers recognizers to handle additional …
RonShakutai Dec 23, 2025
73eed13
fix: Correct the import order for constants in methods.py
RonShakutai Dec 23, 2025
c2c122f
refactor: Remove update_recognizers_name.py script as its functionali…
RonShakutai Dec 23, 2025
87707f6
check
RonShakutai Dec 31, 2025
ee80999
Merge branch 'main' into add-recognizer-class-name
RonShakutai Jan 1, 2026
66d42e4
fix: Remove unnecessary comments and clean up recognizer configuratio…
RonShakutai Jan 1, 2026
a92cb0e
Refactor recognizer constructors to remove unused **kwargs parameter
RonShakutai Jan 1, 2026
e9569f7
refactor: Remove unused **kwargs parameter from recognizer initializers
RonShakutai Jan 1, 2026
6355d71
refactor: Remove unused **kwargs parameter from recognizer constructors
RonShakutai Jan 1, 2026
d6e906f
fix ci
RonShakutai Jan 1, 2026
8c87a90
refactor: format parameters in recognizer constructors for consistency
RonShakutai Jan 1, 2026
de7459c
refactor: format parameters in recognizer constructors for consistency
RonShakutai Jan 1, 2026
f53cf52
Merge branch 'main' into add-recognizer-class-name
RonShakutai Jan 5, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion e2e-tests/common/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,8 @@
import requests

from common.constants import (
ANONYMIZER_BASE_URL,
ANALYZER_BASE_URL,
ANONYMIZER_BASE_URL,
IMAGE_REDACTOR_BASE_URL,
)

Expand Down
3 changes: 2 additions & 1 deletion e2e-tests/resources/test_ollama_enabled_recognizers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -200,9 +200,10 @@ recognizers:
type: predefined
enabled: false

- name: OllamaLangExtractRecognizer
- name: e2eollama
supported_languages:
- en
type: predefined
class_name: OllamaLangExtractRecognizer
enabled: true
config_path: e2e-tests/resources/ollama_test_config.yaml
50 changes: 19 additions & 31 deletions e2e-tests/tests/test_package_e2e_integration_flows.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,16 +71,18 @@ def test_given_text_with_pii_using_ollama_recognizer_then_detects_entities(tmp_p

text_to_test = "Patient John Smith, SSN 123-45-6789, email john@example.com, phone 555-123-4567, lives at 123 Main St, works at Acme Corp"

# Use pre-configured config file with small model (qwen2.5:1.5b)
import os
config_path = os.path.join(
os.path.dirname(__file__), "..", "resources", "ollama_test_config.yaml"
)

# Create Ollama recognizer with custom config
ollama_recognizer = OllamaLangExtractRecognizer(config_path=config_path)
ollama_recognizer = OllamaLangExtractRecognizer(
config_path=config_path, name="e2eollama"
)

assert ollama_recognizer.name == "e2eollama", \
f"Expected recognizer name to be 'e2eollama', got '{ollama_recognizer.name}'"

# Create analyzer with ONLY Ollama recognizer (no NLP engine, no default recognizers)
from presidio_analyzer.recognizer_registry import RecognizerRegistry
registry = RecognizerRegistry()
registry.add_recognizer(ollama_recognizer)
Expand All @@ -90,13 +92,10 @@ def test_given_text_with_pii_using_ollama_recognizer_then_detects_entities(tmp_p
supported_languages=["en"]
)

# Analyze text
results = analyzer.analyze(text_to_test, language="en")

# Verify at least some entities were detected
assert len(results) > 0, "Expected to detect at least one PII entity"

# Check which recognizers participated in detection
recognizers_used = set()
langextract_detected_at_least_one = False

Expand All @@ -108,12 +107,11 @@ def test_given_text_with_pii_using_ollama_recognizer_then_detects_entities(tmp_p
recognizers_used.add(recognizer_name)

langextract_detected_at_least_one |= (
recognizer_name == "Ollama LangExtract PII"
recognizer_name == "e2eollama"
)

# Verify that Ollama LangExtract recognizer participated in detection
assert langextract_detected_at_least_one, \
f"Expected 'Ollama LangExtract PII' recognizer to detect at least one entity. Recognizers used: {recognizers_used}"
f"Expected 'e2eollama' recognizer to detect at least one entity. Recognizers used: {recognizers_used}"


@pytest.mark.package
Expand All @@ -133,7 +131,6 @@ def test_ollama_recognizer_loads_from_yaml_configuration_when_enabled():
if not OLLAMA_RECOGNIZER_AVAILABLE:
pytest.skip("LangExtract not installed")

# Check if Ollama is available
import os
try:
import requests
Expand All @@ -144,7 +141,6 @@ def test_ollama_recognizer_loads_from_yaml_configuration_when_enabled():
except Exception:
pytest.skip("Ollama service not available")

# Load recognizer registry from YAML config with Ollama enabled
from presidio_analyzer.recognizer_registry import RecognizerRegistryProvider

config_path = os.path.join(
Expand All @@ -155,40 +151,32 @@ def test_ollama_recognizer_loads_from_yaml_configuration_when_enabled():
provider = RecognizerRegistryProvider(conf_file=config_path)
registry = provider.create_recognizer_registry()

# Verify Ollama recognizer was loaded
ollama_recognizers = [r for r in registry.recognizers if "Ollama" in r.name]
ollama_recognizers = [r for r in registry.recognizers if r.name == "e2eollama"]
assert len(ollama_recognizers) == 1, \
f"Expected exactly 1 Ollama recognizer, found {len(ollama_recognizers)}"
f"Expected exactly 1 recognizer with name 'e2eollama', found {len(ollama_recognizers)}"

ollama_recognizer = ollama_recognizers[0]

ollama_rec = ollama_recognizers[0]
assert ollama_rec.name == "Ollama LangExtract PII"
assert ollama_rec.supported_language == "en"
assert len(ollama_rec.supported_entities) > 0
assert ollama_recognizer.__class__.__name__ == "OllamaLangExtractRecognizer", \
f"Expected class OllamaLangExtractRecognizer, got {ollama_recognizer.__class__.__name__}"

assert ollama_recognizer.supported_language == "en"
assert len(ollama_recognizer.supported_entities) > 0

# Test functionality: analyze text with the loaded recognizer
analyzer = AnalyzerEngine(registry=registry, supported_languages=["en"])

text_to_test = "Patient John Smith, SSN 123-45-6789, email john@example.com, phone 555-123-4567, lives at 123 Main St, works at Acme Corp"
results = analyzer.analyze(text_to_test, language="en")

# Should detect entities
assert len(results) > 0, "Expected to detect at least one PII entity"

# Check if Ollama recognizer detected anything
ollama_detected = any(
r.recognition_metadata and
"Ollama" in r.recognition_metadata.get(RecognizerResult.RECOGNIZER_NAME_KEY, "")
for r in results
)

# At minimum, other recognizers should detect common entities
entity_types = {r.entity_type for r in results}
expected_entities = {"EMAIL_ADDRESS", "PERSON", "PHONE_NUMBER", "US_SSN"}
detected_expected = entity_types & expected_entities

assert len(detected_expected) >= 2, \
f"Expected at least 2 entities from {expected_entities}, detected: {entity_types}"

print(f"\n✓ Ollama recognizer loaded successfully from YAML config")
print(f"\n✓ Ollama recognizer 'e2eollama' loaded successfully from YAML config")
print(f" Class: {ollama_recognizer.__class__.__name__}")
print(f" Detected entities: {entity_types}")
print(f" Ollama participated: {ollama_detected}")
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ recognizers:
# For predefined:
# - If only a recognizer name is provided, a predefined recognizer with this name and default parameters will be loaded.
# - If a parameter isn't provided, the default one would be loaded.
# - Use 'class_name' to specify the Python class when using a custom 'name' for display/metadata
# For custom:
# - See an example configuration here: https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/conf/example_recognizers.yaml
# - Custom pattern recognizers with this configuration can be added to this file, with type: custom
Expand Down Expand Up @@ -206,8 +207,9 @@ recognizers:
- en
type: predefined
enabled: false

- name: OllamaLangExtractRecognizer

- name: OllamaRecognizer
class_name: OllamaLangExtractRecognizer
supported_languages:
- en
type: predefined
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,8 @@ def validate_language_code(cls, v: str) -> str:
class BaseRecognizerConfig(BaseModel):
"""Base validation for all recognizer configuration types.

:param name: Name of the recognizer
:param name: Instance name used in analysis results. Defaults to class name.
:param class_name: Python class name for lookup. If not provided, uses 'name'.
:param enabled: Whether the recognizer is enabled
:param type: Type of recognizer (predefined/custom)
:param supported_language: Single supported language (legacy)
Expand All @@ -50,7 +51,14 @@ class BaseRecognizerConfig(BaseModel):
:param supported_entities: List of supported entities for this recognizer.
"""

name: str = Field(..., description="Name of the recognizer")
name: str = Field(..., description="Instance name for the recognizer")
class_name: Optional[str] = Field(
default=None,
description=(
"Python class name for predefined recognizers "
"(if different from instance name)"
),
)
enabled: bool = Field(default=True, description="Whether the recognizer is enabled")
type: Optional[str] = Field(
default="predefined", description="Type of recognizer (predefined/custom)"
Expand Down Expand Up @@ -136,11 +144,12 @@ class PredefinedRecognizerConfig(BaseRecognizerConfig):
@model_validator(mode="after")
def validate_predefined_recognizer_exists(self):
"""Validate that the predefined recognizer class actually exists."""
recognizer_class_name = self.class_name if self.class_name else self.name
try:
RecognizerListLoader.get_existing_recognizer_cls(self.name)
RecognizerListLoader.get_existing_recognizer_cls(recognizer_class_name)
except PredefinedRecognizerNotFoundError as e:
raise ValueError(
f"Predefined recognizer '{self.name}' not found: {str(e)}"
f"Predefined recognizer '{recognizer_class_name}' not found: {str(e)}"
) from e
return self

Expand Down Expand Up @@ -201,8 +210,6 @@ def check_predefined_name_conflict(cls, data: Any) -> Any:
f"for your custom recognizer."
)
except PredefinedRecognizerNotFoundError:
# Name is not a predefined recognizer,
# which is fine for custom recognizers
pass
return data

Expand Down Expand Up @@ -328,7 +335,6 @@ def parse_recognizers(
parsed_recognizers = []
for recognizer in recognizers:
if isinstance(recognizer, str):
# Simple string recognizer name - treat as predefined
parsed_recognizers.append(recognizer)
continue

Expand All @@ -346,7 +352,6 @@ def parse_recognizers(
f"Either use type: 'custom' or remove these fields."
)

# Auto-detect type if not provided
if not recognizer_type:
if "patterns" in recognizer or "deny_list" in recognizer:
recognizer_type = "custom"
Expand All @@ -357,7 +362,6 @@ def parse_recognizers(
recognizer_type = "predefined"
recognizer["type"] = recognizer_type

# Final append based on resolved type (only once)
if recognizer_type == "predefined":
parsed_recognizers.append(PredefinedRecognizerConfig(**recognizer))
elif recognizer_type == "custom":
Expand All @@ -369,7 +373,6 @@ def parse_recognizers(
)
continue

# Fallback: unrecognized structure, keep as-is
parsed_recognizers.append(recognizer)

return parsed_recognizers
Expand All @@ -378,7 +381,6 @@ def parse_recognizers(
def __check_if_predefined(cls, recognizer_name: Optional[Any]) -> None:
try:
RecognizerListLoader.get_existing_recognizer_cls(recognizer_name)
# If we reach here, it IS a predefined recognizer, so raise an error
raise ValueError(
f"Recognizer '{recognizer_name}' conflicts with a predefined "
f"recognizer. "
Expand All @@ -388,7 +390,6 @@ def __check_if_predefined(cls, recognizer_name: Optional[Any]) -> None:
f"for your custom recognizer."
)
except PredefinedRecognizerNotFoundError:
# Name is not a predefined recognizer, which is fine for custom recognizers
pass

@model_validator(mode="after")
Expand All @@ -401,12 +402,10 @@ def validate_language_presence(self):
custom_without_language_present = False
for r in self.recognizers:
if isinstance(r, (PredefinedRecognizerConfig, CustomRecognizerConfig)):
# Track if any language is defined
if (r.supported_language and r.supported_language.strip()) or (
r.supported_languages and len(r.supported_languages) > 0
):
any_language_defined = True
# Track custom recognizers lacking language info
if (
isinstance(r, CustomRecognizerConfig)
and not r.supported_language
Expand Down
4 changes: 2 additions & 2 deletions presidio-analyzer/presidio_analyzer/lm_recognizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,13 +31,13 @@ def __init__(
self,
supported_entities: Optional[List[str]] = None,
supported_language: str = "en",
name: str = "Language Model PII Recognizer",
name: Optional[str] = None,
version: str = "1.0.0",
model_id: Optional[str] = None,
temperature: Optional[float] = None,
min_score: float = 0.5,
labels_to_ignore: Optional[List[str]] = None,
enable_generic_consolidation: bool = True
enable_generic_consolidation: bool = True,
):
"""Initialize LM recognizer.

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def __init__(
supported_language: str = "en",
supported_entity: str = "AU_ABN",
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
name: Optional[str] = None,
):
self.replacement_pairs = (
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
Expand All @@ -61,6 +62,7 @@ def __init__(
patterns=patterns,
context=context,
supported_language=supported_language,
name=name,
)

def validate_result(self, pattern_text: str) -> bool:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
supported_language: str = "en",
supported_entity: str = "AU_ACN",
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
name: Optional[str] = None,
):
self.replacement_pairs = (
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
Expand All @@ -58,6 +59,7 @@ def __init__(
patterns=patterns,
context=context,
supported_language=supported_language,
name=name,
)

def validate_result(self, pattern_text: str) -> bool:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,7 @@ def __init__(
supported_language: str = "en",
supported_entity: str = "AU_MEDICARE",
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
name: Optional[str] = None,
):
self.replacement_pairs = (
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
Expand All @@ -58,6 +59,7 @@ def __init__(
patterns=patterns,
context=context,
supported_language=supported_language,
name=name,
)

def validate_result(self, pattern_text: str) -> bool:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ def __init__(
supported_language: str = "en",
supported_entity: str = "AU_TFN",
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
name: Optional[str] = None,
):
self.replacement_pairs = (
replacement_pairs if replacement_pairs else [("-", ""), (" ", "")]
Expand All @@ -64,6 +65,7 @@ def __init__(
patterns=patterns,
context=context,
supported_language=supported_language,
name=name,
)

def validate_result(self, pattern_text: str) -> bool:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ def __init__(
context: Optional[List[str]] = None,
supported_language: str = "fi",
supported_entity: str = "FI_PERSONAL_IDENTITY_CODE",
name: Optional[str] = None,
):
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
Expand All @@ -42,6 +43,7 @@ def __init__(
patterns=patterns,
context=context,
supported_language=supported_language,
name=name,
)

def validate_result(self, pattern_text: str) -> Optional[bool]:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@ def __init__(
supported_language: str = "en",
supported_entity: str = "IN_AADHAAR",
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
name: Optional[str] = None,
) -> None:
self.replacement_pairs = (
replacement_pairs
Expand All @@ -54,6 +55,7 @@ def __init__(
patterns=patterns,
context=context,
supported_language=supported_language,
name=name,
)

def validate_result(self, pattern_text: str) -> bool:
Expand Down
Loading
Loading