From 159315e1f1a82663dc06a2c671e639ccac0a9ac6 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 19 Oct 2025 15:03:41 +0300 Subject: [PATCH 01/30] initial code for pydantic based validation for yaml files --- .../analyzer_engine_provider.py | 11 +- .../presidio_analyzer/analyzer_request.py | 5 +- .../input_validation/__init__.py | 21 + .../input_validation/schemas.py | 142 +++ .../yaml_recognizer_models.py | 490 +++++++++++ .../nlp_engine/ner_model_configuration.py | 136 +-- .../nlp_engine/nlp_engine_provider.py | 43 +- .../presidio_analyzer/pattern.py | 20 + .../recognizers_loader_utils.py | 43 +- presidio-analyzer/pyproject.toml | 3 +- presidio-analyzer/test-output.xml | 833 ++++++++++++++++++ .../tests/conf/custom_recognizer_yaml.yaml | 1 + .../tests/test_configuration_validator.py | 85 ++ .../tests/test_ner_model_configuration.py | 54 +- .../tests/test_nlp_engine_provider.py | 2 +- presidio-analyzer/tests/test_pattern.py | 46 + .../test_recognizer_registry_provider.py | 4 +- .../tests/test_yaml_recognizer_models.py | 744 ++++++++++++++++ 18 files changed, 2568 insertions(+), 115 deletions(-) create mode 100644 presidio-analyzer/presidio_analyzer/input_validation/__init__.py create mode 100644 presidio-analyzer/presidio_analyzer/input_validation/schemas.py create mode 100644 presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py create mode 100644 presidio-analyzer/test-output.xml create mode 100644 presidio-analyzer/tests/test_configuration_validator.py create mode 100644 presidio-analyzer/tests/test_yaml_recognizer_models.py diff --git a/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py b/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py index c198e9000d..2689133a42 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py @@ -5,6 +5,7 @@ import yaml from presidio_analyzer import AnalyzerEngine, RecognizerRegistry +from presidio_analyzer.input_validation import ConfigurationValidator from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider from presidio_analyzer.recognizer_registry import RecognizerRegistryProvider @@ -36,7 +37,7 @@ def __init__( def get_configuration( self, conf_file: Optional[Union[Path, str]] ) -> Union[Dict[str, Any]]: - """Retrieve the analyzer engine configuration from the provided file.""" + """Retrieve analyzer engine configuration from the provided file.""" if not conf_file: default_conf_file = self._get_full_conf_path() @@ -63,6 +64,14 @@ def get_configuration( with open(self._get_full_conf_path()) as file: configuration = yaml.safe_load(file) + # Validate validation using enhanced validation + try: + ConfigurationValidator.validate_analyzer_configuration(configuration) + logger.debug("Analyzer validation validation passed") + except ValueError as e: + logger.error(f"Invalid analyzer validation: {e}") + raise ValueError(f"Configuration validation failed: {e}") + return configuration def create_engine(self) -> AnalyzerEngine: diff --git a/presidio-analyzer/presidio_analyzer/analyzer_request.py b/presidio-analyzer/presidio_analyzer/analyzer_request.py index 87574c7e7b..669d8bb822 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_request.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_request.py @@ -37,5 +37,6 @@ def __init__(self, req_data: Dict): self.context = req_data.get("context") self.allow_list = req_data.get("allow_list") self.allow_list_match = req_data.get("allow_list_match", "exact") - self.regex_flags = req_data.get("regex_flags", - re.DOTALL | re.MULTILINE | re.IGNORECASE) + self.regex_flags = req_data.get( + "regex_flags", re.DOTALL | re.MULTILINE | re.IGNORECASE + ) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/__init__.py b/presidio-analyzer/presidio_analyzer/input_validation/__init__.py new file mode 100644 index 0000000000..1e26821264 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/input_validation/__init__.py @@ -0,0 +1,21 @@ +"""Configuration validation module for Presidio.""" + +from .schemas import ConfigurationValidator +from .yaml_recognizer_models import ( + BaseRecognizerConfig, + CustomRecognizerConfig, + LanguageContextConfig, + PredefinedRecognizerConfig, + RecognizerRegistryConfig, + YamlRecognizerProcessor, +) + +__all__ = [ + "ConfigurationValidator", + "BaseRecognizerConfig", + "CustomRecognizerConfig", + "LanguageContextConfig", + "PredefinedRecognizerConfig", + "RecognizerRegistryConfig", + "YamlRecognizerProcessor", +] diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py new file mode 100644 index 0000000000..437be07d8b --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -0,0 +1,142 @@ +import re +from pathlib import Path +from typing import Any, Dict, List, Union + +from pydantic import ValidationError + + +class ConfigurationValidator: + """Class for validating configurations using Pydantic-enabled classes.""" + + @staticmethod + def validate_language_codes(languages: List[str]) -> List[str]: + """Validate language codes format. + + :param languages: List of languages to validate. + """ + for lang in languages: + if not re.match(r"^[a-z]{2}(-[A-Z]{2})?$", lang): + raise ValueError( + f"Invalid language code format: {lang}. " + f"Expected format: 'en' or 'en-US'" + ) + return languages + + @staticmethod + def validate_file_path(file_path: Union[str, Path]) -> Path: + """Validate file path exists and is readable. + + :param file_path: Path to validate. + """ + path = Path(file_path) + if not path.exists(): + raise ValueError(f"Configuration file does not exist: {path}") + if not path.is_file(): + raise ValueError(f"Path is not a file: {path}") + return path + + @staticmethod + def validate_score_threshold(threshold: float) -> float: + """Validate score threshold is within valid range. + + :param threshold: score threshold to validate. + """ + if not 0.0 <= threshold <= 1.0: + raise ValueError( + f"Score threshold must be between 0.0 and 1.0, got: {threshold}" + ) + return threshold + + @staticmethod + def validate_nlp_configuration(config: Dict[str, Any]) -> Dict[str, Any]: + """Validate NLP validation structure. + + :param config: NLP Configuration to validate. + """ + if not isinstance(config, dict): + raise ValueError("NLP validation must be a dictionary") + + required_fields = ["nlp_engine_name", "models"] + missing_fields = [field for field in required_fields if field not in config] + if missing_fields: + raise ValueError( + f"NLP validation missing required fields: {missing_fields}" + ) + + # Validate models structure + if not isinstance(config["models"], list) or not config["models"]: + raise ValueError("Models must be a non-empty list") + + for model in config["models"]: + if not isinstance(model, dict): + raise ValueError("Each model must be a dictionary") + if "lang_code" not in model or "model_name" not in model: + raise ValueError("Each model must have 'lang_code' and 'model_name'") + + return config + + @staticmethod + def validate_recognizer_registry_configuration( + config: Dict[str, Any], + ) -> Dict[str, Any]: + """Validate recognizer registry validation using Pydantic models.""" + try: + from .yaml_recognizer_models import RecognizerRegistryConfig + + # Use Pydantic model for validation + validated_config = RecognizerRegistryConfig(**config) + return validated_config.model_dump() + except ValidationError as e: + raise ValueError(f"Invalid recognizer registry validation: {e}") + except ImportError: + # Fallback to basic validation if models not available + return ConfigurationValidator._validate_recognizer_registry_basic(config) + + @staticmethod + def _validate_recognizer_registry_basic(config: Dict[str, Any]) -> Dict[str, Any]: + """Validate recognizer registry config.""" + if not isinstance(config, dict): + raise ValueError("Recognizer registry validation must be a dictionary") + + # Validate supported languages + if "supported_languages" in config: + ConfigurationValidator.validate_language_codes( + config["supported_languages"] + ) + + # Validate recognizers list + if "recognizers" in config and not isinstance(config["recognizers"], list): + raise ValueError("Recognizers must be a list") + + return config + + @staticmethod + def validate_analyzer_configuration(config: Dict[str, Any]) -> Dict[str, Any]: + """Validate analyzer engine validation.""" + if not isinstance(config, dict): + raise ValueError("Analyzer validation must be a dictionary") + + # Validate supported languages if present + if "supported_languages" in config: + ConfigurationValidator.validate_language_codes( + config["supported_languages"] + ) + + # Validate score threshold if present + if "default_score_threshold" in config: + ConfigurationValidator.validate_score_threshold( + config["default_score_threshold"] + ) + + # Validate nested configurations + if "nlp_configuration" in config: + ConfigurationValidator.validate_nlp_configuration( + config["nlp_configuration"] + ) + + if "recognizer_registry" in config: + ConfigurationValidator.validate_recognizer_registry_configuration( + config["recognizer_registry"] + ) + + return config diff --git a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py new file mode 100644 index 0000000000..ab0960fbd7 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py @@ -0,0 +1,490 @@ +"""Pydantic models for YAML recognizer configurations.""" + +from typing import Any, Dict, List, Optional, Union + +import regex as re +from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator + + +class LanguageContextConfig(BaseModel): + """Configuration for language-specific validation with context words. + + :param language: Language code (e.g., 'en', 'es') + :param context: Context words for this language + """ + + language: str = Field(..., description="Language code (e.g., 'en', 'es')") + context: Optional[List[str]] = Field( + default=None, description="Context words for this language" + ) + + @field_validator("language") + @classmethod + def validate_language_code(cls, v: str) -> str: + """Validate language code format.""" + if not re.match(r"^[a-z]{2}(-[A-Z]{2})?$", v): + raise ValueError( + f"Invalid language code format: {v}. Expected format: 'en' or 'en-US'" + ) + return v + + +class BaseRecognizerConfig(BaseModel): + """Base validation for all recognizer configuration types. + + :param name: Name of the recognizer + :param enabled: Whether the recognizer is enabled + :param type: Type of recognizer (predefined/custom) + :param supported_language: Single supported language (legacy) + :param supported_languages: Multiple supported languages with optional context. + Passing multiple languages will result in multiple actual + recognizers initialized in Presidio. + :param context: context words. Context is best defined + in the language-specific configuration, + as it is language-dependent. If context is defined outside, + it should only work if the user passed one language + (either in supported_language or have a supported_languages with length 1). + :param supported_entity: Supported entity for this recognizer (legacy) + :param supported_entities: List of supported entities for this recognizer. + """ + + name: str = Field(..., description="Name of the recognizer") + enabled: bool = Field(default=True, description="Whether the recognizer is enabled") + type: Optional[str] = Field( + default="predefined", description="Type of recognizer (predefined/custom)" + ) + supported_language: Optional[str] = Field( + default=None, description="The language this recognizer supports" + ) + supported_languages: Optional[Union[List[str], List[LanguageContextConfig]]] = ( + Field( + default=None, + description="Multiple supported languages with optional context", + ) + ) + context: Optional[List[str]] = Field( + default=None, description="Global context words" + ) + supported_entity: Optional[str] = Field( + default=None, description="Supported entity for this recognizer" + ) + supported_entities: Optional[List[str]] = Field( + default=None, description="List of supported entities " "for this recognizer" + ) + + @field_validator("supported_language") + @classmethod + def validate_single_language(cls, v: Optional[str]) -> Optional[str]: + """Validate single language code format.""" + if v and not re.match(r"^[a-z]{2}(-[A-Z]{2})?$", v): + raise ValueError(f"Invalid language code format: {v}") + return v + + @model_validator(mode="after") + def validate_language_configuration(self): + """Ensure proper language validation.""" + if self.supported_language and self.supported_languages: + raise ValueError( + "Cannot specify both 'supported_language' and 'supported_languages'" + ) + + if self.supported_language: + self.supported_languages = [self.supported_language] + self.supported_language = None + + # If neither is specified, this is allowed for + # predefined recognizers (defaults will be used) + return self + + @model_validator(mode="after") + def validate_entity_configuration(self): + """Ensure proper entity validation.""" + if self.supported_entity and self.supported_entities: + raise ValueError( + "Cannot specify both 'supported_entity' and 'supported_entities'" + ) + + if self.supported_entity: + self.supported_entities = [self.supported_entity] + self.supported_entity = None + + # If neither is specified, this is allowed for + # predefined recognizers (defaults will be used) + return self + + @model_validator(mode="after") + def validate_context_configuration(self): + """Validate context configuration according to language settings.""" + # Check if global context is defined + if self.context: + # Global context is only valid if we have exactly one language + if self.supported_languages and len(self.supported_languages) > 1: + raise ValueError( + "Global context can only be used with a single language. " + "For multiple languages, define context in " + "language-specific configurations." + "Example: " + " supported_languages: " + " - language: en " + " context: [credit, card, visa, mastercard] " + " - language: es " + " context: [tarjeta, credito, visa, mastercard] " + ) + return self + + +class PredefinedRecognizerConfig(BaseRecognizerConfig): + """Configuration for predefined recognizers.""" + + type: str = Field(default="predefined", description="Type of recognizer") + + @model_validator(mode="after") + def validate_predefined_recognizer_exists(self): + """Validate that the predefined recognizer class actually exists.""" + try: + # Lazy import to avoid circular dependency + from presidio_analyzer.recognizer_registry.recognizers_loader_utils import ( + RecognizerListLoader, + ) + + RecognizerListLoader.get_existing_recognizer_cls(self.name) + except (ImportError, ModuleNotFoundError): + return self + except ValueError as e: + available_recognizers = [ + cls.__name__ + for cls in RecognizerListLoader.get_all_existing_recognizers() + ] + raise ValueError( + f"Predefined recognizer '{self.name}' not found. " + f"Available predefined recognizers: " + f"{', '.join(sorted(available_recognizers))}" + ) from e + return self + + +class CustomRecognizerConfig(BaseRecognizerConfig): + """Configuration for custom pattern-based recognizers.""" + + type: str = Field(default="custom", description="Type of recognizer") + supported_entity: str = Field( + ..., description="Entity type this recognizer detects" + ) + patterns: Optional[List[Dict[str, Any]]] = Field( + default=None, description="List of patterns" + ) + context: Optional[List[str]] = Field( + default=None, description="Global context words" + ) + deny_list: Optional[List[str]] = Field( + default=None, description="Words to deny/exclude" + ) + deny_list_score: Optional[float] = Field( + default=0.0, ge=0.0, le=1.0, description="Deny list score" + ) + + # Language validation (legacy and new formats) + supported_language: Optional[str] = Field( + default=None, description="Single supported language (legacy)" + ) + supported_languages: Optional[Union[List[str], List[LanguageContextConfig]]] = ( + Field( + default=None, + description="Multiple supported languages with optional context", + ) + ) + + model_config = ConfigDict(arbitrary_types_allowed=True) + + @field_validator("patterns") + @classmethod + def validate_patterns(cls, patterns: Optional[List[Dict]]) -> Optional[str]: + """Validate single language code format.""" + if patterns and not isinstance(patterns, list): + raise ValueError(f"Patterns should be a list: {patterns}") + + for pattern in patterns: + if not isinstance(pattern, dict): + raise ValueError(f"Pattern should be a dict: {pattern}") + if "name" not in pattern: + raise ValueError(f"Pattern should contain a name field: {pattern}") + if "regex" not in pattern: + raise ValueError(f"Pattern should contain a regex field: {pattern}") + if "score" not in pattern: + raise ValueError(f"Pattern should contain a score field: {pattern}") + if not isinstance(pattern["score"], float): + raise ValueError(f"Pattern score should be a float: {pattern}") + if pattern["score"] < 0 or pattern["score"] > 1: + raise ValueError(f"Pattern score should be between 0 and 1: {pattern}") + return patterns + + @field_validator("supported_language") + @classmethod + def validate_single_language(cls, v: Optional[str]) -> Optional[str]: + """Validate single language code format.""" + if v and not re.match(r"^[a-z]{2}(-[A-Z]{2})?$", v): + raise ValueError(f"Invalid language code format: {v}") + return v + + @model_validator(mode="after") + def validate_configuration(self): + """Ensure configuration is valid.""" + # Check if user accidentally marked a predefined recognizer as custom + try: + # Lazy import to avoid circular dependency + from presidio_analyzer.recognizer_registry.recognizers_loader_utils import ( + RecognizerListLoader, + ) + + try: + RecognizerListLoader.get_existing_recognizer_cls(self.name) + raise ValueError( + f"Recognizer '{self.name}' is a predefined recognizer " + f"but is marked as 'custom'. " + f"Either use type: 'predefined' or choose a different " + f"name for your custom recognizer." + ) + except ValueError as e: + if "was not found" not in str(e): + raise + except (ImportError, ModuleNotFoundError): + pass + + # Validate patterns or deny_list + if not self.patterns and not self.deny_list: + raise ValueError( + "Custom recognizer must have at least one " + "of 'patterns' or 'deny_list'" + ) + return self + + +class RecognizerRegistryConfig(BaseModel): + """Complete validation for the recognizer registry.""" + + supported_languages: Optional[List[str]] = Field( + default=None, description="List of supported languages" + ) + global_regex_flags: int = Field(default=26, description="Global regex flags") + recognizers: List[ + Union[PredefinedRecognizerConfig, CustomRecognizerConfig, str] + ] = Field(default_factory=list, description="List of recognizer configurations") + + @field_validator("supported_languages") + @classmethod + def validate_language_codes(cls, v: List[str]) -> List[str]: + """Validate language codes format.""" + + if v is None or len(v) == 0: + # Allow empty languages, which will be filled later + # by the languages of the recognizers. + return v + + for lang in v: + if not re.match(r"^[a-z]{2}(-[A-Z]{2})?$", lang): + raise ValueError(f"Invalid language code format: {lang}") + return v + + @field_validator("recognizers", mode="before") + @classmethod + def parse_recognizers(cls, v): + """Parse recognizers from various input formats without duplication.""" + if not isinstance(v, list): + raise ValueError("Recognizers must be a list") + + parsed_recognizers = [] + for recognizer in v: + if isinstance(recognizer, str): + # Simple string recognizer name - treat as predefined + parsed_recognizers.append(recognizer) + continue + + if isinstance(recognizer, dict): + recognizer_type = recognizer.get("type") + + # Validate conflicting custom-only fields if explicitly predefined + if recognizer_type == "predefined" and ( + "patterns" in recognizer or "deny_list" in recognizer + ): + raise ValueError( + f"Recognizer '{recognizer.get('name')}' is marked " + f"as 'predefined' but contains 'patterns' or 'deny_list' " + f"which are only valid for custom recognizers. " + f"Either use type: 'custom' or remove these fields." + ) + + # Auto-detect type if not provided + if not recognizer_type: + if "patterns" in recognizer or "deny_list" in recognizer: + recognizer_type = "custom" + recognizer_name = recognizer.get("name") + if recognizer_name: + cls.__check_if_predefined(recognizer_name) + else: + recognizer_type = "predefined" + recognizer["type"] = recognizer_type + + # Final append based on resolved type (only once) + if recognizer_type == "predefined": + parsed_recognizers.append(PredefinedRecognizerConfig(**recognizer)) + elif recognizer_type == "custom": + parsed_recognizers.append(CustomRecognizerConfig(**recognizer)) + else: + raise ValueError( + f"Invalid recognizer type: {recognizer_type}. " + f"Must be 'predefined' or 'custom'." + ) + continue + + # Fallback: unrecognized structure, keep as-is + parsed_recognizers.append(recognizer) + + return parsed_recognizers + + @classmethod + def __check_if_predefined(cls, recognizer_name: Any | None): + try: + from presidio_analyzer.recognizer_registry.recognizers_loader_utils import ( + RecognizerListLoader, + ) + + try: + RecognizerListLoader.get_existing_recognizer_cls(recognizer_name) + raise ValueError( + f"Recognizer '{recognizer_name}' is a recognizer predefined in " + f"code but has 'patterns' or 'deny_list' defined. " + f"Either use type: 'predefined' " + f"or choose a different name for your custom recognizer." + ) + except ValueError as e: + if "was not found" not in str(e): + raise + except ImportError: + pass + + @model_validator(mode="after") + def validate_language_presence(self): + """Ensure custom recognizers define languages if no global languages are set.""" + if self.recognizers and ( + not self.supported_languages or len(self.supported_languages) == 0 + ): + any_language_defined = False + custom_without_language_present = False + for r in self.recognizers: + if isinstance(r, (PredefinedRecognizerConfig, CustomRecognizerConfig)): + # Track if any language is defined + if (r.supported_language and r.supported_language.strip()) or ( + r.supported_languages and len(r.supported_languages) > 0 + ): + any_language_defined = True + # Track custom recognizers lacking language info + if ( + isinstance(r, CustomRecognizerConfig) + and not r.supported_language + and not r.supported_languages + ): + custom_without_language_present = True + + if custom_without_language_present and not any_language_defined: + raise ValueError( + "Language configuration missing for custom recognizer(s): " + "provide 'supported_languages' at registry level " + "or specify languages for each custom recognizer." + ) + return self + + +class YamlRecognizerProcessor: + """Utility class to process YAML recognizer configurations.""" + + @staticmethod + def expand_recognizer_configs( + recognizer_config: Union[ + PredefinedRecognizerConfig, CustomRecognizerConfig, str + ], + registry_supported_languages: List[str], + ) -> List[Dict[str, Any]]: + """ + Expand a recognizer validation into multiple recognizer instances. + + This handles the logic where one YAML recognizer + can create multiple actual recognizers + based on language configurations. + """ + if isinstance(recognizer_config, str): + # Simple string name - create for all registry languages + return [ + { + "name": recognizer_config, + "supported_language": lang, + "type": "predefined", + } + for lang in registry_supported_languages + ] + + expanded_configs = [] + + # Handle language expansion + if recognizer_config.supported_language: + # Single language (legacy format) + config_dict = recognizer_config.model_dump() + config_dict["supported_language"] = recognizer_config.supported_language + if "supported_languages" in config_dict: + del config_dict["supported_languages"] + expanded_configs.append(config_dict) + + elif recognizer_config.supported_languages: + # Multiple languages + for lang_config in recognizer_config.supported_languages: + config_dict = recognizer_config.model_dump() + + config_dict["supported_language"] = lang_config + config_dict["context"] = recognizer_config.context # Use global context + + if "supported_languages" in config_dict: + del config_dict["supported_languages"] + expanded_configs.append(config_dict) + else: + # No language specified - use the default recognizer language + # (for predefined only) + # For custom, raise an exception. + if isinstance(recognizer_config, CustomRecognizerConfig): + # Custom recognizers must specify languages + raise ValueError( + f"Custom recognizer '{recognizer_config.name}' " + f"must specify supported languages" + ) + else: + config_dict = recognizer_config.model_dump(exclude_unset=True) + config_dict["type"] = recognizer_config.type + expanded_configs.append(config_dict) + + return expanded_configs + + @staticmethod + def create_pattern_recognizers_from_config( + custom_config: CustomRecognizerConfig, registry_supported_languages: List[str] + ) -> List[Dict[str, Any]]: + """Create PatternRecognizer configurations from CustomRecognizerConfig.""" + expanded_configs = YamlRecognizerProcessor.expand_recognizer_configs( + custom_config, registry_supported_languages + ) + + pattern_recognizer_configs = [] + for config in expanded_configs: + # Convert patterns to the format expected by PatternRecognizer.from_dict() + if "patterns" in config: + config["patterns"] = [ + pattern.model_dump() if hasattr(pattern, "model_dump") else pattern + for pattern in config["patterns"] + ] + + # Ensure supported_entities is a list with the single entity + if "supported_entity" in config: + if config["supported_entity"] is not None: + config["supported_entities"] = [config["supported_entity"]] + del config["supported_entity"] + + pattern_recognizer_configs.append(config) + + return pattern_recognizer_configs diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py index 849b895985..74fabe5930 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py @@ -1,6 +1,7 @@ import logging -from dataclasses import dataclass -from typing import Collection, Dict, Optional, Type +from typing import Collection, Dict, Optional + +from pydantic import BaseModel, ConfigDict, Field, field_validator logger = logging.getLogger("presidio-analyzer") @@ -29,9 +30,8 @@ LOW_SCORE_ENTITY_NAMES = set() -@dataclass -class NerModelConfiguration: - """NER model configuration. +class NerModelConfiguration(BaseModel): + """NER model configuration using Pydantic validation. :param labels_to_ignore: List of labels to not return predictions for. :param aggregation_strategy: @@ -48,73 +48,81 @@ class NerModelConfiguration: Multiplier to the score given for low_score_entity_names. """ # noqa E501 - labels_to_ignore: Optional[Collection[str]] = None - aggregation_strategy: Optional[str] = "max" - stride: Optional[int] = 14 - alignment_mode: Optional[str] = "expand" - default_score: Optional[float] = 0.85 - model_to_presidio_entity_mapping: Optional[Dict[str, str]] = None - low_score_entity_names: Optional[Collection] = None - low_confidence_score_multiplier: Optional[float] = 0.4 - - def __post_init__(self): - """Validate the configuration and set defaults.""" - if self.model_to_presidio_entity_mapping is None: - logger.warning( - "model_to_presidio_entity_mapping is missing from configuration, " - "using default" - ) - self.model_to_presidio_entity_mapping = MODEL_TO_PRESIDIO_ENTITY_MAPPING - if self.low_score_entity_names is None: - logger.warning( - "low_score_entity_names is missing from configuration, " "using default" - ) - self.low_score_entity_names = LOW_SCORE_ENTITY_NAMES - if self.labels_to_ignore is None: + labels_to_ignore: Optional[Collection[str]] = Field( + default_factory=list, description="List of labels to ignore" + ) + aggregation_strategy: Optional[str] = Field( + default="max", description="Token classification aggregation strategy" + ) + stride: Optional[int] = Field( + default=14, description="Stride for token classification" + ) + alignment_mode: Optional[str] = Field( + default="expand", description="Alignment mode for spaCy char spans" + ) + default_score: Optional[float] = Field( + default=0.85, ge=0.0, le=1.0, description="Default confidence score" + ) + model_to_presidio_entity_mapping: Optional[Dict[str, str]] = Field( + default_factory=lambda: MODEL_TO_PRESIDIO_ENTITY_MAPPING.copy(), + description="Mapping between model entities and Presidio entities", + ) + low_score_entity_names: Optional[Collection[str]] = Field( + default_factory=lambda: LOW_SCORE_ENTITY_NAMES.copy(), + description="Entity names with likely low detection accuracy", + ) + low_confidence_score_multiplier: Optional[float] = Field( + default=0.4, ge=0.0, description="Score multiplier for low confidence entities" + ) + + model_config = ConfigDict(arbitrary_types_allowed=True) + + @field_validator("aggregation_strategy") + @classmethod + def validate_aggregation_strategy(cls, v: str) -> str: + """Validate aggregation strategy.""" + valid_strategies = ["simple", "first", "average", "max"] + if v not in valid_strategies: logger.warning( - "labels_to_ignore is missing from configuration, " "using default" + f"Aggregation strategy '{v}' might not be supported. " + f"Valid options: {valid_strategies}" ) - self.labels_to_ignore = {} + return v + @field_validator("stride") @classmethod - def _validate_input(cls, ner_model_configuration_dict: Dict) -> None: - key_to_type = { - "labels_to_ignore": Collection, - "aggregation_strategy": str, - "alignment_mode": str, - "model_to_presidio_entity_mapping": dict, - "low_confidence_score_multiplier": float, - "low_score_entity_names": Collection, - "stride": int, - } - - for key, field_type in key_to_type.items(): - cls.__validate_type( - config_dict=ner_model_configuration_dict, key=key, field_type=field_type + def validate_stride(cls, v: Optional[int]) -> int: + """Validate stride and handle None values.""" + if v is None: + # Get the default value from the field definition + return cls.model_fields["stride"].default + return v + + @field_validator("alignment_mode") + @classmethod + def validate_alignment_mode(cls, v: Optional[str]) -> str: + """Validate alignment mode and handle None values.""" + if v is None: + # Get the default value from the field definition + return cls.model_fields["alignment_mode"].default + valid_modes = ["strict", "contract", "expand"] + if v not in valid_modes: + logger.warning( + f"Alignment mode '{v}' might not be supported. " + f"Valid options: {valid_modes}" ) - - @staticmethod - def __validate_type(config_dict: Dict, key: str, field_type: Type) -> None: - if key in config_dict: - if not isinstance(config_dict[key], field_type): - raise ValueError(f"{key} must be of type {field_type}") + return v @classmethod - def from_dict(cls, nlp_engine_configuration: Dict) -> "NerModelConfiguration": - """Load NLP engine configuration from dict. - - :param nlp_engine_configuration: Dict with the configuration to load. + def from_dict(cls, ner_model_configuration_dict: Dict) -> "NerModelConfiguration": """ - cls._validate_input(nlp_engine_configuration) + Create NerModelConfiguration from a dictionary with Pydantic validation. - return cls(**nlp_engine_configuration) + :param ner_model_configuration_dict: Dictionary containing configuration + :return: NerModelConfiguration instance + """ + return cls(**ner_model_configuration_dict) def to_dict(self) -> Dict: - """Return the configuration as a dict.""" - return self.__dict__ - - def __str__(self) -> str: # noqa D105 - return str(self.to_dict()) - - def __repr__(self) -> str: # noqa D105 - return str(self) + """Convert to dictionary representation.""" + return self.model_dump(exclude_none=True) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py index 921c87190d..997265105a 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py @@ -4,6 +4,7 @@ import yaml +from presidio_analyzer.input_validation import ConfigurationValidator from presidio_analyzer.nlp_engine import ( NerModelConfiguration, NlpEngine, @@ -59,7 +60,7 @@ def __init__( self._validate_nlp_configuration(nlp_configuration) self.nlp_configuration = nlp_configuration - if conf_file or conf_file == '': + if conf_file or conf_file == "": self._validate_conf_file_path(conf_file) self.nlp_configuration = self._read_nlp_conf(conf_file) @@ -79,7 +80,7 @@ def _validate_nlp_engines(nlp_engines: Tuple) -> None: if not isinstance(nlp_engines, tuple): raise ValueError(f"nlp_engines must be a tuple, got {type(nlp_engines)}") - required_attributes = ['engine_name', 'is_available'] + required_attributes = ["engine_name", "is_available"] for engine_class in nlp_engines: missing_attributes = [] @@ -117,43 +118,25 @@ def _validate_nlp_configuration(nlp_configuration: Dict) -> None: :param nlp_configuration: The configuration dictionary to validate """ - if not isinstance(nlp_configuration, Dict): - raise ValueError(f"nlp_configuration must be a dictionary, " - f"got {type(nlp_configuration)}") - - required_fields = ['nlp_engine_name', 'models'] - missing_fields = [] - - for field in required_fields: - if field not in nlp_configuration.keys(): - missing_fields.append(field) - - if missing_fields: - raise ValueError( - f"nlp_configuration is missing required fields: {missing_fields}. " - f"Required fields are: {required_fields}" - ) + try: + ConfigurationValidator.validate_nlp_configuration(nlp_configuration) + except ValueError as e: + raise ValueError(f"Invalid NLP configuration: {e}") @staticmethod def _validate_conf_file_path(conf_file: Union[Path, str]) -> None: """ - Validate the conf file path. + Validate the conf file path using enhanced validation. :param conf_file: The conf file path to validate """ - - if conf_file == '': + if conf_file == "": raise ValueError("conf_file is empty") - if not isinstance(conf_file, (Path, str)): - raise ValueError(f"conf_file must be a string or Path, " - f"got {type(conf_file)}") - - if not Path(conf_file).exists(): - raise ValueError(f"conf_file {conf_file} does not exist") - - if Path(conf_file).is_dir(): - raise ValueError(f"conf_file {conf_file} is a directory, not a file") + try: + ConfigurationValidator.validate_file_path(conf_file) + except ValueError as e: + raise ValueError(str(e)) def create_engine(self) -> NlpEngine: """Create an NLP engine instance.""" diff --git a/presidio-analyzer/presidio_analyzer/pattern.py b/presidio-analyzer/presidio_analyzer/pattern.py index f37f8052d2..dedf61da12 100644 --- a/presidio-analyzer/presidio_analyzer/pattern.py +++ b/presidio-analyzer/presidio_analyzer/pattern.py @@ -1,6 +1,8 @@ import json from typing import Dict +import regex as re + class Pattern: """ @@ -18,6 +20,24 @@ def __init__(self, name: str, regex: str, score: float): self.compiled_regex = None self.compiled_with_flags = None + self.__validate_regex(self.regex) + self.__validate_score(self.score) + + @staticmethod + def __validate_regex(v: str) -> None: + """Validate that the regex pattern is valid.""" + try: + re.compile(v) + except re.error as e: + raise ValueError(f"Invalid regex pattern: {e}") + + @staticmethod + def __validate_score(score: float) -> None: + if score < 0 or score > 1: + raise ValueError( + f"Invalid score: {score}. " "Score should be between 0 and 1" + ) + def to_dict(self) -> Dict: """ Turn this instance into a dictionary. diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py index 358ea9a622..c617dd7e9f 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py @@ -56,7 +56,8 @@ def _split_recognizers( predefined = [ recognizer_conf for recognizer_conf in recognizers_conf - if ("type" in recognizer_conf and recognizer_conf["type"] == "predefined") + if isinstance(recognizer_conf, dict) + and ("type" in recognizer_conf and recognizer_conf["type"] == "predefined") ] custom = [ recognizer_conf @@ -182,7 +183,7 @@ def get_all_existing_recognizers( ) @staticmethod - def _get_existing_recognizer_cls(recognizer_name: str) -> Type[EntityRecognizer]: + def get_existing_recognizer_cls(recognizer_name: str) -> Type[EntityRecognizer]: """ Get the recognizer class by name. @@ -215,32 +216,37 @@ def get( """ recognizer_instances = [] predefined, custom = RecognizerListLoader._split_recognizers(recognizers) + predefined_to_exclude = {"enabled", "type", "supported_languages", "name"} + # For custom recognizers we keep 'supported_languages' + # so we can create per-language + # instances with their specific context values. + custom_to_exclude = {"enabled", "type"} for recognizer_conf in predefined: for language_conf in RecognizerListLoader._get_recognizer_languages( recognizer_conf=recognizer_conf, supported_languages=supported_languages ): if RecognizerListLoader.is_recognizer_enabled(recognizer_conf): - copied_recognizer_conf = { - k: v - for k, v in RecognizerListLoader._get_recognizer_items( - recognizer_conf=recognizer_conf - ) - if k not in ["enabled", "type", "supported_languages", "name"] - } - kwargs = {**copied_recognizer_conf, **language_conf} + new_conf = RecognizerListLoader._filter_recognizer_fields( + recognizer_conf, to_exclude=predefined_to_exclude + ) + + kwargs = {**new_conf, **language_conf} recognizer_name = RecognizerListLoader.get_recognizer_name( recognizer_conf=recognizer_conf ) - recognizer_cls = RecognizerListLoader._get_existing_recognizer_cls( + recognizer_cls = RecognizerListLoader.get_existing_recognizer_cls( recognizer_name=recognizer_name ) recognizer_instances.append(recognizer_cls(**kwargs)) for recognizer_conf in custom: if RecognizerListLoader.is_recognizer_enabled(recognizer_conf): + new_conf = RecognizerListLoader._filter_recognizer_fields( + recognizer_conf, to_exclude=custom_to_exclude + ) recognizer_instances.extend( RecognizerListLoader._create_custom_recognizers( - recognizer_conf=recognizer_conf, + recognizer_conf=new_conf, supported_languages=supported_languages, ) ) @@ -259,6 +265,19 @@ def get( return recognizer_instances + @staticmethod + def _filter_recognizer_fields( + recognizer_conf: Dict[str, Any], to_exclude: Set[str] + ) -> Dict[str, Any]: + copied_recognizer_conf = { + k: v + for k, v in RecognizerListLoader._get_recognizer_items( + recognizer_conf=recognizer_conf + ) + if k not in to_exclude + } + return copied_recognizer_conf + class RecognizerConfigurationLoader: """A utility class that initializes recognizer registry configuration.""" diff --git a/presidio-analyzer/pyproject.toml b/presidio-analyzer/pyproject.toml index f31a1978fa..d0ce1a809b 100644 --- a/presidio-analyzer/pyproject.toml +++ b/presidio-analyzer/pyproject.toml @@ -27,7 +27,8 @@ dependencies = [ "regex", "tldextract", "pyyaml", - "phonenumbers (>=8.12,<10.0.0)" + "phonenumbers (>=8.12,<10.0.0)", + "pydantic (>=2.0.0,<3.0.0)" ] [project.optional-dependencies] diff --git a/presidio-analyzer/test-output.xml b/presidio-analyzer/test-output.xml new file mode 100644 index 0000000000..168e5c7e4f --- /dev/null +++ b/presidio-analyzer/test-output.xml @@ -0,0 +1,833 @@ +/Users/omri.mendels/Library/Caches/pypoetry/virtualenvs/presidio-evaluator-nCKHFi6i-py3.10/bin/pytest \ No newline at end of file diff --git a/presidio-analyzer/tests/conf/custom_recognizer_yaml.yaml b/presidio-analyzer/tests/conf/custom_recognizer_yaml.yaml index 36adf864a5..d75abe5359 100644 --- a/presidio-analyzer/tests/conf/custom_recognizer_yaml.yaml +++ b/presidio-analyzer/tests/conf/custom_recognizer_yaml.yaml @@ -13,6 +13,7 @@ recognizer_registry: supported_entity: "ZIP" - name: "SpacyRecognizer" enabled: false + type: predefined supported_languages: - en diff --git a/presidio-analyzer/tests/test_configuration_validator.py b/presidio-analyzer/tests/test_configuration_validator.py new file mode 100644 index 0000000000..36d74bc25e --- /dev/null +++ b/presidio-analyzer/tests/test_configuration_validator.py @@ -0,0 +1,85 @@ +"""Tests for the Pydantic-based validation validation system using existing adapted classes.""" +import pytest + +from presidio_analyzer.input_validation import ConfigurationValidator + + +def test_configuration_validator_language_codes_valid(): + """Test ConfigurationValidator accepts valid language codes.""" + valid_languages = ["en", "es", "fr", "en-US", "es-ES"] + validated = ConfigurationValidator.validate_language_codes(valid_languages) + assert validated == valid_languages + +def test_configuration_validator_language_codes_invalid(): + """Test ConfigurationValidator rejects invalid language codes.""" + invalid_languages = ["invalid_lang"] + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_language_codes(invalid_languages) + + assert "Invalid language code format" in str(exc_info.value) + +def test_configuration_validator_nlp_config_valid(): + """Test ConfigurationValidator accepts valid NLP validation.""" + valid_config = { + "nlp_engine_name": "spacy", + "models": [ + {"lang_code": "en", "model_name": "en_core_web_lg"} + ] + } + + validated = ConfigurationValidator.validate_nlp_configuration(valid_config) + assert validated == valid_config + +def test_configuration_validator_nlp_config_missing_fields(): + """Test ConfigurationValidator rejects NLP config with missing required fields.""" + invalid_config = { + "nlp_engine_name": "spacy" + # Missing "models" field + } + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_nlp_configuration(invalid_config) + + assert "missing required fields" in str(exc_info.value) + +def test_configuration_validator_analyzer_config_valid(): + """Test ConfigurationValidator accepts valid analyzer validation.""" + valid_config = { + "supported_languages": ["en", "es"], + "default_score_threshold": 0.5, + "nlp_configuration": { + "nlp_engine_name": "spacy", + "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}] + } + } + + validated = ConfigurationValidator.validate_analyzer_configuration(valid_config) + assert validated == valid_config + +def test_configuration_validator_analyzer_config_invalid_threshold(): + """Test ConfigurationValidator rejects invalid score threshold.""" + invalid_config = { + "supported_languages": ["en"], + "default_score_threshold": 1.5 # Invalid: > 1.0 + } + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_analyzer_configuration(invalid_config) + + assert "must be between 0.0 and 1.0" in str(exc_info.value) + +def test_file_path_validation_success(tmp_path): + """Test file path validation with existing file.""" + test_file = tmp_path / "test.yaml" + test_file.write_text("test: content") + + validated_path = ConfigurationValidator.validate_file_path(str(test_file)) + assert validated_path == test_file + +def test_file_path_validation_nonexistent(): + """Test file path validation with non-existent file.""" + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_file_path("/nonexistent/file.yaml") + + assert "does not exist" in str(exc_info.value) diff --git a/presidio-analyzer/tests/test_ner_model_configuration.py b/presidio-analyzer/tests/test_ner_model_configuration.py index 09c1e95cc5..56d12070e3 100644 --- a/presidio-analyzer/tests/test_ner_model_configuration.py +++ b/presidio-analyzer/tests/test_ner_model_configuration.py @@ -2,6 +2,7 @@ import pytest import yaml +from pydantic import ValidationError from presidio_analyzer.nlp_engine import NerModelConfiguration @@ -43,9 +44,7 @@ def test_from_dict_happy_path( [ ("stride", []), ("stride", "X"), - ("stride", None), ("alignment_mode", 5), - ("alignment_mode", None), ("low_confidence_score_multiplier", "X"), ], ) @@ -55,3 +54,54 @@ def test_from_dict_wrong_types(ner_model_configuration_dict, key, value): with pytest.raises(ValueError): NerModelConfiguration.from_dict(new_config) + +@pytest.mark.parametrize( + "key, value", + [ + ("stride", None), + ("alignment_mode", None), + ], +) +def test_from_dict_none_resolves_to_default(ner_model_configuration_dict, key, value): + new_config = ner_model_configuration_dict.copy() + new_config[key] = value + ner_model_configuration = NerModelConfiguration.from_dict(new_config) + assert ner_model_configuration.stride is not None + assert ner_model_configuration.alignment_mode is not None + + +def test_ner_model_configuration_validation_success(): + """Test NerModelConfiguration validates correctly.""" + config_data = { + "aggregation_strategy": "max", + "stride": 16, + "alignment_mode": "expand", + "default_score": 0.9, + "low_confidence_score_multiplier": 0.3 + } + + config = NerModelConfiguration.from_dict(config_data) + assert config.aggregation_strategy == "max" + assert config.stride == 16 + assert config.default_score == 0.9 + assert config.low_confidence_score_multiplier == 0.3 + +def test_ner_model_configuration_invalid_score(): + """Test NerModelConfiguration rejects invalid score values.""" + config_data = { + "default_score": 1.5 # Invalid: > 1.0 + } + + with pytest.raises(ValidationError) as exc_info: + NerModelConfiguration.from_dict(config_data) + + assert "less than or equal to 1" in str(exc_info.value) + +def test_backward_compatibility_ner_config_to_dict(): + """Test that NerModelConfiguration maintains backward compatibility.""" + config = NerModelConfiguration(default_score=0.8, stride=20) + config_dict = config.to_dict() + + assert "default_score" in config_dict + assert config_dict["default_score"] == 0.8 + assert config_dict["stride"] == 20 diff --git a/presidio-analyzer/tests/test_nlp_engine_provider.py b/presidio-analyzer/tests/test_nlp_engine_provider.py index b65f315220..285fe5784a 100644 --- a/presidio-analyzer/tests/test_nlp_engine_provider.py +++ b/presidio-analyzer/tests/test_nlp_engine_provider.py @@ -327,7 +327,7 @@ def test_when_conf_file_is_empty_string_then_fail(): def test_when_conf_file_is_not_string_or_path_then_fail(): conf_file = 1 - with pytest.raises(ValueError): + with pytest.raises(TypeError): NlpEngineProvider(conf_file=conf_file) diff --git a/presidio-analyzer/tests/test_pattern.py b/presidio-analyzer/tests/test_pattern.py index a276b69613..255aa9e7b6 100644 --- a/presidio-analyzer/tests/test_pattern.py +++ b/presidio-analyzer/tests/test_pattern.py @@ -27,3 +27,49 @@ def test_when_use_from_dict_return_pattern(my_pattern, my_pattern_dict): assert expected.name == actual.name assert expected.score == actual.score assert expected.regex == actual.regex + + +def test_pattern_validation_success(): + """Test that Pattern class validates correctly with valid data.""" + pattern_data = { + "name": "US ZIP Code", + "regex": r"\b\d{5}(?:-\d{4})?\b", + "score": 0.85 + } + + pattern = Pattern.from_dict(pattern_data) + assert pattern.name == "US ZIP Code" + assert pattern.score == 0.85 + assert pattern.regex == r"\b\d{5}(?:-\d{4})?\b" + +def test_pattern_validation_invalid_regex(): + """Test that Pattern class rejects invalid regex patterns.""" + pattern_data = { + "name": "Invalid Pattern", + "regex": "[unclosed_bracket", # Invalid regex + "score": 0.5 + } + + with pytest.raises(ValueError) as exc_info: + Pattern.from_dict(pattern_data) + + +def test_pattern_validation_invalid_score_range(): + """Test that Pattern class rejects scores outside [0,1] range.""" + pattern_data = { + "name": "Invalid Score", + "regex": r"\btest\b", + "score": 1.5 # Invalid: > 1.0 + } + + with pytest.raises(ValueError) as exc_info: + Pattern.from_dict(pattern_data) + + +def test_backward_compatibility_pattern_to_dict(): + """Test that Pattern maintains backward compatibility with to_dict method.""" + pattern = Pattern(name="test", regex=r"\btest\b", score=0.5) + pattern_dict = pattern.to_dict() + + expected = {"name": "test", "regex": r"\btest\b", "score": 0.5} + assert pattern_dict == expected diff --git a/presidio-analyzer/tests/test_recognizer_registry_provider.py b/presidio-analyzer/tests/test_recognizer_registry_provider.py index 77ba7d4d96..0493a2968f 100644 --- a/presidio-analyzer/tests/test_recognizer_registry_provider.py +++ b/presidio-analyzer/tests/test_recognizer_registry_provider.py @@ -37,8 +37,8 @@ def test_recognizer_registry_provider_configuration_file(): assert [recognizer.supported_language for recognizer in recognizer_registry.recognizers if recognizer.name == "ItFiscalCodeRecognizer"] == ["en", "es"] assert [recognizer.supported_language for recognizer in recognizer_registry.recognizers if recognizer.name == "CreditCardRecognizer"] == ["en"] assert [recognizer.supported_language for recognizer in recognizer_registry.recognizers if recognizer.name == "ExampleCustomRecognizer"] == ["en", "es"] - snpanish_recognizer = [recognizer for recognizer in recognizer_registry.recognizers if recognizer.name == "ExampleCustomRecognizer" and recognizer.supported_language == "es"][0] - assert snpanish_recognizer.context == ["tarjeta", "credito"] + spanish_recognizer = [recognizer for recognizer in recognizer_registry.recognizers if recognizer.name == "ExampleCustomRecognizer" and recognizer.supported_language == "es"][0] + assert spanish_recognizer.context == ["tarjeta", "credito"] def test_recognizer_registry_provider_configuration_file_load_predefined(mandatory_recognizers): diff --git a/presidio-analyzer/tests/test_yaml_recognizer_models.py b/presidio-analyzer/tests/test_yaml_recognizer_models.py new file mode 100644 index 0000000000..8d12225989 --- /dev/null +++ b/presidio-analyzer/tests/test_yaml_recognizer_models.py @@ -0,0 +1,744 @@ +"""Tests for YAML recognizer configuration models.""" + +import pytest +from pydantic import ValidationError + +from presidio_analyzer.input_validation.yaml_recognizer_models import ( + BaseRecognizerConfig, + CustomRecognizerConfig, + LanguageContextConfig, + PredefinedRecognizerConfig, + RecognizerRegistryConfig, + YamlRecognizerProcessor, +) + + +def test_language_context_config_valid(): + """Test LanguageContextConfig validates correctly.""" + lang_config = LanguageContextConfig( + language="en", + context=["credit", "card"] + ) + assert lang_config.language == "en" + assert lang_config.context == ["credit", "card"] + + +def test_language_context_config_valid_with_region(): + """Test LanguageContextConfig with region code.""" + lang_config = LanguageContextConfig( + language="en-US", + context=["social", "security"] + ) + assert lang_config.language == "en-US" + assert lang_config.context == ["social", "security"] + + +def test_language_context_config_no_context(): + """Test LanguageContextConfig without context.""" + lang_config = LanguageContextConfig(language="es") + assert lang_config.language == "es" + assert lang_config.context is None + + +def test_language_context_config_invalid_language(): + """Test LanguageContextConfig rejects invalid language codes.""" + with pytest.raises(ValidationError) as exc_info: + LanguageContextConfig(language="invalid") + assert "Invalid language code format" in str(exc_info.value) + + +def test_language_context_config_invalid_format(): + """Test various invalid language formats.""" + invalid_languages = ["e", "eng", "EN", "en-us", "en-USA", "123", ""] + + for lang in invalid_languages: + with pytest.raises(ValidationError): + LanguageContextConfig(language=lang) + + +def test_base_recognizer_config_minimal(): + """Test minimal valid configuration.""" + config = BaseRecognizerConfig(name="test_recognizer") + assert config.name == "test_recognizer" + assert config.enabled is True + assert config.type == "predefined" + + +def test_base_recognizer_config_full(): + """Test full configuration with all fields.""" + config = BaseRecognizerConfig( + name="test_recognizer", + enabled=False, + type="custom", + supported_language="en", + context=["test", "context"], + supported_entity="TEST_ENTITY" + ) + assert config.name == "test_recognizer" + assert config.enabled is False + assert config.type == "custom" + assert config.supported_languages == ["en"] + assert config.supported_language is None # Should be normalized + assert config.context == ["test", "context"] + assert config.supported_entities == ["TEST_ENTITY"] + assert config.supported_entity is None # Should be normalized + + +def test_language_normalization_single_to_multiple(): + """Test that supported_language gets normalized to supported_languages.""" + config = BaseRecognizerConfig( + name="test", + supported_language="en" + ) + assert config.supported_languages == ["en"] + assert config.supported_language is None + + +def test_entity_normalization_single_to_multiple(): + """Test that supported_entity gets normalized to supported_entities.""" + config = BaseRecognizerConfig( + name="test", + supported_entity="PERSON" + ) + assert config.supported_entities == ["PERSON"] + assert config.supported_entity is None + + +def test_cannot_specify_both_language_formats(): + """Test that specifying both language formats raises error.""" + with pytest.raises(ValidationError) as exc_info: + BaseRecognizerConfig( + name="test", + supported_language="en", + supported_languages=["es", "fr"] + ) + assert "Cannot specify both 'supported_language' and 'supported_languages'" in str(exc_info.value) + + +def test_cannot_specify_both_entity_formats(): + """Test that specifying both entity formats raises error.""" + with pytest.raises(ValidationError) as exc_info: + BaseRecognizerConfig( + name="test", + supported_entity="PERSON", + supported_entities=["LOCATION", "ORG"] + ) + assert "Cannot specify both 'supported_entity' and 'supported_entities'" in str(exc_info.value) + + +def test_invalid_single_language_format(): + """Test validation of single language format.""" + with pytest.raises(ValidationError): + BaseRecognizerConfig( + name="test", + supported_language="invalid" + ) + + +def test_context_with_multiple_languages_error(): + """Test that global context with multiple languages raises error.""" + with pytest.raises(ValidationError) as exc_info: + BaseRecognizerConfig( + name="test", + supported_languages=["en", "es"], + context=["global", "context"] + ) + assert "Global context can only be used with a single language" in str(exc_info.value) + + +def test_context_with_single_language_valid(): + """Test that global context with single language is valid.""" + config = BaseRecognizerConfig( + name="test", + supported_languages=["en"], + context=["global", "context"] + ) + assert config.context == ["global", "context"] + + +def test_predefined_recognizer_config_defaults(): + """Test predefined recognizer with defaults.""" + config = PredefinedRecognizerConfig(name="CreditCardRecognizer") + assert config.name == "CreditCardRecognizer" + assert config.type == "predefined" + assert config.enabled is True + + +def test_predefined_recognizer_config_with_language(): + """Test predefined recognizer with language specification.""" + config = PredefinedRecognizerConfig( + name="CreditCardRecognizer", + supported_language="en" + ) + assert config.supported_languages == ["en"] + + +def test_custom_recognizer_config_with_patterns(): + """Test custom recognizer with patterns.""" + patterns = [ + { + "name": "test_pattern", + "regex": r"\b\d{4}-\d{4}-\d{4}-\d{4}\b", + "score": 0.8 + } + ] + config = CustomRecognizerConfig( + name="custom_test", + supported_entity="CUSTOM_ENTITY", + patterns=patterns + ) + assert config.name == "custom_test" + assert config.type == "custom" + assert config.supported_entities == ["CUSTOM_ENTITY"] + assert config.patterns == patterns + + +def test_custom_recognizer_config_with_deny_list(): + """Test custom recognizer with deny list only.""" + config = CustomRecognizerConfig( + name="custom_test", + supported_entity="CUSTOM_ENTITY", + deny_list=["exclude", "this"], + deny_list_score=0.1 + ) + assert config.deny_list == ["exclude", "this"] + assert config.deny_list_score == 0.1 + + +def test_custom_recognizer_config_invalid_patterns_not_list(): + """Test that patterns must be a list.""" + with pytest.raises(ValidationError) as exc_info: + CustomRecognizerConfig( + name="test", + supported_entity="TEST", + patterns="not a list" + ) + + +def test_custom_recognizer_config_invalid_pattern_not_dict(): + """Test that each pattern must be a dict.""" + with pytest.raises(ValidationError) as exc_info: + CustomRecognizerConfig( + name="test", + supported_entity="TEST", + patterns=["not a dict"] + ) + + +def test_custom_recognizer_config_pattern_missing_fields(): + """Test that patterns must have required fields.""" + required_fields = ["name", "regex", "score"] + + for field in required_fields: + pattern = {"name": "test", "regex": r"\d+", "score": 0.5} + del pattern[field] + + with pytest.raises(ValidationError) as exc_info: + CustomRecognizerConfig( + name="test", + supported_entity="TEST", + patterns=[pattern] + ) + + +def test_custom_recognizer_config_invalid_score_type(): + """Test that pattern score must be float.""" + pattern = { + "name": "test", + "regex": r"\d+", + "score": "not a float" + } + with pytest.raises(ValidationError) as exc_info: + CustomRecognizerConfig( + name="test", + supported_entity="TEST", + patterns=[pattern] + ) + + +def test_custom_recognizer_config_invalid_score_range(): + """Test that pattern score must be between 0 and 1.""" + invalid_scores = [-0.1, 1.1, 2.0] + + for score in invalid_scores: + pattern = { + "name": "test", + "regex": r"\d+", + "score": score + } + with pytest.raises(ValidationError) as exc_info: + CustomRecognizerConfig( + name="test", + supported_entity="TEST", + patterns=[pattern] + ) + + +def test_custom_recognizer_config_no_patterns_or_deny_list(): + """Test that custom recognizer must have patterns or deny_list.""" + with pytest.raises(ValidationError) as exc_info: + CustomRecognizerConfig( + name="test", + supported_entity="TEST" + ) + + +def test_custom_recognizer_config_invalid_deny_list_score(): + """Test deny_list_score validation.""" + with pytest.raises(ValidationError): + CustomRecognizerConfig( + name="test", + supported_entity="TEST", + deny_list=["test"], + deny_list_score=1.5 # Invalid: > 1.0 + ) + + with pytest.raises(ValidationError): + CustomRecognizerConfig( + name="test", + supported_entity="TEST", + deny_list=["test"], + deny_list_score=-0.1 # Invalid: < 0.0 + ) + + +def test_recognizer_registry_config_defaults(): + """Test registry config with defaults.""" + config = RecognizerRegistryConfig() + assert config.supported_languages is None + assert config.global_regex_flags == 26 + assert config.recognizers == [] + + +def test_recognizer_registry_config_valid_languages(): + """Test registry with valid languages.""" + config = RecognizerRegistryConfig( + supported_languages=["en", "es", "fr-CA"] + ) + assert config.supported_languages == ["en", "es", "fr-CA"] + + +def test_recognizer_registry_config_invalid_language(): + """Test registry with invalid language codes.""" + with pytest.raises(ValidationError): + RecognizerRegistryConfig( + supported_languages=["en", "invalid", "es"] + ) + + +def test_recognizer_registry_config_empty_languages(): + """Test registry with empty languages list.""" + config = RecognizerRegistryConfig(supported_languages=[]) + assert config.supported_languages == [] + + +def test_recognizer_registry_config_string_recognizers(): + """Test registry with string recognizers.""" + config = RecognizerRegistryConfig( + recognizers=["credit_card", "email", "phone_number"] + ) + assert len(config.recognizers) == 3 + assert all(isinstance(r, str) for r in config.recognizers) + + +def test_recognizer_registry_config_mixed_recognizers(): + """Test registry with mixed recognizer types and missing languages should fail.""" + custom_config = { + "name": "custom_test", + "type": "custom", + "supported_entity": "TEST", + "patterns": [{"name": "test", "regex": r"\d+", "score": 0.5}] + } + + with pytest.raises(ValidationError) as exc_info: + RecognizerRegistryConfig( + recognizers=[ + "credit_card", # string predefined + {"name": "UrlRecognizer", "type": "predefined"}, # predefined + custom_config # custom without languages should trigger error + ] + ) + assert "Language configuration missing" in str(exc_info.value) + + +def test_recognizer_registry_config_only_predefined_no_languages(): + """Predefined recognizers without languages should be allowed (use defaults).""" + config = RecognizerRegistryConfig( + recognizers=[ + "credit_card", + {"name": "UrlRecognizer", "type": "predefined"}, + ] + ) + assert len(config.recognizers) == 2 + assert isinstance(config.recognizers[0], str) + assert isinstance(config.recognizers[1], PredefinedRecognizerConfig) + + +def test_recognizer_registry_config_auto_detect_type(): + """Test auto-detection of recognizer type based on patterns and deny_list.""" + # Should be detected as custom due to patterns + custom_with_patterns_config = { + "name": "auto_custom_patterns", + "supported_entity": "TEST", + "supported_language": "en", + "patterns": [{"name": "test", "regex": r"\d+", "score": 0.5}] + } + + # Should be detected as custom due to deny_list + custom_with_deny_list_config = { + "name": "auto_custom_deny", + "supported_entity": "TEST", + "supported_language": "en", + "deny_list": ["exclude_this"] + } + + # Should be detected as predefined (no patterns or deny_list) + predefined_config = { + "name": "UrlRecognizer", + "enabled": True + } + + config = RecognizerRegistryConfig( + supported_languages=["en"], # Add global language to satisfy new validation + recognizers=[custom_with_patterns_config, custom_with_deny_list_config, predefined_config] + ) + + assert isinstance(config.recognizers[0], CustomRecognizerConfig) + assert config.recognizers[0].type == "custom" + assert isinstance(config.recognizers[1], CustomRecognizerConfig) + assert config.recognizers[1].type == "custom" + assert isinstance(config.recognizers[2], PredefinedRecognizerConfig) + assert config.recognizers[2].type == "predefined" + + +def test_expand_predefined_recognizer_single_language(): + """Test expanding predefined recognizer with single language.""" + config = PredefinedRecognizerConfig( + name="EmailRecognizer", + supported_language="en" + ) + + result = YamlRecognizerProcessor.expand_recognizer_configs( + config, ["en", "es"] + ) + + assert len(result) == 1 + assert result[0]["name"] == "EmailRecognizer" + assert result[0]["supported_language"] == "en" + + +def test_expand_predefined_recognizer_multiple_languages(): + """Test expanding predefined recognizer with multiple languages.""" + config = PredefinedRecognizerConfig( + name="PhoneRecognizer", + supported_languages=["en", "es"] + ) + + result = YamlRecognizerProcessor.expand_recognizer_configs( + config, ["en", "es", "fr"] + ) + + assert len(result) == 2 + assert result[0]["supported_language"] == "en" + assert result[1]["supported_language"] == "es" + + +def test_expand_predefined_recognizer_no_language(): + """Test that predefined recognizer with no language creates single config.""" + config = PredefinedRecognizerConfig( + name="ItFiscalCodeRecognizer" + ) + + result = YamlRecognizerProcessor.expand_recognizer_configs( + config, ["en", "es", "it"] + ) + + # Should create only one config, not one per registry language + assert len(result) == 1 + # Should not have supported_language set (let recognizer use its default) + assert "supported_language" not in result[0] + assert "supported_languages" not in result[0] + assert result[0]["name"] == "ItFiscalCodeRecognizer" + assert result[0]["type"] == "predefined" + + +def test_expand_custom_recognizer_no_language_error(): + """Test that custom recognizer without language raises error.""" + config = CustomRecognizerConfig( + name="custom_test", + supported_entity="TEST", + patterns=[{"name": "test", "regex": r"\d+", "score": 0.5}] + ) + + with pytest.raises(ValueError) as exc_info: + YamlRecognizerProcessor.expand_recognizer_configs( + config, ["en", "es"] + ) + assert "Custom recognizer 'custom_test' must specify supported languages" in str(exc_info.value) + + +def test_expand_custom_recognizer_with_language(): + """Test expanding custom recognizer with specified language.""" + config = CustomRecognizerConfig( + name="custom_test", + supported_entity="TEST", + supported_language="en", + patterns=[{"name": "test", "regex": r"\d+", "score": 0.5}] + ) + + result = YamlRecognizerProcessor.expand_recognizer_configs( + config, ["en", "es"] + ) + + assert len(result) == 1 + assert result[0]["name"] == "custom_test" + assert result[0]["supported_language"] == "en" + + +def test_expand_recognizer_with_global_context(): + """Test that global context is preserved during expansion.""" + config = PredefinedRecognizerConfig( + name="EmailRecognizer", + supported_languages=["en"], + context=["global", "context"] + ) + + result = YamlRecognizerProcessor.expand_recognizer_configs( + config, ["en", "es"] + ) + + assert len(result) == 1 + assert result[0]["context"] == ["global", "context"] + + +def test_create_pattern_recognizers_from_config(): + """Test creating PatternRecognizer configs from CustomRecognizerConfig.""" + patterns = [ + {"name": "test", "regex": r"\d+", "score": 0.5} + ] + config = CustomRecognizerConfig( + name="custom_test", + supported_entity="TEST", + supported_language="en", + patterns=patterns, + deny_list=["exclude"], + deny_list_score=0.1 + ) + + result = YamlRecognizerProcessor.create_pattern_recognizers_from_config( + config, ["en"] + ) + + assert len(result) == 1 + pattern_config = result[0] + assert pattern_config["name"] == "custom_test" + assert pattern_config["supported_entities"] == ["TEST"] + assert "supported_entity" not in pattern_config + assert pattern_config["patterns"] == patterns + assert pattern_config["deny_list"] == ["exclude"] + assert pattern_config["deny_list_score"] == 0.1 + + +def test_expand_language_context_config(): + """Test expanding recognizer with LanguageContextConfig.""" + lang_config = LanguageContextConfig( + language="es", + context=["tarjeta", "credito"] + ) + config = CustomRecognizerConfig( + name="credit_card_es", + supported_entity="CREDIT_CARD", + supported_languages=[lang_config], + patterns=[{"name": "test", "regex": r"\d+", "score": 0.5}] + ) + + result = YamlRecognizerProcessor.expand_recognizer_configs( + config, ["en", "es"] + ) + + assert len(result) == 1 + assert result[0]["supported_language"] == lang_config + # Global context should be preserved even with language-specific config + assert result[0]["context"] is None # No global context in this case + + +def test_complete_registry_scenario(): + """Test a complete registry configuration scenario.""" + registry_config = { + "supported_languages": ["en", "es"], + "recognizers": [ + "credit_card", # String recognizer (kept as string) + { + "name": "EmailRecognizer", + "type": "predefined", + "enabled": True + }, + { + "name": "custom_pattern", + "type": "custom", + "supported_entity": "CUSTOM_ID", + "supported_language": "en", + "patterns": [ + { + "name": "id_pattern", + "regex": r"ID-\d{6}", + "score": 0.9 + } + ] + } + ] + } + + config = RecognizerRegistryConfig(**registry_config) + assert len(config.recognizers) == 3 + assert isinstance(config.recognizers[0], str) + assert isinstance(config.recognizers[1], PredefinedRecognizerConfig) + assert isinstance(config.recognizers[2], CustomRecognizerConfig) + + +def test_language_context_integration(): + """Test integration with LanguageContextConfig.""" + lang_configs = [ + LanguageContextConfig(language="en", context=["credit", "card"]), + LanguageContextConfig(language="es", context=["tarjeta", "credito"]) + ] + + config = PredefinedRecognizerConfig( + name="CreditCardRecognizer", + supported_languages=lang_configs + ) + + # Test expansion + result = YamlRecognizerProcessor.expand_recognizer_configs( + config, ["en", "es", "fr"] + ) + + assert len(result) == 2 + assert result[0]["supported_language"] == lang_configs[0] + assert result[1]["supported_language"] == lang_configs[1] + + +def test_error_handling_cascade(): + """Test that validation errors are properly cascaded.""" + # This should fail at the CustomRecognizerConfig level + with pytest.raises(ValidationError) as exc_info: + RecognizerRegistryConfig( + recognizers=[ + { + "name": "invalid_custom", + "type": "custom", + "supported_entity": "TEST", + "patterns": [ + { + "name": "test", + "regex": r"\d+", + "score": 2.0 # Invalid score > 1.0 + } + ] + } + ] + ) + assert "Pattern score should be between 0 and 1" in str(exc_info.value) + + +def test_predefined_recognizer_config_valid_recognizer(): + """Test predefined recognizer with valid recognizer name.""" + # Test with a common recognizer that should exist + config = PredefinedRecognizerConfig(name="CreditCardRecognizer") + assert config.name == "CreditCardRecognizer" + assert config.type == "predefined" + + +def test_predefined_recognizer_config_invalid_recognizer(): + """Test predefined recognizer with invalid recognizer name.""" + with pytest.raises(ValidationError) as exc_info: + PredefinedRecognizerConfig(name="NonExistentRecognizer") + + error_message = str(exc_info.value) + assert "Predefined recognizer 'NonExistentRecognizer' not found" in error_message + assert "Available predefined recognizers:" in error_message + + +def test_predefined_recognizer_config_case_sensitive(): + """Test that recognizer names are case sensitive.""" + with pytest.raises(ValidationError) as exc_info: + PredefinedRecognizerConfig(name="creditcardrecognizer") # lowercase + + error_message = str(exc_info.value) + assert "Predefined recognizer 'creditcardrecognizer' not found" in error_message + + +def test_predefined_recognizer_validation_with_import_error(): + """Test that validation gracefully handles import errors.""" + import sys + from unittest.mock import patch + + with patch.dict('sys.modules', {'presidio_analyzer.recognizer_registry.recognizers_loader_utils': None}): + config = PredefinedRecognizerConfig(name="SomeRecognizer") + assert config.name == "SomeRecognizer" + assert config.type == "predefined" + + +def test_custom_recognizer_config_predefined_name_error(): + """Test that using a predefined recognizer name for custom recognizer raises error.""" + with pytest.raises(ValidationError) as exc_info: + CustomRecognizerConfig( + name="CreditCardRecognizer", # This is a predefined recognizer + type="custom", + supported_entity="CREDIT_CARD", + patterns=[{"name": "test", "regex": r"\d+", "score": 0.5}] + ) + + error_message = str(exc_info.value) + assert "is a predefined recognizer but is marked as 'custom'" in error_message + assert "Either use type: 'predefined' or choose a different name" in error_message + + +def test_custom_recognizer_config_unique_name_valid(): + """Test that custom recognizers with unique names are valid.""" + config = CustomRecognizerConfig( + name="MyCustomRecognizer", # This should not exist as predefined + type="custom", + supported_entity="CUSTOM_ENTITY", + patterns=[{"name": "test", "regex": r"\d+", "score": 0.5}] + ) + assert config.name == "MyCustomRecognizer" + assert config.type == "custom" + + +def test_custom_recognizer_config_predefined_name_validation_with_import_error(): + """Test that validation gracefully handles import errors for predefined name checking.""" + from unittest.mock import patch + + # Mock the import to raise ImportError + with patch.dict('sys.modules', {'presidio_analyzer.recognizer_registry.recognizers_loader_utils': None}): + # This should not raise an error even if the import fails + config = CustomRecognizerConfig( + name="SomeRecognizer", + type="custom", + supported_entity="TEST", + patterns=[{"name": "test", "regex": r"\d+", "score": 0.5}] + ) + assert config.name == "SomeRecognizer" + assert config.type == "custom" + + +def test_custom_recognizer_with_language_no_global_languages(): + """Custom recognizer specifying its own language should pass without global languages.""" + registry_config = { + "recognizers": [ + { + "name": "my_custom_with_lang", + "type": "custom", + "supported_entity": "TEST", + "supported_language": "en", + "patterns": [ + {"name": "p", "regex": r"\d+", "score": 0.5} + ] + } + ] + } + config = RecognizerRegistryConfig(**registry_config) + assert len(config.recognizers) == 1 + assert isinstance(config.recognizers[0], CustomRecognizerConfig) + assert config.recognizers[0].supported_languages == ["en"] From cfc7c1b724cc1db3343a963f5773a7c014bd6356 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 11 Nov 2025 12:49:37 +0200 Subject: [PATCH 02/30] Validation layer for YAML based configuration - cont'd --- .../analyzer_engine_provider.py | 14 +- .../input_validation/__init__.py | 2 - .../input_validation/schemas.py | 6 +- .../yaml_recognizer_models.py | 169 +-- .../nlp_engine/nlp_engine_provider.py | 160 +-- .../presidio_analyzer/pattern_recognizer.py | 17 + .../recognizer_registry_provider.py | 5 + .../recognizers_loader_utils.py | 130 +- presidio-analyzer/test-output.xml | 1274 ++++++++--------- .../conf/missing_global_regex_flags.yaml | 8 + .../tests/conf/missing_recognizers.yaml | 7 + .../tests/conf/test_analyzer_engine.yaml | 2 +- .../tests/test_nlp_engine_provider.py | 22 +- .../test_recognizer_registry_provider.py | 133 +- .../tests/test_yaml_recognizer_models.py | 208 +-- 15 files changed, 1050 insertions(+), 1107 deletions(-) create mode 100644 presidio-analyzer/tests/conf/missing_global_regex_flags.yaml create mode 100644 presidio-analyzer/tests/conf/missing_recognizers.yaml diff --git a/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py b/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py index 2689133a42..6e6baff842 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py @@ -5,7 +5,6 @@ import yaml from presidio_analyzer import AnalyzerEngine, RecognizerRegistry -from presidio_analyzer.input_validation import ConfigurationValidator from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider from presidio_analyzer.recognizer_registry import RecognizerRegistryProvider @@ -60,17 +59,14 @@ def get_configuration( with open(self._get_full_conf_path()) as file: configuration = yaml.safe_load(file) except Exception: - print(f"Failed to parse file {conf_file}, resorting to default") + logger.warning(f"Failed to parse file {conf_file}, resorting to default") with open(self._get_full_conf_path()) as file: configuration = yaml.safe_load(file) - # Validate validation using enhanced validation - try: - ConfigurationValidator.validate_analyzer_configuration(configuration) - logger.debug("Analyzer validation validation passed") - except ValueError as e: - logger.error(f"Invalid analyzer validation: {e}") - raise ValueError(f"Configuration validation failed: {e}") + # Validate configuration using Pydantic-based ConfigurationValidator + from presidio_analyzer.input_validation import ConfigurationValidator + ConfigurationValidator.validate_analyzer_configuration(configuration) + logger.debug("Analyzer configuration validation passed") return configuration diff --git a/presidio-analyzer/presidio_analyzer/input_validation/__init__.py b/presidio-analyzer/presidio_analyzer/input_validation/__init__.py index 1e26821264..28b55fe8e0 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/__init__.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/__init__.py @@ -7,7 +7,6 @@ LanguageContextConfig, PredefinedRecognizerConfig, RecognizerRegistryConfig, - YamlRecognizerProcessor, ) __all__ = [ @@ -17,5 +16,4 @@ "LanguageContextConfig", "PredefinedRecognizerConfig", "RecognizerRegistryConfig", - "YamlRecognizerProcessor", ] diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index 437be07d8b..bf59b86374 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -3,7 +3,7 @@ from typing import Any, Dict, List, Union from pydantic import ValidationError - +from .yaml_recognizer_models import RecognizerRegistryConfig class ConfigurationValidator: """Class for validating configurations using Pydantic-enabled classes.""" @@ -81,11 +81,11 @@ def validate_recognizer_registry_configuration( ) -> Dict[str, Any]: """Validate recognizer registry validation using Pydantic models.""" try: - from .yaml_recognizer_models import RecognizerRegistryConfig # Use Pydantic model for validation validated_config = RecognizerRegistryConfig(**config) - return validated_config.model_dump() + # Use model_dump() without exclude_unset to include default values + return validated_config.model_dump(exclude_unset=False) except ValidationError as e: raise ValueError(f"Invalid recognizer registry validation: {e}") except ImportError: diff --git a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py index ab0960fbd7..aad3fbf89b 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py @@ -1,10 +1,13 @@ """Pydantic models for YAML recognizer configurations.""" +import logging from typing import Any, Dict, List, Optional, Union import regex as re from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator +logger = logging.getLogger("presidio-analyzer") + class LanguageContextConfig(BaseModel): """Configuration for language-specific validation with context words. @@ -88,10 +91,6 @@ def validate_language_configuration(self): "Cannot specify both 'supported_language' and 'supported_languages'" ) - if self.supported_language: - self.supported_languages = [self.supported_language] - self.supported_language = None - # If neither is specified, this is allowed for # predefined recognizers (defaults will be used) return self @@ -99,17 +98,17 @@ def validate_language_configuration(self): @model_validator(mode="after") def validate_entity_configuration(self): """Ensure proper entity validation.""" - if self.supported_entity and self.supported_entities: + # Check if user provided both (before we modify them) + user_provided_both = ( + self.supported_entity is not None + and self.supported_entities is not None + ) + + if user_provided_both: raise ValueError( - "Cannot specify both 'supported_entity' and 'supported_entities'" + f"Recognizer {self.name} has both ""'supported_entity' and 'supported_entities' specified." ) - if self.supported_entity: - self.supported_entities = [self.supported_entity] - self.supported_entity = None - - # If neither is specified, this is allowed for - # predefined recognizers (defaults will be used) return self @model_validator(mode="after") @@ -212,9 +211,9 @@ def validate_patterns(cls, patterns: Optional[List[Dict]]) -> Optional[str]: raise ValueError(f"Pattern should contain a regex field: {pattern}") if "score" not in pattern: raise ValueError(f"Pattern should contain a score field: {pattern}") - if not isinstance(pattern["score"], float): + if not isinstance(pattern["score"], (int, float)): raise ValueError(f"Pattern score should be a float: {pattern}") - if pattern["score"] < 0 or pattern["score"] > 1: + if not (0.0 <= pattern["score"] <= 1.0): raise ValueError(f"Pattern score should be between 0 and 1: {pattern}") return patterns @@ -265,30 +264,64 @@ class RecognizerRegistryConfig(BaseModel): supported_languages: Optional[List[str]] = Field( default=None, description="List of supported languages" ) - global_regex_flags: int = Field(default=26, description="Global regex flags") + global_regex_flags: int = Field( + default=26, description="Global regex flags" + ) recognizers: List[ Union[PredefinedRecognizerConfig, CustomRecognizerConfig, str] ] = Field(default_factory=list, description="List of recognizer configurations") @field_validator("supported_languages") @classmethod - def validate_language_codes(cls, v: List[str]) -> List[str]: + def validate_language_codes(cls, v: Optional[List[str]]) -> Optional[List[str]]: """Validate language codes format.""" - if v is None or len(v) == 0: - # Allow empty languages, which will be filled later - # by the languages of the recognizers. - return v + # Allow None or empty list for cases where languages will be inferred + if v is None: + return None + + if len(v) == 0: + return [] for lang in v: if not re.match(r"^[a-z]{2}(-[A-Z]{2})?$", lang): raise ValueError(f"Invalid language code format: {lang}") return v + @model_validator(mode="after") + def validate_languages_for_custom_recognizers(self): + """Validate that custom recognizers have language configuration.""" + # If we have custom recognizers, we need language configuration somewhere + for recognizer in self.recognizers: + if isinstance(recognizer, CustomRecognizerConfig): + # Check if this custom recognizer has its own language config + if not recognizer.supported_language and not recognizer.supported_languages: + # If no language config on recognizer, we need global languages + if not self.supported_languages: + raise ValueError( + f"Language configuration missing for custom recognizer '{recognizer.name}': " + "Either specify 'supported_languages' on the recognizer or provide " + "global 'supported_languages' in the registry configuration." + ) + + return self + + @field_validator("global_regex_flags") + @classmethod + def validate_global_regex_flags(cls, v: int) -> int: + """Validate global_regex_flags and warn if using default.""" + return v + @field_validator("recognizers", mode="before") @classmethod def parse_recognizers(cls, v): """Parse recognizers from various input formats without duplication.""" + if v is None: + raise ValueError( + "Configuration error: 'recognizers' is required. " + "Please provide a list of recognizers in the configuration." + ) + if not isinstance(v, list): raise ValueError("Recognizers must be a list") @@ -392,99 +425,3 @@ def validate_language_presence(self): "or specify languages for each custom recognizer." ) return self - - -class YamlRecognizerProcessor: - """Utility class to process YAML recognizer configurations.""" - - @staticmethod - def expand_recognizer_configs( - recognizer_config: Union[ - PredefinedRecognizerConfig, CustomRecognizerConfig, str - ], - registry_supported_languages: List[str], - ) -> List[Dict[str, Any]]: - """ - Expand a recognizer validation into multiple recognizer instances. - - This handles the logic where one YAML recognizer - can create multiple actual recognizers - based on language configurations. - """ - if isinstance(recognizer_config, str): - # Simple string name - create for all registry languages - return [ - { - "name": recognizer_config, - "supported_language": lang, - "type": "predefined", - } - for lang in registry_supported_languages - ] - - expanded_configs = [] - - # Handle language expansion - if recognizer_config.supported_language: - # Single language (legacy format) - config_dict = recognizer_config.model_dump() - config_dict["supported_language"] = recognizer_config.supported_language - if "supported_languages" in config_dict: - del config_dict["supported_languages"] - expanded_configs.append(config_dict) - - elif recognizer_config.supported_languages: - # Multiple languages - for lang_config in recognizer_config.supported_languages: - config_dict = recognizer_config.model_dump() - - config_dict["supported_language"] = lang_config - config_dict["context"] = recognizer_config.context # Use global context - - if "supported_languages" in config_dict: - del config_dict["supported_languages"] - expanded_configs.append(config_dict) - else: - # No language specified - use the default recognizer language - # (for predefined only) - # For custom, raise an exception. - if isinstance(recognizer_config, CustomRecognizerConfig): - # Custom recognizers must specify languages - raise ValueError( - f"Custom recognizer '{recognizer_config.name}' " - f"must specify supported languages" - ) - else: - config_dict = recognizer_config.model_dump(exclude_unset=True) - config_dict["type"] = recognizer_config.type - expanded_configs.append(config_dict) - - return expanded_configs - - @staticmethod - def create_pattern_recognizers_from_config( - custom_config: CustomRecognizerConfig, registry_supported_languages: List[str] - ) -> List[Dict[str, Any]]: - """Create PatternRecognizer configurations from CustomRecognizerConfig.""" - expanded_configs = YamlRecognizerProcessor.expand_recognizer_configs( - custom_config, registry_supported_languages - ) - - pattern_recognizer_configs = [] - for config in expanded_configs: - # Convert patterns to the format expected by PatternRecognizer.from_dict() - if "patterns" in config: - config["patterns"] = [ - pattern.model_dump() if hasattr(pattern, "model_dump") else pattern - for pattern in config["patterns"] - ] - - # Ensure supported_entities is a list with the single entity - if "supported_entity" in config: - if config["supported_entity"] is not None: - config["supported_entities"] = [config["supported_entity"]] - del config["supported_entity"] - - pattern_recognizer_configs.append(config) - - return pattern_recognizer_configs diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py index 997265105a..65901ddca3 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py @@ -39,11 +39,10 @@ def __init__( conf_file: Optional[Union[Path, str]] = None, nlp_configuration: Optional[Dict] = None, ): - if nlp_engines: - self._validate_nlp_engines(nlp_engines) - else: + if nlp_engines is None: nlp_engines = (SpacyNlpEngine, StanzaNlpEngine, TransformersNlpEngine) + # No legacy validation - just assign the engines self.nlp_engines = { engine.engine_name: engine for engine in nlp_engines if engine.is_available } @@ -57,11 +56,15 @@ def __init__( ) if nlp_configuration: - self._validate_nlp_configuration(nlp_configuration) + # Validate using ConfigurationValidator - let Pydantic errors propagate + ConfigurationValidator.validate_nlp_configuration(nlp_configuration) self.nlp_configuration = nlp_configuration if conf_file or conf_file == "": - self._validate_conf_file_path(conf_file) + if conf_file == "": + raise ValueError("conf_file is empty") + # Validate file path using ConfigurationValidator - let Pydantic errors propagate + ConfigurationValidator.validate_file_path(conf_file) self.nlp_configuration = self._read_nlp_conf(conf_file) if conf_file is None and nlp_configuration is None: @@ -69,138 +72,47 @@ def __init__( logger.debug(f"Reading default conf file from {conf_file}") self.nlp_configuration = self._read_nlp_conf(conf_file) - @staticmethod - def _validate_nlp_engines(nlp_engines: Tuple) -> None: - """ - Validate that all NLP engine classes have the required attributes. - - :param nlp_engines: Tuple of NLP engine classes to validate. - """ - - if not isinstance(nlp_engines, tuple): - raise ValueError(f"nlp_engines must be a tuple, got {type(nlp_engines)}") - - required_attributes = ["engine_name", "is_available"] - - for engine_class in nlp_engines: - missing_attributes = [] - - for attr in required_attributes: - if not hasattr(engine_class, attr): - missing_attributes.append(attr) - - if missing_attributes: - raise ValueError( - f"NLP engine class {engine_class} is missing required " - f"class attributes: {missing_attributes}. " - "All NLP engine classes must have 'engine_name' and 'is_available' " - "as class attributes." - ) - - if not isinstance(engine_class.engine_name, str): - raise ValueError( - f"NLP engine class {engine_class} has invalid " - f"'engine_name' attribute. Expected string, " - f"got {type(engine_class.engine_name)}." - ) - - if not isinstance(engine_class.is_available, bool): - raise ValueError( - f"NLP engine class {engine_class} has invalid " - f"'is_available' attribute. Expected boolean, " - f"got {type(engine_class.is_available)}." - ) - - @staticmethod - def _validate_nlp_configuration(nlp_configuration: Dict) -> None: - """ - Validate the NLP configuration structure and content. - - :param nlp_configuration: The configuration dictionary to validate - """ - try: - ConfigurationValidator.validate_nlp_configuration(nlp_configuration) - except ValueError as e: - raise ValueError(f"Invalid NLP configuration: {e}") - - @staticmethod - def _validate_conf_file_path(conf_file: Union[Path, str]) -> None: - """ - Validate the conf file path using enhanced validation. + # _validate_nlp_engines method removed - all validation is now Pydantic-based - :param conf_file: The conf file path to validate - """ - if conf_file == "": - raise ValueError("conf_file is empty") + def _read_nlp_conf(self, conf_file: Union[Path, str]) -> Dict: + """Read NLP configuration from a YAML file.""" + with open(conf_file) as file: + return yaml.safe_load(file) - try: - ConfigurationValidator.validate_file_path(conf_file) - except ValueError as e: - raise ValueError(str(e)) + def _get_full_conf_path( + self, default_conf_file: Union[Path, str] = "default.yaml" + ) -> Path: + """Return a Path to the default conf file.""" + return Path(Path(__file__).parent, "../conf", default_conf_file) def create_engine(self) -> NlpEngine: """Create an NLP engine instance.""" - if ( - not self.nlp_configuration - or not self.nlp_configuration.get("models") - or not self.nlp_configuration.get("nlp_engine_name") - ): - raise ValueError( - "Illegal nlp configuration. " - "Configuration should include nlp_engine_name and models " - "(list of model_name for each lang_code)." - ) + # Configuration is already validated by Pydantic in __init__ nlp_engine_name = self.nlp_configuration["nlp_engine_name"] if nlp_engine_name not in self.nlp_engines: raise ValueError( f"NLP engine '{nlp_engine_name}' is not available. " "Make sure you have all required packages installed" ) - try: - nlp_engine_class = self.nlp_engines[nlp_engine_name] - nlp_models = self.nlp_configuration["models"] - ner_model_configuration = self.nlp_configuration.get( - "ner_model_configuration" - ) - if ner_model_configuration: - ner_model_configuration = NerModelConfiguration.from_dict( - ner_model_configuration - ) + nlp_engine_class = self.nlp_engines[nlp_engine_name] + nlp_models = self.nlp_configuration["models"] - engine = nlp_engine_class( - models=nlp_models, ner_model_configuration=ner_model_configuration - ) - engine.load() - logger.info( - f"Created NLP engine: {engine.engine_name}. " - f"Loaded models: {list(engine.nlp.keys())}" - ) - return engine - except KeyError: - raise ValueError("Wrong NLP engine configuration") - - @staticmethod - def _read_nlp_conf(conf_file: Union[Path, str]) -> dict: - """ - Read the nlp configuration from a provided yaml file. - - :param conf_file: The conf file path to read - """ - - with open(conf_file) as file: - nlp_configuration = yaml.safe_load(file) - - if "ner_model_configuration" not in nlp_configuration: - logger.warning( - "configuration file is missing 'ner_model_configuration'. Using default" + ner_model_configuration = self.nlp_configuration.get( + "ner_model_configuration" + ) + if ner_model_configuration: + ner_model_configuration = NerModelConfiguration.from_dict( + ner_model_configuration ) - return nlp_configuration + engine = nlp_engine_class( + models=nlp_models, ner_model_configuration=ner_model_configuration + ) + engine.load() + logger.info( + f"Created NLP engine: {engine.engine_name}. " + f"Loaded models: {list(engine.nlp.keys())}" + ) + return engine - @staticmethod - def _get_full_conf_path( - default_conf_file: Union[Path, str] = "default.yaml", - ) -> Path: - """Return a Path to the default conf file.""" - return Path(Path(__file__).parent.parent, "conf", default_conf_file) diff --git a/presidio-analyzer/presidio_analyzer/pattern_recognizer.py b/presidio-analyzer/presidio_analyzer/pattern_recognizer.py index 698293481e..a2c0012b2b 100644 --- a/presidio-analyzer/presidio_analyzer/pattern_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/pattern_recognizer.py @@ -266,9 +266,26 @@ def to_dict(self) -> Dict: @classmethod def from_dict(cls, entity_recognizer_dict: Dict) -> "PatternRecognizer": """Create instance from a serialized dict.""" + # Make a copy to avoid mutating the input + entity_recognizer_dict = entity_recognizer_dict.copy() + patterns = entity_recognizer_dict.get("patterns") if patterns: patterns_list = [Pattern.from_dict(pat) for pat in patterns] entity_recognizer_dict["patterns"] = patterns_list + # Transform supported_entities (plural) to supported_entity (singular) + # PatternRecognizer only accepts supported_entity (singular) + if "supported_entity" in entity_recognizer_dict and "supported_entities" in entity_recognizer_dict: + raise ValueError( + "Both 'supported_entity' and 'supported_entities' are present in the input dictionary. " + "Only one should be provided." + ) + if "supported_entities" in entity_recognizer_dict: + supported_entities = entity_recognizer_dict.pop("supported_entities") + if supported_entities and len(supported_entities) > 0: + # Only set if not already present + if "supported_entity" not in entity_recognizer_dict: + entity_recognizer_dict["supported_entity"] = supported_entities[0] + return cls(**entity_recognizer_dict) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry_provider.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry_provider.py index 79c0f2c3bf..15012d555e 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry_provider.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry_provider.py @@ -13,6 +13,7 @@ RecognizerConfigurationLoader, RecognizerListLoader, ) +from presidio_analyzer.input_validation import ConfigurationValidator logger = logging.getLogger("presidio-analyzer") @@ -55,6 +56,9 @@ def __init__( self.configuration = RecognizerConfigurationLoader.get( conf_file=conf_file, registry_configuration=registry_configuration ) + # Validate configuration using Pydantic + + self.configuration = ConfigurationValidator.validate_recognizer_registry_configuration(self.configuration) self.nlp_engine = nlp_engine def create_recognizer_registry(self) -> RecognizerRegistry: @@ -232,3 +236,4 @@ def __remove_disabled_nlp_recognizers( f"Disabled {recognizer.__class__.__name__} " f"recognizer for language {language}." ) + diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py index c617dd7e9f..d4a297263d 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py @@ -84,6 +84,7 @@ def _get_recognizer_languages( if ( isinstance(recognizer_conf, str) or "supported_languages" not in recognizer_conf + or recognizer_conf["supported_languages"] is None ): return [ { @@ -142,9 +143,18 @@ def _create_custom_recognizers( supported_languages: Iterable[str], ) -> List[PatternRecognizer]: """Create a custom recognizer for each language, based on the provided conf.""" - # legacy recognizer - if "supported_language" in recognizer_conf: - return [PatternRecognizer.from_dict(recognizer_conf)] + # legacy recognizer (has supported_language set to a value, not None) + if recognizer_conf.get("supported_language"): + # Remove supported_languages field (plural) if present, as we're using supported_language (singular) + conf_copy = {k: v for k, v in recognizer_conf.items() if k != "supported_languages"} + + # Transform supported_entities -> supported_entity (PatternRecognizer expects singular) + if "supported_entities" in conf_copy: + supported_entities = conf_copy.pop("supported_entities") + if "supported_entity" not in conf_copy and supported_entities: + conf_copy["supported_entity"] = supported_entities[0] + + return [PatternRecognizer.from_dict(conf_copy)] recognizers = [] @@ -156,6 +166,13 @@ def _create_custom_recognizers( for k, v in recognizer_conf.items() if k not in ["enabled", "type", "supported_languages"] } + + # Transform supported_entities -> supported_entity (PatternRecognizer expects singular) + if "supported_entities" in copied_recognizer: + supported_entities = copied_recognizer.pop("supported_entities") + if "supported_entity" not in copied_recognizer and supported_entities: + copied_recognizer["supported_entity"] = supported_entities[0] + kwargs = {**copied_recognizer, **supported_language} recognizers.append(PatternRecognizer.from_dict(kwargs)) @@ -203,6 +220,58 @@ def get_existing_recognizer_cls(recognizer_name: str) -> Type[EntityRecognizer]: f"list of recognizers inheriting the EntityRecognizer class" ) + @staticmethod + def _is_pattern_recognizer(recognizer_cls: Type[EntityRecognizer]) -> bool: + """ + Check if a recognizer class inherits from PatternRecognizer. + + :param recognizer_cls: The recognizer class to check. + :return: True if the recognizer inherits from PatternRecognizer. + """ + try: + return issubclass(recognizer_cls, PatternRecognizer) + except TypeError: + return False + + @staticmethod + def _prepare_recognizer_kwargs( + recognizer_conf: Dict[str, Any], + language_conf: Dict[str, Any], + recognizer_cls: Type[EntityRecognizer], + ) -> Dict[str, Any]: + """ + Prepare kwargs for recognizer instantiation. + + Converts supported_entities to supported_entity for PatternRecognizer subclasses. + Removes both fields if they are None to allow recognizer defaults to be used. + + :param recognizer_conf: The recognizer configuration. + :param language_conf: The language configuration. + :param recognizer_cls: The recognizer class. + :return: Prepared kwargs for recognizer instantiation. + """ + kwargs = {**recognizer_conf, **language_conf} + + # If this is a PatternRecognizer, handle supported_entities/supported_entity + if RecognizerListLoader._is_pattern_recognizer(recognizer_cls): + # Convert supported_entities (plural) to supported_entity (singular) if present + if "supported_entities" in kwargs: + supported_entities = kwargs.pop("supported_entities") + # Only set supported_entity if we have valid entities and it's not already set + if supported_entities and len(supported_entities) > 0 and "supported_entity" not in kwargs: + kwargs["supported_entity"] = supported_entities[0] + + # Remove supported_entity if it's None to allow the recognizer's default to be used + if kwargs.get("supported_entity") is None: + kwargs.pop("supported_entity", None) + else: + # For non-PatternRecognizer classes, remove both fields + # as they may not accept these parameters + kwargs.pop("supported_entities", None) + kwargs.pop("supported_entity", None) + + return kwargs + @staticmethod def get( recognizers: Dict[str, Any], @@ -216,10 +285,12 @@ def get( """ recognizer_instances = [] predefined, custom = RecognizerListLoader._split_recognizers(recognizers) + # Exclude Pydantic-normalized fields that should not be passed to recognizer constructors + # Note: We exclude both supported_entity and supported_entities here because we'll handle + # the conversion in _prepare_recognizer_kwargs predefined_to_exclude = {"enabled", "type", "supported_languages", "name"} - # For custom recognizers we keep 'supported_languages' - # so we can create per-language - # instances with their specific context values. + # For custom recognizers, we keep 'supported_languages' and don't exclude 'supported_entity' + # because PatternRecognizer needs it custom_to_exclude = {"enabled", "type"} for recognizer_conf in predefined: for language_conf in RecognizerListLoader._get_recognizer_languages( @@ -230,13 +301,18 @@ def get( recognizer_conf, to_exclude=predefined_to_exclude ) - kwargs = {**new_conf, **language_conf} recognizer_name = RecognizerListLoader.get_recognizer_name( recognizer_conf=recognizer_conf ) recognizer_cls = RecognizerListLoader.get_existing_recognizer_cls( recognizer_name=recognizer_name ) + + # Prepare kwargs, converting supported_entities to supported_entity if needed + kwargs = RecognizerListLoader._prepare_recognizer_kwargs( + new_conf, language_conf, recognizer_cls + ) + recognizer_instances.append(recognizer_cls(**kwargs)) for recognizer_conf in custom: @@ -299,7 +375,6 @@ def _merge_configuration( :param registry_configuration: The configuration to update. :param config_from_file: The configuration coming from the conf file. """ - registry_configuration.update( { k: v @@ -308,14 +383,7 @@ def _merge_configuration( } ) - missing_keys = [ - key - for key in RecognizerConfigurationLoader.mandatory_keys - if key not in registry_configuration - ] - if len(missing_keys) > 0: - raise ValueError(f"Missing the following keys: {', '.join(missing_keys)}") - + # Validation is now handled by Pydantic via ConfigurationValidator return registry_configuration @staticmethod @@ -337,14 +405,23 @@ def get( ) configuration = {} + config_from_file = {} + use_defaults = True if registry_configuration: configuration = registry_configuration.copy() + # Check if registry_configuration has all mandatory keys + # Note: supported_languages is now optional, so we only check for recognizers + mandatory_keys_set = {"recognizers", "global_regex_flags"} + config_keys = set(configuration.keys()) + if mandatory_keys_set.issubset(config_keys): + use_defaults = False if conf_file: try: with open(conf_file) as file: config_from_file = yaml.safe_load(file) + use_defaults = False except OSError: logger.warning( @@ -353,12 +430,15 @@ def get( ) with open(RecognizerConfigurationLoader._get_full_conf_path()) as file: config_from_file = yaml.safe_load(file) + use_defaults = False except Exception as e: raise ValueError( f"Failed to parse file {conf_file}." f"Error: {str(e)}" ) - else: + + # Load defaults if needed (no config provided, or registry_configuration is incomplete) + if use_defaults: with open(RecognizerConfigurationLoader._get_full_conf_path()) as file: config_from_file = yaml.safe_load(file) @@ -374,9 +454,25 @@ def get( f"got {type(registry_configuration)}" ) + # Check if config_from_file has any invalid keys (keys that aren't mandatory or valid optional keys) + # If it has keys but none of them are mandatory keys, it's likely an invalid config + if config_from_file and conf_file: + config_keys = set(config_from_file.keys()) + mandatory_keys_set = {"recognizers"} # Only recognizers is truly mandatory + + # If config has keys but none are mandatory and it's from a conf_file, + # it's probably invalid - don't merge with defaults + if config_keys and not config_keys.intersection(mandatory_keys_set): + raise ValueError( + f"Configuration file {conf_file} does not contain any of the " + f"mandatory keys: {list(mandatory_keys_set)}. " + f"Found keys: {list(config_keys)}" + ) + configuration = RecognizerConfigurationLoader._merge_configuration( registry_configuration=configuration, config_from_file=config_from_file ) + return configuration @staticmethod diff --git a/presidio-analyzer/test-output.xml b/presidio-analyzer/test-output.xml index 168e5c7e4f..a8378c41de 100644 --- a/presidio-analyzer/test-output.xml +++ b/presidio-analyzer/test-output.xml @@ -1,234 +1,4 @@ -/Users/omri.mendels/Library/Caches/pypoetry/virtualenvs/presidio-evaluator-nCKHFi6i-py3.10/bin/pytest/Users/omri.mendels/Library/Caches/pypoetry/virtualenvs/presidio-evaluator-nCKHFi6i-py3.10/bin/pytest \ No newline at end of file +INFO  stanza:core.py:348 Done loading processors!]]> \ No newline at end of file diff --git a/presidio-analyzer/tests/conf/missing_global_regex_flags.yaml b/presidio-analyzer/tests/conf/missing_global_regex_flags.yaml new file mode 100644 index 0000000000..3b7c618fbd --- /dev/null +++ b/presidio-analyzer/tests/conf/missing_global_regex_flags.yaml @@ -0,0 +1,8 @@ +# Test configuration file with missing global_regex_flags field +# This should raise a warning and use default value + +supported_languages: + - en +recognizers: + - CreditCardRecognizer + - EmailRecognizer diff --git a/presidio-analyzer/tests/conf/missing_recognizers.yaml b/presidio-analyzer/tests/conf/missing_recognizers.yaml new file mode 100644 index 0000000000..d2543c5896 --- /dev/null +++ b/presidio-analyzer/tests/conf/missing_recognizers.yaml @@ -0,0 +1,7 @@ +# Test configuration file with missing recognizers field +# This should raise an exception + +supported_languages: + - en + - es +global_regex_flags: 26 \ No newline at end of file diff --git a/presidio-analyzer/tests/conf/test_analyzer_engine.yaml b/presidio-analyzer/tests/conf/test_analyzer_engine.yaml index ec9d528984..9d68dea705 100644 --- a/presidio-analyzer/tests/conf/test_analyzer_engine.yaml +++ b/presidio-analyzer/tests/conf/test_analyzer_engine.yaml @@ -4,7 +4,7 @@ recognizer_registry: - name: CreditCardRecognizer supported_languages: - en - supported_entity: IT_FISCAL_CODE + supported_entity: CREDIT_CARD type: predefined - name: ItFiscalCodeRecognizer diff --git a/presidio-analyzer/tests/test_nlp_engine_provider.py b/presidio-analyzer/tests/test_nlp_engine_provider.py index 285fe5784a..f10517c3ed 100644 --- a/presidio-analyzer/tests/test_nlp_engine_provider.py +++ b/presidio-analyzer/tests/test_nlp_engine_provider.py @@ -251,17 +251,22 @@ def test_when_valid_nlp_engines_then_return_default_configuration(): def test_when_nlp_engines_type_is_not_tuple_then_fail(): + """Test that nlp_engines accepts lists (not just tuples) after removing legacy validation.""" nlp_engines = [SpacyNlpEngine, StanzaNlpEngine, TransformersNlpEngine] - with pytest.raises(ValueError): - NlpEngineProvider(nlp_engines) - - + # After removing legacy validation, lists are now accepted (they work just as well as tuples) + provider = NlpEngineProvider(nlp_engines=nlp_engines) + engine = provider.create_engine() + assert isinstance(engine, SpacyNlpEngine) + def test_when_invalid_nlp_engine_types_then_fail(): + """Test that invalid nlp_engine types will fail when accessing attributes.""" nlp_engines = (1, 2, 3) - with pytest.raises(ValueError): - NlpEngineProvider(nlp_engines) + # After removing legacy validation, this fails with AttributeError when accessing .is_available + with pytest.raises(AttributeError): + NlpEngineProvider(nlp_engines=nlp_engines) + def test_when_valid_nlp_configuration_then_return_default_configuration(): @@ -276,13 +281,16 @@ def test_when_valid_nlp_configuration_then_return_default_configuration(): def test_when_nlp_configuration_is_passed_instead_of_nlp_engines_then_fail(): + """Test that passing nlp_configuration as positional argument fails.""" nlp_configuration = { "nlp_engine_name": "stanza", "models": [{"lang_code": "en", "model_name": "en"}] } - with pytest.raises(ValueError): + # This fails because nlp_configuration is passed as positional arg (interpreted as nlp_engines) + with pytest.raises(AttributeError): NlpEngineProvider(nlp_configuration) + def test_when_nlp_configuration_is_not_dict_then_fail(): diff --git a/presidio-analyzer/tests/test_recognizer_registry_provider.py b/presidio-analyzer/tests/test_recognizer_registry_provider.py index 0493a2968f..33fc7634ba 100644 --- a/presidio-analyzer/tests/test_recognizer_registry_provider.py +++ b/presidio-analyzer/tests/test_recognizer_registry_provider.py @@ -3,6 +3,7 @@ from pathlib import Path from typing import List from inspect import signature +import pydantic from presidio_analyzer.predefined_recognizers import SpacyRecognizer from presidio_analyzer.recognizer_registry import RecognizerRegistryProvider @@ -73,10 +74,12 @@ def test_recognizer_registry_provider_corrupt_conf_file_fail(mandatory_recognize def test_recognizer_registry_provider_conf_file_valid_missing_keys_fail(): + """Test that a config file with invalid keys (no mandatory keys) raises an error.""" this_path = Path(__file__).parent.absolute() test_yaml = Path(this_path, "conf/recognizer_configuration_missing_keys.yaml") - with pytest.raises(ValueError): + # Config file with no mandatory keys should raise ValueError + with pytest.raises(ValueError, match="does not contain any of the mandatory keys"): RecognizerRegistryProvider(conf_file=test_yaml) @@ -149,4 +152,130 @@ def test_default_attributes_equal_recognizer_registry_signature(): registry_provider = RecognizerRegistryProvider() provider_fields = set(RecognizerConfigurationLoader.mandatory_keys) - assert registry_fields == provider_fields \ No newline at end of file + assert registry_fields == provider_fields + + +def test_recognizer_registry_provider_missing_language_config_raises(): + """ + Test that a recognizer configuration without language info gets the default languages. + """ + from presidio_analyzer.recognizer_registry.recognizer_registry_provider import RecognizerRegistryProvider + # Configuration with no supported_languages and no recognizer language + registry_configuration = { + "recognizers": [ + { + "name": "CustomRecognizer", + "type": "custom", + "supported_entity": "CUSTOM_ENTITY", + "patterns": [ + {"name": "custom", "regex": "test", "score": 0.5} + ], + # No supported_language or supported_languages + } + ] + } + # When registry_configuration is passed, it gets merged with defaults + # so supported_languages gets filled in and recognizers get created for default languages + provider = RecognizerRegistryProvider(registry_configuration=registry_configuration) + # Verify that defaults were applied + assert provider.configuration.get("supported_languages") is not None + registry = provider.create_recognizer_registry() + # Verify registry was created successfully with default language + assert len(registry.recognizers) > 0 + + +# Tests for missing required and optional fields in YAML configuration + +def test_missing_recognizers_raises_exception(): + """Test that missing recognizers raises an exception.""" + this_path = Path(__file__).parent.absolute() + conf_file = Path(this_path, "conf/missing_recognizers.yaml") + + with pytest.raises(ValueError) as exc_info: + RecognizerRegistryProvider(conf_file=conf_file) + + assert "recognizers" in str(exc_info.value) + assert "mandatory" in str(exc_info.value).lower() + + +def test_missing_global_regex_flags_uses_default(): + """Test that missing global_regex_flags uses default value without error.""" + this_path = Path(__file__).parent.absolute() + conf_file = Path(this_path, "conf/missing_global_regex_flags.yaml") + + # Should not raise an exception + provider = RecognizerRegistryProvider(conf_file=conf_file) + registry = provider.create_recognizer_registry() + + # Check that default value was used (26 = re.DOTALL | re.MULTILINE | re.IGNORECASE) + assert registry.global_regex_flags == 26 + assert registry.supported_languages == ["en"] + + +def test_valid_configuration_passes(): + """Test that a valid configuration passes validation.""" + from presidio_analyzer.input_validation import ConfigurationValidator + + config = { + "supported_languages": ["en", "es"], + "recognizers": ["CreditCardRecognizer", "EmailRecognizer"], + "global_regex_flags": 26, + } + + validated = ConfigurationValidator.validate_recognizer_registry_configuration(config) + + assert validated is not None + assert validated["supported_languages"] == ["en", "es"] + assert validated["global_regex_flags"] == 26 + + +def test_valid_configuration_without_global_regex_flags(): + """Test that configuration without global_regex_flags uses default without error.""" + from presidio_analyzer.input_validation import ConfigurationValidator + + config = { + "supported_languages": ["en"], + "recognizers": ["CreditCardRecognizer"], + } + + # Should not raise an exception + validated = ConfigurationValidator.validate_recognizer_registry_configuration(config) + + # Check default value was set + assert validated["global_regex_flags"] == 26 + assert validated["supported_languages"] == ["en"] + + +def test_recognizers_none_raises_exception(): + """Test that recognizers explicitly set to None raises an exception.""" + from presidio_analyzer.input_validation import ConfigurationValidator + + config = { + "supported_languages": ["en"], + "recognizers": None, + "global_regex_flags": 26, + } + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_recognizer_registry_configuration(config) + + assert "recognizers" in str(exc_info.value) + assert "required" in str(exc_info.value).lower() + + + +def test_direct_validation_with_missing_global_regex_flags(): + """Test direct validation without global_regex_flags succeeds with default.""" + from presidio_analyzer.input_validation import ConfigurationValidator + + config = { + "supported_languages": ["en"], + "recognizers": ["CreditCardRecognizer"], + } + + # Should not raise an exception + validated = ConfigurationValidator.validate_recognizer_registry_configuration(config) + + # Verify default value and successful creation + assert validated["global_regex_flags"] == 26 + assert validated["supported_languages"] == ["en"] diff --git a/presidio-analyzer/tests/test_yaml_recognizer_models.py b/presidio-analyzer/tests/test_yaml_recognizer_models.py index 8d12225989..e5a6aebb82 100644 --- a/presidio-analyzer/tests/test_yaml_recognizer_models.py +++ b/presidio-analyzer/tests/test_yaml_recognizer_models.py @@ -9,7 +9,6 @@ LanguageContextConfig, PredefinedRecognizerConfig, RecognizerRegistryConfig, - YamlRecognizerProcessor, ) @@ -77,31 +76,31 @@ def test_base_recognizer_config_full(): assert config.name == "test_recognizer" assert config.enabled is False assert config.type == "custom" - assert config.supported_languages == ["en"] - assert config.supported_language is None # Should be normalized + assert config.supported_language == "en" # Preserved as-is + assert config.supported_languages is None assert config.context == ["test", "context"] - assert config.supported_entities == ["TEST_ENTITY"] - assert config.supported_entity is None # Should be normalized + assert config.supported_entity == "TEST_ENTITY" # Preserved as-is + assert config.supported_entities is None -def test_language_normalization_single_to_multiple(): - """Test that supported_language gets normalized to supported_languages.""" +def test_language_fields_preserved(): + """Test that supported_language is preserved as-is (not normalized).""" config = BaseRecognizerConfig( name="test", supported_language="en" ) - assert config.supported_languages == ["en"] - assert config.supported_language is None + assert config.supported_language == "en" + assert config.supported_languages is None -def test_entity_normalization_single_to_multiple(): - """Test that supported_entity gets normalized to supported_entities.""" +def test_entity_fields_preserved(): + """Test that supported_entity is preserved as-is (not normalized).""" config = BaseRecognizerConfig( name="test", supported_entity="PERSON" ) - assert config.supported_entities == ["PERSON"] - assert config.supported_entity is None + assert config.supported_entity == "PERSON" + assert config.supported_entities is None def test_cannot_specify_both_language_formats(): @@ -123,7 +122,7 @@ def test_cannot_specify_both_entity_formats(): supported_entity="PERSON", supported_entities=["LOCATION", "ORG"] ) - assert "Cannot specify both 'supported_entity' and 'supported_entities'" in str(exc_info.value) + assert "has both 'supported_entity' and 'supported_entities' specified" in str(exc_info.value) def test_invalid_single_language_format(): @@ -170,7 +169,8 @@ def test_predefined_recognizer_config_with_language(): name="CreditCardRecognizer", supported_language="en" ) - assert config.supported_languages == ["en"] + assert config.supported_language == "en" + assert config.supported_languages is None def test_custom_recognizer_config_with_patterns(): @@ -189,7 +189,8 @@ def test_custom_recognizer_config_with_patterns(): ) assert config.name == "custom_test" assert config.type == "custom" - assert config.supported_entities == ["CUSTOM_ENTITY"] + assert config.supported_entity == "CUSTOM_ENTITY" + assert config.supported_entities is None assert config.patterns == patterns @@ -411,156 +412,6 @@ def test_recognizer_registry_config_auto_detect_type(): assert config.recognizers[2].type == "predefined" -def test_expand_predefined_recognizer_single_language(): - """Test expanding predefined recognizer with single language.""" - config = PredefinedRecognizerConfig( - name="EmailRecognizer", - supported_language="en" - ) - - result = YamlRecognizerProcessor.expand_recognizer_configs( - config, ["en", "es"] - ) - - assert len(result) == 1 - assert result[0]["name"] == "EmailRecognizer" - assert result[0]["supported_language"] == "en" - - -def test_expand_predefined_recognizer_multiple_languages(): - """Test expanding predefined recognizer with multiple languages.""" - config = PredefinedRecognizerConfig( - name="PhoneRecognizer", - supported_languages=["en", "es"] - ) - - result = YamlRecognizerProcessor.expand_recognizer_configs( - config, ["en", "es", "fr"] - ) - - assert len(result) == 2 - assert result[0]["supported_language"] == "en" - assert result[1]["supported_language"] == "es" - - -def test_expand_predefined_recognizer_no_language(): - """Test that predefined recognizer with no language creates single config.""" - config = PredefinedRecognizerConfig( - name="ItFiscalCodeRecognizer" - ) - - result = YamlRecognizerProcessor.expand_recognizer_configs( - config, ["en", "es", "it"] - ) - - # Should create only one config, not one per registry language - assert len(result) == 1 - # Should not have supported_language set (let recognizer use its default) - assert "supported_language" not in result[0] - assert "supported_languages" not in result[0] - assert result[0]["name"] == "ItFiscalCodeRecognizer" - assert result[0]["type"] == "predefined" - - -def test_expand_custom_recognizer_no_language_error(): - """Test that custom recognizer without language raises error.""" - config = CustomRecognizerConfig( - name="custom_test", - supported_entity="TEST", - patterns=[{"name": "test", "regex": r"\d+", "score": 0.5}] - ) - - with pytest.raises(ValueError) as exc_info: - YamlRecognizerProcessor.expand_recognizer_configs( - config, ["en", "es"] - ) - assert "Custom recognizer 'custom_test' must specify supported languages" in str(exc_info.value) - - -def test_expand_custom_recognizer_with_language(): - """Test expanding custom recognizer with specified language.""" - config = CustomRecognizerConfig( - name="custom_test", - supported_entity="TEST", - supported_language="en", - patterns=[{"name": "test", "regex": r"\d+", "score": 0.5}] - ) - - result = YamlRecognizerProcessor.expand_recognizer_configs( - config, ["en", "es"] - ) - - assert len(result) == 1 - assert result[0]["name"] == "custom_test" - assert result[0]["supported_language"] == "en" - - -def test_expand_recognizer_with_global_context(): - """Test that global context is preserved during expansion.""" - config = PredefinedRecognizerConfig( - name="EmailRecognizer", - supported_languages=["en"], - context=["global", "context"] - ) - - result = YamlRecognizerProcessor.expand_recognizer_configs( - config, ["en", "es"] - ) - - assert len(result) == 1 - assert result[0]["context"] == ["global", "context"] - - -def test_create_pattern_recognizers_from_config(): - """Test creating PatternRecognizer configs from CustomRecognizerConfig.""" - patterns = [ - {"name": "test", "regex": r"\d+", "score": 0.5} - ] - config = CustomRecognizerConfig( - name="custom_test", - supported_entity="TEST", - supported_language="en", - patterns=patterns, - deny_list=["exclude"], - deny_list_score=0.1 - ) - - result = YamlRecognizerProcessor.create_pattern_recognizers_from_config( - config, ["en"] - ) - - assert len(result) == 1 - pattern_config = result[0] - assert pattern_config["name"] == "custom_test" - assert pattern_config["supported_entities"] == ["TEST"] - assert "supported_entity" not in pattern_config - assert pattern_config["patterns"] == patterns - assert pattern_config["deny_list"] == ["exclude"] - assert pattern_config["deny_list_score"] == 0.1 - - -def test_expand_language_context_config(): - """Test expanding recognizer with LanguageContextConfig.""" - lang_config = LanguageContextConfig( - language="es", - context=["tarjeta", "credito"] - ) - config = CustomRecognizerConfig( - name="credit_card_es", - supported_entity="CREDIT_CARD", - supported_languages=[lang_config], - patterns=[{"name": "test", "regex": r"\d+", "score": 0.5}] - ) - - result = YamlRecognizerProcessor.expand_recognizer_configs( - config, ["en", "es"] - ) - - assert len(result) == 1 - assert result[0]["supported_language"] == lang_config - # Global context should be preserved even with language-specific config - assert result[0]["context"] is None # No global context in this case - def test_complete_registry_scenario(): """Test a complete registry configuration scenario.""" @@ -596,27 +447,6 @@ def test_complete_registry_scenario(): assert isinstance(config.recognizers[2], CustomRecognizerConfig) -def test_language_context_integration(): - """Test integration with LanguageContextConfig.""" - lang_configs = [ - LanguageContextConfig(language="en", context=["credit", "card"]), - LanguageContextConfig(language="es", context=["tarjeta", "credito"]) - ] - - config = PredefinedRecognizerConfig( - name="CreditCardRecognizer", - supported_languages=lang_configs - ) - - # Test expansion - result = YamlRecognizerProcessor.expand_recognizer_configs( - config, ["en", "es", "fr"] - ) - - assert len(result) == 2 - assert result[0]["supported_language"] == lang_configs[0] - assert result[1]["supported_language"] == lang_configs[1] - def test_error_handling_cascade(): """Test that validation errors are properly cascaded.""" @@ -628,6 +458,7 @@ def test_error_handling_cascade(): "name": "invalid_custom", "type": "custom", "supported_entity": "TEST", + "supported_language": "en", # Add language to avoid that error "patterns": [ { "name": "test", @@ -741,4 +572,5 @@ def test_custom_recognizer_with_language_no_global_languages(): config = RecognizerRegistryConfig(**registry_config) assert len(config.recognizers) == 1 assert isinstance(config.recognizers[0], CustomRecognizerConfig) - assert config.recognizers[0].supported_languages == ["en"] + assert config.recognizers[0].supported_language == "en" + assert config.recognizers[0].supported_languages is None From 0fbd010b90c96a203c3335a92d89c42cb77eebd7 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 11 Nov 2025 13:07:37 +0200 Subject: [PATCH 03/30] linting --- .../analyzer_engine_provider.py | 5 +- .../input_validation/schemas.py | 3 +- .../yaml_recognizer_models.py | 24 ++++---- .../nlp_engine/nlp_engine_provider.py | 6 +- .../presidio_analyzer/pattern_recognizer.py | 10 +++- .../recognizer_registry.py | 2 +- .../recognizer_registry_provider.py | 9 ++- .../recognizers_loader_utils.py | 59 +++++++++++++------ 8 files changed, 75 insertions(+), 43 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py b/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py index 6e6baff842..fc67516ef3 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py @@ -59,12 +59,15 @@ def get_configuration( with open(self._get_full_conf_path()) as file: configuration = yaml.safe_load(file) except Exception: - logger.warning(f"Failed to parse file {conf_file}, resorting to default") + logger.warning( + f"Failed to parse file {conf_file}, resorting to default" + ) with open(self._get_full_conf_path()) as file: configuration = yaml.safe_load(file) # Validate configuration using Pydantic-based ConfigurationValidator from presidio_analyzer.input_validation import ConfigurationValidator + ConfigurationValidator.validate_analyzer_configuration(configuration) logger.debug("Analyzer configuration validation passed") diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index bf59b86374..5ce672039d 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -3,8 +3,10 @@ from typing import Any, Dict, List, Union from pydantic import ValidationError + from .yaml_recognizer_models import RecognizerRegistryConfig + class ConfigurationValidator: """Class for validating configurations using Pydantic-enabled classes.""" @@ -81,7 +83,6 @@ def validate_recognizer_registry_configuration( ) -> Dict[str, Any]: """Validate recognizer registry validation using Pydantic models.""" try: - # Use Pydantic model for validation validated_config = RecognizerRegistryConfig(**config) # Use model_dump() without exclude_unset to include default values diff --git a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py index aad3fbf89b..66a1a918ba 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py @@ -100,13 +100,13 @@ def validate_entity_configuration(self): """Ensure proper entity validation.""" # Check if user provided both (before we modify them) user_provided_both = ( - self.supported_entity is not None - and self.supported_entities is not None + self.supported_entity is not None and self.supported_entities is not None ) if user_provided_both: raise ValueError( - f"Recognizer {self.name} has both ""'supported_entity' and 'supported_entities' specified." + f"Recognizer {self.name} has both " + "'supported_entity' and 'supported_entities' specified." ) return self @@ -264,9 +264,7 @@ class RecognizerRegistryConfig(BaseModel): supported_languages: Optional[List[str]] = Field( default=None, description="List of supported languages" ) - global_regex_flags: int = Field( - default=26, description="Global regex flags" - ) + global_regex_flags: int = Field(default=26, description="Global regex flags") recognizers: List[ Union[PredefinedRecognizerConfig, CustomRecognizerConfig, str] ] = Field(default_factory=list, description="List of recognizer configurations") @@ -295,13 +293,19 @@ def validate_languages_for_custom_recognizers(self): for recognizer in self.recognizers: if isinstance(recognizer, CustomRecognizerConfig): # Check if this custom recognizer has its own language config - if not recognizer.supported_language and not recognizer.supported_languages: + if ( + not recognizer.supported_language + and not recognizer.supported_languages + ): # If no language config on recognizer, we need global languages if not self.supported_languages: raise ValueError( - f"Language configuration missing for custom recognizer '{recognizer.name}': " - "Either specify 'supported_languages' on the recognizer or provide " - "global 'supported_languages' in the registry configuration." + f"Language configuration missing for custom recognizer " + f"'{recognizer.name}': " + "Either specify 'supported_languages' " + "on the recognizer or provide " + "global 'supported_languages' in the " + "registry configuration." ) return self diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py index 65901ddca3..850b1b4218 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py @@ -63,7 +63,6 @@ def __init__( if conf_file or conf_file == "": if conf_file == "": raise ValueError("conf_file is empty") - # Validate file path using ConfigurationValidator - let Pydantic errors propagate ConfigurationValidator.validate_file_path(conf_file) self.nlp_configuration = self._read_nlp_conf(conf_file) @@ -98,9 +97,7 @@ def create_engine(self) -> NlpEngine: nlp_engine_class = self.nlp_engines[nlp_engine_name] nlp_models = self.nlp_configuration["models"] - ner_model_configuration = self.nlp_configuration.get( - "ner_model_configuration" - ) + ner_model_configuration = self.nlp_configuration.get("ner_model_configuration") if ner_model_configuration: ner_model_configuration = NerModelConfiguration.from_dict( ner_model_configuration @@ -115,4 +112,3 @@ def create_engine(self) -> NlpEngine: f"Loaded models: {list(engine.nlp.keys())}" ) return engine - diff --git a/presidio-analyzer/presidio_analyzer/pattern_recognizer.py b/presidio-analyzer/presidio_analyzer/pattern_recognizer.py index 66ce893f3c..df4a051ae7 100644 --- a/presidio-analyzer/presidio_analyzer/pattern_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/pattern_recognizer.py @@ -198,7 +198,7 @@ def __analyze_patterns( logger.debug( "--- match_time[%s]: %.6f seconds", pattern.name, - match_time.total_seconds() + match_time.total_seconds(), ) for match in matches: @@ -276,9 +276,13 @@ def from_dict(cls, entity_recognizer_dict: Dict) -> "PatternRecognizer": # Transform supported_entities (plural) to supported_entity (singular) # PatternRecognizer only accepts supported_entity (singular) - if "supported_entity" in entity_recognizer_dict and "supported_entities" in entity_recognizer_dict: + if ( + "supported_entity" in entity_recognizer_dict + and "supported_entities" in entity_recognizer_dict + ): raise ValueError( - "Both 'supported_entity' and 'supported_entities' are present in the input dictionary. " + "Both 'supported_entity' and 'supported_entities' " + "are present in the input dictionary. " "Only one should be provided." ) if "supported_entities" in entity_recognizer_dict: diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py index 0acab3698e..8c00ad90be 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry.py @@ -56,7 +56,7 @@ def __init__( def _create_nlp_recognizer( self, nlp_engine: Optional[NlpEngine] = None, - supported_language: Optional[str] = None + supported_language: Optional[str] = None, ) -> SpacyRecognizer: nlp_recognizer = self.get_nlp_recognizer(nlp_engine) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry_provider.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry_provider.py index 15012d555e..73c646227e 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry_provider.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry_provider.py @@ -6,6 +6,7 @@ from typing import Any, Dict, List, Optional, Union from presidio_analyzer import EntityRecognizer +from presidio_analyzer.input_validation import ConfigurationValidator from presidio_analyzer.nlp_engine import NlpEngine from presidio_analyzer.predefined_recognizers import SpacyRecognizer from presidio_analyzer.recognizer_registry import RecognizerRegistry @@ -13,7 +14,6 @@ RecognizerConfigurationLoader, RecognizerListLoader, ) -from presidio_analyzer.input_validation import ConfigurationValidator logger = logging.getLogger("presidio-analyzer") @@ -58,7 +58,11 @@ def __init__( ) # Validate configuration using Pydantic - self.configuration = ConfigurationValidator.validate_recognizer_registry_configuration(self.configuration) + self.configuration = ( + ConfigurationValidator.validate_recognizer_registry_configuration( + self.configuration + ) + ) self.nlp_engine = nlp_engine def create_recognizer_registry(self) -> RecognizerRegistry: @@ -236,4 +240,3 @@ def __remove_disabled_nlp_recognizers( f"Disabled {recognizer.__class__.__name__} " f"recognizer for language {language}." ) - diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py index d4a297263d..36266ddb54 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py @@ -145,15 +145,19 @@ def _create_custom_recognizers( """Create a custom recognizer for each language, based on the provided conf.""" # legacy recognizer (has supported_language set to a value, not None) if recognizer_conf.get("supported_language"): - # Remove supported_languages field (plural) if present, as we're using supported_language (singular) - conf_copy = {k: v for k, v in recognizer_conf.items() if k != "supported_languages"} - - # Transform supported_entities -> supported_entity (PatternRecognizer expects singular) + # Remove supported_languages field (plural) if present, + # as we're using supported_language (singular) + conf_copy = { + k: v for k, v in recognizer_conf.items() if k != "supported_languages" + } + + # Transform supported_entities -> supported_entity + # (PatternRecognizer expects singular) if "supported_entities" in conf_copy: supported_entities = conf_copy.pop("supported_entities") if "supported_entity" not in conf_copy and supported_entities: conf_copy["supported_entity"] = supported_entities[0] - + return [PatternRecognizer.from_dict(conf_copy)] recognizers = [] @@ -167,7 +171,8 @@ def _create_custom_recognizers( if k not in ["enabled", "type", "supported_languages"] } - # Transform supported_entities -> supported_entity (PatternRecognizer expects singular) + # Transform supported_entities -> supported_entity + # (PatternRecognizer expects singular) if "supported_entities" in copied_recognizer: supported_entities = copied_recognizer.pop("supported_entities") if "supported_entity" not in copied_recognizer and supported_entities: @@ -242,7 +247,8 @@ def _prepare_recognizer_kwargs( """ Prepare kwargs for recognizer instantiation. - Converts supported_entities to supported_entity for PatternRecognizer subclasses. + Converts supported_entities to supported_entity + for PatternRecognizer subclasses. Removes both fields if they are None to allow recognizer defaults to be used. :param recognizer_conf: The recognizer configuration. @@ -254,14 +260,21 @@ def _prepare_recognizer_kwargs( # If this is a PatternRecognizer, handle supported_entities/supported_entity if RecognizerListLoader._is_pattern_recognizer(recognizer_cls): - # Convert supported_entities (plural) to supported_entity (singular) if present + # Convert supported_entities (plural) to supported_entity + # (singular) if present if "supported_entities" in kwargs: supported_entities = kwargs.pop("supported_entities") - # Only set supported_entity if we have valid entities and it's not already set - if supported_entities and len(supported_entities) > 0 and "supported_entity" not in kwargs: + # Only set supported_entity if we have valid entities + # and it's not already set + if ( + supported_entities + and len(supported_entities) > 0 + and "supported_entity" not in kwargs + ): kwargs["supported_entity"] = supported_entities[0] - # Remove supported_entity if it's None to allow the recognizer's default to be used + # Remove supported_entity if it's None + # to allow the recognizer's default to be used if kwargs.get("supported_entity") is None: kwargs.pop("supported_entity", None) else: @@ -285,11 +298,14 @@ def get( """ recognizer_instances = [] predefined, custom = RecognizerListLoader._split_recognizers(recognizers) - # Exclude Pydantic-normalized fields that should not be passed to recognizer constructors - # Note: We exclude both supported_entity and supported_entities here because we'll handle + # Exclude Pydantic-normalized fields that should not + # be passed to recognizer constructors + # Note: We exclude both supported_entity + # and supported_entities here because we'll handle # the conversion in _prepare_recognizer_kwargs predefined_to_exclude = {"enabled", "type", "supported_languages", "name"} - # For custom recognizers, we keep 'supported_languages' and don't exclude 'supported_entity' + # For custom recognizers, we keep 'supported_languages' + # and don't exclude 'supported_entity' # because PatternRecognizer needs it custom_to_exclude = {"enabled", "type"} for recognizer_conf in predefined: @@ -308,7 +324,8 @@ def get( recognizer_name=recognizer_name ) - # Prepare kwargs, converting supported_entities to supported_entity if needed + # Prepare kwargs, converting supported_entities + # to supported_entity if needed kwargs = RecognizerListLoader._prepare_recognizer_kwargs( new_conf, language_conf, recognizer_cls ) @@ -411,7 +428,8 @@ def get( if registry_configuration: configuration = registry_configuration.copy() # Check if registry_configuration has all mandatory keys - # Note: supported_languages is now optional, so we only check for recognizers + # Note: supported_languages is now optional, + # so we only check for recognizers mandatory_keys_set = {"recognizers", "global_regex_flags"} config_keys = set(configuration.keys()) if mandatory_keys_set.issubset(config_keys): @@ -437,7 +455,8 @@ def get( f"Failed to parse file {conf_file}." f"Error: {str(e)}" ) - # Load defaults if needed (no config provided, or registry_configuration is incomplete) + # Load defaults if needed (no config provided, + # or registry_configuration is incomplete) if use_defaults: with open(RecognizerConfigurationLoader._get_full_conf_path()) as file: config_from_file = yaml.safe_load(file) @@ -454,8 +473,10 @@ def get( f"got {type(registry_configuration)}" ) - # Check if config_from_file has any invalid keys (keys that aren't mandatory or valid optional keys) - # If it has keys but none of them are mandatory keys, it's likely an invalid config + # Check if config_from_file has any invalid keys + # (keys that aren't mandatory or valid optional keys) + # If it has keys but none of them are mandatory keys, + # it's likely an invalid config if config_from_file and conf_file: config_keys = set(config_from_file.keys()) mandatory_keys_set = {"recognizers"} # Only recognizers is truly mandatory From c4841b539e94fd749d189e71b5561f9d86713e17 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 11 Nov 2025 22:24:21 +0200 Subject: [PATCH 04/30] Update presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../input_validation/yaml_recognizer_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py index 66a1a918ba..990dba0279 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py @@ -197,7 +197,7 @@ class CustomRecognizerConfig(BaseRecognizerConfig): @field_validator("patterns") @classmethod - def validate_patterns(cls, patterns: Optional[List[Dict]]) -> Optional[str]: + def validate_patterns(cls, patterns: Optional[List[Dict]]) -> Optional[List[Dict]]: """Validate single language code format.""" if patterns and not isinstance(patterns, list): raise ValueError(f"Patterns should be a list: {patterns}") From 3939fc93308ed99e9b62bc515e23b7a87f81d328 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 11 Nov 2025 22:24:51 +0200 Subject: [PATCH 05/30] Update presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../input_validation/yaml_recognizer_models.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py index 990dba0279..fa21788e63 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py @@ -379,7 +379,7 @@ def parse_recognizers(cls, v): return parsed_recognizers @classmethod - def __check_if_predefined(cls, recognizer_name: Any | None): + def __check_if_predefined(cls, recognizer_name: Optional[Any]): try: from presidio_analyzer.recognizer_registry.recognizers_loader_utils import ( RecognizerListLoader, From d7cb69b963a11fe810e37f24a08ecaa0799d13bb Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 11 Nov 2025 22:25:07 +0200 Subject: [PATCH 06/30] Update presidio-analyzer/tests/test_configuration_validator.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- presidio-analyzer/tests/test_configuration_validator.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-analyzer/tests/test_configuration_validator.py b/presidio-analyzer/tests/test_configuration_validator.py index 36d74bc25e..c9dd3d4f88 100644 --- a/presidio-analyzer/tests/test_configuration_validator.py +++ b/presidio-analyzer/tests/test_configuration_validator.py @@ -1,4 +1,4 @@ -"""Tests for the Pydantic-based validation validation system using existing adapted classes.""" +"""Tests for the Pydantic-based validation system using existing adapted classes.""" import pytest from presidio_analyzer.input_validation import ConfigurationValidator From 3b61469123bf2c7ed5e614eafb7ae06520400681 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 11 Nov 2025 22:25:33 +0200 Subject: [PATCH 07/30] Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- presidio-analyzer/presidio_analyzer/input_validation/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index 5ce672039d..217e3def77 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -51,7 +51,7 @@ def validate_score_threshold(threshold: float) -> float: @staticmethod def validate_nlp_configuration(config: Dict[str, Any]) -> Dict[str, Any]: - """Validate NLP validation structure. + """Validate NLP configuration structure. :param config: NLP Configuration to validate. """ From 251cefc2cec8cedebab2ff93a5904e7177070405 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 11 Nov 2025 22:28:02 +0200 Subject: [PATCH 08/30] Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- .../presidio_analyzer/input_validation/schemas.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index 217e3def77..93867b0d44 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -56,13 +56,13 @@ def validate_nlp_configuration(config: Dict[str, Any]) -> Dict[str, Any]: :param config: NLP Configuration to validate. """ if not isinstance(config, dict): - raise ValueError("NLP validation must be a dictionary") + raise ValueError("NLP configuration must be a dictionary") required_fields = ["nlp_engine_name", "models"] missing_fields = [field for field in required_fields if field not in config] if missing_fields: raise ValueError( - f"NLP validation missing required fields: {missing_fields}" + f"NLP configuration missing required fields: {missing_fields}" ) # Validate models structure From 1420bd5c86237e6b54873343e0bb04a7d6eefde1 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 11 Nov 2025 22:28:34 +0200 Subject: [PATCH 09/30] Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- presidio-analyzer/presidio_analyzer/input_validation/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index 93867b0d44..5cbe03a87f 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -81,7 +81,7 @@ def validate_nlp_configuration(config: Dict[str, Any]) -> Dict[str, Any]: def validate_recognizer_registry_configuration( config: Dict[str, Any], ) -> Dict[str, Any]: - """Validate recognizer registry validation using Pydantic models.""" + """Validate recognizer registry configuration using Pydantic models.""" try: # Use Pydantic model for validation validated_config = RecognizerRegistryConfig(**config) From c677b793ae6fe635489e4bc60f760554e0e82c55 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 11 Nov 2025 22:29:07 +0200 Subject: [PATCH 10/30] Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- presidio-analyzer/presidio_analyzer/input_validation/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index 5cbe03a87f..26c67335bd 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -97,7 +97,7 @@ def validate_recognizer_registry_configuration( def _validate_recognizer_registry_basic(config: Dict[str, Any]) -> Dict[str, Any]: """Validate recognizer registry config.""" if not isinstance(config, dict): - raise ValueError("Recognizer registry validation must be a dictionary") + raise ValueError("Recognizer registry configuration must be a dictionary") # Validate supported languages if "supported_languages" in config: From 421e47d6a2d7ccc5eb03c9dd982885aa6ed95a3d Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 11 Nov 2025 22:30:15 +0200 Subject: [PATCH 11/30] Update presidio-analyzer/tests/test_recognizer_registry_provider.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- presidio-analyzer/tests/test_recognizer_registry_provider.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/presidio-analyzer/tests/test_recognizer_registry_provider.py b/presidio-analyzer/tests/test_recognizer_registry_provider.py index 33fc7634ba..9bbf749bb0 100644 --- a/presidio-analyzer/tests/test_recognizer_registry_provider.py +++ b/presidio-analyzer/tests/test_recognizer_registry_provider.py @@ -3,8 +3,6 @@ from pathlib import Path from typing import List from inspect import signature -import pydantic - from presidio_analyzer.predefined_recognizers import SpacyRecognizer from presidio_analyzer.recognizer_registry import RecognizerRegistryProvider from presidio_analyzer.recognizer_registry.recognizers_loader_utils import RecognizerConfigurationLoader From 2901b136caf00372ae8958efd1831d8cee5bba14 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 11 Nov 2025 22:30:27 +0200 Subject: [PATCH 12/30] Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- presidio-analyzer/presidio_analyzer/input_validation/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index 26c67335bd..5b302b3e39 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -115,7 +115,7 @@ def _validate_recognizer_registry_basic(config: Dict[str, Any]) -> Dict[str, Any def validate_analyzer_configuration(config: Dict[str, Any]) -> Dict[str, Any]: """Validate analyzer engine validation.""" if not isinstance(config, dict): - raise ValueError("Analyzer validation must be a dictionary") + raise ValueError("Analyzer configuration must be a dictionary") # Validate supported languages if present if "supported_languages" in config: From 8b84370b9db8d5a46877410eb471c217d5ae8f8c Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Tue, 11 Nov 2025 23:35:25 +0200 Subject: [PATCH 13/30] Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- presidio-analyzer/presidio_analyzer/input_validation/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index 5b302b3e39..26ace2e4e3 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -88,7 +88,7 @@ def validate_recognizer_registry_configuration( # Use model_dump() without exclude_unset to include default values return validated_config.model_dump(exclude_unset=False) except ValidationError as e: - raise ValueError(f"Invalid recognizer registry validation: {e}") + raise ValueError(f"Invalid recognizer registry configuration: {e}") except ImportError: # Fallback to basic validation if models not available return ConfigurationValidator._validate_recognizer_registry_basic(config) From 108e3d02aba8534fd94d8aad92d33020be2c7ce4 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 19 Nov 2025 13:02:08 +0200 Subject: [PATCH 14/30] ruff on the entire analyzer codebase --- .../analyzer_engine_provider.py | 8 + .../conf/default_analyzer_full.yaml | 154 +++++++++++++++++ .../context_aware_enhancers/__init__.py | 1 + .../input_validation/schemas.py | 161 +++++++++++++++++- .../yaml_recognizer_models.py | 60 ++++++- .../nlp_engine/stanza_nlp_engine.py | 4 +- .../predefined_recognizers/__init__.py | 1 - .../india/in_gstin_recognizer.py | 6 +- .../thai/th_tnin_recognizer.py | 2 - .../third_party/__init__.py | 5 +- .../third_party/ahds_recognizer.py | 16 +- .../conf/test_minimal_registry_conf.yaml | 3 +- .../tests/test_configuration_validator.py | 35 ++++ .../test_recognizer_registry_provider.py | 33 +--- .../tests/test_yaml_recognizer_models.py | 57 +++++-- 15 files changed, 475 insertions(+), 71 deletions(-) create mode 100644 presidio-analyzer/presidio_analyzer/conf/default_analyzer_full.yaml diff --git a/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py b/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py index fc67516ef3..c338b41544 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py @@ -5,6 +5,7 @@ import yaml from presidio_analyzer import AnalyzerEngine, RecognizerRegistry +from presidio_analyzer.input_validation import ConfigurationValidator from presidio_analyzer.nlp_engine import NlpEngine, NlpEngineProvider from presidio_analyzer.recognizer_registry import RecognizerRegistryProvider @@ -29,6 +30,13 @@ def __init__( nlp_engine_conf_file: Optional[Union[Path, str]] = None, recognizer_registry_conf_file: Optional[Union[Path, str]] = None, ): + if analyzer_engine_conf_file: + ConfigurationValidator.validate_file_path(analyzer_engine_conf_file) + if nlp_engine_conf_file: + ConfigurationValidator.validate_file_path(nlp_engine_conf_file) + if recognizer_registry_conf_file: + ConfigurationValidator.validate_file_path(recognizer_registry_conf_file) + self.configuration = self.get_configuration(conf_file=analyzer_engine_conf_file) self.nlp_engine_conf_file = nlp_engine_conf_file self.recognizer_registry_conf_file = recognizer_registry_conf_file diff --git a/presidio-analyzer/presidio_analyzer/conf/default_analyzer_full.yaml b/presidio-analyzer/presidio_analyzer/conf/default_analyzer_full.yaml new file mode 100644 index 0000000000..a3603e3947 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/conf/default_analyzer_full.yaml @@ -0,0 +1,154 @@ +supported_languages: + - en +default_score_threshold: 0 +nlp_configuration: + nlp_engine_name: spacy + models: + - lang_code: en + model_name: en_core_web_lg + + ner_model_configuration: + model_to_presidio_entity_mapping: + PER: PERSON + PERSON: PERSON + NORP: NRP + FAC: LOCATION + LOC: LOCATION + LOCATION: LOCATION + GPE: LOCATION + ORG: ORGANIZATION + ORGANIZATION: ORGANIZATION + DATE: DATE_TIME + TIME: DATE_TIME + + low_confidence_score_multiplier: 0.4 + low_score_entity_names: + - + labels_to_ignore: + - ORG + - ORGANIZATION # has many false positives + - CARDINAL + - EVENT + - LANGUAGE + - LAW + - MONEY + - ORDINAL + - PERCENT + - PRODUCT + - QUANTITY + - WORK_OF_ART + + +recognizer_registry: + # global_regex_flags: 26 + recognizers: + # Recognizers listed here can either be loaded from the recognizers defined in code (type: predefined), + # or created based on the provided configuration (type: custom). + # For predefined: + # - If only a recognizer name is provided, a predefined recognizer with this name and default parameters will be loaded. + # - If a parameter isn't provided, the default one would be loaded. + # For custom: + # - See an example configuration here: https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/conf/example_recognizers.yaml + # - Custom pattern recognizers with this configuration can be added to this file, with type: custom + # For recognizers supporting more than one language, an instance of the recognizer for each language will be created. + # For example, see the CreditCardRecognizer definition below: + - name: CreditCardRecognizer + supported_languages: + - language: en + context: [credit, card, visa, mastercard, cc, amex, discover, jcb, diners, maestro, instapayment] + type: predefined + + - name: UsBankRecognizer + type: predefined + + - name: UsLicenseRecognizer + type: predefined + + - name: UsItinRecognizer + type: predefined + + - name: UsPassportRecognizer + type: predefined + + - name: UsSsnRecognizer + type: predefined + + - name: NhsRecognizer + type: predefined + + - name: UkNinoRecognizer + type: predefined + enabled: false + + - name: SgFinRecognizer + type: predefined + enabled: false + + - name: AuAbnRecognizer + type: predefined + enabled: false + + - name: AuAcnRecognizer + type: predefined + enabled: false + + - name: AuTfnRecognizer + type: predefined + enabled: false + + - name: AuMedicareRecognizer + type: predefined + enabled: false + + - name: InPanRecognizer + type: predefined + enabled: false + + - name: InAadhaarRecognizer + supported_languages: + - en + type: predefined + enabled: false + + - name: InVehicleRegistrationRecognizer + type: predefined + enabled: false + + - name: InPassportRecognizer + type: predefined + enabled: false + + - name: CryptoRecognizer + type: predefined + + - name: DateRecognizer + type: predefined + + - name: EmailRecognizer + type: predefined + + - name: IbanRecognizer + type: predefined + + - name: IpRecognizer + type: predefined + + - name: MedicalLicenseRecognizer + type: predefined + + - name: PhoneRecognizer + type: predefined + + - name: UrlRecognizer + type: predefined + + - name: InVoterRecognizer + type: predefined + enabled: false + + - name: InGstinRecognizer + type: predefined + enabled: false + + - name: SpacyRecognizer + type: predefined diff --git a/presidio-analyzer/presidio_analyzer/context_aware_enhancers/__init__.py b/presidio-analyzer/presidio_analyzer/context_aware_enhancers/__init__.py index a6667dc57d..fbcfdf91bc 100644 --- a/presidio-analyzer/presidio_analyzer/context_aware_enhancers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/context_aware_enhancers/__init__.py @@ -1,4 +1,5 @@ """Context awareness modules.""" + from .context_aware_enhancer import ContextAwareEnhancer from .lemma_context_aware_enhancer import LemmaContextAwareEnhancer diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index 5ce672039d..682b0b1207 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -88,7 +88,11 @@ def validate_recognizer_registry_configuration( # Use model_dump() without exclude_unset to include default values return validated_config.model_dump(exclude_unset=False) except ValidationError as e: - raise ValueError(f"Invalid recognizer registry validation: {e}") + # Format the error in a human-readable way + formatted_error = ConfigurationValidator._format_custom_recognziers_errors( + e + ) + raise ValueError(formatted_error) except ImportError: # Fallback to basic validation if models not available return ConfigurationValidator._validate_recognizer_registry_basic(config) @@ -99,6 +103,18 @@ def _validate_recognizer_registry_basic(config: Dict[str, Any]) -> Dict[str, Any if not isinstance(config, dict): raise ValueError("Recognizer registry validation must be a dictionary") + # Define valid top-level keys for recognizer registry configuration + valid_keys = {"supported_languages", "global_regex_flags", "recognizers"} + + # Check for unknown keys + unknown_keys = set(config.keys()) - valid_keys + if unknown_keys: + raise ValueError( + f"Unknown configuration key(s) in " + f"recognizer_registry: {sorted(unknown_keys)}. " + f"Valid keys are: {sorted(valid_keys)}" + ) + # Validate supported languages if "supported_languages" in config: ConfigurationValidator.validate_language_codes( @@ -117,6 +133,23 @@ def validate_analyzer_configuration(config: Dict[str, Any]) -> Dict[str, Any]: if not isinstance(config, dict): raise ValueError("Analyzer validation must be a dictionary") + # Define valid top-level keys for analyzer configuration + valid_keys = { + "supported_languages", + "default_score_threshold", + "nlp_configuration", + "recognizer_registry", + } + + # Check for unknown keys + unknown_keys = set(config.keys()) - valid_keys + if unknown_keys: + raise ValueError( + f"Unknown configuration key(s) in analyzer " + f"configuration: {sorted(unknown_keys)}. " + f"Valid keys are: {sorted(valid_keys)}" + ) + # Validate supported languages if present if "supported_languages" in config: ConfigurationValidator.validate_language_codes( @@ -136,8 +169,128 @@ def validate_analyzer_configuration(config: Dict[str, Any]) -> Dict[str, Any]: ) if "recognizer_registry" in config: - ConfigurationValidator.validate_recognizer_registry_configuration( - config["recognizer_registry"] - ) + if not isinstance(config["recognizer_registry"], dict): + raise ValueError("recognizer_registry must be a dictionary") return config + + @staticmethod + def _format_custom_recognziers_errors(error: ValidationError) -> str: + """Format Pydantic ValidationError into human-readable message. + + :param error: Pydantic ValidationError to format + :return: Human-readable error message + """ + messages = [] + messages.append("Configuration validation failed:\n") + + for err in error.errors(): + error_type = err.get("type", "") + location = " -> ".join(str(loc) for loc in err.get("loc", [])) + msg = err.get("msg", "") + input_value = err.get("input", None) + + # Build a more readable error message based on error type and location + if error_type == "missing": + # Handle missing required fields + field_name = err["loc"][-1] if err.get("loc") else "field" + + if "recognizers" in location and field_name == "supported_entity": + messages.append( + f" ✗ Missing required field '{field_name}' " + f"in custom recognizer\n" + f" Location: {location}\n" + f" Fix: Add 'supported_entity' to specify which " + f"entity type this recognizer detects.\n" + f" Example:\n" + f' supported_entity: "MY_ENTITY"\n' + ) + elif field_name == "patterns": + messages.append( + f" ✗ Custom recognizer is missing 'patterns' or 'deny_list'\n" + f" Location: {location}\n" + f" Fix: Add at least one of the following:\n" + f" - patterns: List of regex patterns to match\n" + f" - deny_list: List of words to detect\n" + f" Example:\n" + f' context: ["my", "entity", "keyword"]\n' + f" patterns:\n" + f' - name: "my_pattern"\n' + f' regex: "[A-Z]{{3}}-\\d{{4}}"\n' + f" score: 0.8\n" + ) + else: + messages.append( + f" ✗ Missing required field: '{field_name}'\n" + f" Location: {location}\n" + f" Fix: This field is required. " + f"Please add it to your configuration.\n" + ) + + elif error_type == "value_error": + # Handle custom validation errors + # Check if it's the missing patterns/deny_list error + if "patterns" in msg.lower() and "deny_list" in msg.lower(): + messages.append( + f" ✗ Custom recognizer is missing 'patterns' or 'deny_list'\n" + f" Location: {location}\n" + f" Error: {msg}\n" + f" Fix: Add at least one of the following:\n" + f" - patterns: List of regex patterns to match\n" + f" - deny_list: List of words to detect\n" + f" Example:\n" + f' context: ["my", "entity", "keyword"]\n' + f" patterns:\n" + f' - name: "my_pattern"\n' + f' regex: "[A-Z]{{3}}-\\d{{4}}"\n' + f" score: 0.8\n" + ) + else: + messages.append( + f" ✗ Validation error at: {location}\n" f" Error: {msg}\n" + ) + + elif error_type in ("string_type", "list_type", "dict_type"): + # Handle type errors + expected_type = error_type.replace("_type", "") + messages.append( + f" ✗ Type error at: {location}\n" + f" Expected: {expected_type}\n" + f" Got: {type(input_value).__name__ if input_value is not None else 'None'}\n" # noqa: E501 + f" Value: {input_value}\n" + ) + + elif "union" in error_type: + # Handle union type errors + messages.append( + f" ✗ Invalid value at: {location}\n" f" Error: {msg}\n" + ) + + else: + # Generic error message + messages.append(f" ✗ Error at: {location}\n" f" {msg}\n") + + # Add helpful tips at the end + messages.append("\n💡 Common fixes for custom recognizers:") + messages.append(" • Ensure 'type: custom' is set") + messages.append(" • Add 'supported_entity' (e.g., 'MY_ENTITY')") + messages.append(" • Define 'patterns' or 'deny_list'") + messages.append( + " • Specify language(s) via 'supported_language' or 'supported_languages'" + ) + messages.append( + " • Optionally add 'context' words to improve detection accuracy" + ) + messages.append("\n Example custom recognizer:") + messages.append(" recognizers:") + messages.append(' - name: "MyRecognizer"') + messages.append(' type: "custom"') + messages.append(' supported_entity: "MY_ENTITY"') + messages.append(' supported_language: "en"') + messages.append(' context: ["my", "entity", "keyword"]') + messages.append(" patterns:") + messages.append(' - name: "my_pattern"') + messages.append(' regex: "[A-Z]{3}-\\d{4}"') + messages.append(" score: 0.8") + + return "\n".join(messages) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py index 66a1a918ba..b459209ea6 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py @@ -156,7 +156,9 @@ def validate_predefined_recognizer_exists(self): ] raise ValueError( f"Predefined recognizer '{self.name}' not found. " - f"Available predefined recognizers: " + f"If you want to add your own custom recognizer, " + f"mark is as type: 'custom'. " + f"The available predefined recognizers are: " f"{', '.join(sorted(available_recognizers))}" ) from e return self @@ -195,6 +197,40 @@ class CustomRecognizerConfig(BaseRecognizerConfig): model_config = ConfigDict(arbitrary_types_allowed=True) + @model_validator(mode="before") + @classmethod + def check_predefined_name_conflict(cls, data: Any) -> Any: + """Check if custom recognizer name conflicts with predefined recognizer. + + This validation runs BEFORE field validation to provide a clearer error message + when someone tries to use a predefined recognizer name for a custom recognizer. + """ + if isinstance(data, dict): + name = data.get("name") + if name: + try: + # Lazy import to avoid circular dependency + from presidio_analyzer.recognizer_registry.recognizers_loader_utils import ( # noqa + RecognizerListLoader, + ) + + try: + RecognizerListLoader.get_existing_recognizer_cls(name) + raise ValueError( + f"Recognizer '{name}' conflicts with a predefined " + f"recognizer. " + f"Custom recognizers cannot use the same name " + f"as predefined recognizers. " + f"Either use type: 'predefined' or choose a different name " + f"for your custom recognizer." + ) + except ValueError as e: + if "was not found" not in str(e): + raise + except (ImportError, ModuleNotFoundError): + pass + return data + @field_validator("patterns") @classmethod def validate_patterns(cls, patterns: Optional[List[Dict]]) -> Optional[str]: @@ -229,6 +265,8 @@ def validate_single_language(cls, v: Optional[str]) -> Optional[str]: def validate_configuration(self): """Ensure configuration is valid.""" # Check if user accidentally marked a predefined recognizer as custom + # This check should happen BEFORE checking patterns/deny_list + # to give a more specific error message try: # Lazy import to avoid circular dependency from presidio_analyzer.recognizer_registry.recognizers_loader_utils import ( @@ -249,7 +287,7 @@ def validate_configuration(self): except (ImportError, ModuleNotFoundError): pass - # Validate patterns or deny_list + # Validate patterns or deny_list only after name check if not self.patterns and not self.deny_list: raise ValueError( "Custom recognizer must have at least one " @@ -269,6 +307,8 @@ class RecognizerRegistryConfig(BaseModel): Union[PredefinedRecognizerConfig, CustomRecognizerConfig, str] ] = Field(default_factory=list, description="List of recognizer configurations") + model_config = ConfigDict(extra="forbid") + @field_validator("supported_languages") @classmethod def validate_language_codes(cls, v: Optional[List[str]]) -> Optional[List[str]]: @@ -316,6 +356,16 @@ def validate_global_regex_flags(cls, v: int) -> int: """Validate global_regex_flags and warn if using default.""" return v + @model_validator(mode="after") + def validate_recognizers_not_empty(self): + """Ensure recognizers list is not empty after all defaults are applied.""" + if not self.recognizers: + raise ValueError( + "The 'recognizers' field must contain at least one recognizer. " + "Found an empty recognizers list." + ) + return self + @field_validator("recognizers", mode="before") @classmethod def parse_recognizers(cls, v): @@ -329,6 +379,12 @@ def parse_recognizers(cls, v): if not isinstance(v, list): raise ValueError("Recognizers must be a list") + if len(v) == 0: + raise ValueError( + "The 'recognizers' field must contain at least one recognizer. " + "Found an empty recognizers list." + ) + parsed_recognizers = [] for recognizer in v: if isinstance(recognizer, str): diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py index fc39e27d40..079587a8c2 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/stanza_nlp_engine.py @@ -298,7 +298,7 @@ def __call__(self, text): f"expansion or because the character offsets don't map to " f"valid tokens produced by the Stanza tokenizer:\n" f"Words: {words}\n" - f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}", # noqa + f"Entities: {[(e.text, e.type, e.start_char, e.end_char) for e in snlp_doc.entities]}", # noqa stacklevel=4, ) else: @@ -375,7 +375,7 @@ def __get_words_and_spaces(words, text): text_spaces.append(False) return text_words, text_spaces - def token_vector(self, token:Token): + def token_vector(self, token: Token): """Get Stanza's pretrained word embedding for given token. :param token: The token whose embedding will be returned diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py index ef9f2d2272..acaccaa538 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/__init__.py @@ -1,6 +1,5 @@ """Predefined recognizers package. Holds all the default recognizers.""" - # Australia recognizers from presidio_analyzer.predefined_recognizers.nlp_engine_recognizers.transformers_recognizer import ( # noqa: E501 TransformersRecognizer, diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_gstin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_gstin_recognizer.py index 13a381551f..14662e609b 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_gstin_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/india/in_gstin_recognizer.py @@ -90,8 +90,8 @@ def _sanitize_value(self, text: str) -> str: # First, try to extract GSTIN pattern from the text gstin_pattern = ( - r'\b((?:0[1-9]|[1-3][0-7])[A-Za-z]{5}[0-9]{4}[A-Za-z]{1}' - r'[0-9A-Za-z]{1}Z[0-9A-Za-z]{1})\b' + r"\b((?:0[1-9]|[1-3][0-7])[A-Za-z]{5}[0-9]{4}[A-Za-z]{1}" + r"[0-9A-Za-z]{1}Z[0-9A-Za-z]{1})\b" ) match = re.search(gstin_pattern, text.upper()) if match: @@ -129,7 +129,7 @@ def _validate_gstin(self, gstin: str) -> bool: return False # Check 14th character should be 'Z' - if gstin[13] != 'Z': + if gstin[13] != "Z": return False # Check 15th character (checksum) diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/thai/th_tnin_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/thai/th_tnin_recognizer.py index e7b33f41ed..39a46eff75 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/thai/th_tnin_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/country_specific/thai/th_tnin_recognizer.py @@ -61,7 +61,6 @@ class ThTninRecognizer(PatternRecognizer): "รหัสปชช", ] - def __init__( self, patterns: Optional[List[Pattern]] = None, @@ -105,7 +104,6 @@ def validate_result(self, pattern_text: str) -> Union[bool, None]: # Validate TNIN checksum (format validation is handled by regex) return self._validate_checksum(sanitized_value) - def _validate_checksum(self, tnin: str) -> bool: """ Validate the checksum of Thai TNIN. diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/third_party/__init__.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/third_party/__init__.py index 9714fbaa04..da1d0abc28 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/third_party/__init__.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/third_party/__init__.py @@ -3,7 +3,4 @@ from .ahds_recognizer import AzureHealthDeidRecognizer from .azure_ai_language import AzureAILanguageRecognizer -__all__ = [ - "AzureAILanguageRecognizer", - "AzureHealthDeidRecognizer" -] +__all__ = ["AzureAILanguageRecognizer", "AzureHealthDeidRecognizer"] diff --git a/presidio-analyzer/presidio_analyzer/predefined_recognizers/third_party/ahds_recognizer.py b/presidio-analyzer/presidio_analyzer/predefined_recognizers/third_party/ahds_recognizer.py index 277036aceb..fdf2671474 100644 --- a/presidio-analyzer/presidio_analyzer/predefined_recognizers/third_party/ahds_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/predefined_recognizers/third_party/ahds_recognizer.py @@ -38,7 +38,7 @@ def __init__( supported_entities: Optional[List[str]] = None, supported_language: str = "en", client: Optional[DeidentificationClient] = None, - **kwargs + **kwargs, ): """ Wrap PHI detection using Azure Health Data Services de-identification. @@ -53,10 +53,9 @@ def __init__( supported_language=supported_language, name="Azure Health Data Services Deidentification", version="1.0.0", - **kwargs + **kwargs, ) - endpoint = os.getenv("AHDS_ENDPOINT", None) if client is None: @@ -75,13 +74,13 @@ def __init__( # Use ChainedTokenCredential for production (secure by default) # Only use DefaultAzureCredential in development mode - if os.getenv('ENV') == 'development': + if os.getenv("ENV") == "development": credential = DefaultAzureCredential() # CodeQL [SM05139] OK for dev else: credential = ChainedTokenCredential( EnvironmentCredential(), WorkloadIdentityCredential(), - ManagedIdentityCredential() + ManagedIdentityCredential(), ) client = DeidentificationClient(endpoint, credential) @@ -123,8 +122,7 @@ def analyze( entities = self.supported_entities body = DeidentificationContent( - input_text=text, - operation_type=DeidentificationOperationType.TAG + input_text=text, operation_type=DeidentificationOperationType.TAG ) result = self.deid_client.deidentify_text(body) @@ -154,8 +152,8 @@ def _build_explanation(entity_type: str) -> AnalysisExplanation: recognizer=AzureHealthDeidRecognizer.__class__.__name__, original_score=1.0, textual_explanation=( - f"Identified as {entity_type} by Azure Health Data Services " - "Deidentification" + f"Identified as {entity_type} by Azure Health Data Services " + "Deidentification" ), ) return explanation diff --git a/presidio-analyzer/tests/conf/test_minimal_registry_conf.yaml b/presidio-analyzer/tests/conf/test_minimal_registry_conf.yaml index 18a2bd24fa..6cfd0c3898 100644 --- a/presidio-analyzer/tests/conf/test_minimal_registry_conf.yaml +++ b/presidio-analyzer/tests/conf/test_minimal_registry_conf.yaml @@ -1,4 +1,5 @@ global_regex_flags: 26 -recognizers: [] +recognizers: + - name: "CreditCardRecognizer" supported_languages: - en \ No newline at end of file diff --git a/presidio-analyzer/tests/test_configuration_validator.py b/presidio-analyzer/tests/test_configuration_validator.py index 36d74bc25e..0112d17c68 100644 --- a/presidio-analyzer/tests/test_configuration_validator.py +++ b/presidio-analyzer/tests/test_configuration_validator.py @@ -83,3 +83,38 @@ def test_file_path_validation_nonexistent(): ConfigurationValidator.validate_file_path("/nonexistent/file.yaml") assert "does not exist" in str(exc_info.value) + +def test_configuration_validator_analyzer_config_unknown_keys(): + """Test ConfigurationValidator rejects analyzer config with unknown keys.""" + invalid_config = { + "supported_languages": ["en"], + "default_score_threshold": 0.5, + "unknown_key": "some_value", + "another_typo": 123 + } + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_analyzer_configuration(invalid_config) + + error_message = str(exc_info.value) + assert "Unknown configuration key(s)" in error_message + assert "unknown_key" in error_message or "another_typo" in error_message + assert "Valid keys are" in error_message + +def test_configuration_validator_recognizer_registry_unknown_keys(): + """Test ConfigurationValidator rejects recognizer registry config with unknown keys.""" + invalid_config = { + "supported_languages": ["en"], + "global_regex_flags": 26, + "recognizers": [], + "invalid_field": "value", + "typo_key": 456 + } + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_recognizer_registry_configuration(invalid_config) + + error_message = str(exc_info.value) + # Pydantic will raise an error about extra fields + assert "extra" in error_message.lower() or "unexpected" in error_message.lower() or "invalid_field" in error_message or "typo_key" in error_message + diff --git a/presidio-analyzer/tests/test_recognizer_registry_provider.py b/presidio-analyzer/tests/test_recognizer_registry_provider.py index 33fc7634ba..13dd226148 100644 --- a/presidio-analyzer/tests/test_recognizer_registry_provider.py +++ b/presidio-analyzer/tests/test_recognizer_registry_provider.py @@ -83,37 +83,6 @@ def test_recognizer_registry_provider_conf_file_valid_missing_keys_fail(): RecognizerRegistryProvider(conf_file=test_yaml) -# def test_recognizer_registry_provider_with_registry_configuration(): -# registry_configuration = { -# "supported_languages": ["de", "es", "en"], -# "recognizers": [ -# { -# "name": "Zip code Recognizer", -# "supported_language": "en", -# "patterns": [ -# { -# "name": "zip code (weak)", -# "regex": "(\\b\\d{5}(?:\\-\\d{4})?\\b)", -# "score": 0.01, -# } -# ], -# "context": ["zip", "code"], -# "supported_entity": "ZIP", -# } -# ] -# } - - # provider = RecognizerRegistryProvider(registry_configuration=registry_configuration) - # recognizer_registry = provider.create_recognizer_registry() - # assert recognizer_registry.supported_languages == ["de", "es", "en"] - # assert recognizer_registry.global_regex_flags == re.DOTALL | re.MULTILINE | re.IGNORECASE - # assert len(recognizer_registry.recognizers) == 1 - # recognizer = recognizer_registry.recognizers[0] - # assert recognizer.name == "Zip code Recognizer" - # assert recognizer.supported_language == "en" - # assert recognizer.supported_entities == ["ZIP"] - # assert len(recognizer.patterns) == 1 - def test_recognizer_registry_provider_when_conf_file_and_registry_configuration_fail(): this_path = Path(__file__).parent.absolute() @@ -132,7 +101,7 @@ def test_recognizer_provider_with_minimal_creates_empty_registry(): provider = RecognizerRegistryProvider(conf_file=minimal_yaml) registry = provider.create_recognizer_registry() - assert len(registry.recognizers) == 0 + assert len(registry.recognizers) == 1 def test_recognizer_provider_with_nlp_reco_only_creates_nlp_recognizer(): diff --git a/presidio-analyzer/tests/test_yaml_recognizer_models.py b/presidio-analyzer/tests/test_yaml_recognizer_models.py index e5a6aebb82..27ca997e02 100644 --- a/presidio-analyzer/tests/test_yaml_recognizer_models.py +++ b/presidio-analyzer/tests/test_yaml_recognizer_models.py @@ -304,17 +304,18 @@ def test_custom_recognizer_config_invalid_deny_list_score(): def test_recognizer_registry_config_defaults(): - """Test registry config with defaults.""" - config = RecognizerRegistryConfig() + """Test registry config with defaults (requires at least one recognizer).""" + config = RecognizerRegistryConfig(recognizers=["CreditCardRecognizer"]) assert config.supported_languages is None assert config.global_regex_flags == 26 - assert config.recognizers == [] + assert len(config.recognizers) == 1 def test_recognizer_registry_config_valid_languages(): """Test registry with valid languages.""" config = RecognizerRegistryConfig( - supported_languages=["en", "es", "fr-CA"] + supported_languages=["en", "es", "fr-CA"], + recognizers=["CreditCardRecognizer"] ) assert config.supported_languages == ["en", "es", "fr-CA"] @@ -323,16 +324,40 @@ def test_recognizer_registry_config_invalid_language(): """Test registry with invalid language codes.""" with pytest.raises(ValidationError): RecognizerRegistryConfig( - supported_languages=["en", "invalid", "es"] + supported_languages=["en", "invalid", "es"], + recognizers=["CreditCardRecognizer"] ) def test_recognizer_registry_config_empty_languages(): """Test registry with empty languages list.""" - config = RecognizerRegistryConfig(supported_languages=[]) + config = RecognizerRegistryConfig( + supported_languages=[], + recognizers=["CreditCardRecognizer"] + ) assert config.supported_languages == [] +def test_recognizer_registry_config_empty_recognizers(): + """Test that empty recognizers list raises a validation error.""" + with pytest.raises(ValidationError) as exc_info: + RecognizerRegistryConfig( + recognizers=[], + global_regex_flags=26 + ) + assert "empty recognizers list" in str(exc_info.value).lower() + + +def test_recognizer_registry_config_missing_recognizers(): + """Test that missing recognizers field raises a validation error.""" + with pytest.raises(ValidationError) as exc_info: + RecognizerRegistryConfig( + supported_languages=["en"], + global_regex_flags=26 + ) + assert "empty recognizers list" in str(exc_info.value).lower() + + def test_recognizer_registry_config_string_recognizers(): """Test registry with string recognizers.""" config = RecognizerRegistryConfig( @@ -485,10 +510,6 @@ def test_predefined_recognizer_config_invalid_recognizer(): with pytest.raises(ValidationError) as exc_info: PredefinedRecognizerConfig(name="NonExistentRecognizer") - error_message = str(exc_info.value) - assert "Predefined recognizer 'NonExistentRecognizer' not found" in error_message - assert "Available predefined recognizers:" in error_message - def test_predefined_recognizer_config_case_sensitive(): """Test that recognizer names are case sensitive.""" @@ -521,10 +542,24 @@ def test_custom_recognizer_config_predefined_name_error(): ) error_message = str(exc_info.value) - assert "is a predefined recognizer but is marked as 'custom'" in error_message + assert "Recognizer 'CreditCardRecognizer' conflicts with a predefined" in error_message assert "Either use type: 'predefined' or choose a different name" in error_message +def test_custom_recognizer_config_predefined_name_error_without_required_fields(): + """Test that predefined name conflict is caught even when missing required fields.""" + with pytest.raises(ValidationError) as exc_info: + CustomRecognizerConfig( + name="UrlRecognizer", # This is a predefined recognizer + type="custom" + # Intentionally missing supported_entity, patterns, and deny_list + ) + + error_message = str(exc_info.value) + assert "conflicts with a predefined recognizer" in error_message or \ + "is a predefined recognizer but is marked as 'custom'" in error_message + + def test_custom_recognizer_config_unique_name_valid(): """Test that custom recognizers with unique names are valid.""" config = CustomRecognizerConfig( From bfd067bb817681cc708b81fc9807c4bc8f33ae4a Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 19 Nov 2025 13:41:09 +0200 Subject: [PATCH 15/30] Update presidio-analyzer/presidio_analyzer/input_validation/schemas.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- presidio-analyzer/presidio_analyzer/input_validation/schemas.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index 26ace2e4e3..b1d8dc5a58 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -1,4 +1,4 @@ -import re +import regex as re from pathlib import Path from typing import Any, Dict, List, Union From f2e7fd9d707b4d94403198c1357d80fde2acfe36 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 19 Nov 2025 14:07:55 +0200 Subject: [PATCH 16/30] ruff and copilot review fixes --- .../presidio_analyzer/input_validation/schemas.py | 2 +- .../presidio_analyzer/nlp_engine/nlp_engine_provider.py | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index 26ace2e4e3..c30afe2aba 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -1,7 +1,7 @@ -import re from pathlib import Path from typing import Any, Dict, List, Union +import regex as re from pydantic import ValidationError from .yaml_recognizer_models import RecognizerRegistryConfig diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py index 850b1b4218..dde4dcd135 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py @@ -73,13 +73,16 @@ def __init__( # _validate_nlp_engines method removed - all validation is now Pydantic-based - def _read_nlp_conf(self, conf_file: Union[Path, str]) -> Dict: + @staticmethod + def _read_nlp_conf(conf_file: Union[Path, str]) -> Dict: """Read NLP configuration from a YAML file.""" with open(conf_file) as file: return yaml.safe_load(file) + + @staticmethod def _get_full_conf_path( - self, default_conf_file: Union[Path, str] = "default.yaml" + default_conf_file: Union[Path, str] = "default.yaml" ) -> Path: """Return a Path to the default conf file.""" return Path(Path(__file__).parent, "../conf", default_conf_file) From 41328cc80f0da6a0fdf7e49e65d1bfee64435574 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 19 Nov 2025 14:13:40 +0200 Subject: [PATCH 17/30] Delete presidio-analyzer/test-output.xml --- presidio-analyzer/test-output.xml | 831 ------------------------------ 1 file changed, 831 deletions(-) delete mode 100644 presidio-analyzer/test-output.xml diff --git a/presidio-analyzer/test-output.xml b/presidio-analyzer/test-output.xml deleted file mode 100644 index a8378c41de..0000000000 --- a/presidio-analyzer/test-output.xml +++ /dev/null @@ -1,831 +0,0 @@ -/Users/omri.mendels/Library/Caches/pypoetry/virtualenvs/presidio-evaluator-nCKHFi6i-py3.10/bin/pytest \ No newline at end of file From bd2d0459be26ebf039499bdbc308104b1bdf7c8a Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 19 Nov 2025 22:15:13 +0200 Subject: [PATCH 18/30] fixed bad test --- .../input_validation/schemas.py | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index c30afe2aba..84669adaf2 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -99,6 +99,21 @@ def _validate_recognizer_registry_basic(config: Dict[str, Any]) -> Dict[str, Any if not isinstance(config, dict): raise ValueError("Recognizer registry configuration must be a dictionary") + # Define valid top-level keys for recognizer registry configuration + valid_keys = { + "supported_languages", + "global_regex_flags", + "recognizers" + } + + # Check for unknown keys + unknown_keys = set(config.keys()) - valid_keys + if unknown_keys: + raise ValueError( + f"Unknown configuration key(s) in recognizer_registry: {sorted(unknown_keys)}. " + f"Valid keys are: {sorted(valid_keys)}" + ) + # Validate supported languages if "supported_languages" in config: ConfigurationValidator.validate_language_codes( @@ -117,6 +132,22 @@ def validate_analyzer_configuration(config: Dict[str, Any]) -> Dict[str, Any]: if not isinstance(config, dict): raise ValueError("Analyzer configuration must be a dictionary") + # Define valid top-level keys for analyzer configuration + valid_keys = { + "supported_languages", + "default_score_threshold", + "nlp_configuration", + "recognizer_registry" + } + + # Check for unknown keys + unknown_keys = set(config.keys()) - valid_keys + if unknown_keys: + raise ValueError( + f"Unknown configuration key(s) in analyzer configuration: {sorted(unknown_keys)}. " + f"Valid keys are: {sorted(valid_keys)}" + ) + # Validate supported languages if present if "supported_languages" in config: ConfigurationValidator.validate_language_codes( From 11a8169b6448a70e183d76c1fb7dab060c06dc1c Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Wed, 19 Nov 2025 22:16:26 +0200 Subject: [PATCH 19/30] ruff --- .../presidio_analyzer/input_validation/schemas.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index 84669adaf2..661b8fe4fd 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -110,7 +110,8 @@ def _validate_recognizer_registry_basic(config: Dict[str, Any]) -> Dict[str, Any unknown_keys = set(config.keys()) - valid_keys if unknown_keys: raise ValueError( - f"Unknown configuration key(s) in recognizer_registry: {sorted(unknown_keys)}. " + f"Unknown configuration key(s) in " + f"recognizer_registry: {sorted(unknown_keys)}. " f"Valid keys are: {sorted(valid_keys)}" ) @@ -144,7 +145,8 @@ def validate_analyzer_configuration(config: Dict[str, Any]) -> Dict[str, Any]: unknown_keys = set(config.keys()) - valid_keys if unknown_keys: raise ValueError( - f"Unknown configuration key(s) in analyzer configuration: {sorted(unknown_keys)}. " + f"Unknown configuration key(s) in " + f"analyzer configuration: {sorted(unknown_keys)}. " f"Valid keys are: {sorted(valid_keys)}" ) From 8054750098d70c2638a0a53423bfe48f42945bf8 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 20 Nov 2025 11:29:58 +0200 Subject: [PATCH 20/30] removed wrong test which assumes defaults --- .../test_analyzer_engine_missing_values.yaml | 7 ------- .../tests/test_analyzer_engine_provider.py | 21 ------------------- 2 files changed, 28 deletions(-) delete mode 100644 presidio-analyzer/tests/conf/test_analyzer_engine_missing_values.yaml diff --git a/presidio-analyzer/tests/conf/test_analyzer_engine_missing_values.yaml b/presidio-analyzer/tests/conf/test_analyzer_engine_missing_values.yaml deleted file mode 100644 index 3abaf9b003..0000000000 --- a/presidio-analyzer/tests/conf/test_analyzer_engine_missing_values.yaml +++ /dev/null @@ -1,7 +0,0 @@ -recognizer_registry: - global_regex_flags: 26 - -supported_languages: - - de - - en - - es \ No newline at end of file diff --git a/presidio-analyzer/tests/test_analyzer_engine_provider.py b/presidio-analyzer/tests/test_analyzer_engine_provider.py index ba45e00a31..6e7169a011 100644 --- a/presidio-analyzer/tests/test_analyzer_engine_provider.py +++ b/presidio-analyzer/tests/test_analyzer_engine_provider.py @@ -93,27 +93,6 @@ def test_analyzer_engine_provider_configuration_file(): assert engine.nlp_engine.engine_name == "spacy" -def test_analyzer_engine_provider_configuration_file_missing_values_expect_defaults( - mandatory_recognizers, -): - test_yaml, _, _ = get_full_paths("conf/test_analyzer_engine_missing_values.yaml") - provider = AnalyzerEngineProvider(test_yaml) - engine = provider.create_engine() - assert engine.supported_languages == ["de", "en", "es"] - assert engine.default_score_threshold == 0 - recognizer_registry = engine.registry - assert ( - recognizer_registry.global_regex_flags - == re.DOTALL | re.MULTILINE | re.IGNORECASE - ) - assert recognizer_registry.supported_languages == ["de", "en", "es"] - names = [recognizer.name for recognizer in recognizer_registry.recognizers] - for predefined_recognizer in mandatory_recognizers: - assert predefined_recognizer in names - assert isinstance(engine.nlp_engine, SpacyNlpEngine) - assert engine.nlp_engine.engine_name == "spacy" - - def test_analyzer_engine_provider_defaults(mandatory_recognizers): provider = AnalyzerEngineProvider() engine = provider.create_engine() From 86baa12b241385c69d68c1a5a005e76fe13c154d Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 20 Nov 2025 12:15:07 +0200 Subject: [PATCH 21/30] Clean up comments in recognizers_loader_utils.py Removed comments about excluded fields in recognizer initialization. --- .../recognizer_registry/recognizers_loader_utils.py | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py index 36266ddb54..b03db674cd 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py @@ -298,12 +298,9 @@ def get( """ recognizer_instances = [] predefined, custom = RecognizerListLoader._split_recognizers(recognizers) - # Exclude Pydantic-normalized fields that should not - # be passed to recognizer constructors - # Note: We exclude both supported_entity - # and supported_entities here because we'll handle - # the conversion in _prepare_recognizer_kwargs + predefined_to_exclude = {"enabled", "type", "supported_languages", "name"} + # For custom recognizers, we keep 'supported_languages' # and don't exclude 'supported_entity' # because PatternRecognizer needs it From 08c15ac3fb5f3f771116bdc04746fd696271d379 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 30 Nov 2025 14:53:53 +0200 Subject: [PATCH 22/30] updates to PR following review --- .../analyzer_engine_provider.py | 3 -- .../input_validation/__init__.py | 2 ++ .../input_validation/language_validation.py | 18 ++++++++++ .../input_validation/schemas.py | 20 ++++------- .../yaml_recognizer_models.py | 36 +++++-------------- .../nlp_engine/nlp_engine_provider.py | 4 +-- .../recognizers_loader_utils.py | 28 ++++++--------- .../tests/test_configuration_validator.py | 24 ------------- .../tests/test_language_validation.py | 18 ++++++++++ .../test_recognizer_registry_provider.py | 4 --- 10 files changed, 63 insertions(+), 94 deletions(-) create mode 100644 presidio-analyzer/presidio_analyzer/input_validation/language_validation.py create mode 100644 presidio-analyzer/tests/test_language_validation.py diff --git a/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py b/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py index c338b41544..1ca9c4665e 100644 --- a/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/analyzer_engine_provider.py @@ -73,9 +73,6 @@ def get_configuration( with open(self._get_full_conf_path()) as file: configuration = yaml.safe_load(file) - # Validate configuration using Pydantic-based ConfigurationValidator - from presidio_analyzer.input_validation import ConfigurationValidator - ConfigurationValidator.validate_analyzer_configuration(configuration) logger.debug("Analyzer configuration validation passed") diff --git a/presidio-analyzer/presidio_analyzer/input_validation/__init__.py b/presidio-analyzer/presidio_analyzer/input_validation/__init__.py index 28b55fe8e0..aadea64675 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/__init__.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/__init__.py @@ -1,5 +1,6 @@ """Configuration validation module for Presidio.""" +from .language_validation import validate_language_codes from .schemas import ConfigurationValidator from .yaml_recognizer_models import ( BaseRecognizerConfig, @@ -10,6 +11,7 @@ ) __all__ = [ + "validate_language_codes", "ConfigurationValidator", "BaseRecognizerConfig", "CustomRecognizerConfig", diff --git a/presidio-analyzer/presidio_analyzer/input_validation/language_validation.py b/presidio-analyzer/presidio_analyzer/input_validation/language_validation.py new file mode 100644 index 0000000000..c5f0171624 --- /dev/null +++ b/presidio-analyzer/presidio_analyzer/input_validation/language_validation.py @@ -0,0 +1,18 @@ +from typing import List + +import regex as re + + +def validate_language_codes(languages: List[str]) -> None: + """Validate language codes format. + + :param languages: List of languages to validate. + """ + language_code_regex = re.compile(r"^[a-z]{2}(-[A-Z]{2})?$") + + for lang in languages: + if not re.match(language_code_regex, lang): + raise ValueError( + f"Invalid language code format: {lang}. " + f"Expected format: 'en' or 'en-US'" + ) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index 661b8fe4fd..c78c139687 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -1,9 +1,9 @@ from pathlib import Path from typing import Any, Dict, List, Union -import regex as re from pydantic import ValidationError +from . import validate_language_codes from .yaml_recognizer_models import RecognizerRegistryConfig @@ -16,12 +16,7 @@ def validate_language_codes(languages: List[str]) -> List[str]: :param languages: List of languages to validate. """ - for lang in languages: - if not re.match(r"^[a-z]{2}(-[A-Z]{2})?$", lang): - raise ValueError( - f"Invalid language code format: {lang}. " - f"Expected format: 'en' or 'en-US'" - ) + validate_language_codes(languages) return languages @staticmethod @@ -88,10 +83,7 @@ def validate_recognizer_registry_configuration( # Use model_dump() without exclude_unset to include default values return validated_config.model_dump(exclude_unset=False) except ValidationError as e: - raise ValueError(f"Invalid recognizer registry configuration: {e}") - except ImportError: - # Fallback to basic validation if models not available - return ConfigurationValidator._validate_recognizer_registry_basic(config) + raise ValueError("Invalid recognizer registry configuration") from e @staticmethod def _validate_recognizer_registry_basic(config: Dict[str, Any]) -> Dict[str, Any]: @@ -117,7 +109,7 @@ def _validate_recognizer_registry_basic(config: Dict[str, Any]) -> Dict[str, Any # Validate supported languages if "supported_languages" in config: - ConfigurationValidator.validate_language_codes( + validate_language_codes( config["supported_languages"] ) @@ -129,7 +121,7 @@ def _validate_recognizer_registry_basic(config: Dict[str, Any]) -> Dict[str, Any @staticmethod def validate_analyzer_configuration(config: Dict[str, Any]) -> Dict[str, Any]: - """Validate analyzer engine validation.""" + """Validate analyzer engine configuration.""" if not isinstance(config, dict): raise ValueError("Analyzer configuration must be a dictionary") @@ -152,7 +144,7 @@ def validate_analyzer_configuration(config: Dict[str, Any]) -> Dict[str, Any]: # Validate supported languages if present if "supported_languages" in config: - ConfigurationValidator.validate_language_codes( + validate_language_codes( config["supported_languages"] ) diff --git a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py index e6f4a31e9c..0f5da5bee2 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py @@ -1,12 +1,10 @@ """Pydantic models for YAML recognizer configurations.""" -import logging from typing import Any, Dict, List, Optional, Union -import regex as re from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator -logger = logging.getLogger("presidio-analyzer") +from presidio_analyzer.input_validation import validate_language_codes class LanguageContextConfig(BaseModel): @@ -25,10 +23,7 @@ class LanguageContextConfig(BaseModel): @classmethod def validate_language_code(cls, v: str) -> str: """Validate language code format.""" - if not re.match(r"^[a-z]{2}(-[A-Z]{2})?$", v): - raise ValueError( - f"Invalid language code format: {v}. Expected format: 'en' or 'en-US'" - ) + validate_language_codes([v]) return v @@ -79,8 +74,7 @@ class BaseRecognizerConfig(BaseModel): @classmethod def validate_single_language(cls, v: Optional[str]) -> Optional[str]: """Validate single language code format.""" - if v and not re.match(r"^[a-z]{2}(-[A-Z]{2})?$", v): - raise ValueError(f"Invalid language code format: {v}") + validate_language_codes([v]) return v @model_validator(mode="after") @@ -91,8 +85,6 @@ def validate_language_configuration(self): "Cannot specify both 'supported_language' and 'supported_languages'" ) - # If neither is specified, this is allowed for - # predefined recognizers (defaults will be used) return self @model_validator(mode="after") @@ -234,7 +226,10 @@ def check_predefined_name_conflict(cls, data: Any) -> Any: @field_validator("patterns") @classmethod def validate_patterns(cls, patterns: Optional[List[Dict]]) -> Optional[List[Dict]]: - """Validate single language code format.""" + """Validate single language code format. + + :param patterns: List of patterns + """ if patterns and not isinstance(patterns, list): raise ValueError(f"Patterns should be a list: {patterns}") @@ -253,13 +248,6 @@ def validate_patterns(cls, patterns: Optional[List[Dict]]) -> Optional[List[Dict raise ValueError(f"Pattern score should be between 0 and 1: {pattern}") return patterns - @field_validator("supported_language") - @classmethod - def validate_single_language(cls, v: Optional[str]) -> Optional[str]: - """Validate single language code format.""" - if v and not re.match(r"^[a-z]{2}(-[A-Z]{2})?$", v): - raise ValueError(f"Invalid language code format: {v}") - return v @model_validator(mode="after") def validate_configuration(self): @@ -321,9 +309,7 @@ def validate_language_codes(cls, v: Optional[List[str]]) -> Optional[List[str]]: if len(v) == 0: return [] - for lang in v: - if not re.match(r"^[a-z]{2}(-[A-Z]{2})?$", lang): - raise ValueError(f"Invalid language code format: {lang}") + validate_language_codes(v) return v @model_validator(mode="after") @@ -350,12 +336,6 @@ def validate_languages_for_custom_recognizers(self): return self - @field_validator("global_regex_flags") - @classmethod - def validate_global_regex_flags(cls, v: int) -> int: - """Validate global_regex_flags and warn if using default.""" - return v - @model_validator(mode="after") def validate_recognizers_not_empty(self): """Ensure recognizers list is not empty after all defaults are applied.""" diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py index dde4dcd135..8ad0b336ae 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py @@ -42,7 +42,6 @@ def __init__( if nlp_engines is None: nlp_engines = (SpacyNlpEngine, StanzaNlpEngine, TransformersNlpEngine) - # No legacy validation - just assign the engines self.nlp_engines = { engine.engine_name: engine for engine in nlp_engines if engine.is_available } @@ -70,8 +69,7 @@ def __init__( conf_file = self._get_full_conf_path() logger.debug(f"Reading default conf file from {conf_file}") self.nlp_configuration = self._read_nlp_conf(conf_file) - - # _validate_nlp_engines method removed - all validation is now Pydantic-based + ConfigurationValidator.validate_nlp_configuration(self.nlp_configuration) @staticmethod def _read_nlp_conf(conf_file: Union[Path, str]) -> Dict: diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py index b03db674cd..5905e4e340 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py @@ -120,6 +120,13 @@ def get_recognizer_name(recognizer_conf: Union[Dict[str, Any], str]) -> str: return recognizer_conf return recognizer_conf["name"] + @staticmethod + def _convert_supported_entities_to_entity(conf: Dict[str, Any]) -> None: + if "supported_entities" in conf: + supported_entities = conf.pop("supported_entities") + if "supported_entity" not in conf and supported_entities: + conf["supported_entity"] = supported_entities[0] + @staticmethod def _is_language_supported_globally( recognizer: EntityRecognizer, @@ -153,10 +160,7 @@ def _create_custom_recognizers( # Transform supported_entities -> supported_entity # (PatternRecognizer expects singular) - if "supported_entities" in conf_copy: - supported_entities = conf_copy.pop("supported_entities") - if "supported_entity" not in conf_copy and supported_entities: - conf_copy["supported_entity"] = supported_entities[0] + RecognizerListLoader._convert_supported_entities_to_entity(conf_copy) return [PatternRecognizer.from_dict(conf_copy)] @@ -173,10 +177,7 @@ def _create_custom_recognizers( # Transform supported_entities -> supported_entity # (PatternRecognizer expects singular) - if "supported_entities" in copied_recognizer: - supported_entities = copied_recognizer.pop("supported_entities") - if "supported_entity" not in copied_recognizer and supported_entities: - copied_recognizer["supported_entity"] = supported_entities[0] + RecognizerListLoader._convert_supported_entities_to_entity(copied_recognizer) kwargs = {**copied_recognizer, **supported_language} recognizers.append(PatternRecognizer.from_dict(kwargs)) @@ -262,16 +263,7 @@ def _prepare_recognizer_kwargs( if RecognizerListLoader._is_pattern_recognizer(recognizer_cls): # Convert supported_entities (plural) to supported_entity # (singular) if present - if "supported_entities" in kwargs: - supported_entities = kwargs.pop("supported_entities") - # Only set supported_entity if we have valid entities - # and it's not already set - if ( - supported_entities - and len(supported_entities) > 0 - and "supported_entity" not in kwargs - ): - kwargs["supported_entity"] = supported_entities[0] + RecognizerListLoader._convert_supported_entities_to_entity(kwargs) # Remove supported_entity if it's None # to allow the recognizer's default to be used diff --git a/presidio-analyzer/tests/test_configuration_validator.py b/presidio-analyzer/tests/test_configuration_validator.py index 79dd056b61..f84e0e611e 100644 --- a/presidio-analyzer/tests/test_configuration_validator.py +++ b/presidio-analyzer/tests/test_configuration_validator.py @@ -4,21 +4,6 @@ from presidio_analyzer.input_validation import ConfigurationValidator -def test_configuration_validator_language_codes_valid(): - """Test ConfigurationValidator accepts valid language codes.""" - valid_languages = ["en", "es", "fr", "en-US", "es-ES"] - validated = ConfigurationValidator.validate_language_codes(valid_languages) - assert validated == valid_languages - -def test_configuration_validator_language_codes_invalid(): - """Test ConfigurationValidator rejects invalid language codes.""" - invalid_languages = ["invalid_lang"] - - with pytest.raises(ValueError) as exc_info: - ConfigurationValidator.validate_language_codes(invalid_languages) - - assert "Invalid language code format" in str(exc_info.value) - def test_configuration_validator_nlp_config_valid(): """Test ConfigurationValidator accepts valid NLP validation.""" valid_config = { @@ -96,11 +81,6 @@ def test_configuration_validator_analyzer_config_unknown_keys(): with pytest.raises(ValueError) as exc_info: ConfigurationValidator.validate_analyzer_configuration(invalid_config) - error_message = str(exc_info.value) - assert "Unknown configuration key(s)" in error_message - assert "unknown_key" in error_message or "another_typo" in error_message - assert "Valid keys are" in error_message - def test_configuration_validator_recognizer_registry_unknown_keys(): """Test ConfigurationValidator rejects recognizer registry config with unknown keys.""" invalid_config = { @@ -114,7 +94,3 @@ def test_configuration_validator_recognizer_registry_unknown_keys(): with pytest.raises(ValueError) as exc_info: ConfigurationValidator.validate_recognizer_registry_configuration(invalid_config) - error_message = str(exc_info.value) - # Pydantic will raise an error about extra fields - assert "extra" in error_message.lower() or "unexpected" in error_message.lower() or "invalid_field" in error_message or "typo_key" in error_message - diff --git a/presidio-analyzer/tests/test_language_validation.py b/presidio-analyzer/tests/test_language_validation.py new file mode 100644 index 0000000000..7cc22909ff --- /dev/null +++ b/presidio-analyzer/tests/test_language_validation.py @@ -0,0 +1,18 @@ +import pytest + +from presidio_analyzer.input_validation import validate_language_codes + + +def test_configuration_validator_language_codes_no_exception(): + """Test ConfigurationValidator accepts valid language codes.""" + valid_languages = ["en", "es", "fr", "en-US", "es-ES"] + validate_language_codes(valid_languages) + +def test_configuration_validator_language_codes_invalid(): + """Test ConfigurationValidator rejects invalid language codes.""" + invalid_languages = ["invalid_lang"] + + with pytest.raises(ValueError) as exc_info: + validate_language_codes(invalid_languages) + + assert "Invalid language code format" in str(exc_info.value) diff --git a/presidio-analyzer/tests/test_recognizer_registry_provider.py b/presidio-analyzer/tests/test_recognizer_registry_provider.py index 042793f636..17771a1d3e 100644 --- a/presidio-analyzer/tests/test_recognizer_registry_provider.py +++ b/presidio-analyzer/tests/test_recognizer_registry_provider.py @@ -226,10 +226,6 @@ def test_recognizers_none_raises_exception(): with pytest.raises(ValueError) as exc_info: ConfigurationValidator.validate_recognizer_registry_configuration(config) - assert "recognizers" in str(exc_info.value) - assert "required" in str(exc_info.value).lower() - - def test_direct_validation_with_missing_global_regex_flags(): """Test direct validation without global_regex_flags succeeds with default.""" From 4dd06756633a6c8292b3956e70488fa782dde168 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 1 Dec 2025 14:16:03 +0200 Subject: [PATCH 23/30] added more tests --- .../tests/test_configuration_validator.py | 409 +++++++++++++++++- 1 file changed, 391 insertions(+), 18 deletions(-) diff --git a/presidio-analyzer/tests/test_configuration_validator.py b/presidio-analyzer/tests/test_configuration_validator.py index f84e0e611e..ad5ed5687a 100644 --- a/presidio-analyzer/tests/test_configuration_validator.py +++ b/presidio-analyzer/tests/test_configuration_validator.py @@ -4,6 +4,110 @@ from presidio_analyzer.input_validation import ConfigurationValidator +# ========== Language Code Validation Tests ========== + +def test_validate_language_codes_valid(): + """Test valid language codes.""" + valid_languages = ["en", "es", "fr", "de"] + result = ConfigurationValidator.validate_language_codes(valid_languages) + assert result == valid_languages + + +def test_validate_language_codes_valid_with_country(): + """Test valid language codes with country codes.""" + valid_languages = ["en-US", "en-GB", "es-ES"] + result = ConfigurationValidator.validate_language_codes(valid_languages) + assert result == valid_languages + + +def test_validate_language_codes_invalid_format(): + """Test invalid language code format.""" + invalid_languages = ["english", "EN", "e", "en-us"] + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_language_codes(invalid_languages) + assert "Invalid language code format" in str(exc_info.value) + + +def test_validate_language_codes_mixed_invalid(): + """Test mixed valid and invalid language codes.""" + mixed_languages = ["en", "invalid_lang", "es"] + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_language_codes(mixed_languages) + assert "Invalid language code format" in str(exc_info.value) + + +# ========== File Path Validation Tests ========== + +def test_file_path_validation_success(tmp_path): + """Test file path validation with existing file.""" + test_file = tmp_path / "test.yaml" + test_file.write_text("test: content") + + validated_path = ConfigurationValidator.validate_file_path(str(test_file)) + assert validated_path == test_file + + +def test_file_path_validation_with_path_object(tmp_path): + """Test file path validation with Path object.""" + test_file = tmp_path / "test.yaml" + test_file.write_text("test: content") + + validated_path = ConfigurationValidator.validate_file_path(test_file) + assert validated_path == test_file + + +def test_file_path_validation_nonexistent(): + """Test file path validation with non-existent file.""" + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_file_path("/nonexistent/file.yaml") + + assert "does not exist" in str(exc_info.value) + + +def test_file_path_validation_directory(tmp_path): + """Test file path validation with directory instead of file.""" + test_dir = tmp_path / "test_directory" + test_dir.mkdir() + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_file_path(test_dir) + + assert "not a file" in str(exc_info.value) + + +# ========== Score Threshold Validation Tests ========== + +def test_validate_score_threshold_valid(): + """Test valid score thresholds.""" + valid_thresholds = [0.0, 0.5, 1.0, 0.25, 0.75] + for threshold in valid_thresholds: + result = ConfigurationValidator.validate_score_threshold(threshold) + assert result == threshold + + +def test_validate_score_threshold_above_one(): + """Test score threshold above 1.0.""" + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_score_threshold(1.5) + assert "must be between 0.0 and 1.0" in str(exc_info.value) + + +def test_validate_score_threshold_negative(): + """Test negative score threshold.""" + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_score_threshold(-0.1) + assert "must be between 0.0 and 1.0" in str(exc_info.value) + + +def test_validate_score_threshold_way_above(): + """Test score threshold far above valid range.""" + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_score_threshold(100.0) + assert "must be between 0.0 and 1.0" in str(exc_info.value) + + +# ========== NLP Configuration Validation Tests ========== + def test_configuration_validator_nlp_config_valid(): """Test ConfigurationValidator accepts valid NLP validation.""" valid_config = { @@ -16,6 +120,21 @@ def test_configuration_validator_nlp_config_valid(): validated = ConfigurationValidator.validate_nlp_configuration(valid_config) assert validated == valid_config + +def test_nlp_config_multiple_models(): + """Test NLP configuration with multiple models.""" + valid_config = { + "nlp_engine_name": "spacy", + "models": [ + {"lang_code": "en", "model_name": "en_core_web_lg"}, + {"lang_code": "es", "model_name": "es_core_news_lg"} + ] + } + + validated = ConfigurationValidator.validate_nlp_configuration(valid_config) + assert validated == valid_config + + def test_configuration_validator_nlp_config_missing_fields(): """Test ConfigurationValidator rejects NLP config with missing required fields.""" invalid_config = { @@ -28,6 +147,162 @@ def test_configuration_validator_nlp_config_missing_fields(): assert "missing required fields" in str(exc_info.value) + +def test_nlp_config_missing_nlp_engine_name(): + """Test NLP config missing nlp_engine_name.""" + invalid_config = { + "models": [{"lang_code": "en", "model_name": "en_core_web_lg"}] + } + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_nlp_configuration(invalid_config) + assert "missing required fields" in str(exc_info.value) + + +def test_nlp_config_not_dict(): + """Test NLP configuration that is not a dictionary.""" + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_nlp_configuration("not a dict") + assert "must be a dictionary" in str(exc_info.value) + + +def test_nlp_config_models_not_list(): + """Test NLP configuration with models not as list.""" + invalid_config = { + "nlp_engine_name": "spacy", + "models": {"lang_code": "en", "model_name": "en_core_web_lg"} + } + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_nlp_configuration(invalid_config) + assert "Models must be a non-empty list" in str(exc_info.value) + + +def test_nlp_config_models_empty_list(): + """Test NLP configuration with empty models list.""" + invalid_config = { + "nlp_engine_name": "spacy", + "models": [] + } + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_nlp_configuration(invalid_config) + assert "Models must be a non-empty list" in str(exc_info.value) + + +def test_nlp_config_model_not_dict(): + """Test NLP configuration with model that is not a dict.""" + invalid_config = { + "nlp_engine_name": "spacy", + "models": ["en_core_web_lg"] + } + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_nlp_configuration(invalid_config) + assert "Each model must be a dictionary" in str(exc_info.value) + + +def test_nlp_config_model_missing_lang_code(): + """Test NLP configuration with model missing lang_code.""" + invalid_config = { + "nlp_engine_name": "spacy", + "models": [{"model_name": "en_core_web_lg"}] + } + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_nlp_configuration(invalid_config) + assert "must have 'lang_code' and 'model_name'" in str(exc_info.value) + + +def test_nlp_config_model_missing_model_name(): + """Test NLP configuration with model missing model_name.""" + invalid_config = { + "nlp_engine_name": "spacy", + "models": [{"lang_code": "en"}] + } + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_nlp_configuration(invalid_config) + assert "must have 'lang_code' and 'model_name'" in str(exc_info.value) + + +# ========== Recognizer Registry Configuration Tests ========== + +def test_recognizer_registry_valid_custom_recognizer(): + """Test valid recognizer registry configuration with custom recognizer.""" + valid_config = { + "supported_languages": ["en"], + "recognizers": [ + { + "name": "CustomRecognizer", + "type": "custom", + "supported_entity": "CUSTOM_ENTITY", + "patterns": [ + { + "name": "pattern1", + "regex": "test", + "score": 0.5 + } + ] + } + ] + } + + result = ConfigurationValidator.validate_recognizer_registry_configuration(valid_config) + assert result is not None + assert "recognizers" in result + + +def test_recognizer_registry_valid_predefined_recognizer(): + """Test valid recognizer registry configuration with predefined recognizer.""" + valid_config = { + "supported_languages": ["en"], + "recognizers": [ + { + "name": "CreditCardRecognizer", + "type": "predefined" + } + ] + } + + result = ConfigurationValidator.validate_recognizer_registry_configuration(valid_config) + assert result is not None + + +def test_recognizer_registry_empty_recognizers_list(): + """Test recognizer registry with empty recognizers list.""" + invalid_config = { + "supported_languages": ["en"], + "recognizers": [] + } + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_recognizer_registry_configuration(invalid_config) + assert "Invalid recognizer registry configuration" in str(exc_info.value) + + +def test_configuration_validator_recognizer_registry_unknown_keys(): + """Test ConfigurationValidator rejects recognizer registry config with unknown keys.""" + invalid_config = { + "supported_languages": ["en"], + "global_regex_flags": 26, + "recognizers": [ + { + "name": "CreditCardRecognizer", + "type": "predefined" + } + ], + "invalid_field": "value", + "typo_key": 456 + } + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_recognizer_registry_configuration(invalid_config) + assert "Invalid recognizer registry configuration" in str(exc_info.value) + + +# ========== Analyzer Configuration Tests ========== + def test_configuration_validator_analyzer_config_valid(): """Test ConfigurationValidator accepts valid analyzer validation.""" valid_config = { @@ -42,6 +317,36 @@ def test_configuration_validator_analyzer_config_valid(): validated = ConfigurationValidator.validate_analyzer_configuration(valid_config) assert validated == valid_config + +def test_analyzer_config_minimal(): + """Test minimal valid analyzer configuration.""" + valid_config = { + "supported_languages": ["en"] + } + + validated = ConfigurationValidator.validate_analyzer_configuration(valid_config) + assert validated == valid_config + + +def test_analyzer_config_with_recognizer_registry(): + """Test analyzer configuration with recognizer registry.""" + valid_config = { + "supported_languages": ["en"], + "recognizer_registry": { + "supported_languages": ["en"], + "recognizers": [ + { + "name": "CreditCardRecognizer", + "type": "predefined" + } + ] + } + } + + validated = ConfigurationValidator.validate_analyzer_configuration(valid_config) + assert validated is not None + + def test_configuration_validator_analyzer_config_invalid_threshold(): """Test ConfigurationValidator rejects invalid score threshold.""" invalid_config = { @@ -54,20 +359,13 @@ def test_configuration_validator_analyzer_config_invalid_threshold(): assert "must be between 0.0 and 1.0" in str(exc_info.value) -def test_file_path_validation_success(tmp_path): - """Test file path validation with existing file.""" - test_file = tmp_path / "test.yaml" - test_file.write_text("test: content") - - validated_path = ConfigurationValidator.validate_file_path(str(test_file)) - assert validated_path == test_file -def test_file_path_validation_nonexistent(): - """Test file path validation with non-existent file.""" +def test_analyzer_config_not_dict(): + """Test analyzer configuration that is not a dictionary.""" with pytest.raises(ValueError) as exc_info: - ConfigurationValidator.validate_file_path("/nonexistent/file.yaml") + ConfigurationValidator.validate_analyzer_configuration("not a dict") + assert "must be a dictionary" in str(exc_info.value) - assert "does not exist" in str(exc_info.value) def test_configuration_validator_analyzer_config_unknown_keys(): """Test ConfigurationValidator rejects analyzer config with unknown keys.""" @@ -80,17 +378,92 @@ def test_configuration_validator_analyzer_config_unknown_keys(): with pytest.raises(ValueError) as exc_info: ConfigurationValidator.validate_analyzer_configuration(invalid_config) + assert "Unknown configuration key" in str(exc_info.value) -def test_configuration_validator_recognizer_registry_unknown_keys(): - """Test ConfigurationValidator rejects recognizer registry config with unknown keys.""" + +def test_analyzer_config_invalid_languages(): + """Test analyzer configuration with invalid language codes.""" + invalid_config = { + "supported_languages": ["invalid_lang"] + } + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_analyzer_configuration(invalid_config) + assert "Invalid language code format" in str(exc_info.value) + + +def test_analyzer_config_invalid_nlp_nested(): + """Test analyzer configuration with invalid nested NLP config.""" invalid_config = { "supported_languages": ["en"], - "global_regex_flags": 26, - "recognizers": [], - "invalid_field": "value", - "typo_key": 456 + "nlp_configuration": { + "nlp_engine_name": "spacy" + # Missing models + } } with pytest.raises(ValueError) as exc_info: - ConfigurationValidator.validate_recognizer_registry_configuration(invalid_config) + ConfigurationValidator.validate_analyzer_configuration(invalid_config) + assert "missing required fields" in str(exc_info.value) + + +def test_analyzer_config_invalid_recognizer_registry_nested(): + """Test analyzer configuration with invalid nested recognizer registry.""" + invalid_config = { + "supported_languages": ["en"], + "recognizer_registry": { + "recognizers": [] # Empty list not allowed + } + } + + with pytest.raises(ValueError) as exc_info: + ConfigurationValidator.validate_analyzer_configuration(invalid_config) + assert "Invalid recognizer registry configuration" in str(exc_info.value) + + +def test_analyzer_config_threshold_at_boundaries(): + """Test analyzer configuration with threshold at boundaries.""" + # Test 0.0 + config_zero = { + "supported_languages": ["en"], + "default_score_threshold": 0.0 + } + validated = ConfigurationValidator.validate_analyzer_configuration(config_zero) + assert validated["default_score_threshold"] == 0.0 + + # Test 1.0 + config_one = { + "supported_languages": ["en"], + "default_score_threshold": 1.0 + } + validated = ConfigurationValidator.validate_analyzer_configuration(config_one) + assert validated["default_score_threshold"] == 1.0 + + +def test_analyzer_config_all_fields(): + """Test analyzer configuration with all fields populated.""" + valid_config = { + "supported_languages": ["en", "es"], + "default_score_threshold": 0.7, + "nlp_configuration": { + "nlp_engine_name": "spacy", + "models": [ + {"lang_code": "en", "model_name": "en_core_web_lg"}, + {"lang_code": "es", "model_name": "es_core_news_lg"} + ] + }, + "recognizer_registry": { + "supported_languages": ["en"], + "global_regex_flags": 26, + "recognizers": [ + { + "name": "CreditCardRecognizer", + "type": "predefined" + } + ] + } + } + + validated = ConfigurationValidator.validate_analyzer_configuration(valid_config) + assert validated == valid_config From 3bf8e7b0f4c54dd87ca9eeceb9455e34c63ac70d Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Mon, 1 Dec 2025 14:22:58 +0200 Subject: [PATCH 24/30] removed bandit from defender-for-devops Removed 'bandit' from the tools list in the workflow. --- .github/workflows/defender-for-devops.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/defender-for-devops.yml b/.github/workflows/defender-for-devops.yml index 7bda40e720..d168936fdf 100644 --- a/.github/workflows/defender-for-devops.yml +++ b/.github/workflows/defender-for-devops.yml @@ -50,7 +50,7 @@ jobs: env: GDN_CHECKOV_SKIPPATH: 'docs' with: - tools: bandit, checkov, templateanalyzer, trivy + tools: checkov, templateanalyzer, trivy - name: Upload results to Security tab uses: github/codeql-action/upload-sarif@v4 From 8f98b645235f234c4fe59990f765f4c15d3a2a32 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 4 Dec 2025 10:24:53 +0200 Subject: [PATCH 25/30] more unit tests --- .../tests/test_analyzer_engine_provider.py | 231 +++++++++++++++ .../tests/test_pattern_recognizer.py | 279 ++++++++++++++++++ 2 files changed, 510 insertions(+) diff --git a/presidio-analyzer/tests/test_analyzer_engine_provider.py b/presidio-analyzer/tests/test_analyzer_engine_provider.py index 6e7169a011..3060ecd549 100644 --- a/presidio-analyzer/tests/test_analyzer_engine_provider.py +++ b/presidio-analyzer/tests/test_analyzer_engine_provider.py @@ -330,3 +330,234 @@ def test_analyzer_engine_provider_one_custom_recognizer(): assert len(analyzer_engine.get_recognizers()) == 1 assert analyzer_engine.analyze("My zip code is 12345", language="en")[0].score == pytest.approx(0.4) + +def test_analyzer_engine_provider_invalid_analyzer_conf_file(): + """Test that invalid analyzer configuration file path raises error.""" + with pytest.raises(ValueError): + AnalyzerEngineProvider(analyzer_engine_conf_file="/nonexistent/path/file.yaml") + + +def test_analyzer_engine_provider_invalid_nlp_conf_file(): + """Test that invalid NLP engine configuration file path raises error.""" + with pytest.raises(ValueError): + AnalyzerEngineProvider(nlp_engine_conf_file="/nonexistent/path/file.yaml") + + +def test_analyzer_engine_provider_invalid_registry_conf_file(): + """Test that invalid recognizer registry configuration file path raises error.""" + with pytest.raises(ValueError): + AnalyzerEngineProvider(recognizer_registry_conf_file="/nonexistent/path/file.yaml") + + +def test_analyzer_engine_provider_get_configuration_with_nonexistent_file(): + """Test get_configuration falls back to default when file doesn't exist.""" + provider = AnalyzerEngineProvider() + + # Test with nonexistent file - should fall back to default + config = provider.get_configuration("/tmp/nonexistent_config_file_12345.yaml") + + # Should return a valid configuration (the default one) + assert config is not None + assert isinstance(config, dict) + + +def test_analyzer_engine_provider_get_configuration_with_invalid_yaml(): + """Test get_configuration handles invalid YAML gracefully.""" + import tempfile + import os + + # Create a temporary file with invalid YAML + with tempfile.NamedTemporaryFile(mode='w', suffix='.yaml', delete=False) as f: + f.write("invalid: yaml: content: [[[") + temp_file = f.name + + try: + provider = AnalyzerEngineProvider() + config = provider.get_configuration(temp_file) + + # Should fall back to default configuration + assert config is not None + assert isinstance(config, dict) + finally: + os.unlink(temp_file) + + +def test_analyzer_engine_provider_get_full_conf_path(): + """Test _get_full_conf_path static method.""" + from pathlib import Path + + path = AnalyzerEngineProvider._get_full_conf_path() + + assert isinstance(path, Path) + assert path.name == "default_analyzer.yaml" + assert path.exists() + + +def test_analyzer_engine_provider_get_full_conf_path_custom_file(): + """Test _get_full_conf_path with custom filename.""" + from pathlib import Path + + path = AnalyzerEngineProvider._get_full_conf_path("custom_file.yaml") + + assert isinstance(path, Path) + assert path.name == "custom_file.yaml" + + +def test_analyzer_engine_provider_configuration_property(): + """Test that configuration property is set correctly.""" + provider = AnalyzerEngineProvider() + + assert provider.configuration is not None + assert isinstance(provider.configuration, dict) + + +def test_analyzer_engine_provider_nlp_engine_conf_file_property(): + """Test that nlp_engine_conf_file property is stored correctly.""" + test_yaml, nlp_yaml, _ = get_full_paths( + "conf/simple_analyzer_engine.yaml", + "conf/default.yaml", + ) + + provider = AnalyzerEngineProvider( + analyzer_engine_conf_file=test_yaml, + nlp_engine_conf_file=nlp_yaml, + ) + + assert provider.nlp_engine_conf_file == nlp_yaml + + +def test_analyzer_engine_provider_recognizer_registry_conf_file_property(): + """Test that recognizer_registry_conf_file property is stored correctly.""" + test_yaml, _, registry_yaml = get_full_paths( + "conf/simple_analyzer_engine.yaml", + None, + "conf/test_recognizer_registry.yaml", + ) + + provider = AnalyzerEngineProvider( + analyzer_engine_conf_file=test_yaml, + recognizer_registry_conf_file=registry_yaml, + ) + + assert provider.recognizer_registry_conf_file == registry_yaml + + +def test_analyzer_engine_provider_load_nlp_engine_from_conf(): + """Test _load_nlp_engine with nlp_configuration in analyzer config.""" + analyzer_yaml, _, _ = get_full_paths("conf/test_analyzer_engine.yaml") + + provider = AnalyzerEngineProvider(analyzer_engine_conf_file=analyzer_yaml) + nlp_engine = provider._load_nlp_engine() + + assert nlp_engine is not None + assert nlp_engine.engine_name == "spacy" + + +def test_analyzer_engine_provider_load_nlp_engine_default(): + """Test _load_nlp_engine falls back to default when no config provided.""" + provider = AnalyzerEngineProvider() + nlp_engine = provider._load_nlp_engine() + + assert nlp_engine is not None + assert isinstance(nlp_engine, SpacyNlpEngine) + + +def test_analyzer_engine_provider_load_recognizer_registry_from_embedded_config(): + """Test _load_recognizer_registry with embedded recognizer_registry in config.""" + analyzer_yaml, _, _ = get_full_paths("conf/test_analyzer_engine.yaml") + + provider = AnalyzerEngineProvider(analyzer_engine_conf_file=analyzer_yaml) + nlp_engine = provider._load_nlp_engine() + + registry = provider._load_recognizer_registry( + supported_languages=["en"], + nlp_engine=nlp_engine, + ) + + assert registry is not None + assert len(registry.recognizers) > 0 + + +def test_analyzer_engine_provider_load_recognizer_registry_default(): + """Test _load_recognizer_registry uses default when no config provided.""" + provider = AnalyzerEngineProvider() + nlp_engine = provider._load_nlp_engine() + + registry = provider._load_recognizer_registry( + supported_languages=["en"], + nlp_engine=nlp_engine, + ) + + assert registry is not None + assert len(registry.recognizers) > 0 + + +def test_analyzer_engine_provider_create_engine_with_all_params(): + """Test create_engine with all configuration parameters.""" + analyzer_yaml, nlp_yaml, registry_yaml = get_full_paths( + "conf/simple_analyzer_engine.yaml", + "conf/default.yaml", + "conf/test_recognizer_registry.yaml", + ) + + provider = AnalyzerEngineProvider( + analyzer_engine_conf_file=analyzer_yaml, + nlp_engine_conf_file=nlp_yaml, + recognizer_registry_conf_file=registry_yaml, + ) + + engine = provider.create_engine() + + assert engine is not None + assert engine.nlp_engine is not None + assert engine.registry is not None + assert len(engine.supported_languages) > 0 + + +def test_analyzer_engine_provider_multiple_languages_support(): + """Test analyzer engine with multiple language support.""" + analyzer_yaml, _, _ = get_full_paths("conf/test_analyzer_engine.yaml") + + provider = AnalyzerEngineProvider(analyzer_engine_conf_file=analyzer_yaml) + engine = provider.create_engine() + + assert "en" in engine.supported_languages + assert "de" in engine.supported_languages + assert "es" in engine.supported_languages + + +def test_analyzer_engine_provider_default_score_threshold(): + """Test that default_score_threshold is properly set.""" + analyzer_yaml, _, _ = get_full_paths("conf/test_analyzer_engine.yaml") + + provider = AnalyzerEngineProvider(analyzer_engine_conf_file=analyzer_yaml) + engine = provider.create_engine() + + assert engine.default_score_threshold == 0.7 + + +def test_analyzer_engine_provider_with_pathlib_path(): + """Test AnalyzerEngineProvider works with pathlib.Path objects.""" + from pathlib import Path + + analyzer_yaml, _, _ = get_full_paths("conf/simple_analyzer_engine.yaml") + analyzer_path = Path(analyzer_yaml) + + provider = AnalyzerEngineProvider(analyzer_engine_conf_file=analyzer_path) + engine = provider.create_engine() + + assert engine is not None + + +def test_analyzer_engine_provider_configuration_logging(caplog): + """Test that configuration loading logs appropriate messages.""" + import logging + + with caplog.at_level(logging.INFO): + provider = AnalyzerEngineProvider() + _ = provider.create_engine() + + # Check that some logging occurred + assert len(caplog.records) > 0 + + diff --git a/presidio-analyzer/tests/test_pattern_recognizer.py b/presidio-analyzer/tests/test_pattern_recognizer.py index b26d6cc278..2c39f05ce9 100644 --- a/presidio-analyzer/tests/test_pattern_recognizer.py +++ b/presidio-analyzer/tests/test_pattern_recognizer.py @@ -219,3 +219,282 @@ def test_global_regex_flag_deny_list_returns_right_result(global_flag, expected_ results = recognizer_ignore_case.analyze(text=text, entities=["TITLE"]) assert len(results) == expected_len + + +def test_pattern_recognizer_with_invalidate_result(): + """Test PatternRecognizer with invalidate_result returning True.""" + class InvalidatingRecognizer(PatternRecognizer): + def invalidate_result(self, pattern_text): + # Invalidate if pattern starts with '0' + return pattern_text.startswith('0') + + patterns = [Pattern(name="test_pattern", regex=r"\d{3}", score=0.8)] + recognizer = InvalidatingRecognizer( + supported_entity="TEST", + patterns=patterns, + name="InvalidatingTest", + ) + + # Test with valid pattern (doesn't start with 0) + results = recognizer.analyze("Test 123 and 456", ["TEST"]) + assert len(results) == 2 + assert all(r.score == 0.8 for r in results) + + # Test with invalidated pattern (starts with 0) + results = recognizer.analyze("Test 012 and 098", ["TEST"]) + assert len(results) == 0 # Should be filtered out due to MIN_SCORE + + +def test_pattern_recognizer_with_validate_result_false(): + """Test PatternRecognizer with validate_result returning False.""" + class ValidatingRecognizer(PatternRecognizer): + def validate_result(self, pattern_text): + # Only validate if it contains digit '5' + return '5' in pattern_text + + patterns = [Pattern(name="test_pattern", regex=r"\d{3}", score=0.5)] + recognizer = ValidatingRecognizer( + supported_entity="TEST", + patterns=patterns, + name="ValidatingTest", + ) + + # Test with valid pattern (contains 5) + results = recognizer.analyze("Test 456", ["TEST"]) + assert len(results) == 1 + assert results[0].score == 1.0 # MAX_SCORE + + # Test with invalid pattern (no 5) + results = recognizer.analyze("Test 123", ["TEST"]) + assert len(results) == 0 # Filtered due to MIN_SCORE + + +def test_pattern_recognizer_with_both_validate_and_invalidate(): + """Test PatternRecognizer with both validate and invalidate logic.""" + class BothRecognizer(PatternRecognizer): + def validate_result(self, pattern_text): + return len(pattern_text) == 3 + + def invalidate_result(self, pattern_text): + return pattern_text == "000" + + patterns = [Pattern(name="test_pattern", regex=r"\d{3}", score=0.5)] + recognizer = BothRecognizer( + supported_entity="TEST", + patterns=patterns, + name="BothTest", + ) + + # Test with valid and not invalidated + results = recognizer.analyze("Test 123", ["TEST"]) + assert len(results) == 1 + assert results[0].score == 1.0 + + # Test with invalidated + results = recognizer.analyze("Test 000", ["TEST"]) + assert len(results) == 0 + + +def test_pattern_recognizer_empty_match_skipped(): + """Test that empty regex matches are skipped.""" + patterns = [Pattern(name="test_pattern", regex=r"\d*", score=0.5)] + recognizer = PatternRecognizer( + supported_entity="TEST", + patterns=patterns, + name="EmptyMatchTest", + ) + + # This regex can match empty strings + results = recognizer.analyze("abc", ["TEST"]) + # Empty matches should be filtered out + assert len(results) == 0 + + +def test_pattern_recognizer_to_dict(): + """Test serialization of PatternRecognizer to dict.""" + patterns = [Pattern(name="p1", regex=r"\d+", score=0.8)] + deny_list = ["word1", "word2"] + context = ["context1", "context2"] + + recognizer = PatternRecognizer( + supported_entity="TEST_ENTITY", + patterns=patterns, + deny_list=deny_list, + context=context, + name="TestRecognizer", + version="1.0.0", + ) + + result_dict = recognizer.to_dict() + + assert result_dict["supported_entity"] == "TEST_ENTITY" + assert "supported_entities" not in result_dict + assert len(result_dict["patterns"]) == 2 # 1 pattern + 1 deny_list pattern + assert result_dict["deny_list"] == deny_list + assert result_dict["context"] == context + assert result_dict["name"] == "TestRecognizer" + assert result_dict["version"] == "1.0.0" + + +def test_pattern_recognizer_from_dict_with_both_supported_entity_and_entities(): + """Test from_dict raises error when both supported_entity and supported_entities present.""" + recognizer_dict = { + "supported_entity": "ENTITY_A", + "supported_entities": ["ENTITY_B"], + "patterns": [{"name": "p1", "score": 0.5, "regex": r"\d+"}], + } + + with pytest.raises(ValueError, match="Both 'supported_entity' and 'supported_entities'"): + PatternRecognizer.from_dict(recognizer_dict) + + +def test_pattern_recognizer_from_dict_with_supported_entities_only(): + """Test from_dict uses first element of supported_entities.""" + recognizer_dict = { + "supported_entities": ["ENTITY_A", "ENTITY_B"], + "patterns": [{"name": "p1", "score": 0.5, "regex": r"\d+"}], + } + + recognizer = PatternRecognizer.from_dict(recognizer_dict) + assert recognizer.supported_entities == ["ENTITY_A"] + + +def test_pattern_recognizer_from_dict_with_empty_supported_entities(): + """Test from_dict with empty supported_entities list.""" + recognizer_dict = { + "supported_entities": [], + "patterns": [{"name": "p1", "score": 0.5, "regex": r"\d+"}], + } + + # Should raise TypeError because supported_entity parameter is missing + with pytest.raises(TypeError): + PatternRecognizer.from_dict(recognizer_dict) + + +def test_pattern_recognizer_analyze_with_custom_regex_flags(): + """Test analyze with custom regex flags.""" + patterns = [Pattern(name="test_pattern", regex=r"test", score=0.8)] + recognizer = PatternRecognizer( + supported_entity="TEST", + patterns=patterns, + name="FlagTest", + global_regex_flags=0, # No flags by default + ) + + # Should not match with default flags (case-sensitive) + results = recognizer.analyze("TEST", ["TEST"]) + assert len(results) == 0 + + # Should match with IGNORECASE flag + results = recognizer.analyze("TEST", ["TEST"], regex_flags=re.IGNORECASE) + assert len(results) == 1 + + +def test_pattern_recognizer_multiple_patterns(): + """Test recognizer with multiple patterns.""" + patterns = [ + Pattern(name="pattern1", regex=r"\b\d{3}\b", score=0.6), + Pattern(name="pattern2", regex=r"\b[A-Z]{4}\b", score=0.7), + ] + recognizer = PatternRecognizer( + supported_entity="TEST", + patterns=patterns, + name="MultiPatternTest", + global_regex_flags=re.DOTALL | re.MULTILINE + ) + + results = recognizer.analyze("Number 123 and CAPS word", ["TEST"]) + # Should find exactly 2 results (digits and capitals) + assert len(results) == 2 + + # Check that both patterns were matched with correct scores + scores = sorted([r.score for r in results]) + assert scores == [0.6, 0.7] + + +def test_pattern_recognizer_build_regex_explanation(): + """Test build_regex_explanation static method.""" + explanation = PatternRecognizer.build_regex_explanation( + recognizer_name="TestRecognizer", + pattern_name="TestPattern", + pattern=r"\d+", + original_score=0.85, + validation_result=True, + regex_flags=re.IGNORECASE, + ) + + assert explanation.recognizer == "TestRecognizer" + assert explanation.pattern_name == "TestPattern" + assert explanation.pattern == r"\d+" + assert explanation.original_score == 0.85 + assert explanation.validation_result == True + assert explanation.regex_flags == re.IGNORECASE + assert "TestRecognizer" in explanation.textual_explanation + assert "TestPattern" in explanation.textual_explanation + + +def test_pattern_recognizer_load_method(): + """Test that load method can be called without error.""" + recognizer = PatternRecognizer( + supported_entity="TEST", + deny_list=["test"], + ) + + # load() should not raise any exception + recognizer.load() + + +def test_pattern_recognizer_with_zero_global_regex_flags(): + """Test PatternRecognizer with 0 as global_regex_flags.""" + patterns = [Pattern(name="test", regex=r"test", score=0.5)] + recognizer = PatternRecognizer( + supported_entity="TEST", + patterns=patterns, + global_regex_flags=0, + ) + + # Should work with 0 flags (case-sensitive) + results = recognizer.analyze("test", ["TEST"]) + assert len(results) == 1 + + # Should not match different case + results = recognizer.analyze("TEST", ["TEST"]) + assert len(results) == 0 + + +def test_pattern_recognizer_recompiles_regex_on_flag_change(): + """Test that regex is recompiled when flags change.""" + patterns = [Pattern(name="test", regex=r"test", score=0.5)] + recognizer = PatternRecognizer( + supported_entity="TEST", + patterns=patterns, + global_regex_flags=0, + ) + + # First analysis with no flags + results = recognizer.analyze("TEST", ["TEST"], regex_flags=0) + assert len(results) == 0 + + # Second analysis with IGNORECASE flag (should recompile) + results = recognizer.analyze("TEST", ["TEST"], regex_flags=re.IGNORECASE) + assert len(results) == 1 + + +def test_pattern_recognizer_recognizer_metadata(): + """Test that recognition_metadata is properly set in results.""" + patterns = [Pattern(name="test", regex=r"\d+", score=0.5)] + recognizer = PatternRecognizer( + supported_entity="TEST", + patterns=patterns, + name="MetadataTest", + ) + + results = recognizer.analyze("Test 123", ["TEST"]) + assert len(results) == 1 + + metadata = results[0].recognition_metadata + assert "recognizer_name" in metadata + assert metadata["recognizer_name"] == "MetadataTest" + assert "recognizer_identifier" in metadata + + From 4ebee59e41e87cb2c1588f39400e890e021e2885 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 4 Dec 2025 10:49:18 +0200 Subject: [PATCH 26/30] more unit tests --- .../input_validation/schemas.py | 40 +-- .../tests/test_analyzer_request.py | 279 ++++++++++++++++++ 2 files changed, 281 insertions(+), 38 deletions(-) create mode 100644 presidio-analyzer/tests/test_analyzer_request.py diff --git a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py index c78c139687..a256ca2a58 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/schemas.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/schemas.py @@ -85,40 +85,6 @@ def validate_recognizer_registry_configuration( except ValidationError as e: raise ValueError("Invalid recognizer registry configuration") from e - @staticmethod - def _validate_recognizer_registry_basic(config: Dict[str, Any]) -> Dict[str, Any]: - """Validate recognizer registry config.""" - if not isinstance(config, dict): - raise ValueError("Recognizer registry configuration must be a dictionary") - - # Define valid top-level keys for recognizer registry configuration - valid_keys = { - "supported_languages", - "global_regex_flags", - "recognizers" - } - - # Check for unknown keys - unknown_keys = set(config.keys()) - valid_keys - if unknown_keys: - raise ValueError( - f"Unknown configuration key(s) in " - f"recognizer_registry: {sorted(unknown_keys)}. " - f"Valid keys are: {sorted(valid_keys)}" - ) - - # Validate supported languages - if "supported_languages" in config: - validate_language_codes( - config["supported_languages"] - ) - - # Validate recognizers list - if "recognizers" in config and not isinstance(config["recognizers"], list): - raise ValueError("Recognizers must be a list") - - return config - @staticmethod def validate_analyzer_configuration(config: Dict[str, Any]) -> Dict[str, Any]: """Validate analyzer engine configuration.""" @@ -130,7 +96,7 @@ def validate_analyzer_configuration(config: Dict[str, Any]) -> Dict[str, Any]: "supported_languages", "default_score_threshold", "nlp_configuration", - "recognizer_registry" + "recognizer_registry", } # Check for unknown keys @@ -144,9 +110,7 @@ def validate_analyzer_configuration(config: Dict[str, Any]) -> Dict[str, Any]: # Validate supported languages if present if "supported_languages" in config: - validate_language_codes( - config["supported_languages"] - ) + validate_language_codes(config["supported_languages"]) # Validate score threshold if present if "default_score_threshold" in config: diff --git a/presidio-analyzer/tests/test_analyzer_request.py b/presidio-analyzer/tests/test_analyzer_request.py new file mode 100644 index 0000000000..c704ac9d57 --- /dev/null +++ b/presidio-analyzer/tests/test_analyzer_request.py @@ -0,0 +1,279 @@ +import regex as re +from presidio_analyzer import AnalyzerRequest, PatternRecognizer + + +class TestAnalyzerRequest: + """Tests for AnalyzerRequest class.""" + + def test_analyzer_request_basic_fields(self): + """Test basic field initialization.""" + req_data = { + "text": "My phone number is 555-1234", + "language": "en", + "entities": ["PHONE_NUMBER"], + "correlation_id": "test-123", + "score_threshold": 0.5, + "return_decision_process": True, + } + + request = AnalyzerRequest(req_data) + + assert request.text == "My phone number is 555-1234" + assert request.language == "en" + assert request.entities == ["PHONE_NUMBER"] + assert request.correlation_id == "test-123" + assert request.score_threshold == 0.5 + assert request.return_decision_process is True + + def test_analyzer_request_with_context(self): + """Test context field initialization (line 37).""" + req_data = { + "text": "Test text", + "language": "en", + "context": ["previous message", "current message"] + } + + request = AnalyzerRequest(req_data) + + assert request.context == ["previous message", "current message"] + + def test_analyzer_request_with_allow_list(self): + """Test allow_list field initialization (line 38).""" + req_data = { + "text": "Test text", + "language": "en", + "allow_list": ["John", "Microsoft", "Seattle"] + } + + request = AnalyzerRequest(req_data) + + assert request.allow_list == ["John", "Microsoft", "Seattle"] + + def test_analyzer_request_with_allow_list_match_default(self): + """Test allow_list_match field with default value (line 39).""" + req_data = { + "text": "Test text", + "language": "en", + } + + request = AnalyzerRequest(req_data) + + # Should default to "exact" + assert request.allow_list_match == "exact" + + def test_analyzer_request_with_allow_list_match_custom(self): + """Test allow_list_match field with custom value (line 39).""" + req_data = { + "text": "Test text", + "language": "en", + "allow_list_match": "partial" + } + + request = AnalyzerRequest(req_data) + + assert request.allow_list_match == "partial" + + def test_analyzer_request_with_regex_flags_default(self): + """Test regex_flags field with default value (line 40).""" + req_data = { + "text": "Test text", + "language": "en", + } + + request = AnalyzerRequest(req_data) + + # Should default to DOTALL | MULTILINE | IGNORECASE + expected_flags = re.DOTALL | re.MULTILINE | re.IGNORECASE + assert request.regex_flags == expected_flags + + def test_analyzer_request_with_regex_flags_custom(self): + """Test regex_flags field with custom value (line 40).""" + custom_flags = re.IGNORECASE | re.UNICODE + req_data = { + "text": "Test text", + "language": "en", + "regex_flags": custom_flags + } + + request = AnalyzerRequest(req_data) + + assert request.regex_flags == custom_flags + + def test_analyzer_request_without_context(self): + """Test that context is None when not provided.""" + req_data = { + "text": "Test text", + "language": "en", + } + + request = AnalyzerRequest(req_data) + + assert request.context is None + + def test_analyzer_request_without_allow_list(self): + """Test that allow_list is None when not provided.""" + req_data = { + "text": "Test text", + "language": "en", + } + + request = AnalyzerRequest(req_data) + + assert request.allow_list is None + + def test_analyzer_request_all_fields(self): + """Test initialization with all fields including lines 37-40.""" + req_data = { + "text": "My name is John and my email is john@example.com", + "language": "en", + "entities": ["PERSON", "EMAIL_ADDRESS"], + "correlation_id": "full-test-456", + "score_threshold": 0.7, + "return_decision_process": False, + "ad_hoc_recognizers": [ + { + "supported_entity": "CUSTOM_ENTITY", + "supported_language": "en", + "patterns": [ + { + "name": "custom_pattern", + "regex": r"\d{3}-\d{3}", + "score": 0.5 + } + ] + } + ], + "context": ["user profile", "chat history"], + "allow_list": ["John", "Microsoft"], + "allow_list_match": "fuzzy", + "regex_flags": re.IGNORECASE + } + + request = AnalyzerRequest(req_data) + + assert request.text == "My name is John and my email is john@example.com" + assert request.language == "en" + assert request.entities == ["PERSON", "EMAIL_ADDRESS"] + assert request.correlation_id == "full-test-456" + assert request.score_threshold == 0.7 + assert request.return_decision_process is False + assert len(request.ad_hoc_recognizers) == 1 + assert isinstance(request.ad_hoc_recognizers[0], PatternRecognizer) + assert request.context == ["user profile", "chat history"] + assert request.allow_list == ["John", "Microsoft"] + assert request.allow_list_match == "fuzzy" + assert request.regex_flags == re.IGNORECASE + + def test_analyzer_request_with_ad_hoc_recognizers(self): + """Test ad_hoc_recognizers field initialization.""" + req_data = { + "text": "Test text", + "language": "en", + "ad_hoc_recognizers": [ + { + "supported_entity": "CUSTOM_ID", + "supported_language": "en", + "patterns": [ + { + "name": "id_pattern", + "regex": r"ID-\d{5}", + "score": 0.8 + } + ] + } + ] + } + + request = AnalyzerRequest(req_data) + + assert len(request.ad_hoc_recognizers) == 1 + assert isinstance(request.ad_hoc_recognizers[0], PatternRecognizer) + assert request.ad_hoc_recognizers[0].supported_entities == ["CUSTOM_ID"] + + def test_analyzer_request_without_ad_hoc_recognizers(self): + """Test that ad_hoc_recognizers is empty list when not provided.""" + req_data = { + "text": "Test text", + "language": "en", + } + + request = AnalyzerRequest(req_data) + + assert request.ad_hoc_recognizers == [] + + def test_analyzer_request_empty_dict(self): + """Test initialization with empty dictionary.""" + req_data = {} + + request = AnalyzerRequest(req_data) + + assert request.text is None + assert request.language is None + assert request.entities is None + assert request.correlation_id is None + assert request.score_threshold is None + assert request.return_decision_process is None + assert request.ad_hoc_recognizers == [] + assert request.context is None + assert request.allow_list is None + assert request.allow_list_match == "exact" + assert request.regex_flags == (re.DOTALL | re.MULTILINE | re.IGNORECASE) + + def test_analyzer_request_with_complex_context(self): + """Test context field with various data types.""" + req_data = { + "text": "Test text", + "language": "en", + "context": { + "user_id": "12345", + "session": "abc", + "metadata": {"key": "value"} + } + } + + request = AnalyzerRequest(req_data) + + assert request.context == { + "user_id": "12345", + "session": "abc", + "metadata": {"key": "value"} + } + + def test_analyzer_request_with_multiple_regex_flags(self): + """Test regex_flags with multiple combined flags.""" + custom_flags = re.IGNORECASE | re.MULTILINE | re.DOTALL | re.VERBOSE + req_data = { + "text": "Test text", + "language": "en", + "regex_flags": custom_flags + } + + request = AnalyzerRequest(req_data) + + assert request.regex_flags == custom_flags + # Verify individual flags are present + assert request.regex_flags & re.IGNORECASE + assert request.regex_flags & re.MULTILINE + assert request.regex_flags & re.DOTALL + assert request.regex_flags & re.VERBOSE + + def test_analyzer_request_allow_list_match_variations(self): + """Test various allow_list_match values.""" + test_cases = [ + "exact", + "partial", + "fuzzy", + "regex", + "custom_match_type" + ] + + for match_type in test_cases: + req_data = { + "text": "Test text", + "language": "en", + "allow_list_match": match_type + } + + request = AnalyzerRequest(req_data) + assert request.allow_list_match == match_type + From c17ee634ac018641659af5f59acb0d359b66cf88 Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Thu, 4 Dec 2025 13:17:23 +0200 Subject: [PATCH 27/30] Update presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry_provider.py Co-authored-by: Dor Lugasi-Gal --- .../recognizer_registry/recognizer_registry_provider.py | 1 - 1 file changed, 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry_provider.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry_provider.py index 73c646227e..143f825133 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry_provider.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizer_registry_provider.py @@ -56,7 +56,6 @@ def __init__( self.configuration = RecognizerConfigurationLoader.get( conf_file=conf_file, registry_configuration=registry_configuration ) - # Validate configuration using Pydantic self.configuration = ( ConfigurationValidator.validate_recognizer_registry_configuration( From ea593d69330f8fe1ced6c1cd97dc650b17769dbe Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 7 Dec 2025 18:52:08 +0200 Subject: [PATCH 28/30] updates based on PR comments --- .../presidio_analyzer/entity_recognizer.py | 10 +- .../yaml_recognizer_models.py | 166 +++++++----------- .../presidio_analyzer/lm_recognizer.py | 8 +- .../presidio_analyzer/pattern_recognizer.py | 8 +- .../recognizers_loader_utils.py | 7 +- .../presidio_analyzer/remote_recognizer.py | 8 +- .../tests/test_yaml_recognizer_models.py | 38 ++-- 7 files changed, 101 insertions(+), 144 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/entity_recognizer.py b/presidio-analyzer/presidio_analyzer/entity_recognizer.py index 07fccc0727..9f05e9b6e8 100644 --- a/presidio-analyzer/presidio_analyzer/entity_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/entity_recognizer.py @@ -1,9 +1,11 @@ import logging from abc import abstractmethod -from typing import Dict, List, Optional, Tuple +from typing import TYPE_CHECKING, Dict, List, Optional, Tuple from presidio_analyzer import RecognizerResult -from presidio_analyzer.nlp_engine import NlpArtifacts + +if TYPE_CHECKING: + from presidio_analyzer.nlp_engine import NlpArtifacts logger = logging.getLogger("presidio-analyzer") @@ -74,7 +76,7 @@ def load(self) -> None: @abstractmethod def analyze( - self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts + self, text: str, entities: List[str], nlp_artifacts: "NlpArtifacts" ) -> List[RecognizerResult]: """ Analyze text to identify entities. @@ -92,7 +94,7 @@ def enhance_using_context( text: str, raw_recognizer_results: List[RecognizerResult], other_raw_recognizer_results: List[RecognizerResult], - nlp_artifacts: NlpArtifacts, + nlp_artifacts: "NlpArtifacts", context: Optional[List[str]] = None, ) -> List[RecognizerResult]: """Enhance confidence score using context of the entity. diff --git a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py index 0f5da5bee2..ea7c2c81f4 100644 --- a/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py +++ b/presidio-analyzer/presidio_analyzer/input_validation/yaml_recognizer_models.py @@ -5,6 +5,10 @@ from pydantic import BaseModel, ConfigDict, Field, field_validator, model_validator from presidio_analyzer.input_validation import validate_language_codes +from presidio_analyzer.recognizer_registry.recognizers_loader_utils import ( + PredefinedRecognizerNotFoundError, + RecognizerListLoader, +) class LanguageContextConfig(BaseModel): @@ -133,25 +137,10 @@ class PredefinedRecognizerConfig(BaseRecognizerConfig): def validate_predefined_recognizer_exists(self): """Validate that the predefined recognizer class actually exists.""" try: - # Lazy import to avoid circular dependency - from presidio_analyzer.recognizer_registry.recognizers_loader_utils import ( - RecognizerListLoader, - ) - RecognizerListLoader.get_existing_recognizer_cls(self.name) - except (ImportError, ModuleNotFoundError): - return self - except ValueError as e: - available_recognizers = [ - cls.__name__ - for cls in RecognizerListLoader.get_all_existing_recognizers() - ] + except PredefinedRecognizerNotFoundError as e: raise ValueError( - f"Predefined recognizer '{self.name}' not found. " - f"If you want to add your own custom recognizer, " - f"mark is as type: 'custom'. " - f"The available predefined recognizers are: " - f"{', '.join(sorted(available_recognizers))}" + f"Predefined recognizer '{self.name}' not found: {str(e)}" ) from e return self @@ -201,25 +190,19 @@ def check_predefined_name_conflict(cls, data: Any) -> Any: name = data.get("name") if name: try: - # Lazy import to avoid circular dependency - from presidio_analyzer.recognizer_registry.recognizers_loader_utils import ( # noqa - RecognizerListLoader, + RecognizerListLoader.get_existing_recognizer_cls(name) + # If we reach here, the recognizer IS predefined, so raise an error + raise ValueError( + f"Recognizer '{name}' conflicts with a predefined " + f"recognizer. " + f"Custom recognizers cannot use the same name " + f"as predefined recognizers. " + f"Either use type: 'predefined' or choose a different name " + f"for your custom recognizer." ) - - try: - RecognizerListLoader.get_existing_recognizer_cls(name) - raise ValueError( - f"Recognizer '{name}' conflicts with a predefined " - f"recognizer. " - f"Custom recognizers cannot use the same name " - f"as predefined recognizers. " - f"Either use type: 'predefined' or choose a different name " - f"for your custom recognizer." - ) - except ValueError as e: - if "was not found" not in str(e): - raise - except (ImportError, ModuleNotFoundError): + except PredefinedRecognizerNotFoundError: + # Name is not a predefined recognizer, + # which is fine for custom recognizers pass return data @@ -248,34 +231,9 @@ def validate_patterns(cls, patterns: Optional[List[Dict]]) -> Optional[List[Dict raise ValueError(f"Pattern score should be between 0 and 1: {pattern}") return patterns - @model_validator(mode="after") - def validate_configuration(self): - """Ensure configuration is valid.""" - # Check if user accidentally marked a predefined recognizer as custom - # This check should happen BEFORE checking patterns/deny_list - # to give a more specific error message - try: - # Lazy import to avoid circular dependency - from presidio_analyzer.recognizer_registry.recognizers_loader_utils import ( - RecognizerListLoader, - ) - - try: - RecognizerListLoader.get_existing_recognizer_cls(self.name) - raise ValueError( - f"Recognizer '{self.name}' is a predefined recognizer " - f"but is marked as 'custom'. " - f"Either use type: 'predefined' or choose a different " - f"name for your custom recognizer." - ) - except ValueError as e: - if "was not found" not in str(e): - raise - except (ImportError, ModuleNotFoundError): - pass - - # Validate patterns or deny_list only after name check + def validate_patterns_or_deny_list(self): + """Ensure custom recognizer has at least patterns or deny_list.""" if not self.patterns and not self.deny_list: raise ValueError( "Custom recognizer must have at least one " @@ -299,40 +257,40 @@ class RecognizerRegistryConfig(BaseModel): @field_validator("supported_languages") @classmethod - def validate_language_codes(cls, v: Optional[List[str]]) -> Optional[List[str]]: + def validate_language_codes( + cls, languages: Optional[List[str]] + ) -> Optional[List[str]]: """Validate language codes format.""" # Allow None or empty list for cases where languages will be inferred - if v is None: + if languages is None: return None - if len(v) == 0: + if len(languages) == 0: return [] - validate_language_codes(v) - return v + validate_language_codes(languages) + return languages @model_validator(mode="after") def validate_languages_for_custom_recognizers(self): """Validate that custom recognizers have language configuration.""" # If we have custom recognizers, we need language configuration somewhere - for recognizer in self.recognizers: - if isinstance(recognizer, CustomRecognizerConfig): - # Check if this custom recognizer has its own language config - if ( - not recognizer.supported_language - and not recognizer.supported_languages - ): - # If no language config on recognizer, we need global languages - if not self.supported_languages: - raise ValueError( - f"Language configuration missing for custom recognizer " - f"'{recognizer.name}': " - "Either specify 'supported_languages' " - "on the recognizer or provide " - "global 'supported_languages' in the " - "registry configuration." - ) + custom_recognizers = [ + rec for rec in self.recognizers if isinstance(rec, CustomRecognizerConfig) + ] + for recognizer in custom_recognizers: + if not recognizer.supported_language and not recognizer.supported_languages: + # If no language config on recognizer, we need global languages + if not self.supported_languages: + raise ValueError( + f"Language configuration missing for custom recognizer " + f"'{recognizer.name}': " + "Either specify 'supported_languages' " + "on the recognizer or provide " + "global 'supported_languages' in the " + "registry configuration." + ) return self @@ -348,25 +306,27 @@ def validate_recognizers_not_empty(self): @field_validator("recognizers", mode="before") @classmethod - def parse_recognizers(cls, v): + def parse_recognizers( + cls, recognizers: List[Union[Dict[str, Any], str]] + ) -> List[BaseRecognizerConfig]: """Parse recognizers from various input formats without duplication.""" - if v is None: + if recognizers is None: raise ValueError( "Configuration error: 'recognizers' is required. " "Please provide a list of recognizers in the configuration." ) - if not isinstance(v, list): + if not isinstance(recognizers, list): raise ValueError("Recognizers must be a list") - if len(v) == 0: + if len(recognizers) == 0: raise ValueError( "The 'recognizers' field must contain at least one recognizer. " "Found an empty recognizers list." ) parsed_recognizers = [] - for recognizer in v: + for recognizer in recognizers: if isinstance(recognizer, str): # Simple string recognizer name - treat as predefined parsed_recognizers.append(recognizer) @@ -415,24 +375,20 @@ def parse_recognizers(cls, v): return parsed_recognizers @classmethod - def __check_if_predefined(cls, recognizer_name: Optional[Any]): + def __check_if_predefined(cls, recognizer_name: Optional[Any]) -> None: try: - from presidio_analyzer.recognizer_registry.recognizers_loader_utils import ( - RecognizerListLoader, + RecognizerListLoader.get_existing_recognizer_cls(recognizer_name) + # If we reach here, it IS a predefined recognizer, so raise an error + raise ValueError( + f"Recognizer '{recognizer_name}' conflicts with a predefined " + f"recognizer. " + f"Custom recognizers cannot use the same name " + f"as predefined recognizers. " + f"Either use type: 'predefined' or choose a different name " + f"for your custom recognizer." ) - - try: - RecognizerListLoader.get_existing_recognizer_cls(recognizer_name) - raise ValueError( - f"Recognizer '{recognizer_name}' is a recognizer predefined in " - f"code but has 'patterns' or 'deny_list' defined. " - f"Either use type: 'predefined' " - f"or choose a different name for your custom recognizer." - ) - except ValueError as e: - if "was not found" not in str(e): - raise - except ImportError: + except PredefinedRecognizerNotFoundError: + # Name is not a predefined recognizer, which is fine for custom recognizers pass @model_validator(mode="after") diff --git a/presidio-analyzer/presidio_analyzer/lm_recognizer.py b/presidio-analyzer/presidio_analyzer/lm_recognizer.py index 64adfdb044..7a79f0b163 100644 --- a/presidio-analyzer/presidio_analyzer/lm_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/lm_recognizer.py @@ -1,6 +1,6 @@ import logging from abc import ABC, abstractmethod -from typing import List, Optional +from typing import TYPE_CHECKING, List, Optional from presidio_analyzer import RecognizerResult, RemoteRecognizer from presidio_analyzer.llm_utils import ( @@ -12,7 +12,9 @@ skip_unmapped_entities, validate_result_positions, ) -from presidio_analyzer.nlp_engine import NlpArtifacts + +if TYPE_CHECKING: + from presidio_analyzer.nlp_engine import NlpArtifacts logger = logging.getLogger("presidio-analyzer") @@ -90,7 +92,7 @@ def analyze( self, text: str, entities: Optional[List[str]] = None, - nlp_artifacts: Optional[NlpArtifacts] = None + nlp_artifacts: Optional["NlpArtifacts"] = None ) -> List[RecognizerResult]: """Analyze text for PII/PHI using LLM.""" if not text or not text.strip(): diff --git a/presidio-analyzer/presidio_analyzer/pattern_recognizer.py b/presidio-analyzer/presidio_analyzer/pattern_recognizer.py index df4a051ae7..ad6053355b 100644 --- a/presidio-analyzer/presidio_analyzer/pattern_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/pattern_recognizer.py @@ -1,6 +1,6 @@ import datetime import logging -from typing import Dict, List, Optional +from typing import TYPE_CHECKING, Dict, List, Optional import regex as re @@ -11,7 +11,9 @@ Pattern, RecognizerResult, ) -from presidio_analyzer.nlp_engine import NlpArtifacts + +if TYPE_CHECKING: + from presidio_analyzer.nlp_engine import NlpArtifacts logger = logging.getLogger("presidio-analyzer") @@ -79,7 +81,7 @@ def analyze( self, text: str, entities: List[str], - nlp_artifacts: Optional[NlpArtifacts] = None, + nlp_artifacts: Optional["NlpArtifacts"] = None, regex_flags: Optional[int] = None, ) -> List[RecognizerResult]: """ diff --git a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py index 5905e4e340..e73f64398f 100644 --- a/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py +++ b/presidio-analyzer/presidio_analyzer/recognizer_registry/recognizers_loader_utils.py @@ -12,6 +12,11 @@ logger = logging.getLogger("presidio-analyzer") +class PredefinedRecognizerNotFoundError(Exception): + """Exception raised when a predefined recognizer is not found.""" + + pass + class RecognizerListLoader: """A utility class that initializes recognizers based on configuration.""" @@ -221,7 +226,7 @@ def get_existing_recognizer_cls(recognizer_name: str) -> Type[EntityRecognizer]: if recognizer_name == recognizer.__name__: return recognizer - raise ValueError( + raise PredefinedRecognizerNotFoundError( f"Recognizer of name {recognizer_name} was not found in the " f"list of recognizers inheriting the EntityRecognizer class" ) diff --git a/presidio-analyzer/presidio_analyzer/remote_recognizer.py b/presidio-analyzer/presidio_analyzer/remote_recognizer.py index c734b3c4a8..0350e2f5ae 100644 --- a/presidio-analyzer/presidio_analyzer/remote_recognizer.py +++ b/presidio-analyzer/presidio_analyzer/remote_recognizer.py @@ -1,8 +1,10 @@ from abc import ABC, abstractmethod -from typing import List, Optional +from typing import TYPE_CHECKING, List, Optional from presidio_analyzer import EntityRecognizer -from presidio_analyzer.nlp_engine import NlpArtifacts + +if TYPE_CHECKING: + from presidio_analyzer.nlp_engine import NlpArtifacts class RemoteRecognizer(ABC, EntityRecognizer): @@ -35,7 +37,7 @@ def load(self): # noqa: D102 pass @abstractmethod - def analyze(self, text: str, entities: List[str], nlp_artifacts: NlpArtifacts): + def analyze(self, text: str, entities: List[str], nlp_artifacts: "NlpArtifacts"): """ Call an external service for PII detection. diff --git a/presidio-analyzer/tests/test_yaml_recognizer_models.py b/presidio-analyzer/tests/test_yaml_recognizer_models.py index 27ca997e02..425c49105f 100644 --- a/presidio-analyzer/tests/test_yaml_recognizer_models.py +++ b/presidio-analyzer/tests/test_yaml_recognizer_models.py @@ -520,17 +520,6 @@ def test_predefined_recognizer_config_case_sensitive(): assert "Predefined recognizer 'creditcardrecognizer' not found" in error_message -def test_predefined_recognizer_validation_with_import_error(): - """Test that validation gracefully handles import errors.""" - import sys - from unittest.mock import patch - - with patch.dict('sys.modules', {'presidio_analyzer.recognizer_registry.recognizers_loader_utils': None}): - config = PredefinedRecognizerConfig(name="SomeRecognizer") - assert config.name == "SomeRecognizer" - assert config.type == "predefined" - - def test_custom_recognizer_config_predefined_name_error(): """Test that using a predefined recognizer name for custom recognizer raises error.""" with pytest.raises(ValidationError) as exc_info: @@ -573,20 +562,19 @@ def test_custom_recognizer_config_unique_name_valid(): def test_custom_recognizer_config_predefined_name_validation_with_import_error(): - """Test that validation gracefully handles import errors for predefined name checking.""" - from unittest.mock import patch - - # Mock the import to raise ImportError - with patch.dict('sys.modules', {'presidio_analyzer.recognizer_registry.recognizers_loader_utils': None}): - # This should not raise an error even if the import fails - config = CustomRecognizerConfig( - name="SomeRecognizer", - type="custom", - supported_entity="TEST", - patterns=[{"name": "test", "regex": r"\d+", "score": 0.5}] - ) - assert config.name == "SomeRecognizer" - assert config.type == "custom" + """Test that custom recognizers with unique names (not predefined) are valid. + + This test verifies that a custom recognizer can use a name that doesn't + conflict with any predefined recognizers. + """ + config = CustomRecognizerConfig( + name="SomeUniqueRecognizer", + type="custom", + supported_entity="TEST", + patterns=[{"name": "test", "regex": r"\d+", "score": 0.5}] + ) + assert config.name == "SomeUniqueRecognizer" + assert config.type == "custom" def test_custom_recognizer_with_language_no_global_languages(): From 18084f8273728fa9ab2ac9fe3510082a687e969b Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 7 Dec 2025 18:55:31 +0200 Subject: [PATCH 29/30] updates based on PR comments --- .../nlp_engine/ner_model_configuration.py | 24 +++++++++---------- .../presidio_analyzer/pattern.py | 4 ++-- 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py index 1e98697159..3f813f015e 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/ner_model_configuration.py @@ -79,39 +79,39 @@ class NerModelConfiguration(BaseModel): @field_validator("aggregation_strategy") @classmethod - def validate_aggregation_strategy(cls, v: str) -> str: + def validate_aggregation_strategy(cls, agg_strategy: str) -> str: """Validate aggregation strategy.""" valid_strategies = ["simple", "first", "average", "max"] - if v not in valid_strategies: + if agg_strategy not in valid_strategies: logger.warning( - f"Aggregation strategy '{v}' might not be supported. " + f"Aggregation strategy '{agg_strategy}' might not be supported. " f"Valid options: {valid_strategies}" ) - return v + return agg_strategy @field_validator("stride") @classmethod - def validate_stride(cls, v: Optional[int]) -> int: + def validate_stride(cls, stride: Optional[int]) -> int: """Validate stride and handle None values.""" - if v is None: + if stride is None: # Get the default value from the field definition return cls.model_fields["stride"].default - return v + return stride @field_validator("alignment_mode") @classmethod - def validate_alignment_mode(cls, v: Optional[str]) -> str: + def validate_alignment_mode(cls, alignment: Optional[str]) -> str: """Validate alignment mode and handle None values.""" - if v is None: + if alignment is None: # Get the default value from the field definition return cls.model_fields["alignment_mode"].default valid_modes = ["strict", "contract", "expand"] - if v not in valid_modes: + if alignment not in valid_modes: logger.warning( - f"Alignment mode '{v}' might not be supported. " + f"Alignment mode '{alignment}' might not be supported. " f"Valid options: {valid_modes}" ) - return v + return alignment @classmethod def from_dict(cls, ner_model_configuration_dict: Dict) -> "NerModelConfiguration": diff --git a/presidio-analyzer/presidio_analyzer/pattern.py b/presidio-analyzer/presidio_analyzer/pattern.py index dedf61da12..a4a3909bb4 100644 --- a/presidio-analyzer/presidio_analyzer/pattern.py +++ b/presidio-analyzer/presidio_analyzer/pattern.py @@ -24,10 +24,10 @@ def __init__(self, name: str, regex: str, score: float): self.__validate_score(self.score) @staticmethod - def __validate_regex(v: str) -> None: + def __validate_regex(pattern: str) -> None: """Validate that the regex pattern is valid.""" try: - re.compile(v) + re.compile(pattern) except re.error as e: raise ValueError(f"Invalid regex pattern: {e}") From 43ce2849788370fa11edc5877afeedf870b46c3b Mon Sep 17 00:00:00 2001 From: Omri Mendels Date: Sun, 7 Dec 2025 21:17:53 +0200 Subject: [PATCH 30/30] Update presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py Co-authored-by: Dor Lugasi-Gal --- .../presidio_analyzer/nlp_engine/nlp_engine_provider.py | 1 - 1 file changed, 1 deletion(-) diff --git a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py index 8ad0b336ae..74713c6f14 100644 --- a/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py +++ b/presidio-analyzer/presidio_analyzer/nlp_engine/nlp_engine_provider.py @@ -55,7 +55,6 @@ def __init__( ) if nlp_configuration: - # Validate using ConfigurationValidator - let Pydantic errors propagate ConfigurationValidator.validate_nlp_configuration(nlp_configuration) self.nlp_configuration = nlp_configuration