Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
17 changes: 17 additions & 0 deletions docs/supported_entities.md
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,23 @@ For more information, refer to the [adding new recognizers documentation](analyz
| KR_RRN | The Korean Resident Registration Number (RRN) is a 13-digit number issued to all Korean residents. | Pattern match, context and custom logic. |


### Germany
| FieldType | Description | Detection Method |
|------------|---------------------------------------------------------------------------------------------------------|------------------------------------------|
| DE_BSNR | The German Betriebsstättennummer (BSNR) is a 9-digit facility identifier for healthcare providers. | Pattern match, context and regional code validation. |
| DE_COMMERCIAL_REGISTER | German commercial register number (Handelsregisternummer) in formats like HRA/HRB 12345 for registered businesses. | Pattern match and context. |
| DE_DRIVER_LICENSE | German driver license number (Führerscheinnummer), an 11-character alphanumeric identifier. | Pattern match and context. |
| DE_KVNR | The German Krankenversichertennummer (KVNR) is a 10-character health insurance number starting with a letter. | Pattern match, context and checksum. |
| DE_LANR | The German Lebenslange Arztnummer (LANR) is a 9-digit lifelong physician identifier. | Pattern match, context and checksum. |
| DE_LICENSE_PLATE | German vehicle license plate (Kfz-Kennzeichen) in standard format like B-AB 1234. | Pattern match and context. |
| DE_PASSPORT | German passport number (Reisepassnummer), a 9-character alphanumeric identifier. | Pattern match, context and checksum. |
| DE_PERSONAL_ID | German personal identity card number (Personalausweisnummer), a 9 or 10-character alphanumeric identifier. | Pattern match, context and checksum. |
| DE_POSTAL_CODE | German postal code (Postleitzahl/PLZ), a 5-digit number from 01001 to 99998. | Pattern match and context. |
| DE_SOCIAL_SECURITY | German social security number (Sozialversicherungsnummer/SVNR), a 12-character identifier with area code and birthdate. | Pattern match, context and checksum. |
| DE_TAX_ID | German tax identification number (Steuer-Identifikationsnummer), an 11-digit unique tax identifier. | Pattern match, context and checksum. |
| DE_TELEMATIK_ID | German Telematik-ID for healthcare IT infrastructure, identifying patients and healthcare professionals. | Pattern match and context. |
| DE_VAT_CODE | German VAT identification number (Umsatzsteuer-Identifikationsnummer), format DE followed by 9 digits. | Pattern match and context. |

### Nigeria
| FieldType | Description | Detection Method |
|------------|---------------------------------------------------------------------------------------------------------|------------------------------------------|
Expand Down
81 changes: 81 additions & 0 deletions presidio-analyzer/presidio_analyzer/conf/default_recognizers.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -261,6 +261,87 @@ recognizers:
type: predefined
enabled: false

- name: DeBsnrRecognizer
supported_languages:
- de
type: predefined
enabled: false

- name: DeCommercialRegisterRecognizer
supported_languages:
- de
type: predefined
enabled: false

- name: DeDriverLicenseRecognizer
supported_languages:
- de
type: predefined
enabled: false

- name: DeKvnrRecognizer
supported_languages:
- de
type: predefined
enabled: false

- name: DeLanrRecognizer
supported_languages:
- de
type: predefined
enabled: false

- name: DeLicensePlateRecognizer
supported_languages:
- de
type: predefined
enabled: false

- name: DePassportRecognizer
supported_languages:
- de
type: predefined
enabled: false

- name: DePersonalIdRecognizer
supported_languages:
- de
type: predefined
enabled: false

- name: DePostalCodeRecognizer
supported_languages:
- de
type: predefined
enabled: false

- name: DeSocialSecurityRecognizer
supported_languages:
- de
type: predefined
enabled: false

- name: DeTaxIdRecognizer
supported_languages:
- de
type: predefined
enabled: false

- name: DeTelematikIdRecognizer
supported_languages:
- de
type: predefined
enabled: false

- name: DeVatCodeRecognizer
supported_languages:
- de
type: predefined
enabled: false

- name: OllamaRecognizer
class_name: OllamaLangExtractRecognizer

- name: BasicLangExtractRecognizer
supported_languages:
- en
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,34 @@
from .country_specific.finland.fi_personal_identity_code_recognizer import (
FiPersonalIdentityCodeRecognizer,
)
from .country_specific.india import (
InVehicleRegistrationRecognizer,

# Germany recognizers
from .country_specific.germany.de_bsnr_recognizer import DeBsnrRecognizer
from .country_specific.germany.de_commercial_register_recognizer import (
DeCommercialRegisterRecognizer,
)
from .country_specific.germany.de_driver_license_recognizer import (
DeDriverLicenseRecognizer,
)
from .country_specific.germany.de_kvnr_recognizer import DeKvnrRecognizer
from .country_specific.germany.de_lanr_recognizer import DeLanrRecognizer
from .country_specific.germany.de_license_plate_recognizer import (
DeLicensePlateRecognizer,
)
from .country_specific.germany.de_passport_recognizer import DePassportRecognizer
from .country_specific.germany.de_personal_id_recognizer import DePersonalIdRecognizer
from .country_specific.germany.de_postal_code_recognizer import DePostalCodeRecognizer
from .country_specific.germany.de_social_security_recognizer import (
DeSocialSecurityRecognizer,
)
from .country_specific.germany.de_tax_id_recognizer import DeTaxIdRecognizer
from .country_specific.germany.de_telematik_id_recognizer import DeTelematikIdRecognizer
from .country_specific.germany.de_vat_code_recognizer import DeVatCodeRecognizer

# India recognizers
from .country_specific.india import (
InVehicleRegistrationRecognizer,
)
from .country_specific.india.in_aadhaar_recognizer import InAadhaarRecognizer
from .country_specific.india.in_gstin_recognizer import InGstinRecognizer
from .country_specific.india.in_pan_recognizer import InPanRecognizer
Expand Down Expand Up @@ -129,6 +152,19 @@
__all__ = [
"AbaRoutingRecognizer",
"CreditCardRecognizer",
"DeBsnrRecognizer",
"DeCommercialRegisterRecognizer",
"DeDriverLicenseRecognizer",
"DeKvnrRecognizer",
"DeLanrRecognizer",
"DeLicensePlateRecognizer",
"DePassportRecognizer",
"DePersonalIdRecognizer",
"DePostalCodeRecognizer",
"DeSocialSecurityRecognizer",
"DeTaxIdRecognizer",
"DeTelematikIdRecognizer",
"DeVatCodeRecognizer",
"CryptoRecognizer",
"DateRecognizer",
"EmailRecognizer",
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
"""Germany-specific recognizers."""

from .de_bsnr_recognizer import DeBsnrRecognizer
from .de_commercial_register_recognizer import DeCommercialRegisterRecognizer
from .de_driver_license_recognizer import DeDriverLicenseRecognizer
from .de_kvnr_recognizer import DeKvnrRecognizer
from .de_lanr_recognizer import DeLanrRecognizer
from .de_license_plate_recognizer import DeLicensePlateRecognizer
from .de_passport_recognizer import DePassportRecognizer
from .de_personal_id_recognizer import DePersonalIdRecognizer
from .de_postal_code_recognizer import DePostalCodeRecognizer
from .de_social_security_recognizer import DeSocialSecurityRecognizer
from .de_tax_id_recognizer import DeTaxIdRecognizer
from .de_telematik_id_recognizer import DeTelematikIdRecognizer
from .de_vat_code_recognizer import DeVatCodeRecognizer

__all__ = [
"DeBsnrRecognizer",
"DeCommercialRegisterRecognizer",
"DeDriverLicenseRecognizer",
"DeKvnrRecognizer",
"DeLanrRecognizer",
"DeLicensePlateRecognizer",
"DePassportRecognizer",
"DePersonalIdRecognizer",
"DePostalCodeRecognizer",
"DeSocialSecurityRecognizer",
"DeTaxIdRecognizer",
"DeTelematikIdRecognizer",
"DeVatCodeRecognizer",
]
Original file line number Diff line number Diff line change
@@ -0,0 +1,142 @@
from typing import List, Optional, Tuple

from presidio_analyzer import EntityRecognizer, Pattern, PatternRecognizer


class DeBsnrRecognizer(PatternRecognizer):
"""
Recognize German BSNR (Betriebsstättennummer) using regex and validation.

The BSNR is a facility number that uniquely identifies the location of
service provision in the German statutory health insurance system:
- 9 digits total
- Digits 1-2: KV state/regional association code (see VALID_KV_CODES)
- Digits 3-7: Facility identifier (assigned by KV)
- Digits 8-9: Additional digits (often "00" for older BSNRs)

The BSNR appears in prescriptions, discharge letters, and billing documents,
identifying the treatment facility. This is quasi-PII as it can narrow down
where a patient received treatment.

Legal basis: §75 Abs. 7 SGB V
Issuing authority: Kassenärztliche Vereinigungen (KV)
Source: KBV Arztnummern-Richtlinie Anlage 1

:param patterns: List of patterns to be used by this recognizer
:param context: List of context words to increase confidence in detection
:param supported_language: Language this recognizer supports
:param supported_entity: The entity this recognizer can detect
:param replacement_pairs: List of tuples with potential replacement values
"""

# Pattern source: https://wiki.hl7.de/index.php/LANR_und_BSNR

# Valid KV region codes per KBV Arztnummern-Richtlinie Anlage 1
# Standard KV regions
VALID_KV_CODES = {
"01", # Schleswig-Holstein
"02", # Hamburg
"03", # Bremen
"17", # Niedersachsen
"20", # Westfalen-Lippe
"38", # Nordrhein
"46", # Hessen
"51", # Rheinland-Pfalz
"52", # Baden-Württemberg
"71", # Bayern
"72", # Berlin
"73", # Saarland
"74", # KBV (Kassenärztliche Bundesvereinigung)
"78", # Mecklenburg-Vorpommern
"83", # Brandenburg
"88", # Sachsen-Anhalt
"93", # Thüringen
"98", # Sachsen
# Special codes for hospitals (Anlage 8 BMV-Ä)
"35", # Krankenhäuser
}

PATTERNS = [
Pattern(
"BSNR (9 digits)",
r"\b[0-9]{9}\b",
0.05, # Very low score - requires context or valid KV code
),
Pattern(
"BSNR (with context)",
r"(?i)(?:bsnr|betriebsstättennummer|betriebsstaetten-nr|betriebsstätten-nr)[\s:]*([0-9]{9})\b",
0.5,
),
]

CONTEXT = [
"bsnr",
"betriebsstättennummer",
"betriebsst\u00e4ttennummer", # With umlaut
"betriebsstaetten-nr",
"betriebsst\u00e4tten-nr", # With umlaut
"facility number",
"praxis",
"praxisnummer",
"behandlungsort",
"einrichtung",
"klinik",
"krankenhaus",
"behandlungsstelle",
]

def __init__(
self,
patterns: Optional[List[Pattern]] = None,
context: Optional[List[str]] = None,
supported_language: str = "de",
supported_entity: str = "DE_BSNR",
replacement_pairs: Optional[List[Tuple[str, str]]] = None,
name: Optional[str] = None,
):
self.replacement_pairs = (
replacement_pairs if replacement_pairs else [("-", ""), (" ", ""), (".", "")]
)
patterns = patterns if patterns else self.PATTERNS
context = context if context else self.CONTEXT
super().__init__(
supported_entity=supported_entity,
patterns=patterns,
context=context,
supported_language=supported_language,
name=name,
)

def validate_result(self, pattern_text: str) -> Optional[bool]:
"""
Validate the BSNR format using KV regional code validation.

Validates that the first 2 digits match a valid KV region code
per KBV Arztnummern-Richtlinie Anlage 1.

:param pattern_text: Text detected as pattern by regex
:return: True if valid KV code, False if invalid, None if uncertain
"""
pattern_text = EntityRecognizer.sanitize_value(
pattern_text, self.replacement_pairs
)

if len(pattern_text) != 9:
return False

if not pattern_text.isdigit():
return False

# Basic validation: BSNR should not be all zeros
if pattern_text == "000000000":
return False

# Validate KV regional code (digits 1-2)
kv_code = pattern_text[:2]
if kv_code in self.VALID_KV_CODES:
# Valid KV code - increase confidence
return True

# Unknown KV code - could be valid (historic or special cases)
# but reduce confidence by returning None
return None
Loading
Loading