microsoft · RonShakutai · Nov 30, 2025 · Oct 30, 2025 · Oct 30, 2025 · Oct 30, 2025
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -322,6 +322,10 @@ jobs:
       - name: Wait for services to be ready
         run: sleep 60
 
+      - name: Download Ollama model for Ollama LangExtract recognizer tests
+        run: |
+          docker exec $(docker ps -qf "name=ollama") ollama pull gemma3:1b
+
       - name: Run E2E tests
         working-directory: e2e-tests
         run: |
@@ -405,6 +409,10 @@ jobs:
       - name: Wait for services to be ready
         run: sleep 60
 
+      - name: Download Ollama model for Ollama LangExtract recognizer tests
+        run: |
+          docker exec $(docker ps -qf "name=ollama") ollama pull gemma3:1b
+
       - name: Run E2E tests
         working-directory: e2e-tests
         run: |

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,4 +1,18 @@
 services:
+  ollama:
+    image: ollama/ollama:latest
+    ports:
+      - "127.0.0.1:11434:11434"   # or "127.0.0.1:11435:11434"
+    volumes:
+      - ollama-data:/root/.ollama
+    environment:
+      - OLLAMA_HOST=0.0.0.0
+    healthcheck:
+      test: ["CMD", "ollama", "list"]
+      interval: 10s
+      timeout: 10s
+      retries: 30        # ~5 minutes total
+      start_period: 60s  
   presidio-anonymizer:
     image: ${REGISTRY_NAME}/${IMAGE_PREFIX}presidio-anonymizer${TAG}
     build:
@@ -9,6 +23,7 @@ services:
       - PORT=5001
     ports:
       - "5001:5001"
+
   presidio-analyzer:
     image: ${REGISTRY_NAME}/${IMAGE_PREFIX}presidio-analyzer${TAG}
     build:
@@ -17,8 +32,13 @@ services:
         - type=registry,ref=${REGISTRY_NAME}/${IMAGE_PREFIX}presidio-analyzer:latest
     environment:
       - PORT=5001
+      - OLLAMA_HOST=http://ollama:11434
     ports:
       - "5002:5001"
+    depends_on:
+      ollama:
+        condition: service_healthy
+
   presidio-image-redactor:
     image: ${REGISTRY_NAME}/${IMAGE_PREFIX}presidio-image-redactor${TAG}
     build:
@@ -29,3 +49,6 @@ services:
       - PORT=5001
     ports:
       - "5003:5001"
+
+volumes:
+  ollama-data:
diff --git a/docs/analyzer/adding_recognizers.md b/docs/analyzer/adding_recognizers.md
@@ -165,6 +165,13 @@ On how to integrate Presidio with AHDS De-Identification Protected Health Inform
 and a sample for a ADHS Remote Recognizer, refer to the
 [AHDS de-Identification Integration document](../samples/python/ahds/index.md).
 
+### Language Model-based PII/PHI detection recognizer
+
+Presidio supports language model-based entity detection using LLMs and SLMs for flexible PII/PHI recognition.
+
+The current implementation uses LangExtract with Ollama (local models). For full setup instructions and usage examples, 
+see the [Language Model-based PII/PHI Detection guide](../samples/python/langextract/index.md).
+
 ### Creating ad-hoc recognizers
 
 In addition to recognizers in code, it is possible to create ad-hoc recognizers via the Presidio Analyzer API for regex and deny-list based logic.

diff --git a/docs/samples/index.md b/docs/samples/index.md
@@ -19,6 +19,7 @@
 | Usage | Text      | Python file                            | [Azure AI Language as a Remote Recognizer](python/text_analytics/index.md)  |
 | Usage | Text      | Python file                            | [Azure Health Data Services de-identification Service as a Remote Recognizer](python/ahds/index.md)  |
 | Usage | Text      | Python file                            | [AHDS Surrogate Example](python/ahds/example_ahds_surrogate.py)  |
+| Usage | Text      | Python file                            | [Language Model-based PII/PHI Detection using LangExtract](python/langextract/index.md)  |
 | Usage | CSV       | Python file                            | [Analyze and Anonymize CSV file](https://github.com/microsoft/presidio/blob/main/docs/samples/python/process_csv_file.py) |
 | Usage | Text      | Python                                 | [Using Flair as an external PII model](https://github.com/microsoft/presidio/blob/main/docs/samples/python/flair_recognizer.py)|
 | Usage | Text      | Python file                            | [Using Span Marker as an external PII model](https://github.com/microsoft/presidio/blob/main/docs/samples/python/span_marker_recognizer.py)|

diff --git a/docs/samples/python/langextract/index.md b/docs/samples/python/langextract/index.md
@@ -0,0 +1,116 @@
+# Language Model-based PII/PHI Detection
+
+## Introduction
+
+Presidio supports language model-based PII/PHI detection for flexible entity recognition using language models (LLMs, SLMs, etc.). This approach enables detection of both:
+- **PII (Personally Identifiable Information)**: Names, emails, phone numbers, SSN, credit cards, etc.
+- **PHI (Protected Health Information)**: Medical records, health identifiers, etc.
+
+The current implementation uses [LangExtract](https://github.com/google/langextract) with **Ollama** for local model deployment. Additional provider integrations will be added soon.
+
+## Entity Detection Capabilities
+
+Unlike pattern-based recognizers, language model-based detection is flexible and depends on:
+- The language model being used
+- The prompt description provided
+- The few-shot examples configured
+
+The default configuration includes examples for common PII/PHI entities such as PERSON, EMAIL_ADDRESS, PHONE_NUMBER, US_SSN, CREDIT_CARD, MEDICAL_LICENSE, and more. 
+**You can customize the prompts and examples to detect any entity types relevant to your use case**.
+
+For the default entity mappings and examples, see the [default configuration](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/conf/langextract_config_ollama.yaml).
+
+## Prerequisites
+
+### Setting up Ollama
+
+You have two options to set up Ollama:
+
+**Option 1: Docker Compose** (recommended)
+```bash
+# Start Ollama service
+docker compose up -d ollama
+
+# Pull the language model (required - takes ~1-2 minutes)
+docker exec -it presidio-ollama-1 ollama pull gemma2:2b
+```
+
+**Option 2: Manual setup**
+Follow the [official LangExtract Ollama guide](https://github.com/google/langextract?tab=readme-ov-file#using-local-llms-with-ollama).
+
+!!! note "Note"
+    The model must be pulled before using the recognizer. The default model is `gemma2:2b` (~1.6GB).
+
+## Language Model-based Recognizer Implementation
+
+Presidio provides a hierarchy of recognizers for language model-based PII/PHI detection:
+
+- **`LMRecognizer`**: Abstract base class for all language model recognizers (LLMs, SLMs, etc.)
+- **`LangExtractRecognizer`**: Abstract base class for LangExtract library integration (model-agnostic)
+- **`OllamaLangExtractRecognizer`**: Concrete implementation for Ollama local language models
+
+[The implementation can be found here](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/predefined_recognizers/third_party/ollama_langextract_recognizer.py).
+
+## How to integrate Language Model-based detection into Presidio
+
+### Option 1: Enable in Configuration (Recommended)
+
+1. Install with langextract support and set up Ollama (see Prerequisites above):
+   ```sh
+   pip install presidio-analyzer[langextract]
+   ```
+
+2. Enable the recognizer in `default_recognizers.yaml`:
+   ```yaml
+   - name: OllamaLangExtractRecognizer
+     enabled: true  # Change from false to true
+   ```
+
+### Option 2: Add Programmatically
+
+```python
+from presidio_analyzer import AnalyzerEngine
+from presidio_analyzer.predefined_recognizers.third_party.ollama_langextract_recognizer import OllamaLangExtractRecognizer
+
+analyzer = AnalyzerEngine()
+analyzer.registry.add_recognizer(OllamaLangExtractRecognizer())
+
+results = analyzer.analyze(text="My email is john.doe@example.com", language="en")
+```
+
+### Custom Configuration
+
+To use a custom configuration file:
+
+```python
+analyzer.registry.add_recognizer(
+    OllamaLangExtractRecognizer(config_path="/path/to/custom_config.yaml")
+)
+```
+
+!!! note "Note"
+    The recognizer is disabled by default in `default_recognizers.yaml` to avoid requiring Ollama for basic Presidio usage. Enable it when you have Ollama set up and running.
+
+## Configuration Options
+
+The `langextract_config_ollama.yaml` file supports the following options:
+
+- **`model_id`**: The Ollama model to use (default: `"gemma2:2b"`)
+- **`model_url`**: Ollama server URL (default: `"http://localhost:11434"`)
+- **`temperature`**: Model temperature for generation (default: `null` for model default)
+- **`supported_entities`**: PII/PHI entity types to detect
+- **`entity_mappings`**: Map LangExtract entity classes to Presidio entity names
+- **`min_score`**: Minimum confidence score (default: `0.5`)
+
+See the [default configuration](https://github.com/microsoft/presidio/blob/main/presidio-analyzer/presidio_analyzer/conf/langextract_config_ollama.yaml) for complete examples.
+
+## Troubleshooting
+
+**ConnectionError: "Ollama server not reachable"**
+- Ensure Ollama is running: `docker ps` or check `http://localhost:11434`
+- Verify the `model_url` in your configuration matches your Ollama server address
+
+**RuntimeError: "Model 'gemma2:2b' not found"**
+- Pull the model: `docker exec -it presidio-ollama-1 ollama pull gemma2:2b`
+- Or for manual setup: `ollama pull gemma2:2b`
+- Verify the model name matches the `model_id` in your configuration
diff --git a/e2e-tests/requirements.txt b/e2e-tests/requirements.txt
@@ -1,4 +1,4 @@
 requests>=2.32.4
 pytest
-file:../presidio-analyzer
-file:../presidio-anonymizer
+-e ../presidio-analyzer[langextract]
+-e ../presidio-anonymizer
diff --git a/e2e-tests/resources/ollama_test_config.yaml b/e2e-tests/resources/ollama_test_config.yaml
@@ -0,0 +1,51 @@
+# LMRecognizer base configuration
+lm_recognizer:
+  supported_entities:
+    - PERSON
+    - LOCATION
+    - ORGANIZATION
+    - PHONE_NUMBER
+    - EMAIL_ADDRESS
+    - DATE_TIME
+    - US_SSN
+    - CREDIT_CARD
+    - MEDICAL_LICENSE
+    - IP_ADDRESS
+    - URL
+    - IBAN_CODE
+
+  labels_to_ignore:
+    - payment_status
+
+  enable_generic_consolidation: true
+  min_score: 0.5
+
+langextract:
+  prompt_file: langextract_prompts/default_pii_phi_prompt.j2
+  examples_file: langextract_prompts/default_pii_phi_examples.yaml
+
+  entity_mappings:
+    person: PERSON
+    full_name: PERSON
+    name_first: PERSON
+    name_last: PERSON
+    name_middle: PERSON
+    location: LOCATION
+    address: LOCATION
+    organization: ORGANIZATION
+    phone: PHONE_NUMBER
+    phone_number: PHONE_NUMBER
+    email: EMAIL_ADDRESS
+    date: DATE_TIME
+    ssn: US_SSN
+    identification_number: US_SSN
+    credit_card: CREDIT_CARD
+    medical_record: MEDICAL_LICENSE
+    ip_address: IP_ADDRESS
+    url: URL
+    iban: IBAN_CODE
+
+  model:
+    model_id: gemma3:1b
+    model_url: http://localhost:11434
+    temperature: 0.0
diff --git a/e2e-tests/tests/test_analyzer.py → e2e-tests/tests/test_api_analyzer.py b/e2e-tests/tests/test_analyzer.py → e2e-tests/tests/test_api_analyzer.py
diff --git a/e2e-tests/tests/test_anonymizer.py → e2e-tests/tests/test_api_anonymizer.py b/e2e-tests/tests/test_anonymizer.py → e2e-tests/tests/test_api_anonymizer.py