diff --git a/evaluation/ai-assistant/backend/models.py b/evaluation/ai-assistant/backend/models.py index 03c91219d..f4942d1d2 100644 --- a/evaluation/ai-assistant/backend/models.py +++ b/evaluation/ai-assistant/backend/models.py @@ -109,8 +109,15 @@ class SetupConfig(BaseModel): run_llm: bool = True +class SamplingMethod(str, Enum): + random = "random" + length = "length" + + class SamplingConfig(BaseModel): + dataset_id: str sample_size: int = 500 + method: SamplingMethod = SamplingMethod.random class AnalysisStatus(BaseModel): diff --git a/evaluation/ai-assistant/backend/pyproject.toml b/evaluation/ai-assistant/backend/pyproject.toml index fb1e2001b..f0bb6035d 100644 --- a/evaluation/ai-assistant/backend/pyproject.toml +++ b/evaluation/ai-assistant/backend/pyproject.toml @@ -10,6 +10,8 @@ fastapi = ">=0.115.0" uvicorn = { version = ">=0.32.0", extras = ["standard"] } pydantic = ">=2.0.0" python-multipart = ">=0.0.9" +pandas = ">=2.0.0" +scikit-learn = ">=1.3.0" [build-system] requires = ["poetry-core"] diff --git a/evaluation/ai-assistant/backend/routers/sampling.py b/evaluation/ai-assistant/backend/routers/sampling.py index 50ffe2bcf..d32dfb53b 100644 --- a/evaluation/ai-assistant/backend/routers/sampling.py +++ b/evaluation/ai-assistant/backend/routers/sampling.py @@ -1,14 +1,90 @@ -from fastapi import APIRouter -from models import SamplingConfig +import pandas as pd +from fastapi import APIRouter, HTTPException +from models import Record, SamplingConfig, SamplingMethod +from routers.upload import _records as uploaded_records router = APIRouter(prefix="/api/sampling", tags=["sampling"]) +# Sampled records available for downstream steps +sampled_records: list[Record] = [] + + +def _sample_random(df: pd.DataFrame, n: int) -> pd.DataFrame: + return df.sample(n=n, random_state=42) + + +def _sample_length(df: pd.DataFrame, n: int) -> pd.DataFrame: + """Stratified sampling by text length buckets (short / medium / long).""" + lengths = df["text"].str.len() + terciles = lengths.quantile([1 / 3, 2 / 3]) + df = df.copy() + df["_len_bucket"] = pd.cut( + lengths, + bins=[-1, terciles.iloc[0], terciles.iloc[1], lengths.max() + 1], + labels=["short", "medium", "long"], + ) + per_bucket = max(1, n // 3) + remainder = n - per_bucket * 3 + parts: list[pd.DataFrame] = [] + for bucket in ["short", "medium", "long"]: + group = df[df["_len_bucket"] == bucket] + take = min(per_bucket, len(group)) + parts.append(group.sample(n=take, random_state=42)) + collected = pd.concat(parts) + # fill any remaining quota from the full set + if len(collected) < n: + remaining = df.drop(collected.index) + extra = min(n - len(collected), len(remaining)) + if extra > 0: + collected = pd.concat( + [collected, remaining.sample(n=extra, random_state=42)] + ) + return collected.drop(columns=["_len_bucket"]) + + +_SAMPLERS = { + SamplingMethod.random: _sample_random, + SamplingMethod.length: _sample_length, +} + @router.post("") async def configure_sampling(config: SamplingConfig): - """Accept sampling configuration and return a summary.""" + """Sample records from the loaded dataset.""" + global sampled_records + + records = uploaded_records.get(config.dataset_id) + if not records: + raise HTTPException( + status_code=404, + detail=f"Dataset '{config.dataset_id}' not found.", + ) + + total = len(records) + sample_size = min(config.sample_size, total) + + if sample_size <= 0: + raise HTTPException( + status_code=400, + detail="Sample size must be greater than 0.", + ) + + df = pd.DataFrame([r.model_dump() for r in records]) + sampler = _SAMPLERS[config.method] + sampled_df = sampler(df, sample_size) + sampled_records = [ + Record(**row) for row in sampled_df.to_dict(orient="records") + ] + return { - "sample_size": config.sample_size, - "method": "stratified_random", + "sample_size": len(sampled_records), + "total_records": total, + "method": config.method.value, "status": "ready", } + + +@router.get("/records") +async def get_sampled_records(): + """Return the current set of sampled records.""" + return sampled_records diff --git a/evaluation/ai-assistant/src/app/pages/Anonymization.tsx b/evaluation/ai-assistant/src/app/pages/Anonymization.tsx index 2077ab2c4..12eda26a9 100644 --- a/evaluation/ai-assistant/src/app/pages/Anonymization.tsx +++ b/evaluation/ai-assistant/src/app/pages/Anonymization.tsx @@ -1,11 +1,9 @@ -import { useState, useEffect, useMemo } from 'react'; +import { useMemo } from 'react'; import { useNavigate } from 'react-router'; import { Card } from '../components/ui/card'; import { Button } from '../components/ui/button'; -import { Progress } from '../components/ui/progress'; -import { Badge } from '../components/ui/badge'; import { Alert, AlertDescription } from '../components/ui/alert'; -import { ArrowRight, Loader2, CheckCircle, Shield, Sparkles, AlertTriangle, Database } from 'lucide-react'; +import { ArrowRight, Shield, Sparkles, Database } from 'lucide-react'; import type { SetupConfig } from '../types'; export function Anonymization() { @@ -20,52 +18,8 @@ export function Anonymization() { } }, []); - const runPresidio = setupConfig?.runPresidio ?? true; - const runLlm = setupConfig?.runLlm ?? true; const hasDatasetEntities = setupConfig?.hasDatasetEntities ?? false; - const [presidioProgress, setPresidioProgress] = useState(runPresidio ? 0 : 100); - const [llmProgress, setLlmProgress] = useState(runLlm ? 0 : 100); - const [presidioComplete, setPresidioComplete] = useState(!runPresidio); - const [llmComplete, setLlmComplete] = useState(!runLlm); - - const isComplete = presidioComplete && llmComplete; - - useEffect(() => { - if (!runPresidio && !runLlm) return; // nothing to simulate - - if (runPresidio) { - const presidioInterval = setInterval(() => { - setPresidioProgress((prev) => { - if (prev >= 100) { - clearInterval(presidioInterval); - setPresidioComplete(true); - return 100; - } - return prev + 2; - }); - }, 50); - return () => clearInterval(presidioInterval); - } - }, [runPresidio]); - - useEffect(() => { - if (!runLlm) return; - const timer = setTimeout(() => { - const llmInterval = setInterval(() => { - setLlmProgress((prev) => { - if (prev >= 100) { - clearInterval(llmInterval); - setLlmComplete(true); - return 100; - } - return prev + 1.5; - }); - }, 80); - }, runPresidio ? 500 : 0); - return () => clearTimeout(timer); - }, [runLlm, runPresidio]); - const handleContinue = () => { navigate('/human-review'); }; @@ -75,13 +29,7 @@ export function Anonymization() {

PII Detection Analysis

- {runPresidio && runLlm - ? 'Running Presidio and LLM analysis in parallel to detect PII entities across sampled records.' - : runPresidio - ? 'Running Presidio analysis to detect PII entities across sampled records.' - : runLlm - ? 'Running LLM analysis to detect PII entities across sampled records.' - : 'Using dataset-provided entities. No additional detection selected.'} + Automated PII detection engines will run here once implemented.

@@ -100,249 +48,54 @@ export function Anonymization() { )} - {/* Important Notice */} - {runLlm && ( - - - -
-
LLM is Assistive Only
-
- The AI Judge may miss entities or lack exact character spans. Its suggestions will be - combined with Presidio results for human review - it does not have final authority. -
-
-
-
- )} - - {/* Side-by-Side Processing */} - {(runPresidio || runLlm) ? ( -
- {/* Presidio Processing */} - {runPresidio && ( - + {/* Side-by-Side Cards — greyed out / coming soon */} +
+ {/* Presidio Processing — not implemented */} +
-
- -
-

Presidio Anonymization

-

Baseline configuration v1.2

-
-
- -
- {!presidioComplete ? ( - - ) : ( - - )} -
- -
-

- {!presidioComplete ? 'Processing Records...' : 'Complete'} -

-
- -
-
- Progress - {presidioProgress}% +
+
+ +
+

Presidio Analysis

+

Baseline PII detection

+
- + Coming soon
- {presidioComplete && ( -
-
-
-
500
-
Records
-
-
-
1,247
-
Entities
-
-
-
12
-
Types
-
-
-
91%
-
Avg. Conf.
-
-
-
- )} +

+ Run Presidio's rule-based and NLP detection to identify PII entities with precise character spans and confidence scores. +

- )} - {/* LLM Processing */} - {runLlm && ( - + {/* LLM Processing — not implemented */} +
-
- -
-

LLM-based PII Judging

-

Azure OpenAI - GPT-4

-
-
- -
- {!llmComplete ? ( - - ) : ( - - )} -
- -
-

- {!llmComplete ? 'AI Judge Analyzing...' : 'Complete'} -

-
- -
-
- Progress - {llmProgress}% +
+
+ +
+

LLM Judge

+

AI-assisted entity detection

+
- + Coming soon
- {llmComplete && ( -
-
-
-
500
-
Records
-
-
-
1,312
-
Entities
-
-
-
65
-
Additional
-
-
-
87%
-
Avg. Conf.
-
-
-
- )} +

+ Use an LLM to suggest additional PII entities and validate detections. Results will be combined with Presidio output for human review. +

- )}
- ) : ( - -
- -
-

No additional detection needed

-

- Proceeding with dataset-provided entities only. Continue to human review. -

-
-
-
- )} - - {/* Combined Results */} - {isComplete && ( - <> - -
-
- -

Analysis Complete - Ready for Human Review

-
- -
-
Comparison Summary:
-
- - ✓ 1,182 Matches - - - ⚠ 47 Conflicts - - - + 65 LLM-only - - - − 18 Presidio-only - -
-
-
-
- - -
-

Detected Entity Types

-
- {['PERSON', 'EMAIL', 'PHONE_NUMBER', 'SSN', 'CREDIT_CARD', 'DATE_OF_BIRTH', - 'MEDICAL_RECORD', 'IP_ADDRESS', 'EMPLOYEE_ID', 'ADDRESS', 'ORGANIZATION', 'DATE'].map(type => ( - - {type} - - ))} -
-
-
- - -
-

Output Generated

-
-
-
Presidio Output:
-
- -
Detected entities with precise character positions
-
-
- -
Anonymized text with PII replaced
-
-
- -
Confidence scores for each detection
-
-
-
-
LLM Output:
-
- -
Suggested entities and types
-
-
- -
Additional detections for review
-
-
- -
Approximate spans (may need correction)
-
-
-
-
-
- - )} {/* Actions */}
@@ -57,8 +112,7 @@ export function Sampling() {
- Larger samples provide more accurate evaluation metrics but require more manual review time. - We recommend 500-800 records for balanced accuracy and efficiency. + Larger samples provide more accurate evaluation metrics but require more manual review time. Choose a size that balances statistical confidence with your available review capacity.
@@ -73,11 +127,45 @@ export function Sampling() {

Sampling Method

-
-
Stratified Random Sampling
-
- Records are randomly selected while maintaining proportional representation across data segments. - This ensures statistical validity and repeatability. + setSamplingMethod(v as SamplingMethod)} + className="space-y-3" + > + {METHODS.map(({ value, label, description, icon: Icon }) => ( + + ))} + + + {/* Semantic — coming soon */} +
+
+
+
+ + Semantic Diversity Sampling + Coming soon +
+

+ TF-IDF vectorization + greedy max-min distance to maximise topical diversity in the sample. +

@@ -109,13 +197,17 @@ export function Sampling() {
Sample Summary
+
+ Dataset: + {datasetRecordCount.toLocaleString()} total records +
Sample Size: {sampleSize} records
Method: - Stratified Random + {METHODS.find(m => m.value === samplingMethod)?.label}
@@ -123,7 +215,8 @@ export function Sampling() { {/* Actions */}
- diff --git a/evaluation/ai-assistant/src/app/pages/Setup.tsx b/evaluation/ai-assistant/src/app/pages/Setup.tsx index cc0689c43..c752cc387 100644 --- a/evaluation/ai-assistant/src/app/pages/Setup.tsx +++ b/evaluation/ai-assistant/src/app/pages/Setup.tsx @@ -104,6 +104,7 @@ export function Setup() { hasDatasetEntities: selectedDataset.has_entities, }; sessionStorage.setItem('setupConfig', JSON.stringify(config)); + sessionStorage.setItem('datasetRecordCount', String(selectedDataset.record_count)); navigate('/sampling'); } }; @@ -277,42 +278,40 @@ export function Setup() {
)} - {/* Detection Options — only when dataset has entities */} + {/* Detection Options — not implemented yet */} {selectedDataset.has_entities && ( -
- -

- Your dataset includes pre-identified entities. Choose which additional detection to run: +

+
+ + Coming soon +
+

+ Your dataset includes pre-identified entities. Additional detection engines will be available soon:

setRunPresidio(checked === true)} + checked={false} + disabled /> -
setRunLlm(checked === true)} + checked={false} + disabled /> -
- {!runPresidio && !runLlm && ( -

- Only dataset-provided entities will be used for tagging. -

- )}
)}