diff --git a/evaluation/ai-assistant/backend/models.py b/evaluation/ai-assistant/backend/models.py index 03c91219d..f4942d1d2 100644 --- a/evaluation/ai-assistant/backend/models.py +++ b/evaluation/ai-assistant/backend/models.py @@ -109,8 +109,15 @@ class SetupConfig(BaseModel): run_llm: bool = True +class SamplingMethod(str, Enum): + random = "random" + length = "length" + + class SamplingConfig(BaseModel): + dataset_id: str sample_size: int = 500 + method: SamplingMethod = SamplingMethod.random class AnalysisStatus(BaseModel): diff --git a/evaluation/ai-assistant/backend/pyproject.toml b/evaluation/ai-assistant/backend/pyproject.toml index fb1e2001b..f0bb6035d 100644 --- a/evaluation/ai-assistant/backend/pyproject.toml +++ b/evaluation/ai-assistant/backend/pyproject.toml @@ -10,6 +10,8 @@ fastapi = ">=0.115.0" uvicorn = { version = ">=0.32.0", extras = ["standard"] } pydantic = ">=2.0.0" python-multipart = ">=0.0.9" +pandas = ">=2.0.0" +scikit-learn = ">=1.3.0" [build-system] requires = ["poetry-core"] diff --git a/evaluation/ai-assistant/backend/routers/sampling.py b/evaluation/ai-assistant/backend/routers/sampling.py index 50ffe2bcf..d32dfb53b 100644 --- a/evaluation/ai-assistant/backend/routers/sampling.py +++ b/evaluation/ai-assistant/backend/routers/sampling.py @@ -1,14 +1,90 @@ -from fastapi import APIRouter -from models import SamplingConfig +import pandas as pd +from fastapi import APIRouter, HTTPException +from models import Record, SamplingConfig, SamplingMethod +from routers.upload import _records as uploaded_records router = APIRouter(prefix="/api/sampling", tags=["sampling"]) +# Sampled records available for downstream steps +sampled_records: list[Record] = [] + + +def _sample_random(df: pd.DataFrame, n: int) -> pd.DataFrame: + return df.sample(n=n, random_state=42) + + +def _sample_length(df: pd.DataFrame, n: int) -> pd.DataFrame: + """Stratified sampling by text length buckets (short / medium / long).""" + lengths = df["text"].str.len() + terciles = lengths.quantile([1 / 3, 2 / 3]) + df = df.copy() + df["_len_bucket"] = pd.cut( + lengths, + bins=[-1, terciles.iloc[0], terciles.iloc[1], lengths.max() + 1], + labels=["short", "medium", "long"], + ) + per_bucket = max(1, n // 3) + remainder = n - per_bucket * 3 + parts: list[pd.DataFrame] = [] + for bucket in ["short", "medium", "long"]: + group = df[df["_len_bucket"] == bucket] + take = min(per_bucket, len(group)) + parts.append(group.sample(n=take, random_state=42)) + collected = pd.concat(parts) + # fill any remaining quota from the full set + if len(collected) < n: + remaining = df.drop(collected.index) + extra = min(n - len(collected), len(remaining)) + if extra > 0: + collected = pd.concat( + [collected, remaining.sample(n=extra, random_state=42)] + ) + return collected.drop(columns=["_len_bucket"]) + + +_SAMPLERS = { + SamplingMethod.random: _sample_random, + SamplingMethod.length: _sample_length, +} + @router.post("") async def configure_sampling(config: SamplingConfig): - """Accept sampling configuration and return a summary.""" + """Sample records from the loaded dataset.""" + global sampled_records + + records = uploaded_records.get(config.dataset_id) + if not records: + raise HTTPException( + status_code=404, + detail=f"Dataset '{config.dataset_id}' not found.", + ) + + total = len(records) + sample_size = min(config.sample_size, total) + + if sample_size <= 0: + raise HTTPException( + status_code=400, + detail="Sample size must be greater than 0.", + ) + + df = pd.DataFrame([r.model_dump() for r in records]) + sampler = _SAMPLERS[config.method] + sampled_df = sampler(df, sample_size) + sampled_records = [ + Record(**row) for row in sampled_df.to_dict(orient="records") + ] + return { - "sample_size": config.sample_size, - "method": "stratified_random", + "sample_size": len(sampled_records), + "total_records": total, + "method": config.method.value, "status": "ready", } + + +@router.get("/records") +async def get_sampled_records(): + """Return the current set of sampled records.""" + return sampled_records diff --git a/evaluation/ai-assistant/src/app/pages/Anonymization.tsx b/evaluation/ai-assistant/src/app/pages/Anonymization.tsx index 2077ab2c4..12eda26a9 100644 --- a/evaluation/ai-assistant/src/app/pages/Anonymization.tsx +++ b/evaluation/ai-assistant/src/app/pages/Anonymization.tsx @@ -1,11 +1,9 @@ -import { useState, useEffect, useMemo } from 'react'; +import { useMemo } from 'react'; import { useNavigate } from 'react-router'; import { Card } from '../components/ui/card'; import { Button } from '../components/ui/button'; -import { Progress } from '../components/ui/progress'; -import { Badge } from '../components/ui/badge'; import { Alert, AlertDescription } from '../components/ui/alert'; -import { ArrowRight, Loader2, CheckCircle, Shield, Sparkles, AlertTriangle, Database } from 'lucide-react'; +import { ArrowRight, Shield, Sparkles, Database } from 'lucide-react'; import type { SetupConfig } from '../types'; export function Anonymization() { @@ -20,52 +18,8 @@ export function Anonymization() { } }, []); - const runPresidio = setupConfig?.runPresidio ?? true; - const runLlm = setupConfig?.runLlm ?? true; const hasDatasetEntities = setupConfig?.hasDatasetEntities ?? false; - const [presidioProgress, setPresidioProgress] = useState(runPresidio ? 0 : 100); - const [llmProgress, setLlmProgress] = useState(runLlm ? 0 : 100); - const [presidioComplete, setPresidioComplete] = useState(!runPresidio); - const [llmComplete, setLlmComplete] = useState(!runLlm); - - const isComplete = presidioComplete && llmComplete; - - useEffect(() => { - if (!runPresidio && !runLlm) return; // nothing to simulate - - if (runPresidio) { - const presidioInterval = setInterval(() => { - setPresidioProgress((prev) => { - if (prev >= 100) { - clearInterval(presidioInterval); - setPresidioComplete(true); - return 100; - } - return prev + 2; - }); - }, 50); - return () => clearInterval(presidioInterval); - } - }, [runPresidio]); - - useEffect(() => { - if (!runLlm) return; - const timer = setTimeout(() => { - const llmInterval = setInterval(() => { - setLlmProgress((prev) => { - if (prev >= 100) { - clearInterval(llmInterval); - setLlmComplete(true); - return 100; - } - return prev + 1.5; - }); - }, 80); - }, runPresidio ? 500 : 0); - return () => clearTimeout(timer); - }, [runLlm, runPresidio]); - const handleContinue = () => { navigate('/human-review'); }; @@ -75,13 +29,7 @@ export function Anonymization() {
- {runPresidio && runLlm - ? 'Running Presidio and LLM analysis in parallel to detect PII entities across sampled records.' - : runPresidio - ? 'Running Presidio analysis to detect PII entities across sampled records.' - : runLlm - ? 'Running LLM analysis to detect PII entities across sampled records.' - : 'Using dataset-provided entities. No additional detection selected.'} + Automated PII detection engines will run here once implemented.
Baseline configuration v1.2
-- {!presidioComplete ? 'Processing Records...' : 'Complete'} -
-Baseline PII detection
++ Run Presidio's rule-based and NLP detection to identify PII entities with precise character spans and confidence scores. +
Azure OpenAI - GPT-4
-- {!llmComplete ? 'AI Judge Analyzing...' : 'Complete'} -
-AI-assisted entity detection
++ Use an LLM to suggest additional PII entities and validate detections. Results will be combined with Presidio output for human review. +
- Proceeding with dataset-provided entities only. Continue to human review. -
-+ TF-IDF vectorization + greedy max-min distance to maximise topical diversity in the sample. +
- Your dataset includes pre-identified entities. Choose which additional detection to run: +
+ Your dataset includes pre-identified entities. Additional detection engines will be available soon:
- Only dataset-provided entities will be used for tagging. -
- )}