Skip to content

Commit 9ef83c3

Browse files
CopilotSharonHart
andauthored
Add configurable timeouts to regex execution (default 60 seconds) (#1904)
* Initial plan * Add 60-second timeouts to regex operations to prevent ReDoS Co-authored-by: SharonHart <15013757+SharonHart@users.noreply.github.com> * Narrow TimeoutError scope to regex calls only; add exc_info to warnings Co-authored-by: SharonHart <15013757+SharonHart@users.noreply.github.com> * Revert pattern_recognizer.py logic changes; keep original structure with timeout only Co-authored-by: SharonHart <15013757+SharonHart@users.noreply.github.com> * Revert iban_recognizer.py to original lazy iterator structure; add exc_info=True to warnings Co-authored-by: SharonHart <15013757+SharonHart@users.noreply.github.com> * Add IBAN recognizer timeout and empty-match tests to fix CI coverage failure Co-authored-by: SharonHart <15013757+SharonHart@users.noreply.github.com> * Allow REGEX_TIMEOUT_SECONDS to be overridden via environment variable Co-authored-by: SharonHart <15013757+SharonHart@users.noreply.github.com> --------- Co-authored-by: copilot-swe-agent[bot] <198982749+Copilot@users.noreply.github.com> Co-authored-by: SharonHart <15013757+SharonHart@users.noreply.github.com> Co-authored-by: Sharon Hart <sharonh.dev@gmail.com>
1 parent 6111d16 commit 9ef83c3

File tree

6 files changed

+286
-109
lines changed

6 files changed

+286
-109
lines changed

presidio-analyzer/presidio_analyzer/analyzer_engine.py

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import json
22
import logging
3+
import os
34
from collections import Counter
45
from typing import List, Optional
56

@@ -22,6 +23,7 @@
2223

2324
logger = logging.getLogger("presidio-analyzer")
2425

26+
REGEX_TIMEOUT_SECONDS = int(os.environ.get("REGEX_TIMEOUT_SECONDS", 60))
2527

2628
class AnalyzerEngine:
2729
"""
@@ -371,7 +373,17 @@ def _remove_allow_list(
371373
word = text[result.start : result.end]
372374

373375
# if the word is not specified to be allowed, keep in the PII entities
374-
if not re_compiled.search(word):
376+
try:
377+
if not re_compiled.search(word, timeout=REGEX_TIMEOUT_SECONDS):
378+
new_results.append(result)
379+
except TimeoutError:
380+
logger.warning(
381+
"Allow list regex timed out after %s seconds"
382+
" (word length: %d), keeping result.",
383+
REGEX_TIMEOUT_SECONDS,
384+
len(word),
385+
exc_info=True,
386+
)
375387
new_results.append(result)
376388
elif allow_list_match == "exact":
377389
for result in results:

presidio-analyzer/presidio_analyzer/pattern_recognizer.py

Lines changed: 61 additions & 48 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
import datetime
22
import logging
3+
import os
34
from typing import TYPE_CHECKING, Dict, List, Optional
45

56
import regex as re
@@ -17,6 +18,8 @@
1718

1819
logger = logging.getLogger("presidio-analyzer")
1920

21+
REGEX_TIMEOUT_SECONDS = int(os.environ.get("REGEX_TIMEOUT_SECONDS", 60))
22+
2023

2124
class PatternRecognizer(LocalRecognizer):
2225
"""
@@ -195,60 +198,70 @@ def __analyze_patterns(
195198
pattern.compiled_with_flags = flags
196199
pattern.compiled_regex = re.compile(pattern.regex, flags=flags)
197200

198-
matches = pattern.compiled_regex.finditer(text)
199-
match_time = datetime.datetime.now() - match_start_time
200-
logger.debug(
201-
"--- match_time[%s]: %.6f seconds",
202-
pattern.name,
203-
match_time.total_seconds(),
204-
)
205-
206-
for match in matches:
207-
start, end = match.span()
208-
current_match = text[start:end]
209-
210-
# Skip empty results
211-
if current_match == "":
212-
continue
213-
214-
score = pattern.score
215-
216-
validation_result = self.validate_result(current_match)
217-
description = self.build_regex_explanation(
218-
self.name,
219-
pattern.name,
220-
pattern.regex,
221-
score,
222-
validation_result,
223-
flags,
201+
try:
202+
matches = pattern.compiled_regex.finditer(
203+
text, timeout=REGEX_TIMEOUT_SECONDS
224204
)
225-
pattern_result = RecognizerResult(
226-
entity_type=self.supported_entities[0],
227-
start=start,
228-
end=end,
229-
score=score,
230-
analysis_explanation=description,
231-
recognition_metadata={
232-
RecognizerResult.RECOGNIZER_NAME_KEY: self.name,
233-
RecognizerResult.RECOGNIZER_IDENTIFIER_KEY: self.id,
234-
},
205+
match_time = datetime.datetime.now() - match_start_time
206+
logger.debug(
207+
"--- match_time[%s]: %.6f seconds",
208+
pattern.name,
209+
match_time.total_seconds(),
235210
)
236211

237-
if validation_result is not None:
238-
if validation_result:
239-
pattern_result.score = EntityRecognizer.MAX_SCORE
240-
else:
212+
for match in matches:
213+
start, end = match.span()
214+
current_match = text[start:end]
215+
216+
# Skip empty results
217+
if current_match == "":
218+
continue
219+
220+
score = pattern.score
221+
222+
validation_result = self.validate_result(current_match)
223+
description = self.build_regex_explanation(
224+
self.name,
225+
pattern.name,
226+
pattern.regex,
227+
score,
228+
validation_result,
229+
flags,
230+
)
231+
pattern_result = RecognizerResult(
232+
entity_type=self.supported_entities[0],
233+
start=start,
234+
end=end,
235+
score=score,
236+
analysis_explanation=description,
237+
recognition_metadata={
238+
RecognizerResult.RECOGNIZER_NAME_KEY: self.name,
239+
RecognizerResult.RECOGNIZER_IDENTIFIER_KEY: self.id,
240+
},
241+
)
242+
243+
if validation_result is not None:
244+
if validation_result:
245+
pattern_result.score = EntityRecognizer.MAX_SCORE
246+
else:
247+
pattern_result.score = EntityRecognizer.MIN_SCORE
248+
249+
invalidation_result = self.invalidate_result(current_match)
250+
if invalidation_result is not None and invalidation_result:
241251
pattern_result.score = EntityRecognizer.MIN_SCORE
242252

243-
invalidation_result = self.invalidate_result(current_match)
244-
if invalidation_result is not None and invalidation_result:
245-
pattern_result.score = EntityRecognizer.MIN_SCORE
246-
247-
if pattern_result.score > EntityRecognizer.MIN_SCORE:
248-
results.append(pattern_result)
253+
if pattern_result.score > EntityRecognizer.MIN_SCORE:
254+
results.append(pattern_result)
249255

250-
# Update analysis explanation score following validation or invalidation
251-
description.score = pattern_result.score
256+
# Update analysis explanation score after validation or invalidation
257+
description.score = pattern_result.score
258+
except TimeoutError:
259+
logger.warning(
260+
"Regex pattern '%s' timed out after %s seconds, skipping.",
261+
pattern.name,
262+
REGEX_TIMEOUT_SECONDS,
263+
exc_info=True,
264+
)
252265

253266
results = EntityRecognizer.remove_duplicates(results)
254267
return results

presidio-analyzer/presidio_analyzer/predefined_recognizers/generic/iban_recognizer.py

Lines changed: 71 additions & 49 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
import logging
2+
import os
23
import string
34
from typing import Dict, List, Optional, Tuple
45

@@ -19,6 +20,7 @@
1920

2021
logger = logging.getLogger("presidio-analyzer")
2122

23+
REGEX_TIMEOUT_SECONDS = int(os.environ.get("REGEX_TIMEOUT_SECONDS", 60))
2224

2325
class IbanRecognizer(PatternRecognizer):
2426
"""
@@ -144,54 +146,64 @@ def __analyze_patterns(self, text: str, flags: int = None):
144146
flags = flags if flags else self.global_regex_flags
145147
results = []
146148
for pattern in self.patterns:
147-
matches = re.finditer(pattern.regex, text, flags=flags)
148-
149-
for match in matches:
150-
for grp_num in reversed(range(1, len(match.groups()) + 1)):
151-
start = match.span(0)[0]
152-
end = (
153-
match.span(grp_num)[1]
154-
if match.span(grp_num)[1] > 0
155-
else match.span(0)[1]
156-
)
157-
current_match = text[start:end]
158-
159-
# Skip empty results
160-
if current_match == "":
161-
continue
162-
163-
score = pattern.score
164-
165-
validation_result = self.validate_result(current_match)
166-
description = PatternRecognizer.build_regex_explanation(
167-
self.name,
168-
pattern.name,
169-
pattern.regex,
170-
score,
171-
validation_result,
172-
flags,
173-
)
174-
pattern_result = RecognizerResult(
175-
entity_type=self.supported_entities[0],
176-
start=start,
177-
end=end,
178-
score=score,
179-
analysis_explanation=description,
180-
recognition_metadata={
181-
RecognizerResult.RECOGNIZER_NAME_KEY: self.name,
182-
RecognizerResult.RECOGNIZER_IDENTIFIER_KEY: self.id,
183-
},
184-
)
185-
186-
if validation_result is not None:
187-
if validation_result:
188-
pattern_result.score = EntityRecognizer.MAX_SCORE
189-
else:
190-
pattern_result.score = EntityRecognizer.MIN_SCORE
191-
192-
if pattern_result.score > EntityRecognizer.MIN_SCORE:
193-
results.append(pattern_result)
194-
break
149+
try:
150+
matches = re.finditer(
151+
pattern.regex, text, flags=flags, timeout=REGEX_TIMEOUT_SECONDS
152+
)
153+
154+
for match in matches:
155+
for grp_num in reversed(range(1, len(match.groups()) + 1)):
156+
start = match.span(0)[0]
157+
end = (
158+
match.span(grp_num)[1]
159+
if match.span(grp_num)[1] > 0
160+
else match.span(0)[1]
161+
)
162+
current_match = text[start:end]
163+
164+
# Skip empty results
165+
if current_match == "":
166+
continue
167+
168+
score = pattern.score
169+
170+
validation_result = self.validate_result(current_match)
171+
description = PatternRecognizer.build_regex_explanation(
172+
self.name,
173+
pattern.name,
174+
pattern.regex,
175+
score,
176+
validation_result,
177+
flags,
178+
)
179+
pattern_result = RecognizerResult(
180+
entity_type=self.supported_entities[0],
181+
start=start,
182+
end=end,
183+
score=score,
184+
analysis_explanation=description,
185+
recognition_metadata={
186+
RecognizerResult.RECOGNIZER_NAME_KEY: self.name,
187+
RecognizerResult.RECOGNIZER_IDENTIFIER_KEY: self.id,
188+
},
189+
)
190+
191+
if validation_result is not None:
192+
if validation_result:
193+
pattern_result.score = EntityRecognizer.MAX_SCORE
194+
else:
195+
pattern_result.score = EntityRecognizer.MIN_SCORE
196+
197+
if pattern_result.score > EntityRecognizer.MIN_SCORE:
198+
results.append(pattern_result)
199+
break
200+
except TimeoutError:
201+
logger.warning(
202+
"Regex pattern '%s' timed out after %s seconds, skipping.",
203+
pattern.name,
204+
REGEX_TIMEOUT_SECONDS,
205+
exc_info=True,
206+
)
195207

196208
return results
197209

@@ -216,6 +228,16 @@ def __is_valid_format(
216228
country_regex = regex_per_country.get(country_code, "")
217229
if bos_eos and country_regex:
218230
country_regex = bos_eos[0] + country_regex + bos_eos[1]
219-
return country_regex and re.match(country_regex, iban, flags=flags)
231+
try:
232+
return country_regex and re.match(
233+
country_regex, iban, flags=flags, timeout=REGEX_TIMEOUT_SECONDS
234+
)
235+
except TimeoutError:
236+
logger.warning(
237+
"IBAN format validation regex timed out after %s seconds.",
238+
REGEX_TIMEOUT_SECONDS,
239+
exc_info=True,
240+
)
241+
return False
220242

221243
return False

presidio-analyzer/tests/test_analyzer_engine.py

Lines changed: 31 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -1,30 +1,28 @@
11
import copy
2+
import re
23
from abc import ABC
34
from contextlib import nullcontext
45
from typing import List, Optional
5-
import re
6+
from unittest.mock import patch
67

78
import pytest
8-
99
from presidio_analyzer import (
1010
AnalyzerEngine,
11-
PatternRecognizer,
11+
EntityRecognizer,
1212
Pattern,
13+
PatternRecognizer,
1314
RecognizerRegistry,
14-
EntityRecognizer,
1515
RecognizerResult,
1616
)
1717
from presidio_analyzer.nlp_engine import (
1818
NlpArtifacts,
1919
SpacyNlpEngine,
2020
)
21-
from presidio_analyzer.recognizer_registry import (
22-
RecognizerRegistryProvider
23-
)
21+
from presidio_analyzer.recognizer_registry import RecognizerRegistryProvider
2422

2523
# noqa: F401
2624
from tests import assert_result
27-
from tests.mocks import NlpEngineMock, AppTracerMock, RecognizerRegistryMock
25+
from tests.mocks import AppTracerMock, NlpEngineMock, RecognizerRegistryMock
2826

2927

3028
@pytest.fixture(scope="module")
@@ -935,3 +933,28 @@ def test_when_multiple_nameless_recognizers_context_is_correct(spacy_nlp_engine)
935933

936934
for recognizer_result in recognizer_results:
937935
assert recognizer_result.score > 0.3
936+
937+
938+
def test_when_regex_allow_list_times_out_then_result_is_kept(loaded_analyzer_engine):
939+
"""Test that a timed-out allow list regex keeps the result (conservative behavior)."""
940+
text = "bing.com is his favorite website"
941+
942+
with patch(
943+
"presidio_analyzer.analyzer_engine.REGEX_TIMEOUT_SECONDS", 0.001
944+
):
945+
with patch(
946+
"presidio_analyzer.analyzer_engine.re.compile"
947+
) as mock_compile:
948+
mock_compiled = mock_compile.return_value
949+
mock_compiled.search.side_effect = TimeoutError("regex timed out")
950+
951+
results = loaded_analyzer_engine.analyze(
952+
text=text,
953+
language="en",
954+
allow_list=["bing"],
955+
allow_list_match="regex",
956+
)
957+
958+
# Result should be kept on timeout (not filtered out)
959+
assert any(r.entity_type == "URL" for r in results)
960+

0 commit comments

Comments
 (0)