apache · lewismc · Feb 26, 2026 · Feb 26, 2026 · Feb 26, 2026 · sebastian-nagel
diff --git a/sonar-project.properties b/sonar-project.properties
@@ -27,7 +27,8 @@ sonar.links.ci=https://github.com/apache/nutch/actions
 sonar.sources=src/java,src/plugin
 sonar.tests=src/test,src/plugin
 sonar.test.inclusions=**/src/test/**/*.java,**/Test*.java,**/*IT.java
-sonar.exclusions=**/build.xml,**/build-ivy.xml,**/build-plugin.xml,**/ivy.xml,**/plugin.xml
+# Exclude build/config files and plugin resource directories (no Java code in conf, data, sample)
+sonar.exclusions=**/build.xml,**/build-ivy.xml,**/build-plugin.xml,**/ivy.xml,**/plugin.xml,**/src/plugin/**/conf/**,**/src/plugin/**/data/**,**/src/plugin/**/sample/**
 sonar.source.encoding=UTF-8
 sonar.java.source=17
 

diff --git a/src/java/org/apache/nutch/parse/ParseOutputFormat.java b/src/java/org/apache/nutch/parse/ParseOutputFormat.java
@@ -57,6 +57,7 @@
 import java.net.MalformedURLException;
 import java.net.URL;
 import java.util.ArrayList;
+import java.util.Arrays;
 import java.util.List;
 import java.util.Map.Entry;
 
@@ -73,7 +74,23 @@ public class ParseOutputFormat extends OutputFormat<Text, Parse> {
     NUMBER_FORMAT.setMinimumIntegerDigits(5);
     NUMBER_FORMAT.setGroupingUsed(false);
   }
-
+
+  /**
+   * Parses the comma-separated db.parsemeta.to.crawldb config value.
+   * Uses comma-split and trim to avoid ReDoS from regex backtracking.
+   * @param value config value (may be null or empty)
+   * @return array of trimmed, non-empty metadata keys (never null)
+   */
+  static String[] getParseMetaToCrawlDBKeys(String value) {
+    if (value == null || value.isEmpty()) {
+      return new String[0];
+    }
+    return Arrays.stream(value.split(",", -1))
+        .map(String::trim)
+        .filter(s -> !s.isEmpty())
+        .toArray(String[]::new);
+  }
+
   private static class SimpleEntry implements Entry<Text, CrawlDatum> {
     private Text key;
     private CrawlDatum value;
@@ -177,8 +194,8 @@ public RecordWriter<Text, Parse> getRecordWriter(TaskAttemptContext context)
     Path data = new Path(new Path(out, ParseData.DIR_NAME), name);
     Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name);
 
-    final String[] parseMDtoCrawlDB = conf.get("db.parsemeta.to.crawldb", "")
-        .split(" *, *");
+    final String[] parseMDtoCrawlDB = getParseMetaToCrawlDBKeys(
+        conf.get("db.parsemeta.to.crawldb", ""));
 
     // textOut Options
     final MapFile.Writer textOut;

diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java
@@ -24,8 +24,6 @@
 import java.lang.invoke.MethodHandles;
 import java.util.ArrayList;
 import java.util.Map;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 import java.net.URL;
 import java.net.MalformedURLException;
 import java.nio.charset.StandardCharsets;
@@ -64,15 +62,10 @@ public class HtmlParser implements Parser {
   // NUTCH-2042 (cf. TIKA-357): increased to 8 kB
   private static final int CHUNK_SIZE = 8192;
 
-  // NUTCH-1006 Meta equiv with single quotes not accepted
-  private static Pattern metaPattern = Pattern.compile(
-      "<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>",
-      Pattern.CASE_INSENSITIVE);
-  private static Pattern charsetPattern = Pattern.compile(
-      "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE);
-  private static Pattern charsetPatternHTML5 = Pattern.compile(
-      "<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>",
-      Pattern.CASE_INSENSITIVE);
+  private static final String META_TAG_START = "<meta";
+  private static final String CHARSET_EQ = "charset=";
+  private static final String HTTP_EQUIV = "http-equiv";
+  private static final String CONTENT_TYPE = "content-type";
 
   private String parserImpl;
 
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser {
    *          <code>byte[]</code> representation of an html file
    */
 
+  /**
+   * Extracts charset value from a string like "charset=utf-8" or "charset = utf-8".
+   * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain only [a-z0-9_-].
+   */
+  private static String extractCharsetValue(String s, int fromIndex) {
+    int idx = s.indexOf(CHARSET_EQ, fromIndex);
+    if (idx < 0) {
+      return null;
+    }
+    int start = idx + CHARSET_EQ.length();
+    while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) == '\t')) {
+      start++;
+    }
+    if (start >= s.length()) {
+      return null;
+    }
+    char first = s.charAt(start);
+    if (first != '"' && first != '\'' && (first < 'a' || first > 'z') && (first < 'A' || first > 'Z')) {
+      return null;
+    }
+    if (first == '"' || first == '\'') {
+      start++;
+    }
+    int end = start;
+    while (end < s.length()) {
+      char c = s.charAt(end);
+      if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c == '>') {
+        break;
+      }
+      if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-') {
+        end++;
+      } else {
+        break;
+      }
+    }
+    return end > start ? s.substring(start, end) : null;
+  }
+
+  /**
+   * Finds charset from HTML string using linear scans only (no backtracking regex).
+   * Checks meta http-equiv Content-Type then HTML5 meta charset.
+   * Package-private for unit testing.
+   */
+  static String extractCharsetFromMeta(String str) {
+    String lower = str.toLowerCase();
+    int pos = 0;
+    while (true) {
+      int metaStart = lower.indexOf(META_TAG_START, pos);
+      if (metaStart < 0) {
+        break;
+      }
+      int tagEnd = str.indexOf('>', metaStart);
+      if (tagEnd < 0) {
+        break;
+      }
+      String tagContent = str.substring(metaStart, tagEnd);
+      String tagLower = tagContent.toLowerCase();
+      // HTML4: meta http-equiv=Content-Type ... charset=...
+      if (tagLower.contains(HTTP_EQUIV) && tagLower.contains(CONTENT_TYPE)) {
+        String charset = extractCharsetValue(tagContent, 0);
+        if (charset != null) {
+          return charset;
+        }
+      }
+      // HTML5: <meta charset="utf-8">
+      if (tagLower.contains(CHARSET_EQ)) {
+        String charset = extractCharsetValue(tagContent, 0);
+        if (charset != null) {
+          return charset;
+        }
+      }
+      pos = tagEnd + 1;
+    }
+    return null;
+  }
+
   private static String sniffCharacterEncoding(byte[] content) {
     int length = content.length < CHUNK_SIZE ? content.length : CHUNK_SIZE;
 
@@ -102,20 +171,7 @@ private static String sniffCharacterEncoding(byte[] content) {
     // {U+0041, U+0082, U+00B7}.
     String str = new String(content, 0, length, StandardCharsets.US_ASCII);
 
-    Matcher metaMatcher = metaPattern.matcher(str);
-    String encoding = null;
-    if (metaMatcher.find()) {
-      Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1));
-      if (charsetMatcher.find())
-        encoding = charsetMatcher.group(1);
-    }
-    if (encoding == null) {
-      // check for HTML5 meta charset
-      metaMatcher = charsetPatternHTML5.matcher(str);
-      if (metaMatcher.find()) {
-        encoding = metaMatcher.group(1);
-      }
-    }
+    String encoding = extractCharsetFromMeta(str);
     if (encoding == null) {
       // check for BOM
       if (content.length >= 3 && content[0] == (byte) 0xEF

diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java
@@ -33,7 +33,7 @@
 
 import static org.junit.jupiter.api.Assertions.*;
 
-public class TestHtmlParser {
+class TestHtmlParser {
 
   private static final Logger LOG = LoggerFactory
       .getLogger(MethodHandles.lookup().lookupClass());
@@ -105,7 +105,7 @@ protected Parse parse(byte[] contentBytes) {
   }
 
   @Test
-  public void testEncodingDetection() {
+  void testEncodingDetection() {
     for (String[] testPage : encodingTestPages) {
       String name = testPage[0];
       Charset charset = Charset.forName(testPage[1]);
@@ -131,7 +131,7 @@ public void testEncodingDetection() {
   }
 
   @Test
-  public void testResolveBaseUrl() {
+  void testResolveBaseUrl() {
     byte[] contentBytes = resolveBaseUrlTestContent
         .getBytes(StandardCharsets.UTF_8);
     // parse using http://example.com/ as "fetch" URL
@@ -143,4 +143,30 @@ public void testResolveBaseUrl() {
         outlinks[0].getToUrl());
   }
 
+  /** Tests charset extraction from meta tags (ReDoS-safe parsing). */
+  @Test
+  void testExtractCharsetFromMeta() {
+    assertNull(HtmlParser.extractCharsetFromMeta(""));
+    assertNull(HtmlParser.extractCharsetFromMeta("<html><head></head></html>"));
+
+    assertEquals("utf-8", HtmlParser.extractCharsetFromMeta(
+        "<meta http-equiv=\"Content-Type\" content=\"text/html; charset=utf-8\" />"));
+    assertEquals("utf-8", HtmlParser.extractCharsetFromMeta(
+        "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />"));
+    assertEquals("ISO-8859-1", HtmlParser.extractCharsetFromMeta(
+        "<meta http-equiv=Content-Type content=\"text/html; charset=ISO-8859-1\">"));
+
+    assertEquals("utf-8", HtmlParser.extractCharsetFromMeta(
+        "<meta charset=\"utf-8\">"));
+    assertEquals("utf-8", HtmlParser.extractCharsetFromMeta(
+        "<meta charset='utf-8'>"));
+    assertEquals("utf-8", HtmlParser.extractCharsetFromMeta(
+        "<meta charset=utf-8>"));
+
+    // First content-type meta wins when both appear
+    String both = "<meta http-equiv=\"Content-Type\" content=\"charset=windows-1252\">"
+        + "<meta charset=\"utf-8\">";
+    assertEquals("windows-1252", HtmlParser.extractCharsetFromMeta(both));
+  }
+
 }
diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java
@@ -26,8 +26,6 @@
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
 
 import org.apache.nutch.parse.HTMLMetaTags;
 import org.apache.nutch.parse.HtmlParseFilter;
@@ -188,13 +186,72 @@ public ParseResult getParse(Content c) {
     return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd));
   }
 
-  private static final Pattern STRING_PATTERN = Pattern.compile(
-      "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)",
-      Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
-  // A simple pattern. This allows also invalid URL characters.
-  private static final Pattern URI_PATTERN = Pattern.compile(
-      "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)",
-      Pattern.CASE_INSENSITIVE | Pattern.MULTILINE);
+  /**
+   * Extracts content of quoted strings (single or double) from JavaScript.
+   * Uses linear scan to avoid ReDoS. Backslash escapes the next character.
+   * Package-private for unit testing.
+   */
+  static List<String> extractQuotedStrings(String plainText) {
+    List<String> result = new ArrayList<>();
+    int i = 0;
+    while (i < plainText.length()) {
+      char q = 0;
+      int start = -1;
+      if (plainText.charAt(i) == '"' || plainText.charAt(i) == '\'') {
+        q = plainText.charAt(i);
+        start = i + 1;
+      }
+      if (start > 0) {
+        StringBuilder content = new StringBuilder();
+        int j = start;
+        while (j < plainText.length()) {
+          char c = plainText.charAt(j);
+          if (c == '\\') {
+            j++;
+            if (j < plainText.length()) {
+              content.append(plainText.charAt(j));
+              j++;
+            }
+            continue;
+          }
+          if (c == q) {
+            String s = content.toString().trim();
+            if (s.length() > 0) {
+              result.add(s);
+            }
+            i = j + 1;
+            break;
+          }
+          content.append(c);
+          j++;
+        }
+        if (j >= plainText.length()) {
+          i++;
+        }
+      } else {
+        i++;
+      }
+    }
+    return result;
+  }
+
+  /**
+   * Checks if the string looks like a URI/path (contains . or /, no internal whitespace).
+   * Linear check to avoid ReDoS. Package-private for unit testing.
+   */
+  static boolean looksLikeUri(String s) {
+    if (s == null) {
+      return false;
+    }
+    s = s.trim();
+    if (s.isEmpty()) {
+      return false;
+    }
+    if (s.indexOf(' ') >= 0 || s.indexOf('\t') >= 0) {
+      return false;
+    }
+    return s.contains(".") || s.contains("/");
+  }
 
   // Alternative pattern, which limits valid url characters.
   // private static final String URI_PATTERN =
@@ -216,14 +273,10 @@ private Outlink[] getJSLinks(String plainText, String anchor, String base) {
 
     try {
 
-      Matcher matcher = STRING_PATTERN.matcher(plainText);
-
-      String url;
+      List<String> quotedStrings = extractQuotedStrings(plainText);
 
-      while (matcher.find()) {
-        url = matcher.group(2);
-        Matcher matcherUri = URI_PATTERN.matcher(url);
-        if (!matcherUri.matches()) {
+      for (String url : quotedStrings) {
+        if (!looksLikeUri(url)) {
           continue;
         }
         if (url.startsWith("www.")) {