diff --git a/sonar-project.properties b/sonar-project.properties index c8d7c80249..0822294f0f 100644 --- a/sonar-project.properties +++ b/sonar-project.properties @@ -27,7 +27,8 @@ sonar.links.ci=https://github.com/apache/nutch/actions sonar.sources=src/java,src/plugin sonar.tests=src/test,src/plugin sonar.test.inclusions=**/src/test/**/*.java,**/Test*.java,**/*IT.java -sonar.exclusions=**/build.xml,**/build-ivy.xml,**/build-plugin.xml,**/ivy.xml,**/plugin.xml +# Exclude build/config files and plugin resource directories (no Java code in conf, data, sample) +sonar.exclusions=**/build.xml,**/build-ivy.xml,**/build-plugin.xml,**/ivy.xml,**/plugin.xml,**/src/plugin/**/conf/**,**/src/plugin/**/data/**,**/src/plugin/**/sample/** sonar.source.encoding=UTF-8 sonar.java.source=17 diff --git a/src/java/org/apache/nutch/parse/ParseOutputFormat.java b/src/java/org/apache/nutch/parse/ParseOutputFormat.java index 295c5e853d..4774d3e20b 100644 --- a/src/java/org/apache/nutch/parse/ParseOutputFormat.java +++ b/src/java/org/apache/nutch/parse/ParseOutputFormat.java @@ -57,6 +57,7 @@ import java.net.MalformedURLException; import java.net.URL; import java.util.ArrayList; +import java.util.Arrays; import java.util.List; import java.util.Map.Entry; @@ -73,7 +74,23 @@ public class ParseOutputFormat extends OutputFormat { NUMBER_FORMAT.setMinimumIntegerDigits(5); NUMBER_FORMAT.setGroupingUsed(false); } - + + /** + * Parses the comma-separated db.parsemeta.to.crawldb config value. + * Uses comma-split and trim to avoid ReDoS from regex backtracking. + * @param value config value (may be null or empty) + * @return array of trimmed, non-empty metadata keys (never null) + */ + static String[] getParseMetaToCrawlDBKeys(String value) { + if (value == null || value.isEmpty()) { + return new String[0]; + } + return Arrays.stream(value.split(",", -1)) + .map(String::trim) + .filter(s -> !s.isEmpty()) + .toArray(String[]::new); + } + private static class SimpleEntry implements Entry { private Text key; private CrawlDatum value; @@ -177,8 +194,8 @@ public RecordWriter getRecordWriter(TaskAttemptContext context) Path data = new Path(new Path(out, ParseData.DIR_NAME), name); Path crawl = new Path(new Path(out, CrawlDatum.PARSE_DIR_NAME), name); - final String[] parseMDtoCrawlDB = conf.get("db.parsemeta.to.crawldb", "") - .split(" *, *"); + final String[] parseMDtoCrawlDB = getParseMetaToCrawlDBKeys( + conf.get("db.parsemeta.to.crawldb", "")); // textOut Options final MapFile.Writer textOut; diff --git a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java index 73d42d0dc6..428eb0b709 100644 --- a/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java +++ b/src/plugin/parse-html/src/java/org/apache/nutch/parse/html/HtmlParser.java @@ -24,8 +24,6 @@ import java.lang.invoke.MethodHandles; import java.util.ArrayList; import java.util.Map; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import java.net.URL; import java.net.MalformedURLException; import java.nio.charset.StandardCharsets; @@ -64,15 +62,10 @@ public class HtmlParser implements Parser { // NUTCH-2042 (cf. TIKA-357): increased to 8 kB private static final int CHUNK_SIZE = 8192; - // NUTCH-1006 Meta equiv with single quotes not accepted - private static Pattern metaPattern = Pattern.compile( - "]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>", - Pattern.CASE_INSENSITIVE); - private static Pattern charsetPattern = Pattern.compile( - "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE); - private static Pattern charsetPatternHTML5 = Pattern.compile( - "]*>", - Pattern.CASE_INSENSITIVE); + private static final String META_TAG_START = "byte[] representation of an html file */ + /** + * Extracts charset value from a string like "charset=utf-8" or "charset = utf-8". + * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain only [a-z0-9_-]. + */ + private static String extractCharsetValue(String s, int fromIndex) { + int idx = s.indexOf(CHARSET_EQ, fromIndex); + if (idx < 0) { + return null; + } + int start = idx + CHARSET_EQ.length(); + while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) == '\t')) { + start++; + } + if (start >= s.length()) { + return null; + } + char first = s.charAt(start); + if (first != '"' && first != '\'' && (first < 'a' || first > 'z') && (first < 'A' || first > 'Z')) { + return null; + } + if (first == '"' || first == '\'') { + start++; + } + int end = start; + while (end < s.length()) { + char c = s.charAt(end); + if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c == '>') { + break; + } + if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-') { + end++; + } else { + break; + } + } + return end > start ? s.substring(start, end) : null; + } + + /** + * Finds charset from HTML string using linear scans only (no backtracking regex). + * Checks meta http-equiv Content-Type then HTML5 meta charset. + * Package-private for unit testing. + */ + static String extractCharsetFromMeta(String str) { + String lower = str.toLowerCase(); + int pos = 0; + while (true) { + int metaStart = lower.indexOf(META_TAG_START, pos); + if (metaStart < 0) { + break; + } + int tagEnd = str.indexOf('>', metaStart); + if (tagEnd < 0) { + break; + } + String tagContent = str.substring(metaStart, tagEnd); + String tagLower = tagContent.toLowerCase(); + // HTML4: meta http-equiv=Content-Type ... charset=... + if (tagLower.contains(HTTP_EQUIV) && tagLower.contains(CONTENT_TYPE)) { + String charset = extractCharsetValue(tagContent, 0); + if (charset != null) { + return charset; + } + } + // HTML5: + if (tagLower.contains(CHARSET_EQ)) { + String charset = extractCharsetValue(tagContent, 0); + if (charset != null) { + return charset; + } + } + pos = tagEnd + 1; + } + return null; + } + private static String sniffCharacterEncoding(byte[] content) { int length = content.length < CHUNK_SIZE ? content.length : CHUNK_SIZE; @@ -102,20 +171,7 @@ private static String sniffCharacterEncoding(byte[] content) { // {U+0041, U+0082, U+00B7}. String str = new String(content, 0, length, StandardCharsets.US_ASCII); - Matcher metaMatcher = metaPattern.matcher(str); - String encoding = null; - if (metaMatcher.find()) { - Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1)); - if (charsetMatcher.find()) - encoding = charsetMatcher.group(1); - } - if (encoding == null) { - // check for HTML5 meta charset - metaMatcher = charsetPatternHTML5.matcher(str); - if (metaMatcher.find()) { - encoding = metaMatcher.group(1); - } - } + String encoding = extractCharsetFromMeta(str); if (encoding == null) { // check for BOM if (content.length >= 3 && content[0] == (byte) 0xEF diff --git a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java index 47beff5a0f..5b66870250 100644 --- a/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java +++ b/src/plugin/parse-html/src/test/org/apache/nutch/parse/html/TestHtmlParser.java @@ -33,7 +33,7 @@ import static org.junit.jupiter.api.Assertions.*; -public class TestHtmlParser { +class TestHtmlParser { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); @@ -105,7 +105,7 @@ protected Parse parse(byte[] contentBytes) { } @Test - public void testEncodingDetection() { + void testEncodingDetection() { for (String[] testPage : encodingTestPages) { String name = testPage[0]; Charset charset = Charset.forName(testPage[1]); @@ -131,7 +131,7 @@ public void testEncodingDetection() { } @Test - public void testResolveBaseUrl() { + void testResolveBaseUrl() { byte[] contentBytes = resolveBaseUrlTestContent .getBytes(StandardCharsets.UTF_8); // parse using http://example.com/ as "fetch" URL @@ -143,4 +143,30 @@ public void testResolveBaseUrl() { outlinks[0].getToUrl()); } + /** Tests charset extraction from meta tags (ReDoS-safe parsing). */ + @Test + void testExtractCharsetFromMeta() { + assertNull(HtmlParser.extractCharsetFromMeta("")); + assertNull(HtmlParser.extractCharsetFromMeta("")); + + assertEquals("utf-8", HtmlParser.extractCharsetFromMeta( + "")); + assertEquals("utf-8", HtmlParser.extractCharsetFromMeta( + "")); + assertEquals("ISO-8859-1", HtmlParser.extractCharsetFromMeta( + "")); + + assertEquals("utf-8", HtmlParser.extractCharsetFromMeta( + "")); + assertEquals("utf-8", HtmlParser.extractCharsetFromMeta( + "")); + assertEquals("utf-8", HtmlParser.extractCharsetFromMeta( + "")); + + // First content-type meta wins when both appear + String both = "" + + ""; + assertEquals("windows-1252", HtmlParser.extractCharsetFromMeta(both)); + } + } diff --git a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java index 194ef915e8..664d0c30e7 100644 --- a/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java +++ b/src/plugin/parse-js/src/java/org/apache/nutch/parse/js/JSParseFilter.java @@ -26,8 +26,6 @@ import java.util.ArrayList; import java.util.Arrays; import java.util.List; -import java.util.regex.Matcher; -import java.util.regex.Pattern; import org.apache.nutch.parse.HTMLMetaTags; import org.apache.nutch.parse.HtmlParseFilter; @@ -188,13 +186,72 @@ public ParseResult getParse(Content c) { return ParseResult.createParseResult(c.getUrl(), new ParseImpl(script, pd)); } - private static final Pattern STRING_PATTERN = Pattern.compile( - "(\\\\*(?:\"|\'))([^\\s\"\']+?)(?:\\1)", - Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); - // A simple pattern. This allows also invalid URL characters. - private static final Pattern URI_PATTERN = Pattern.compile( - "(^|\\s*?)/?\\S+?[/\\.]\\S+($|\\s*)", - Pattern.CASE_INSENSITIVE | Pattern.MULTILINE); + /** + * Extracts content of quoted strings (single or double) from JavaScript. + * Uses linear scan to avoid ReDoS. Backslash escapes the next character. + * Package-private for unit testing. + */ + static List extractQuotedStrings(String plainText) { + List result = new ArrayList<>(); + int i = 0; + while (i < plainText.length()) { + char q = 0; + int start = -1; + if (plainText.charAt(i) == '"' || plainText.charAt(i) == '\'') { + q = plainText.charAt(i); + start = i + 1; + } + if (start > 0) { + StringBuilder content = new StringBuilder(); + int j = start; + while (j < plainText.length()) { + char c = plainText.charAt(j); + if (c == '\\') { + j++; + if (j < plainText.length()) { + content.append(plainText.charAt(j)); + j++; + } + continue; + } + if (c == q) { + String s = content.toString().trim(); + if (s.length() > 0) { + result.add(s); + } + i = j + 1; + break; + } + content.append(c); + j++; + } + if (j >= plainText.length()) { + i++; + } + } else { + i++; + } + } + return result; + } + + /** + * Checks if the string looks like a URI/path (contains . or /, no internal whitespace). + * Linear check to avoid ReDoS. Package-private for unit testing. + */ + static boolean looksLikeUri(String s) { + if (s == null) { + return false; + } + s = s.trim(); + if (s.isEmpty()) { + return false; + } + if (s.indexOf(' ') >= 0 || s.indexOf('\t') >= 0) { + return false; + } + return s.contains(".") || s.contains("/"); + } // Alternative pattern, which limits valid url characters. // private static final String URI_PATTERN = @@ -216,14 +273,10 @@ private Outlink[] getJSLinks(String plainText, String anchor, String base) { try { - Matcher matcher = STRING_PATTERN.matcher(plainText); - - String url; + List quotedStrings = extractQuotedStrings(plainText); - while (matcher.find()) { - url = matcher.group(2); - Matcher matcherUri = URI_PATTERN.matcher(url); - if (!matcherUri.matches()) { + for (String url : quotedStrings) { + if (!looksLikeUri(url)) { continue; } if (url.startsWith("www.")) { diff --git a/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java b/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java index fb22a438e1..1a1cbb6a50 100644 --- a/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java +++ b/src/plugin/parse-js/src/test/org/apache/nutch/parse/js/TestJSParseFilter.java @@ -17,11 +17,13 @@ package org.apache.nutch.parse.js; import static org.junit.jupiter.api.Assertions.assertEquals; +import static org.junit.jupiter.api.Assertions.assertFalse; import static org.junit.jupiter.api.Assertions.assertTrue; import java.io.File; import java.io.IOException; import java.lang.invoke.MethodHandles; +import java.util.List; import java.util.Set; import java.util.TreeSet; @@ -51,7 +53,7 @@ * temporarily disabled) * */ -public class TestJSParseFilter { +class TestJSParseFilter { private static final Logger LOG = LoggerFactory .getLogger(MethodHandles.lookup().lookupClass()); @@ -67,7 +69,7 @@ public class TestJSParseFilter { private Configuration conf; @BeforeEach - public void setUp() { + void setUp() { conf = NutchConfiguration.create(); conf.set("file.content.limit", "-1"); conf.set("plugin.includes", "protocol-file|parse-(html|js)"); @@ -88,8 +90,43 @@ public Outlink[] getOutlinks(String sampleFile) return parse.getData().getOutlinks(); } + /** Tests quoted string extraction (ReDoS-safe, no regex backtracking). */ @Test - public void testJavaScriptOutlinkExtraction() + void testExtractQuotedStrings() { + List empty = JSParseFilter.extractQuotedStrings("no quotes here"); + assertTrue(empty.isEmpty()); + + List one = JSParseFilter.extractQuotedStrings("var x = \"http://example.com/\""); + assertEquals(1, one.size()); + assertEquals("http://example.com/", one.get(0)); + + List two = JSParseFilter.extractQuotedStrings("a=\"foo\" b='bar'"); + assertEquals(2, two.size()); + assertEquals("foo", two.get(0)); + assertEquals("bar", two.get(1)); + + List escaped = JSParseFilter.extractQuotedStrings("\"say \\\"hi\\\"\""); + assertEquals(1, escaped.size()); + assertEquals("say \"hi\"", escaped.get(0)); + } + + /** Tests URI shape check (ReDoS-safe). */ + @Test + void testLooksLikeUri() { + assertFalse(JSParseFilter.looksLikeUri(null)); + assertFalse(JSParseFilter.looksLikeUri("")); + assertFalse(JSParseFilter.looksLikeUri(" ")); + assertFalse(JSParseFilter.looksLikeUri("no-dot-or-slash")); + assertFalse(JSParseFilter.looksLikeUri("has space in it.com")); + + assertTrue(JSParseFilter.looksLikeUri("http://example.com/")); + assertTrue(JSParseFilter.looksLikeUri("example.com/path")); + assertTrue(JSParseFilter.looksLikeUri("/relative/path")); + assertTrue(JSParseFilter.looksLikeUri(" https://foo.bar ")); + } + + @Test + void testJavaScriptOutlinkExtraction() throws ProtocolException, ParseException, IOException { String[] filenames = new File(sampleDir).list(); for (int i = 0; i < filenames.length; i++) { diff --git a/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java b/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java index 14fed8a772..6ec4568de7 100644 --- a/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java +++ b/src/plugin/urlfilter-validator/src/java/org/apache/nutch/urlfilter/validator/UrlValidator.java @@ -16,6 +16,8 @@ */ package org.apache.nutch.urlfilter.validator; +import java.net.URI; +import java.net.URISyntaxException; import java.util.regex.Matcher; import java.util.regex.Pattern; @@ -57,57 +59,24 @@ public class UrlValidator implements URLFilter { private static final String ALPHA_CHARS = "a-zA-Z"; - private static final String ALPHA_NUMERIC_CHARS = ALPHA_CHARS + "\\d"; - private static final String SPECIAL_CHARS = ";/@&=,.?:+$"; private static final String VALID_CHARS = "[^\\s" + SPECIAL_CHARS + "]"; private static final String SCHEME_CHARS = ALPHA_CHARS; - // Drop numeric, and "+-." for now - private static final String AUTHORITY_CHARS = ALPHA_NUMERIC_CHARS + "\\-\\."; - private static final String ATOM = VALID_CHARS + '+'; - /** - * This expression derived/taken from the BNF for URI (RFC2396). - */ - private static final Pattern URL_PATTERN = Pattern - .compile("^(([^:/?#]+):)?(//([^/?#]*))?([^?#]*)" - + "(\\?([^#]*))?(#(.*))?"); - - /** - * Schema/Protocol (ie. http:, ftp:, file:, etc). - */ - private static final int PARSE_URL_SCHEME = 2; - - /** - * Includes hostname/ip and port number. - */ - private static final int PARSE_URL_AUTHORITY = 4; - - private static final int PARSE_URL_PATH = 5; - - private static final int PARSE_URL_QUERY = 7; - /** * Protocol (ie. http:, ftp:,https:). */ private static final Pattern SCHEME_PATTERN = Pattern.compile("^[" + SCHEME_CHARS + "]+"); - private static final Pattern AUTHORITY_PATTERN = Pattern.compile("^([" - + AUTHORITY_CHARS + "]*)(:\\d*)?(.*)?"); - - private static final int PARSE_AUTHORITY_HOST_IP = 1; - - private static final int PARSE_AUTHORITY_PORT = 2; - - /** - * Should always be empty. - */ - private static final int PARSE_AUTHORITY_EXTRA = 3; + /** Index for host/IP in parseAuthority result. */ + private static final int PARSE_AUTHORITY_HOST_IP = 0; + /** Index for port string (e.g. ":80") in parseAuthority result, or null. */ + private static final int PARSE_AUTHORITY_PORT = 1; private static final Pattern PATH_PATTERN = Pattern .compile("^(/[-\\w:@&?=+,.!/~*'%$_;\\(\\)]*)?$"); @@ -157,38 +126,72 @@ public void setConf(Configuration conf) { * value is considered invalid. * @return true if the url is valid. */ + /** + * Parse authority "host" or "host:port" using linear scan (avoids ReDoS). + * @return String[2]: { hostOrIp, portOrNull } where port is e.g. ":80" or null + */ + /** Package-private for unit testing. */ + static String[] parseAuthority(String authority) { + if (authority == null || authority.isEmpty()) { + return new String[] { "", null }; + } + int lastColon = authority.lastIndexOf(':'); + if (lastColon < 0) { + return new String[] { authority, null }; + } + String portPart = authority.substring(lastColon + 1); + boolean allDigits = true; + for (int i = 0; i < portPart.length(); i++) { + if (!Character.isDigit(portPart.charAt(i))) { + allDigits = false; + break; + } + } + if (allDigits && !portPart.isEmpty()) { + return new String[] { authority.substring(0, lastColon), ":" + portPart }; + } + return new String[] { authority, null }; + } + private boolean isValid(String value) { if (value == null) { return false; } - Matcher matchUrlPat = URL_PATTERN.matcher(value); if (!LEGAL_ASCII_PATTERN.matcher(value).matches()) { return false; } - // Check the whole url address structure - if (!matchUrlPat.matches()) { - return false; - } - - if (!isValidScheme(matchUrlPat.group(PARSE_URL_SCHEME))) { + String scheme; + String authority; + String path; + String query; + try { + URI uri = new URI(value); + scheme = uri.getScheme(); + authority = uri.getRawAuthority(); + path = uri.getPath(); + query = uri.getRawQuery(); + if (path == null) { + path = ""; + } + } catch (URISyntaxException e) { return false; } - if (!isValidAuthority(matchUrlPat.group(PARSE_URL_AUTHORITY))) { + if (!isValidScheme(scheme)) { return false; } - if (!isValidPath(matchUrlPat.group(PARSE_URL_PATH))) { + if (!isValidAuthority(authority)) { return false; } - if (!isValidQuery(matchUrlPat.group(PARSE_URL_QUERY))) { + if (!isValidPath(path)) { return false; } - return true; + return isValidQuery(query); } /** @@ -223,15 +226,13 @@ private boolean isValidAuthority(String authority) { return false; } - Matcher authorityMatcher = AUTHORITY_PATTERN.matcher(authority); - if (!authorityMatcher.matches()) { - return false; - } + String[] parsed = parseAuthority(authority); + String hostIP = parsed[PARSE_AUTHORITY_HOST_IP]; + String port = parsed[PARSE_AUTHORITY_PORT]; boolean ipV4Address = false; boolean hostname = false; // check if authority is IP address or hostname - String hostIP = authorityMatcher.group(PARSE_AUTHORITY_HOST_IP); Matcher matchIPV4Pat = IP_V4_DOMAIN_PATTERN.matcher(hostIP); ipV4Address = matchIPV4Pat.matches(); @@ -299,29 +300,13 @@ private boolean isValidAuthority(String authority) { return false; } - String port = authorityMatcher.group(PARSE_AUTHORITY_PORT); if (port != null) { if (!PORT_PATTERN.matcher(port).matches()) { return false; } } - String extra = authorityMatcher.group(PARSE_AUTHORITY_EXTRA); - return isBlankOrNull(extra); - } - - /** - *

- * Checks if the field isn't null and length of the field is greater than zero - * not including whitespace. - *

- * - * @param value - * The value validation is being performed on. - * @return true if blank or null. - */ - private boolean isBlankOrNull(String value) { - return ((value == null) || (value.trim().length() == 0)); + return true; } /** diff --git a/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java b/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java index d815486de6..6f1760bbcb 100644 --- a/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java +++ b/src/plugin/urlfilter-validator/src/test/org/apache/nutch/urlfilter/validator/TestUrlValidator.java @@ -18,6 +18,7 @@ import org.junit.jupiter.api.Test; +import static org.junit.jupiter.api.Assertions.assertArrayEquals; import static org.junit.jupiter.api.Assertions.assertNotNull; import static org.junit.jupiter.api.Assertions.assertNull; @@ -30,7 +31,7 @@ * */ -public class TestUrlValidator { +class TestUrlValidator { /** * Test method for @@ -38,7 +39,7 @@ public class TestUrlValidator { * . */ @Test - public void testFilter() { + void testFilter() { UrlValidator url_validator = new UrlValidator(); assertNotNull(url_validator); @@ -73,4 +74,22 @@ public void testFilter() { "Valid url: ftp://alfa.bravo.pi/foo/bar/plan.pdf"); } + + /** Tests authority parsing (ReDoS-safe, no regex backtracking). */ + @Test + void testParseAuthority() { + assertArrayEquals(new String[] { "", null }, UrlValidator.parseAuthority(null)); + assertArrayEquals(new String[] { "", null }, UrlValidator.parseAuthority("")); + + assertArrayEquals(new String[] { "example.com", null }, + UrlValidator.parseAuthority("example.com")); + assertArrayEquals(new String[] { "example.com", ":80" }, + UrlValidator.parseAuthority("example.com:80")); + assertArrayEquals(new String[] { "192.168.1.1", ":8080" }, + UrlValidator.parseAuthority("192.168.1.1:8080")); + + // Port part non-numeric: entire string is host + assertArrayEquals(new String[] { "host:port", null }, + UrlValidator.parseAuthority("host:port")); + } } diff --git a/src/test/org/apache/nutch/parse/TestParseOutputFormat.java b/src/test/org/apache/nutch/parse/TestParseOutputFormat.java new file mode 100644 index 0000000000..6b5cf1a744 --- /dev/null +++ b/src/test/org/apache/nutch/parse/TestParseOutputFormat.java @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.nutch.parse; + +import org.junit.jupiter.api.Test; + +import static org.junit.jupiter.api.Assertions.assertArrayEquals; +import static org.junit.jupiter.api.Assertions.assertNotNull; + +/** Unit tests for ParseOutputFormat. */ +class TestParseOutputFormat { + + @Test + void testGetParseMetaToCrawlDBKeysEmpty() { + assertArrayEquals(new String[0], ParseOutputFormat.getParseMetaToCrawlDBKeys("")); + assertArrayEquals(new String[0], ParseOutputFormat.getParseMetaToCrawlDBKeys(null)); + } + + @Test + void testGetParseMetaToCrawlDBKeysSingle() { + assertArrayEquals(new String[] { "lang" }, + ParseOutputFormat.getParseMetaToCrawlDBKeys("lang")); + assertArrayEquals(new String[] { "lang" }, + ParseOutputFormat.getParseMetaToCrawlDBKeys(" lang ")); + } + + @Test + void testGetParseMetaToCrawlDBKeysCommaSeparated() { + assertArrayEquals(new String[] { "a", "b" }, + ParseOutputFormat.getParseMetaToCrawlDBKeys("a,b")); + assertArrayEquals(new String[] { "a", "b", "c" }, + ParseOutputFormat.getParseMetaToCrawlDBKeys("a,b,c")); + } + + @Test + void testGetParseMetaToCrawlDBKeysTrimSpacesAroundCommas() { + assertArrayEquals(new String[] { "a", "b" }, + ParseOutputFormat.getParseMetaToCrawlDBKeys(" a , b ")); + assertArrayEquals(new String[] { "lang", "Content-Type" }, + ParseOutputFormat.getParseMetaToCrawlDBKeys(" lang , Content-Type ")); + } + + @Test + void testGetParseMetaToCrawlDBKeysEmptySegmentsFiltered() { + assertArrayEquals(new String[] { "a", "b" }, + ParseOutputFormat.getParseMetaToCrawlDBKeys("a,,b")); + assertArrayEquals(new String[] { "a" }, + ParseOutputFormat.getParseMetaToCrawlDBKeys("a,,,")); + assertArrayEquals(new String[0], + ParseOutputFormat.getParseMetaToCrawlDBKeys(", , ,")); + } + + @Test + void testGetParseMetaToCrawlDBKeysNeverNull() { + assertNotNull(ParseOutputFormat.getParseMetaToCrawlDBKeys(null)); + assertNotNull(ParseOutputFormat.getParseMetaToCrawlDBKeys("")); + } +}