-
Notifications
You must be signed in to change notification settings - Fork 1.3k
NUTCH-3161 Address Sonarcloud High and Medium Security Hotspots #904
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -24,8 +24,6 @@ | |
| import java.lang.invoke.MethodHandles; | ||
| import java.util.ArrayList; | ||
| import java.util.Map; | ||
| import java.util.regex.Matcher; | ||
| import java.util.regex.Pattern; | ||
| import java.net.URL; | ||
| import java.net.MalformedURLException; | ||
| import java.nio.charset.StandardCharsets; | ||
|
|
@@ -64,15 +62,10 @@ public class HtmlParser implements Parser { | |
| // NUTCH-2042 (cf. TIKA-357): increased to 8 kB | ||
| private static final int CHUNK_SIZE = 8192; | ||
|
|
||
| // NUTCH-1006 Meta equiv with single quotes not accepted | ||
| private static Pattern metaPattern = Pattern.compile( | ||
| "<meta\\s+([^>]*http-equiv=(\"|')?content-type(\"|')?[^>]*)>", | ||
| Pattern.CASE_INSENSITIVE); | ||
| private static Pattern charsetPattern = Pattern.compile( | ||
| "charset=\\s*([a-z][_\\-0-9a-z]*)", Pattern.CASE_INSENSITIVE); | ||
| private static Pattern charsetPatternHTML5 = Pattern.compile( | ||
| "<meta\\s+charset\\s*=\\s*[\"']?([a-z][_\\-0-9a-z]*)[^>]*>", | ||
| Pattern.CASE_INSENSITIVE); | ||
| private static final String META_TAG_START = "<meta"; | ||
| private static final String CHARSET_EQ = "charset="; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The regex allows white space between "charset" and "=". |
||
| private static final String HTTP_EQUIV = "http-equiv"; | ||
| private static final String CONTENT_TYPE = "content-type"; | ||
|
|
||
| private String parserImpl; | ||
|
|
||
|
|
@@ -93,6 +86,82 @@ public class HtmlParser implements Parser { | |
| * <code>byte[]</code> representation of an html file | ||
| */ | ||
|
|
||
| /** | ||
| * Extracts charset value from a string like "charset=utf-8" or "charset = utf-8". | ||
| * Uses linear scan to avoid ReDoS. Value must start with [a-z] and contain only [a-z0-9_-]. | ||
| */ | ||
| private static String extractCharsetValue(String s, int fromIndex) { | ||
| int idx = s.indexOf(CHARSET_EQ, fromIndex); | ||
| if (idx < 0) { | ||
| return null; | ||
| } | ||
| int start = idx + CHARSET_EQ.length(); | ||
| while (start < s.length() && (s.charAt(start) == ' ' || s.charAt(start) == '\t')) { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
| start++; | ||
| } | ||
| if (start >= s.length()) { | ||
| return null; | ||
| } | ||
| char first = s.charAt(start); | ||
| if (first != '"' && first != '\'' && (first < 'a' || first > 'z') && (first < 'A' || first > 'Z')) { | ||
| return null; | ||
| } | ||
| if (first == '"' || first == '\'') { | ||
| start++; | ||
| } | ||
| int end = start; | ||
| while (end < s.length()) { | ||
| char c = s.charAt(end); | ||
| if (c == ' ' || c == '\t' || c == ';' || c == '"' || c == '\'' || c == '>') { | ||
| break; | ||
| } | ||
| if ((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-') { | ||
| end++; | ||
| } else { | ||
| break; | ||
| } | ||
| } | ||
| return end > start ? s.substring(start, end) : null; | ||
| } | ||
|
|
||
| /** | ||
| * Finds charset from HTML string using linear scans only (no backtracking regex). | ||
| * Checks meta http-equiv Content-Type then HTML5 meta charset. | ||
| * Package-private for unit testing. | ||
| */ | ||
| static String extractCharsetFromMeta(String str) { | ||
| String lower = str.toLowerCase(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should add |
||
| int pos = 0; | ||
| while (true) { | ||
| int metaStart = lower.indexOf(META_TAG_START, pos); | ||
| if (metaStart < 0) { | ||
| break; | ||
| } | ||
| int tagEnd = str.indexOf('>', metaStart); | ||
| if (tagEnd < 0) { | ||
| break; | ||
| } | ||
| String tagContent = str.substring(metaStart, tagEnd); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Assuming that indexes in the lower-case string and the original one are the same might be dangerous, for example, in German there is the pair "ß" <> "SS" (would held for uppercasing the string). See Unicode TR #21 Case Mappings: "Case mappings may produce strings of different length than the original."
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Of course, if we strictly ensure that the method is used on String only including ASCII characters and using always the root locale, this should be safe. |
||
| String tagLower = tagContent.toLowerCase(); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Same: must use |
||
| // HTML4: meta http-equiv=Content-Type ... charset=... | ||
| if (tagLower.contains(HTTP_EQUIV) && tagLower.contains(CONTENT_TYPE)) { | ||
| String charset = extractCharsetValue(tagContent, 0); | ||
| if (charset != null) { | ||
| return charset; | ||
| } | ||
| } | ||
| // HTML5: <meta charset="utf-8"> | ||
| if (tagLower.contains(CHARSET_EQ)) { | ||
| String charset = extractCharsetValue(tagContent, 0); | ||
| if (charset != null) { | ||
| return charset; | ||
| } | ||
| } | ||
| pos = tagEnd + 1; | ||
| } | ||
| return null; | ||
| } | ||
|
|
||
| private static String sniffCharacterEncoding(byte[] content) { | ||
| int length = content.length < CHUNK_SIZE ? content.length : CHUNK_SIZE; | ||
|
|
||
|
|
@@ -102,20 +171,7 @@ private static String sniffCharacterEncoding(byte[] content) { | |
| // {U+0041, U+0082, U+00B7}. | ||
| String str = new String(content, 0, length, StandardCharsets.US_ASCII); | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The content bytes are converted into a String assuming ASCII encoding. If a "hand-crafted" scanner is used, could directly operate on the bytes avoiding the conversion to a string. Alternatively, could use a CharSequence wrapping the bytes, cf. ByteArrayCharSequence. But this can be an improvement for later. |
||
|
|
||
| Matcher metaMatcher = metaPattern.matcher(str); | ||
| String encoding = null; | ||
| if (metaMatcher.find()) { | ||
| Matcher charsetMatcher = charsetPattern.matcher(metaMatcher.group(1)); | ||
| if (charsetMatcher.find()) | ||
| encoding = charsetMatcher.group(1); | ||
| } | ||
| if (encoding == null) { | ||
| // check for HTML5 meta charset | ||
| metaMatcher = charsetPatternHTML5.matcher(str); | ||
| if (metaMatcher.find()) { | ||
| encoding = metaMatcher.group(1); | ||
| } | ||
| } | ||
| String encoding = extractCharsetFromMeta(str); | ||
| if (encoding == null) { | ||
| // check for BOM | ||
| if (content.length >= 3 && content[0] == (byte) 0xEF | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
It's a string from the configuration, it's controlled by the user and should be short. It's parsed once, so this is for sure not critical.
Apart from that should simply rely on getTrimmedStrings.