Skip to content
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ object DateTimeParserConfig
"ga" -> Map("eanáir"->1,"feabhra"->2,"marta"->3,"aibreán"->4,"bealtaine"->5,"meitheamh"->6,"iúil"->7,"lúnasa"->8,"meán fómhair"->9,"deireadh fómhair"->10,"samhain"->11,"nollaig"->12),
"gl" -> Map("xaneiro"->1,"febreiro"->2,"marzo"->3,"abril"->4,"maio"->5,"xuño"->6,"xullo"->7,"agosto"->8,"setembro"->9,"outubro"->10,"novembro"->11,"decembro"->12,
"xan"->1,"feb"->2,"mar"->3,"abr"->4,"mai"->5,"xuñ"->6,"xul"->7,"ago"->8,"set"->9,"out"->10,"nov"->11,"dec"->12),
"hi" -> Map("जनवरी"->1, "फरवरी"->2, "मार्च"->3, "अप्रैल"->4, "मई"->5, "जून"->6, "जुलाई"->7, "अगस्त"->8, "सितम्बर"->9, "अक्टूबर"->10, "नवंबर"->11, "दिसंबर"->12),
"hr" -> Map("siječanj"->1,"veljača"->2,"ožujak"->3,"travanj"->4,"svibanj"->5,"lipanj"->6,"srpanj"->7,"kolovoz"->8,"rujan"->9,"listopad"->10,"studeni"->11,"prosinac"->12),
"id" -> Map("januari"->1,"februari"->2,"maret"->3,"april"->4,"mei"->5,"juni"->6,"juli"->7,"agustus"->8,"september"->9,"oktober"->10,"november"->11,"desember"->12),
"it" -> Map("gennaio"->1,"febbraio"->2,"marzo"->3,"aprile"->4,"maggio"->5,"giugno"->6,"luglio"->7,"agosto"->8,"settembre"->9,"ottobre"->10,"novembre"->11,"dicembre"->12),
Expand Down Expand Up @@ -62,6 +63,7 @@ object DateTimeParserConfig
"fr" -> Map("av\\. J\\.-C\\."-> -1, "ap\\. J\\.-C\\." -> 1),
"ga" -> Map("B\\.C\\." -> -1, "R\\.C\\." -> -1, "r\\. Chr\\." -> -1, "BC" -> -1, "RC" -> -1, "A\\.D\\." -> 1, "AD" -> 1, "I\\.C\\." -> 1, "IC" -> 1),
"gl" -> Map("AC"-> -1, "A\\.C\\."-> -1, "DC"-> 1, "D\\.C\\."-> 1, "aC"-> -1, "a\\.C\\."-> -1, "dC"-> 1, "d\\.C\\."-> 1, "AEC"-> -1, "A\\.E\\.C\\."-> -1 , "EC"-> 1, "E\\.C\\."-> 1),
"hi" -> Map("ई\\.पू\\." -> -1, "BC" -> -1, "ई॰" -> 1, "CE"-> 1, "AD"-> 1, "AC"-> -1),
"it" -> Map("AC"-> -1, "A\\.C\\."-> -1, "DC"-> 1, "D\\.C\\."-> 1, "AD"-> 1, "A\\.D\\."-> 1, "PEV"-> -1, "P\\.E\\.V\\."-> -1, "EV"-> 1, "E\\.V\\." -> 1),
"ja" -> Map("前"-> -1, "B\\.C\\."-> -1, "西暦"-> 1),
"nl" -> Map("v\\.Chr\\." -> -1, "n\\.C\\."-> 1, "v\\.C\\." -> -1, "n\\.Chr\\."-> 1, "voor Chr\\." -> -1, "na Chr\\."-> 1),
Expand All @@ -81,6 +83,7 @@ object DateTimeParserConfig
"fr" -> "er|nd|ème",
"ga" -> "ú",
"gl" -> "°|\\.°|°\\.|ª|\\.ª|ª\\.|º|\\.º|º\\.",
"hi" -> "वॉ",
"it" -> "°|\\.°|°\\.",
"nl" -> "ste|de|e",
"pt" -> "°|\\.°|°\\.",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -379,6 +379,19 @@ object DurationParserConfig
"año" -> "year",
"años" -> "year"
),
"hi" -> Map(
"सेकेंड" -> "second",
"सेकेण्ड" -> "second",
"s" -> "second",
"\"" -> "second",
"मिनट" -> "minute",
"\\’" -> "minute",
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

🧩 Analysis chain

🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "Inspect regex construction:"
sed -n '19,28p' core/src/main/scala/org/dbpedia/extraction/dataparser/DurationParser.scala

echo
echo "Inspect unit cleanup + map lookup:"
sed -n '129,137p' core/src/main/scala/org/dbpedia/extraction/dataparser/DurationParser.scala

echo
echo "Inspect upstream string normalization:"
sed -n '46,53p' core/src/main/scala/org/dbpedia/extraction/dataparser/StringParser.scala

Repository: dbpedia/extraction-framework

Length of output: 1712


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Line 388 context with surrounding lines and hex dump ==="
sed -n '385,391p' core/src/main/scala/org/dbpedia/extraction/config/dataparser/DurationParserConfig.scala | cat -A

echo
echo "=== Hex dump of the key at line 388 ==="
sed -n '388p' core/src/main/scala/org/dbpedia/extraction/config/dataparser/DurationParserConfig.scala | od -c

echo
echo "=== Check if this is part of a map initialization ==="
sed -n '380,395p' core/src/main/scala/org/dbpedia/extraction/config/dataparser/DurationParserConfig.scala

echo
echo "=== Verify the regex character class behavior ==="
echo 'Testing character class [^\'"'"'"a-zA-Z]'
python3 << 'EOF'
import re
# Test the regex pattern used in cleanup
pattern = r"[^'\"a-zA-Z]"
test_cases = [
    ("'", "ASCII apostrophe"),
    ("'", "U+2019 right single quotation mark"),
    ("\\", "backslash"),
    ("a", "letter a"),
    ('"', "ASCII quote"),
]
for char, desc in test_cases:
    removed = re.sub(pattern, "", char)
    print(f"{desc:40} | Input: {repr(char):10} | After cleanup: {repr(removed):10} | Removed: {removed == ''}")
EOF

Repository: dbpedia/extraction-framework

Length of output: 587


🏁 Script executed:

#!/bin/bash
set -euo pipefail

echo "=== Full context around line 388 with line numbers ==="
sed -n '380,395p' core/src/main/scala/org/dbpedia/extraction/config/dataparser/DurationParserConfig.scala | nl -v 380

echo
echo "=== Hex/bytes of line 388 using Python ==="
python3 << 'EOF'
with open('core/src/main/scala/org/dbpedia/extraction/config/dataparser/DurationParserConfig.scala', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    line_388 = lines[387]  # 0-indexed
    print(f"Line 388 content: {repr(line_388)}")
    print(f"Bytes (hex): {line_388.encode('utf-8').hex()}")
    
    # Extract the key part
    import re
    match = re.search(r'"([^"]*)"', line_388)
    if match:
        key = match.group(1)
        print(f"Key: {repr(key)}")
        print(f"Key bytes (hex): {key.encode('utf-8').hex()}")
        for i, char in enumerate(key):
            print(f"  [{i}] {repr(char)} (U+{ord(char):04X})")
EOF

echo
echo "=== Check surrounding lines for similar patterns ==="
python3 << 'EOF'
with open('core/src/main/scala/org/dbpedia/extraction/config/dataparser/DurationParserConfig.scala', 'r', encoding='utf-8') as f:
    lines = f.readlines()
    for i in range(380, 400):
        line = lines[i] if i < len(lines) else ""
        if line.strip():
            match_obj = __import__('re').search(r'"([^"]*)"', line)
            if match_obj:
                key = match_obj.group(1)
                print(f"Line {i+1:3d}: key={repr(key):20s} value_start={lines[i].split('->')[1][:20].strip() if '->' in lines[i] else 'N/A'}")
EOF

Repository: dbpedia/extraction-framework

Length of output: 186


🏁 Script executed:

#!/bin/bash
python3 << 'EOFPYTHON'
import re

with open('core/src/main/scala/org/dbpedia/extraction/config/dataparser/DurationParserConfig.scala', 'r', encoding='utf-8') as f:
    lines = f.readlines()

print("=== Lines 380-395 with key analysis ===")
for i in range(379, min(395, len(lines))):
    line = lines[i]
    line_num = i + 1
    print(f"Line {line_num}: {line.rstrip()}")
    
    # Extract key from quoted string
    match = re.search(r'"([^"]*)"', line)
    if match:
        key = match.group(1)
        key_hex = key.encode('utf-8').hex()
        print(f"  Key: {repr(key)} | Hex: {key_hex}")
        for j, char in enumerate(key):
            print(f"    [{j}] = {repr(char):8s} (U+{ord(char):04X})")

print("\n=== Analyzing the cleanup regex behavior ===")
pattern = r"[^'\"a-zA-Z]"

# Test cases from the actual keys found
test_keys = [
    "\\M-bM-^@M-^Y",  # How it appeared in cat -A
    "\\'",            # As shown in review snippet  
    "'",              # ASCII apostrophe
]

# Actually read what's in the file
print("\nActual keys from file and cleanup result:")
for i in [387, 388, 389]:  # Lines around the area in question
    if i < len(lines):
        line = lines[i]
        match = re.search(r'"([^"]*)"', line)
        if match:
            key = match.group(1)
            cleaned = re.sub(pattern, "", key)
            print(f"Line {i+1}: Original key={repr(key):20s} | After cleanup={repr(cleaned):15s} | Matches itself? {key == cleaned}")
EOFPYTHON

Repository: dbpedia/extraction-framework

Length of output: 3075


🏁 Script executed:

#!/bin/bash
python3 << 'EOFPYTHON'
import re

with open('core/src/main/scala/org/dbpedia/extraction/config/dataparser/DurationParserConfig.scala', 'r', encoding='utf-8') as f:
    lines = f.readlines()

print("=== Context around line 388 with comments ===")
for i in range(375, min(400, len(lines))):
    line = lines[i].rstrip()
    if line.strip():
        print(f"Line {i+1:3d}: {line}")

print("\n=== Summary of the minute keys ===")
print("Line 387: 'मिनट' (Hindi 'minute') -> 'minute'")
print("Line 388: '\\\\\\'' (backslash + backslash + U+2019) -> 'minute'")
print("Line 389: \"'\" (ASCII apostrophe U+0027) -> 'minute' [copied from english]")
print("Line 390: '′' (prime symbol U+2032) -> 'minute'")
print("Line 391: 'm' (ASCII m) -> 'minute'")

print("\n=== Testing what cleanup does to each ===")
pattern = r"[^'\"a-zA-Z]"
test_cases = [
    ("मिनट", "Hindi minute"),
    ("\\\\'" , "Line 388 key"),
    ("'", "ASCII apostrophe"),
    ("′", "Prime symbol"),
    ("m", "ASCII m"),
]

for key, desc in test_cases:
    cleaned = re.sub(pattern, "", key)
    matches = "✓" if key == cleaned else "✗"
    print(f"{matches} {desc:25s}: {repr(key):20s} -> {repr(cleaned):20s}")
EOFPYTHON

Repository: dbpedia/extraction-framework

Length of output: 1879


🏁 Script executed:

#!/bin/bash

echo "=== Searching for tests related to Hindi duration parsing ==="
find . -type f -name "*.scala" -o -name "*.java" | head -20

echo
echo "=== Search for DurationParser tests ==="
find . -type f \( -name "*Test*.scala" -o -name "*test*.scala" \) | xargs grep -l "DurationParser" 2>/dev/null | head -5

echo
echo "=== Search for 'hi' language or Hindi in tests ==="
find . -type f \( -name "*Test*.scala" -o -name "*test*.scala" \) | xargs grep -l "\"hi\"" 2>/dev/null | head -5

echo
echo "=== Check if there are any test inputs with the problematic characters ==="
find . -type f -name "*.scala" | xargs grep -E "(मिनट|U\+2019|\\\\\\\\'" 2>/dev/null | head -10

Repository: dbpedia/extraction-framework

Length of output: 1699


🏁 Script executed:

#!/bin/bash

echo "=== DurationParser test file ==="
cat core/src/test/scala/org/dbpedia/extraction/dataparser/DurationParserTest.scala

echo
echo "=== Check if test covers Hindi language ==="
grep -n "hi\|मिनट\|hindi\|Hindi" core/src/test/scala/org/dbpedia/extraction/dataparser/DurationParserTest.scala

Repository: dbpedia/extraction-framework

Length of output: 7512


This alias won't resolve during parsing.

The key "\\'" as displayed maps to two backslashes followed by U+2019 (right single quotation mark) in the actual file. The cleanup step at DurationParser.scala:129-137 applies replaceAll("""[^\'\"a-zA-Z]""", ""), which removes anything outside the ASCII apostrophe, ASCII quote, and ASCII letter set. This strips the backslashes and U+2019 to an empty string, preventing any map lookup from succeeding. The ASCII apostrophe at line 389 works correctly because it's in the allowed set, but this entry cannot produce "minute".

Use the ASCII apostrophe from line 389 or adjust the cleanup regex to preserve the intended character before lookup.

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In
`@core/src/main/scala/org/dbpedia/extraction/config/dataparser/DurationParserConfig.scala`
at line 388, The alias entry "\\’" in DurationParserConfig.scala uses a
backslash plus U+2019 which gets stripped by the cleanup in DurationParser.scala
(replaceAll("""[^\'\"a-zA-Z]""", "")), so lookups never match; fix by replacing
the problematic key with the ASCII apostrophe variant used on the next line (use
"'" as the map key) or alternatively update the cleanup regex in
DurationParser.scala (the replaceAll call around lines 129-137) to allow U+2019
so the original key survives normalization — modify either
DurationParserConfig.scala (change "\\’" -> "minute" to use the ASCII
apostrophe) or DurationParser.scala (expand the character class to include
\u2019) so map lookups succeed.

"m" -> "minute",
"घंटा" -> "hour",
"दिन" -> "day",
"महीना" -> "month",
"वर्ष" -> "year"
),
"it" -> Map(
"secondo" -> "second",
"secondi" -> "second",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ object DateIntervalMappingConfig
"eu" -> Set("gaur egun", "gaur egun arte", "egun"),
"fr" -> Set("aujourd'hui", "en cours"),
"ga" -> Set("inniu"),
"hi" -> Set("अबतक"),
"hr" -> Set("danas"),
"hu" -> Set("napjainkig"),
"id" -> Set("sekarang"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ object DisambiguationExtractorConfig
"ga" -> " (idirdhealáin)",
"gl" -> " (homónimos)",
"he" -> " (פירושונים)",
"hi" -> " (बहुविकल्पी)", // eg. https://hi.wikipedia.org/wiki/आयरलैण्ड_(बहुविकल्पी)
"hu" -> " (egyértelműsítő lap)",
"id" -> " (disambig)",
"it" -> " (disambigua)",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ object HomepageExtractorConfig
"eu" -> Set("webgunea"),
"fr" -> Set("website", "homepage", "web", "site", "siteweb", "site web"),/*cleanup*/
"ga" -> Set("suíomh"),
"hi" -> Set("वेबसाइट"),
"it" -> Set("homepage", "sito", "sito web"),
"ja" -> Set("homepage", "website", "web", "siteweb", "HP", "ホームページ", "ウェブ", "サイト", "ウェブサイト", "公式サイト"),
"mk" -> Set("Портал", "Мреж. место"),
Expand Down Expand Up @@ -68,6 +69,7 @@ object HomepageExtractorConfig
"eu" -> "Kanpo loturak?",
"fr" -> "(?:Lien externe|Liens externes|Liens et documents externes)",
"ga" -> "(?:Naisc sheachtracha|Nasc sheachtrach)",
"hi" -> "बाहरी कड़ियाँ",
"it" -> "Collegamenti esterni",
"ja" -> "外部リンク",
"mk" -> "Надворешни врски",
Expand Down Expand Up @@ -96,6 +98,7 @@ object HomepageExtractorConfig
"eu" -> "ofiziala?",
"fr" -> "officiel",
"ga" -> "oifigiúil",
"hi" -> "आधिकारिक",
"it" -> "ufficiale",
"ja" -> "(?:公式|オフィシャル)",
"mk" -> "официјален",
Expand All @@ -121,6 +124,7 @@ object HomepageExtractorConfig
"es" -> Map("Página_web" -> "1"),
"fr" -> Map("Site_officiel" -> "url"),
"ga" -> Map("Páxina_web" -> "1"),
"hi" -> Map("आधिकारिक वेबसाइट" -> "1"),
"ja" -> Map("Official website" -> "1"),
"pt" -> Map("Oficial" -> "1"),
"ru" -> Map("Официальный сайт" -> "1"),
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ object ImageExtractorConfig
"eu" -> """(?i)\{\{\s?(Cc-by-nc-sa-2.5|Wikimedia_logoa|Copyrightdun_logoa|Lizentzia_gabea|Album_azala|Aldizkari_azala|Fair_use|Bideo-zinta_azala|Dirua|DVD_azala|Egunkari_azala|Film_pantaila_irudia|Film_posterra|HQFL_logotipoa|Ikonoa|Ikurra|Irrati_logotipoa|Jatetxe_logotipoa|Joku_azala|Joku_pantaila_irudia|Kirol_logotipoa|Komiki_azala|Liburu_azala|Logotipoa|Mahai-joku_azala|Olinpiada_logotipoa|Politika_posterra|Propaganda|Software_azala|Software_pantaila_irudia|Zigilua|TB_pantaila_irudia|Web_pantaila_irudia)\s?\}\}""".r,
"fr" -> """(?iu)\{\{\s?(Copyright by Wikimedia|Copyvio|Logo|Screenshot|Ordnance Survey Copyright|Fairuse|Noncommercial|PolandGov|nonderivative|NZCrownCopyright|PD-IndiaGov|ADRM2|Marque déposée)\s?\}\}""".r,
"gl" -> """(?iu)\{\{\s?(non-free|Copyright by Wikimedia|Copyvio|Logo|Screenshot|PD-CAGov|Fairuse|Noncommercial|Nonderivative|NZCrownCopyright|PolandGov|PD-IndiaGov|ADRM2)\s?\}\}""".r,
"hi" -> """(?i)\{\{\s?non-free""".r,
"id" -> """(?i)\{\{\s?(non-free|Fairuse|Logo|LogoOlahraga|LogoTV|FotoHistoris|GambarKarakter|Promophoto|Smithsonian|TampilanFilm|TampilanVideo|TampilanSitus|TampilanPermainan|GambarUang|GambarPerangko|SampulVideo|SampulPermainan|SampulAlbum|SampulBuku|Poster|GambarBerhakTayangBersyarat)\s?\}\}""".r,
"it" -> """(?iu)\{\{\s?(Sconosciuto|Riservato|NonCommerciale|Unknown|Noncommercial|Nonderivative|Copyrighted|Screenshot|Ordinance Survey Copyright|Fairuse|Cc-nc|cc-by-nc|cc-by-nc-2.0|cc-nc-sa|cc-by-nc-sa|Cc-by-nc-sa-1.0|cc-by-nc-sa-2.0|cc-nd-nc|cc-by-nd-nc|cc-by-nd-nc-2.0|cc-nd|cc-by-nd|cc-by-nd-2.0|TW-cc-by-nc-nd-2.0|TW-cc-by-nc-sa-2.0|Copyright by Wikimedia|CopyrightbyWikimedia)\s?\}\}""".r,
"ja" -> """(?iu)\{\{\s?(Copyright by Wikimedia|Copyvio|Logo|Screenshot|PD-CAGov|Fair use|Noncommercial|PolandGov|Nonderivative|NZCrownCopyright|PD-IndiaGov|ADRM2|RomanianGovernmentCopyright|FrenchMinistryOfForeignAffairs|IRFCA|Members of the Riksdag|Attribution-Ubisoft)\s?\}\}""".r,
Expand Down
2 changes: 1 addition & 1 deletion dump/extraction.default.properties
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ extractors.ga=.MappingExtractor,.HomepageExtractor

extractors.gl=.MappingExtractor

extractors.hi=.MappingExtractor
extractors.hi=.MappingExtractor,.HomepageExtractor,.DisambiguationExtractor,.TopicalConceptsExtractor,.ImageExtractorNew,.AnchorTextExtractor,.CommonsResourceExtractor

extractors.hr=.MappingExtractor

Expand Down
2 changes: 1 addition & 1 deletion dump/extraction.spark.properties
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ extractors.ga=.HomepageExtractor

extractors.gl=

extractors.hi=
extractors.hi=.HomepageExtractor,.DisambiguationExtractor,.TopicalConceptsExtractor,.AnchorTextExtractor,.CommonsResourceExtractor

extractors.hr=

Expand Down
Loading
Loading