dbpedia · prashantkumarniraj · Feb 5, 2026 · Feb 10, 2026 · Feb 11, 2026 · coderabbitai
diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala
@@ -13,26 +13,30 @@ import scala.language.reflectiveCalls
 /**
  * Extracts the grammatical gender of people using a heuristic.
  */
-class GenderExtractor( 
+class GenderExtractor(
   context : {
     def mappings : Mappings
     def ontology : Ontology
     def language : Language
-    def redirects : Redirects 
-  } 
-) 
+    def redirects : Redirects
+  }
+)
 extends MappingExtractor(context)
 {
   private val language = context.language.wikiCode
 
-  private val pronounMap: Map[String, String] = GenderExtractorConfig.pronounsMap(language)
+  private val pronounMap: Map[String, String] =
+    GenderExtractorConfig.pronounsMap(language)
+
+  // ✅ Use ontology instead of hardcoded URIs
+  private val genderProperty =
+    context.ontology.properties("foaf:gender").uri
+
+  private val typeProperty =
+    context.ontology.properties("rdf:type").uri
 
-  // FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.FOAF)
-  private val genderProperty = "http://xmlns.com/foaf/0.1/gender"
-  // FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.RDF)
-  private val typeProperty = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
-  // FIXME: don't use string constant, use context.ontology (or at least DBpediaNamespace.ONTOLOGY)
-  private val personUri = "http://dbpedia.org/ontology/Person"
+  private val personUri =
+    context.ontology.classes("Person").uri
 
   override val datasets = Set(DBpediaDatasets.Genders)
 
@@ -45,14 +49,17 @@ extends MappingExtractor(context)
     // Even better: in the first extraction pass, extract all types. Use them in the second pass.
     val mappingGraph = super.extract(node, subjectUri)
 
-    // if this page is mapped onto Person
-    if (mappingGraph.exists(q => q.predicate == typeProperty && q.value == personUri))
+    // check if page is typed as Person
+    if (mappingGraph.exists(q =>
+      q.predicate == typeProperty && q.value == personUri
+    ))
     {
       // get the page text
       val wikiText: String = node.toWikiText
 
       // count gender pronouns
       var genderCounts: Map[String, Int] = Map()
+
       for ((pronoun, gender) <- pronounMap)
       {
         val regex = new Regex("\\W" + pronoun + "\\W")
@@ -65,6 +72,7 @@ extends MappingExtractor(context)
       var maxGender = ""
       var maxCount = 0
       var secondCount = 0.0
+
       for ((gender, count) <- genderCounts)
       {
         if (count > maxCount)
@@ -75,10 +83,24 @@ extends MappingExtractor(context)
         }
       }
 
-      // output triple for maximum gender
-      if (maxGender != "" && maxCount > GenderExtractorConfig.minCount && maxCount/secondCount > GenderExtractorConfig.minDifference)
+      if (
+        maxGender != "" &&
+        maxCount > GenderExtractorConfig.minCount &&
+        secondCount > 0 &&
+        maxCount / secondCount > GenderExtractorConfig.minDifference
+      )
-      if (
-        maxGender != "" &&
-        maxCount > GenderExtractorConfig.minCount &&
-        secondCount > 0 &&
-        maxCount / secondCount > GenderExtractorConfig.minDifference
-      )
+      if (
+        maxGender != "" &&
+        maxCount > GenderExtractorConfig.minCount &&
+        (secondCount == 0 || maxCount / secondCount > GenderExtractorConfig.minDifference)
+      )
-      if (
-        maxGender != "" &&
-        maxCount > GenderExtractorConfig.minCount &&
-        secondCount > 0 &&
-        maxCount / secondCount > GenderExtractorConfig.minDifference
-      )
+      if (
+        maxGender != "" &&
+        maxCount > GenderExtractorConfig.minCount &&
+        (secondCount == 0 || maxCount / secondCount > GenderExtractorConfig.minDifference)
+      )
       {
-        return Seq(new Quad(context.language, DBpediaDatasets.Genders, subjectUri, genderProperty, maxGender, node.sourceIri, new Datatype("rdf:langString")))
+        return Seq(
+          new Quad(
+            context.language,
+            DBpediaDatasets.Genders,
+            subjectUri,
+            genderProperty,
+            maxGender,
+            node.sourceIri,
+            new Datatype("rdf:langString")
+          )
+        )
       }
     }
 

diff --git a/core/src/main/scala/org/dbpedia/extraction/util/WikiInfo.scala b/core/src/main/scala/org/dbpedia/extraction/util/WikiInfo.scala
@@ -20,68 +20,89 @@ class WikiInfo(val wikicode: String, val pages: Int)
 object WikiInfo
 {
   val logger = Logger.getLogger(WikiInfo.getClass.getName)
+
   // hard-coded - there probably is no mirror, and the format is very specific.
-  // TODO: user might want to use a local file...
-  // TODO: mayby change this to XML serialization
   val URL = new URL("http://wikistats.wmflabs.org/api.php?action=dump&table=wikipedias&format=csv")
-  
-  // Most browsers would save the file with this name, because s23.org returns it in a http header.
+
+  // Most browsers would save the file with this name
   val FileName = "wikipedias.csv"
-  
+
   def fromFile(file: File, codec: Codec): Seq[WikiInfo] = {
     val source = Source.fromFile(file)(codec)
     try fromSource(source) finally source.close
   }
-  
+
   def fromURL(url: URL, codec: Codec): Seq[WikiInfo] = {
     val source = Source.fromURL(url)(codec)
     try fromSource(source) finally source.close
   }
-  
-  def fromSource(source: Source): Seq[WikiInfo] = { 
+
+  def fromSource(source: Source): Seq[WikiInfo] = {
     fromLines(source.getLines)
   }
-  
+
   /**
-  * Retrieves a list of all available Wikipedias from a CSV file like http://s23.org/wikistats/wikipedias_csv.php
-  * 
-  */
-  def fromLines(lines: Iterator[String]): Seq[WikiInfo] = {    
+   * Retrieves a list of all available Wikipedias from a CSV file.
+   */
+  def fromLines(lines: Iterator[String]): Seq[WikiInfo] = {
     val info = new ArrayBuffer[WikiInfo]
-
-    if (! lines.hasNext) throw new Exception("empty file")
-    lines.next // skip first line (headers)
-
-    for (line <- lines)
-      if (line.nonEmpty)
-        fromLine(line) match{
-          case Some(x) => info += x
-          case None =>
+
+    if (!lines.hasNext) {
+      logger.warning("wikipedias.csv is empty")
+      return info
+    }
+
+    lines.next() // skip header
+
+    for (line <- lines) {
+      if (line.nonEmpty) {
+        fromLine(line) match {
+          case Some(wikiInfo) => info += wikiInfo
+          case None => // skip malformed line
         }
-
+      }
+    }
+
     info
   }
-  
+
   /**
    * Reads a WikiInfo object from a single CSV line.
+   * Malformed lines are logged and skipped.
    */
   def fromLine(line: String): Option[WikiInfo] = {
-      val fields = line.split(",", -1)
-
-      if (fields.length < 15) throw new Exception("expected [15] fields, found ["+fields.length+"] in line ["+line+"]")
-
-      val pages = try fields(4).toInt
-      catch { case nfe: NumberFormatException => 0 }
-
-      val wikiCode = fields(2)
-      if (! ConfigUtils.LanguageRegex.pattern.matcher(fields(2)).matches) throw new Exception("expected language code in field with index [2], found line ["+line+"]")
-
-      //if(Language.map.keySet.contains(wikiCode))
-        Option(new WikiInfo(wikiCode, pages))
-      //else
-      //{
-      //  logger.log(Level.WARNING, "Language: " + wikiCode + " will be ignored. Add this language to the addonlangs.json file to extract it.")
-      //  None
-      //}
+
+    val fields = line.split(",", -1)
+
+    // 1️⃣ Validate field count
+    if (fields.length < 15) {
+      logger.warning(
+        s"Skipping malformed CSV line: expected 15 fields, found ${fields.length}. Line: [$line]"
+      )
+      return None
+    }
+
+    // 2️⃣ Parse pages safely
+    val pages =
+      try fields(4).toInt
+      catch {
+        case _: NumberFormatException =>
+          logger.warning(
+            s"Invalid page count in CSV line, defaulting to 0. Line: [$line]"
+          )
+          0
+      }
+
+    // 3️⃣ Validate language code
+    val wikiCode = fields(2)
+    if (!ConfigUtils.LanguageRegex.pattern.matcher(wikiCode).matches) {
+      logger.warning(
+        s"Invalid language code [$wikiCode] in CSV line, skipping. Line: [$line]"
+      )
+      return None
+    }
+
+    // 4️⃣ Valid line → create WikiInfo
+    Some(new WikiInfo(wikiCode, pages))
   }
 }
diff --git a/server/src/main/scala/org/dbpedia/extraction/server/ExtractionManager.scala b/server/src/main/scala/org/dbpedia/extraction/server/ExtractionManager.scala
@@ -148,31 +148,50 @@ abstract class ExtractionManager(
 
     protected def loadMappingPages(language : Language) : Map[WikiTitle, WikiPage] =
     {
-        val namespace = language.wikiCode match {
-          case "wikidata" =>
-            Namespace.mappings(Language.English)
-          case _ =>
-            Namespace.mappings.getOrElse(language, throw new NoSuchElementException("no mapping namespace for language "+language.wikiCode))
-        }
+    val namespaceOpt = language.wikiCode match {
+      case "wikidata" =>
+        Some(Namespace.mappings(Language.English))
+      case _ =>
+        Namespace.mappings.get(language)
+    }
 
+    if (namespaceOpt.isEmpty) {
+      logger.warning(
+        s"No mapping namespace for language ${language.wikiCode} – skipping mapping pages."
+      )
+      return Map.empty
+    }
 
-        val source = if (paths.mappingsDir != null && paths.mappingsDir.isDirectory)
-        {
-            val file = new File(paths.mappingsDir, namespace.name(Language.Mappings).replace(' ','_')+".xml")
-            if(!file.exists()) {
-              logger.warning("MAPPING FILE [" + file + "] DOES NOT EXIST! WILL BE IGNORED")
-              return Map[WikiTitle, WikiPage]()
-            }
-            logger.warning("LOADING MAPPINGS NOT FROM SERVER, BUT FROM LOCAL FILE ["+file+"] - MAY BE OUTDATED - ONLY FOR TESTING!")
-            XMLSource.fromFile(file, language) // TODO: use Language.Mappings?
-        }
-        else
-        {
-            val url = paths.apiUrl
-            WikiSource.fromNamespaces(Set(namespace), url, language) // TODO: use Language.Mappings?
+    val namespace = namespaceOpt.get
+
+    val source = if (paths.mappingsDir != null && paths.mappingsDir.isDirectory)
+    {
+        val file = new File(
+          paths.mappingsDir,
+          namespace.name(Language.Mappings).replace(' ','_') + ".xml"
+        )
+
+        if (!file.exists()) {
+          logger.warning(
+            "MAPPING FILE [" + file + "] DOES NOT EXIST! WILL BE IGNORED"
+          )
+          return Map.empty
         }
 
-        source.map(page => (page.title, page)).toMap
+        logger.warning(
+          "LOADING MAPPINGS NOT FROM SERVER, BUT FROM LOCAL FILE [" + file +
+          "] - MAY BE OUTDATED - ONLY FOR TESTING!"
+        )
+
+        XMLSource.fromFile(file, language)
+    }
+    else
+    {
+        val url = paths.apiUrl
+        WikiSource.fromNamespaces(Set(namespace), url, language)
+    }
+
+    source.map(page => (page.title, page)).toMap
     }
 
     protected def loadOntology() : Ontology =