Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -13,26 +13,30 @@ import scala.language.reflectiveCalls
/**
* Extracts the grammatical gender of people using a heuristic.
*/
class GenderExtractor(
class GenderExtractor(
context : {
def mappings : Mappings
def ontology : Ontology
def language : Language
def redirects : Redirects
}
)
def redirects : Redirects
}
)
extends MappingExtractor(context)
{
private val language = context.language.wikiCode

private val pronounMap: Map[String, String] = GenderExtractorConfig.pronounsMap(language)
private val pronounMap: Map[String, String] =
GenderExtractorConfig.pronounsMap(language)

// ✅ Use ontology instead of hardcoded URIs
private val genderProperty =
context.ontology.properties("foaf:gender").uri

private val typeProperty =
context.ontology.properties("rdf:type").uri

// FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.FOAF)
private val genderProperty = "http://xmlns.com/foaf/0.1/gender"
// FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.RDF)
private val typeProperty = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type"
// FIXME: don't use string constant, use context.ontology (or at least DBpediaNamespace.ONTOLOGY)
private val personUri = "http://dbpedia.org/ontology/Person"
private val personUri =
context.ontology.classes("Person").uri

override val datasets = Set(DBpediaDatasets.Genders)

Expand All @@ -45,14 +49,17 @@ extends MappingExtractor(context)
// Even better: in the first extraction pass, extract all types. Use them in the second pass.
val mappingGraph = super.extract(node, subjectUri)

// if this page is mapped onto Person
if (mappingGraph.exists(q => q.predicate == typeProperty && q.value == personUri))
// check if page is typed as Person
if (mappingGraph.exists(q =>
q.predicate == typeProperty && q.value == personUri
))
{
// get the page text
val wikiText: String = node.toWikiText

// count gender pronouns
var genderCounts: Map[String, Int] = Map()

for ((pronoun, gender) <- pronounMap)
{
val regex = new Regex("\\W" + pronoun + "\\W")
Expand All @@ -65,6 +72,7 @@ extends MappingExtractor(context)
var maxGender = ""
var maxCount = 0
var secondCount = 0.0

for ((gender, count) <- genderCounts)
{
if (count > maxCount)
Expand All @@ -75,10 +83,24 @@ extends MappingExtractor(context)
}
}

// output triple for maximum gender
if (maxGender != "" && maxCount > GenderExtractorConfig.minCount && maxCount/secondCount > GenderExtractorConfig.minDifference)
if (
maxGender != "" &&
maxCount > GenderExtractorConfig.minCount &&
secondCount > 0 &&
maxCount / secondCount > GenderExtractorConfig.minDifference
)
Comment on lines +86 to +91
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Behavior change: single-gender pages will no longer be extracted.

The added secondCount > 0 guard (Line 89) changes semantics. Previously, when only one gender's pronouns were found (secondCount == 0.0), the division maxCount / 0.0 yielded Infinity, which exceeded minDifference, so the gender was reported. Now those cases are silently skipped.

This could suppress valid extractions for pages where pronouns of only one gender appear. If intentional, please add a comment explaining the rationale. If not, consider an alternative like:

      if (
        maxGender != "" &&
        maxCount > GenderExtractorConfig.minCount &&
-       secondCount > 0 &&
-       maxCount / secondCount > GenderExtractorConfig.minDifference
+       (secondCount == 0 || maxCount / secondCount > GenderExtractorConfig.minDifference)
      )
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if (
maxGender != "" &&
maxCount > GenderExtractorConfig.minCount &&
secondCount > 0 &&
maxCount / secondCount > GenderExtractorConfig.minDifference
)
if (
maxGender != "" &&
maxCount > GenderExtractorConfig.minCount &&
(secondCount == 0 || maxCount / secondCount > GenderExtractorConfig.minDifference)
)
🤖 Prompt for AI Agents
In `@core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala`
around lines 86 - 91, The new guard `secondCount > 0` in the extraction
condition changes semantics and prevents single‑gender pages from being
extracted; either remove that guard and instead explicitly handle a zero
`secondCount` (e.g., treat `secondCount == 0` as an automatic pass when
`maxCount > GenderExtractorConfig.minCount`), or keep the guard but add a clear
comment explaining the intentional behavior change; update the condition around
`maxGender`, `maxCount`, `secondCount`, `GenderExtractorConfig.minCount` and
`GenderExtractorConfig.minDifference` so that zero `secondCount` is handled
deterministically (avoid silent skipping) and document the chosen rationale
adjacent to the `secondCount` check.

{
return Seq(new Quad(context.language, DBpediaDatasets.Genders, subjectUri, genderProperty, maxGender, node.sourceIri, new Datatype("rdf:langString")))
return Seq(
new Quad(
context.language,
DBpediaDatasets.Genders,
subjectUri,
genderProperty,
maxGender,
node.sourceIri,
new Datatype("rdf:langString")
)
)
}
}

Expand Down
103 changes: 62 additions & 41 deletions core/src/main/scala/org/dbpedia/extraction/util/WikiInfo.scala
Original file line number Diff line number Diff line change
Expand Up @@ -20,68 +20,89 @@ class WikiInfo(val wikicode: String, val pages: Int)
object WikiInfo
{
val logger = Logger.getLogger(WikiInfo.getClass.getName)

// hard-coded - there probably is no mirror, and the format is very specific.
// TODO: user might want to use a local file...
// TODO: mayby change this to XML serialization
val URL = new URL("http://wikistats.wmflabs.org/api.php?action=dump&table=wikipedias&format=csv")
// Most browsers would save the file with this name, because s23.org returns it in a http header.

// Most browsers would save the file with this name
val FileName = "wikipedias.csv"

def fromFile(file: File, codec: Codec): Seq[WikiInfo] = {
val source = Source.fromFile(file)(codec)
try fromSource(source) finally source.close
}

def fromURL(url: URL, codec: Codec): Seq[WikiInfo] = {
val source = Source.fromURL(url)(codec)
try fromSource(source) finally source.close
}
def fromSource(source: Source): Seq[WikiInfo] = {

def fromSource(source: Source): Seq[WikiInfo] = {
fromLines(source.getLines)
}

/**
* Retrieves a list of all available Wikipedias from a CSV file like http://s23.org/wikistats/wikipedias_csv.php
*
*/
def fromLines(lines: Iterator[String]): Seq[WikiInfo] = {
* Retrieves a list of all available Wikipedias from a CSV file.
*/
def fromLines(lines: Iterator[String]): Seq[WikiInfo] = {
val info = new ArrayBuffer[WikiInfo]

if (! lines.hasNext) throw new Exception("empty file")
lines.next // skip first line (headers)

for (line <- lines)
if (line.nonEmpty)
fromLine(line) match{
case Some(x) => info += x
case None =>

if (!lines.hasNext) {
logger.warning("wikipedias.csv is empty")
return info
}

lines.next() // skip header

for (line <- lines) {
if (line.nonEmpty) {
fromLine(line) match {
case Some(wikiInfo) => info += wikiInfo
case None => // skip malformed line
}

}
}

info
}

/**
* Reads a WikiInfo object from a single CSV line.
* Malformed lines are logged and skipped.
*/
def fromLine(line: String): Option[WikiInfo] = {
val fields = line.split(",", -1)

if (fields.length < 15) throw new Exception("expected [15] fields, found ["+fields.length+"] in line ["+line+"]")

val pages = try fields(4).toInt
catch { case nfe: NumberFormatException => 0 }

val wikiCode = fields(2)
if (! ConfigUtils.LanguageRegex.pattern.matcher(fields(2)).matches) throw new Exception("expected language code in field with index [2], found line ["+line+"]")

//if(Language.map.keySet.contains(wikiCode))
Option(new WikiInfo(wikiCode, pages))
//else
//{
// logger.log(Level.WARNING, "Language: " + wikiCode + " will be ignored. Add this language to the addonlangs.json file to extract it.")
// None
//}

val fields = line.split(",", -1)

// 1️⃣ Validate field count
if (fields.length < 15) {
logger.warning(
s"Skipping malformed CSV line: expected 15 fields, found ${fields.length}. Line: [$line]"
)
return None
}

// 2️⃣ Parse pages safely
val pages =
try fields(4).toInt
catch {
case _: NumberFormatException =>
logger.warning(
s"Invalid page count in CSV line, defaulting to 0. Line: [$line]"
)
0
}

// 3️⃣ Validate language code
val wikiCode = fields(2)
if (!ConfigUtils.LanguageRegex.pattern.matcher(wikiCode).matches) {
logger.warning(
s"Invalid language code [$wikiCode] in CSV line, skipping. Line: [$line]"
)
return None
}

// 4️⃣ Valid line → create WikiInfo
Some(new WikiInfo(wikiCode, pages))
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -148,31 +148,50 @@ abstract class ExtractionManager(

protected def loadMappingPages(language : Language) : Map[WikiTitle, WikiPage] =
{
val namespace = language.wikiCode match {
case "wikidata" =>
Namespace.mappings(Language.English)
case _ =>
Namespace.mappings.getOrElse(language, throw new NoSuchElementException("no mapping namespace for language "+language.wikiCode))
}
val namespaceOpt = language.wikiCode match {
case "wikidata" =>
Some(Namespace.mappings(Language.English))
case _ =>
Namespace.mappings.get(language)
}

if (namespaceOpt.isEmpty) {
logger.warning(
s"No mapping namespace for language ${language.wikiCode} – skipping mapping pages."
)
return Map.empty
}

val source = if (paths.mappingsDir != null && paths.mappingsDir.isDirectory)
{
val file = new File(paths.mappingsDir, namespace.name(Language.Mappings).replace(' ','_')+".xml")
if(!file.exists()) {
logger.warning("MAPPING FILE [" + file + "] DOES NOT EXIST! WILL BE IGNORED")
return Map[WikiTitle, WikiPage]()
}
logger.warning("LOADING MAPPINGS NOT FROM SERVER, BUT FROM LOCAL FILE ["+file+"] - MAY BE OUTDATED - ONLY FOR TESTING!")
XMLSource.fromFile(file, language) // TODO: use Language.Mappings?
}
else
{
val url = paths.apiUrl
WikiSource.fromNamespaces(Set(namespace), url, language) // TODO: use Language.Mappings?
val namespace = namespaceOpt.get

val source = if (paths.mappingsDir != null && paths.mappingsDir.isDirectory)
{
val file = new File(
paths.mappingsDir,
namespace.name(Language.Mappings).replace(' ','_') + ".xml"
)

if (!file.exists()) {
logger.warning(
"MAPPING FILE [" + file + "] DOES NOT EXIST! WILL BE IGNORED"
)
return Map.empty
}

source.map(page => (page.title, page)).toMap
logger.warning(
"LOADING MAPPINGS NOT FROM SERVER, BUT FROM LOCAL FILE [" + file +
"] - MAY BE OUTDATED - ONLY FOR TESTING!"
)

XMLSource.fromFile(file, language)
}
else
{
val url = paths.apiUrl
WikiSource.fromNamespaces(Set(namespace), url, language)
}

source.map(page => (page.title, page)).toMap
}

protected def loadOntology() : Ontology =
Expand Down
Loading