From d5867cd664577e898a1427f52ca51b17ce8b0e41 Mon Sep 17 00:00:00 2001 From: vaibhav45sktech Date: Sat, 24 Jan 2026 18:09:20 +0000 Subject: [PATCH 1/8] updated chnages --- .../extraction/mappings/GenderExtractor.scala | 177 ++++++++++-------- 1 file changed, 104 insertions(+), 73 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala index 32b1221e59..d75130b18a 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala @@ -1,88 +1,119 @@ package org.dbpedia.extraction.mappings -import org.dbpedia.extraction.config.provenance.DBpediaDatasets -import org.dbpedia.extraction.transform.Quad -import org.dbpedia.extraction.wikiparser._ import org.dbpedia.extraction.config.mappings.GenderExtractorConfig +import org.dbpedia.extraction.config.provenance.DBpediaDatasets import org.dbpedia.extraction.ontology.Ontology +import org.dbpedia.extraction.ontology.datatypes.Datatype +import org.dbpedia.extraction.transform.Quad import org.dbpedia.extraction.util.Language +import org.dbpedia.extraction.wikiparser._ import util.matching.Regex -import org.dbpedia.extraction.ontology.datatypes.Datatype import scala.language.reflectiveCalls - /** - * Extracts the grammatical gender of people using a heuristic. + * Extracts the grammatical gender of people using a pronoun-based heuristic. */ -class GenderExtractor( - context : { - def mappings : Mappings - def ontology : Ontology - def language : Language - def redirects : Redirects - } -) -extends MappingExtractor(context) -{ - private val language = context.language.wikiCode - - private val pronounMap: Map[String, String] = GenderExtractorConfig.pronounsMap(language) - - // FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.FOAF) - private val genderProperty = "http://xmlns.com/foaf/0.1/gender" - // FIXME: don't use string constant, use context.ontology (or at least RdfNamespace.RDF) - private val typeProperty = "http://www.w3.org/1999/02/22-rdf-syntax-ns#type" - // FIXME: don't use string constant, use context.ontology (or at least DBpediaNamespace.ONTOLOGY) - private val personUri = "http://dbpedia.org/ontology/Person" +class GenderExtractor( + context: { + def mappings: Mappings + def ontology: Ontology + def language: Language + def redirects: Redirects + } +) extends MappingExtractor(context) { + + /** Language code (en, de, fr, etc.) */ + private val language: String = + context.language.wikiCode + /** Pronoun → gender map (from config) */ + private val pronounMap: Map[String, String] = + GenderExtractorConfig.pronounsMap(language) + + /** Ontology-based properties & classes */ + private val genderProperty = + context.ontology.properties("foaf:gender") + + private val typeProperty = + context.ontology.properties("rdf:type") + + private val personClass = + context.ontology.classes("Person") + + private val langStringDatatype = + new Datatype("rdf:langString") override val datasets = Set(DBpediaDatasets.Genders) - override def extract(node : PageNode, subjectUri : String) : Seq[Quad] = - { - // apply mappings - // FIXME: To find out if it's a person, we extract all mapped properties a second time and throw them away. - // Find a better solution. For example: Make sure that this extractor runs after the - // MappingExtractor. In the MappingExtractor, set the page type as an attriute. - // Even better: in the first extraction pass, extract all types. Use them in the second pass. - val mappingGraph = super.extract(node, subjectUri) - - // if this page is mapped onto Person - if (mappingGraph.exists(q => q.predicate == typeProperty && q.value == personUri)) - { - // get the page text - val wikiText: String = node.toWikiText - - // count gender pronouns - var genderCounts: Map[String, Int] = Map() - for ((pronoun, gender) <- pronounMap) - { - val regex = new Regex("\\W" + pronoun + "\\W") - val count = regex.findAllIn(wikiText).size - val oldCount = genderCounts.getOrElse(gender, 0) - genderCounts = genderCounts.updated(gender, oldCount + count) - } - - // get maximum gender - var maxGender = "" - var maxCount = 0 - var secondCount = 0.0 - for ((gender, count) <- genderCounts) - { - if (count > maxCount) - { - secondCount = maxCount.toDouble - maxCount = count - maxGender = gender - } - } - - // output triple for maximum gender - if (maxGender != "" && maxCount > GenderExtractorConfig.minCount && maxCount/secondCount > GenderExtractorConfig.minDifference) - { - return Seq(new Quad(context.language, DBpediaDatasets.Genders, subjectUri, genderProperty, maxGender, node.sourceIri, new Datatype("rdf:langString"))) - } + override def extract(node: PageNode, subjectUri: String): Seq[Quad] = { + + /** First pass: extract mappings to detect rdf:type */ + val mappingGraph: Seq[Quad] = + super.extract(node, subjectUri) + + /** Check if entity is a dbo:Person */ + val isPerson: Boolean = + mappingGraph.exists(q => + q.predicate == typeProperty.uri && + q.value == personClass.uri + ) + + if (!isPerson) return Seq.empty + + /** Get full wiki text */ + val wikiText: String = + node.toWikiText + + /** Count pronouns by gender */ + var genderCounts: Map[String, Int] = + Map.empty.withDefaultValue(0) + + for ((pronoun, gender) <- pronounMap) { + val regex = + new Regex("(?i)\\b" + Regex.quote(pronoun) + "\\b") + + val count = + regex.findAllIn(wikiText).size + + genderCounts = + genderCounts.updated(gender, genderCounts(gender) + count) } - Seq.empty - } + if (genderCounts.isEmpty) return Seq.empty + + /** Find dominant gender */ + val sorted = + genderCounts.toSeq.sortBy(-_._2) + + val (maxGender, maxCount) = + sorted.head + + val secondCount: Double = + if (sorted.size > 1) sorted(1)._2.toDouble else 0.0 -} + /** Avoid division-by-zero */ + val differenceOk: Boolean = + secondCount == 0.0 || + (maxCount.toDouble / secondCount) > + GenderExtractorConfig.minDifference + + /** Threshold checks */ + if ( + maxGender.nonEmpty && + maxCount > GenderExtractorConfig.minCount && + differenceOk + ) { + Seq( + new Quad( + context.language, + DBpediaDatasets.Genders, + subjectUri, + genderProperty, + maxGender, + node.sourceIri, + langStringDatatype + ) + ) + } else { + Seq.empty + } + } +} \ No newline at end of file From 693d5f9ee1606e967af19b952d71d7290b6443d4 Mon Sep 17 00:00:00 2001 From: vaibhav45sktech Date: Sun, 8 Feb 2026 17:35:54 +0000 Subject: [PATCH 2/8] smallchnage --- .../extraction/mappings/GenderExtractor.scala | 42 ++++++++----------- 1 file changed, 17 insertions(+), 25 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala index d75130b18a..8ac56def08 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala @@ -1,53 +1,45 @@ package org.dbpedia.extraction.mappings -import org.dbpedia.extraction.config.mappings.GenderExtractorConfig import org.dbpedia.extraction.config.provenance.DBpediaDatasets -import org.dbpedia.extraction.ontology.Ontology -import org.dbpedia.extraction.ontology.datatypes.Datatype import org.dbpedia.extraction.transform.Quad -import org.dbpedia.extraction.util.Language import org.dbpedia.extraction.wikiparser._ +import org.dbpedia.extraction.config.mappings.GenderExtractorConfig +import org.dbpedia.extraction.ontology.Ontology +import org.dbpedia.extraction.util.Language import util.matching.Regex +import org.dbpedia.extraction.ontology.datatypes.Datatype import scala.language.reflectiveCalls /** * Extracts the grammatical gender of people using a pronoun-based heuristic. */ class GenderExtractor( context: { - def mappings: Mappings - def ontology: Ontology - def language: Language - def redirects: Redirects + def mappings: Mappings + def ontology: Ontology + def language: Language + def redirects: Redirects } ) extends MappingExtractor(context) { - /** Language code (en, de, fr, etc.) */ private val language: String = - context.language.wikiCode + context.language.wikiCode /** Pronoun → gender map (from config) */ private val pronounMap: Map[String, String] = - GenderExtractorConfig.pronounsMap(language) - + GenderExtractorConfig.pronounsMap(language) /** Ontology-based properties & classes */ private val genderProperty = - context.ontology.properties("foaf:gender") - + context.ontology.properties("foaf:gender") private val typeProperty = - context.ontology.properties("rdf:type") - + context.ontology.properties("rdf:type") private val personClass = - context.ontology.classes("Person") - + context.ontology.classes("Person") private val langStringDatatype = - new Datatype("rdf:langString") - + new Datatype("rdf:langString") override val datasets = Set(DBpediaDatasets.Genders) - override def extract(node: PageNode, subjectUri: String): Seq[Quad] = { - - /** First pass: extract mappings to detect rdf:type */ - val mappingGraph: Seq[Quad] = - super.extract(node, subjectUri) + /** First pass: extract mappings to detect rdf:type */ + val mappingGraph: Seq[Quad] = + super.extract(node, subjectUri) /** Check if entity is a dbo:Person */ val isPerson: Boolean = From c533fec37a0cc3ee1aa4af012f9e09490d36a3fa Mon Sep 17 00:00:00 2001 From: vaibhav45sktech Date: Sun, 8 Feb 2026 17:45:50 +0000 Subject: [PATCH 3/8] change --- .../extraction/mappings/GenderExtractor.scala | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala index 8ac56def08..28414f9df4 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala @@ -14,12 +14,14 @@ import scala.language.reflectiveCalls */ class GenderExtractor( context: { - def mappings: Mappings - def ontology: Ontology - def language: Language - def redirects: Redirects + def mappings: Mappings + def ontology: Ontology + def language: Language + def redirects: Redirects } -) extends MappingExtractor(context) { +) +extends MappingExtractor(context) +{ /** Language code (en, de, fr, etc.) */ private val language: String = context.language.wikiCode From d5cc749aa999c4f0557bacf3e02f4855b6cd4166 Mon Sep 17 00:00:00 2001 From: vaibhav45sktech Date: Sun, 8 Feb 2026 17:49:45 +0000 Subject: [PATCH 4/8] indentation --- .../extraction/mappings/GenderExtractor.scala | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala index 28414f9df4..559bdfb10a 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala @@ -14,11 +14,13 @@ import scala.language.reflectiveCalls */ class GenderExtractor( context: { - def mappings: Mappings - def ontology: Ontology - def language: Language - def redirects: Redirects - } + def mappings : Mappings + def ontology : Ontology + def language : Language + def redirects : Redirects + } + + ) extends MappingExtractor(context) { From e6312c356433c9144b3ba75e64130ac3453abbb2 Mon Sep 17 00:00:00 2001 From: vaibhav45sktech Date: Sun, 8 Feb 2026 17:53:37 +0000 Subject: [PATCH 5/8] schange --- .../org/dbpedia/extraction/mappings/GenderExtractor.scala | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala index 559bdfb10a..dfd05c6e16 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala @@ -13,12 +13,14 @@ import scala.language.reflectiveCalls * Extracts the grammatical gender of people using a pronoun-based heuristic. */ class GenderExtractor( - context: { + context : { def mappings : Mappings def ontology : Ontology def language : Language def redirects : Redirects - } + } + + ) From 436543b99322afeb54cc011b282eaff0277d4b5f Mon Sep 17 00:00:00 2001 From: vaibhav45sktech Date: Sun, 8 Feb 2026 17:58:11 +0000 Subject: [PATCH 6/8] sschnage --- .../org/dbpedia/extraction/mappings/GenderExtractor.scala | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala index dfd05c6e16..9770abd984 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala @@ -9,15 +9,16 @@ import org.dbpedia.extraction.util.Language import util.matching.Regex import org.dbpedia.extraction.ontology.datatypes.Datatype import scala.language.reflectiveCalls + /** * Extracts the grammatical gender of people using a pronoun-based heuristic. */ -class GenderExtractor( +class GenderExtractor( context : { def mappings : Mappings def ontology : Ontology def language : Language - def redirects : Redirects + def redirects : Redirects } From a9017ca28fba40af9530c953a4c4bc1e8fac092c Mon Sep 17 00:00:00 2001 From: vaibhav45sktech Date: Wed, 11 Feb 2026 14:45:06 +0000 Subject: [PATCH 7/8] removedspaces --- .../org/dbpedia/extraction/mappings/GenderExtractor.scala | 4 ---- 1 file changed, 4 deletions(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala index 9770abd984..b81d97a436 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala @@ -20,10 +20,6 @@ class GenderExtractor( def language : Language def redirects : Redirects } - - - - ) extends MappingExtractor(context) { From 62cdd3d134a16b2a9bd96d070623fd674384e272 Mon Sep 17 00:00:00 2001 From: vaibhav45sktech Date: Wed, 11 Feb 2026 14:46:27 +0000 Subject: [PATCH 8/8] spaces --- .../scala/org/dbpedia/extraction/mappings/GenderExtractor.scala | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala index b81d97a436..014346bea8 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/GenderExtractor.scala @@ -21,7 +21,7 @@ class GenderExtractor( def redirects : Redirects } ) -extends MappingExtractor(context) +extends MappingExtractor(context) { /** Language code (en, de, fr, etc.) */ private val language: String =