diff --git a/.DS_Store b/.DS_Store
new file mode 100644
index 0000000000..50cdeb7d03
Binary files /dev/null and b/.DS_Store differ
diff --git a/.gitignore b/.gitignore
index 0733ddae0a..beedddcb6f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,4 +6,4 @@ target/
*.lck
*.tmp
java_pid*
-dump/test-basedir
+dump/test-basedir
\ No newline at end of file
diff --git a/core/.project b/core/.project
index a993a4c099..595e9e19b4 100644
--- a/core/.project
+++ b/core/.project
@@ -1,13 +1,35 @@
- core
-
-
- org.scala-ide.sdt.core.scalabuilder
-
-
-
- org.scala-ide.sdt.core.scalanature
- org.eclipse.jdt.core.javanature
-
-
\ No newline at end of file
+ core
+
+
+
+
+
+ org.scala-ide.sdt.core.scalabuilder
+
+
+
+
+ org.eclipse.m2e.core.maven2Builder
+
+
+
+
+
+ org.eclipse.m2e.core.maven2Nature
+ org.scala-ide.sdt.core.scalanature
+ org.eclipse.jdt.core.javanature
+
+
+
+ 1765562241624
+
+ 30
+
+ org.eclipse.core.resources.regexFilterMatcher
+ node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__
+
+
+
+
diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/HomepageExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/HomepageExtractor.scala
index 2d714ce43e..87e80f3a6d 100644
--- a/core/src/main/scala/org/dbpedia/extraction/mappings/HomepageExtractor.scala
+++ b/core/src/main/scala/org/dbpedia/extraction/mappings/HomepageExtractor.scala
@@ -6,7 +6,7 @@ import org.dbpedia.extraction.transform.Quad
import org.dbpedia.extraction.wikiparser._
import org.dbpedia.extraction.config.mappings.HomepageExtractorConfig
import org.dbpedia.extraction.ontology.Ontology
-import org.dbpedia.extraction.util.Language
+import org.dbpedia.extraction.util.{Language, DataQualityMonitor}
import org.dbpedia.iri.{IRISyntaxException, UriUtils}
import scala.language.reflectiveCalls
@@ -26,6 +26,9 @@ extends PageNodeExtractor
{
private val language = context.language.wikiCode
+ // Extraction quality monitor for logging and metrics
+ private val monitor = DataQualityMonitor.forExtractor("HomepageExtractor")
+
private val propertyNames = HomepageExtractorConfig.propertyNames(language)
private val official = HomepageExtractorConfig.official(language)
@@ -48,7 +51,10 @@ extends PageNodeExtractor
override def extract(page: PageNode, subjectUri: String): Seq[Quad] =
{
- if(page.title.namespace != Namespace.Main) return Seq.empty
+ if(page.title.namespace != Namespace.Main) {
+ monitor.logSkipped(page.title.encoded, s"Not in main namespace: ${page.title.namespace}")
+ return Seq.empty
+ }
val list = collectProperties(page).filter(p => propertyNames.contains(p.key.toLowerCase)).flatMap {
NodeUtil.splitPropertyNode(_, splitPropertyNodeLinkStrict, true)
@@ -118,12 +124,34 @@ extends PageNodeExtractor
{
UriUtils.createURI(url) match{
case Success(u) => UriUtils.cleanLink(u) match{
- case Some(c) => Seq(new Quad(context.language, DBpediaDatasets.Homepages, subjectUri, homepageProperty, c , node.sourceIri))
- case None => Seq()
+ case Some(c) =>
+ monitor.logSuccess(subjectUri, 1)
+ Seq(new Quad(context.language, DBpediaDatasets.Homepages, subjectUri, homepageProperty, c , node.sourceIri))
+ case None =>
+ monitor.logInvalidData(
+ subjectUri,
+ "URL could not be cleaned",
+ data = Some(url)
+ )
+ Seq()
}
case Failure(f) => f match{
- case _ : IRISyntaxException => Seq() // TODO: log
- case _ => Seq()
+ case ex: IRISyntaxException =>
+ monitor.logInvalidData(
+ subjectUri,
+ "Malformed IRI syntax",
+ exception = Some(ex),
+ data = Some(url)
+ )
+ Seq()
+ case ex =>
+ monitor.logInvalidData(
+ subjectUri,
+ "Unexpected error creating URI",
+ exception = Some(ex),
+ data = Some(url)
+ )
+ Seq()
}
}
}
diff --git a/core/src/main/scala/org/dbpedia/extraction/util/DataQualityMonitor.scala b/core/src/main/scala/org/dbpedia/extraction/util/DataQualityMonitor.scala
new file mode 100644
index 0000000000..aa04afb4cd
--- /dev/null
+++ b/core/src/main/scala/org/dbpedia/extraction/util/DataQualityMonitor.scala
@@ -0,0 +1,131 @@
+package org.dbpedia.extraction.util
+
+import java.util.logging.{Level, Logger}
+import java.util.concurrent.atomic.AtomicLong
+import scala.collection.concurrent.TrieMap
+
+/**
+ * Monitors data quality issues during extraction.
+ * Tracks errors per extractor and provides export capabilities.
+ */
+object DataQualityMonitor {
+
+ private val logger = Logger.getLogger(classOf[DataQualityMonitor].getName)
+ private val errorCounts = new TrieMap[String, AtomicLong]()
+ private val errorDetails = new TrieMap[String, collection.mutable.ListBuffer[ExtractionError]]()
+
+ def forExtractor(extractorName: String): DataQualityMonitor = {
+ new DataQualityMonitor(extractorName)
+ }
+
+ def getGlobalMetrics(): Map[String, Long] = {
+ errorCounts.map { case (key, counter) => (key, counter.get()) }.toMap
+ }
+
+ def getErrorDetails(errorType: String, limit: Int = 100): List[ExtractionError] = {
+ errorDetails.get(errorType) match {
+ case Some(errors) => errors.take(limit).toList
+ case None => List.empty
+ }
+ }
+
+ def exportToCsv(errorType: String, limit: Int = 1000): String = {
+ val errors = getErrorDetails(errorType, limit)
+ val header = "Extractor,PageTitle,ErrorMessage,Timestamp\n"
+ val rows = errors.map(e =>
+ s"${e.extractorName},${e.pageTitle},${e.message.replaceAll(",", ";")},${e.timestamp}"
+ ).mkString("\n")
+ header + rows
+ }
+
+ def reset(): Unit = {
+ errorCounts.clear()
+ errorDetails.clear()
+ }
+}
+
+class DataQualityMonitor(val extractorName: String) {
+
+ private val logger = Logger.getLogger(s"org.dbpedia.extraction.monitor.$extractorName")
+
+ def logInvalidData(
+ pageTitle: String,
+ reason: String,
+ exception: Option[Throwable] = None,
+ data: Option[String] = None
+ ): Unit = {
+ val message = buildMessage(pageTitle, reason, data)
+ exception match {
+ case Some(ex) => logger.log(Level.WARNING, message, ex)
+ case None => logger.warning(message)
+ }
+ recordError(pageTitle, reason, exception)
+ }
+
+ def logSkipped(pageTitle: String, reason: String): Unit = {
+ logger.fine(s"[$extractorName] Skipped '$pageTitle': $reason")
+ }
+
+ def logSuccess(pageTitle: String, triplesCount: Int): Unit = {
+ logger.fine(s"[$extractorName] Extracted $triplesCount triples from '$pageTitle'")
+ }
+
+ def getMetrics(): Map[String, Long] = {
+ DataQualityMonitor.errorCounts
+ .filter { case (key, _) => key.startsWith(s"$extractorName:") }
+ .map { case (key, counter) => (key, counter.get()) }
+ .toMap
+ }
+
+ def getTotalErrors(): Long = getMetrics().values.sum
+
+ private def buildMessage(pageTitle: String, reason: String, data: Option[String]): String = {
+ val dataStr = data.map(d => s" | Data: ${truncate(d, 200)}").getOrElse("")
+ s"[$extractorName] Invalid data in '$pageTitle': $reason$dataStr"
+ }
+
+ private def recordError(pageTitle: String, reason: String, exception: Option[Throwable]): Unit = {
+ val errorType = s"$extractorName:${categorizeError(reason, exception)}"
+
+ DataQualityMonitor.errorCounts
+ .getOrElseUpdate(errorType, new AtomicLong(0))
+ .incrementAndGet()
+
+ val errorDetail = ExtractionError(
+ extractorName = extractorName,
+ pageTitle = pageTitle,
+ message = reason,
+ exceptionType = exception.map(_.getClass.getSimpleName),
+ timestamp = System.currentTimeMillis()
+ )
+
+ DataQualityMonitor.errorDetails.synchronized {
+ val buffer = DataQualityMonitor.errorDetails
+ .getOrElseUpdate(errorType, collection.mutable.ListBuffer.empty)
+ if (buffer.size < 10000) buffer += errorDetail
+ }
+ }
+
+ private def categorizeError(reason: String, exception: Option[Throwable]): String = {
+ exception match {
+ case Some(ex) => ex.getClass.getSimpleName
+ case None if reason.toLowerCase.contains("invalid") => "InvalidData"
+ case None if reason.toLowerCase.contains("malformed") => "MalformedData"
+ case None if reason.toLowerCase.contains("missing") => "MissingData"
+ case None => "Other"
+ }
+ }
+
+ private def truncate(str: String, maxLength: Int): String = {
+ if (str.length <= maxLength) str
+ else str.substring(0, maxLength) + "..."
+ }
+}
+
+case class ExtractionError(
+ extractorName: String,
+ pageTitle: String,
+ message: String,
+ exceptionType: Option[String],
+ timestamp: Long
+)
diff --git a/core/src/test/scala/org/dbpedia/extraction/util/DataQualityMonitorTest.scala b/core/src/test/scala/org/dbpedia/extraction/util/DataQualityMonitorTest.scala
new file mode 100644
index 0000000000..ee34953def
--- /dev/null
+++ b/core/src/test/scala/org/dbpedia/extraction/util/DataQualityMonitorTest.scala
@@ -0,0 +1,145 @@
+package org.dbpedia.extraction.util
+
+import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}
+
+class DataQualityMonitorTest extends FlatSpec with Matchers with BeforeAndAfter {
+
+ before {
+ DataQualityMonitor.reset()
+ }
+
+ "DataQualityMonitor" should "create monitor" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+ monitor should not be null
+ monitor.extractorName should be("TestExtractor")
+ }
+
+ it should "record errors" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+ monitor.logInvalidData(
+ "Albert_Einstein",
+ "Invalid IRI syntax",
+ Some(new IllegalArgumentException("Test exception")),
+ Some("http://malformed url")
+ )
+
+ monitor.getMetrics() should not be empty
+ monitor.getTotalErrors() should be(1)
+ }
+
+ it should "categorize errors" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+
+ monitor.logInvalidData("Page1", "Invalid data", exception = Some(new IllegalArgumentException()))
+ monitor.logInvalidData("Page2", "Malformed URL", exception = None)
+ monitor.logInvalidData("Page3", "Missing property", exception = None)
+
+ monitor.getMetrics().size should be >= 1
+ monitor.getTotalErrors() should be(3)
+ }
+
+ it should "track extractors independently" in {
+ val m1 = DataQualityMonitor.forExtractor("ExtractorA")
+ val m2 = DataQualityMonitor.forExtractor("ExtractorB")
+
+ m1.logInvalidData("Page1", "Error in A")
+ m1.logInvalidData("Page2", "Another error in A")
+ m2.logInvalidData("Page3", "Error in B")
+
+ m1.getTotalErrors() should be(2)
+ m2.getTotalErrors() should be(1)
+ }
+
+ it should "provide global metrics" in {
+ val m1 = DataQualityMonitor.forExtractor("ExtractorA")
+ val m2 = DataQualityMonitor.forExtractor("ExtractorB")
+
+ m1.logInvalidData("Page1", "Error 1")
+ m2.logInvalidData("Page2", "Error 2")
+ m2.logInvalidData("Page3", "Error 3")
+
+ DataQualityMonitor.getGlobalMetrics().values.sum should be(3)
+ }
+
+ it should "retrieve error details" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+
+ monitor.logInvalidData("Einstein", "Invalid IRI", data = Some("malformed://url"))
+ monitor.logInvalidData("Tesla", "Invalid IRI", data = Some("another://bad"))
+
+ val details = DataQualityMonitor.getErrorDetails("TestExtractor:InvalidData", limit = 10)
+ details.size should be(2)
+ details.head.extractorName should be("TestExtractor")
+ }
+
+ it should "export to CSV" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+
+ monitor.logInvalidData("Einstein", "Invalid IRI")
+ monitor.logInvalidData("Tesla", "Malformed URL")
+
+ val csv = DataQualityMonitor.exportToCsv("TestExtractor:InvalidData", limit = 100)
+ csv should include("Extractor,PageTitle,ErrorMessage")
+ csv should include("Einstein")
+ }
+
+ it should "limit stored details" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+ (1 to 11000).foreach(i => monitor.logInvalidData(s"Page$i", "Error"))
+
+ monitor.getTotalErrors() should be(11000)
+ val details = DataQualityMonitor.getErrorDetails("TestExtractor:InvalidData", limit = 20000)
+ details.size should be <= 10000
+ }
+
+ it should "log skipped extractions" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+ monitor.logSkipped("TestPage", "Not in main namespace")
+ monitor.getTotalErrors() should be(0)
+ }
+
+ it should "log successful extractions" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+ monitor.logSuccess("Einstein", 5)
+ monitor.getTotalErrors() should be(0)
+ }
+
+ it should "handle concurrent logging" in {
+ val monitor = DataQualityMonitor.forExtractor("ConcurrentExtractor")
+
+ val threads = (1 to 10).map { i =>
+ new Thread {
+ override def run(): Unit = {
+ for (j <- 1 to 100) {
+ monitor.logInvalidData(s"Page_${i}_$j", s"Error from thread $i")
+ }
+ }
+ }
+ }
+
+ threads.foreach(_.start())
+ threads.foreach(_.join())
+ monitor.getTotalErrors() should be(1000)
+ }
+
+ "Error details" should "include required fields" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+
+ monitor.logInvalidData(
+ "TestPage",
+ "Test error message",
+ Some(new IllegalArgumentException("Test")),
+ Some("test data")
+ )
+
+ val details = DataQualityMonitor.getErrorDetails("TestExtractor:IllegalArgumentException", limit = 1)
+ details should have size 1
+
+ val error = details.head
+ error.extractorName should be("TestExtractor")
+ error.pageTitle should be("TestPage")
+ error.message should be("Test error message")
+ error.exceptionType should be(Some("IllegalArgumentException"))
+ error.timestamp should be > 0L
+ }
+}
diff --git a/documentation/data-quality-monitoring.md b/documentation/data-quality-monitoring.md
new file mode 100644
index 0000000000..aea6e02419
--- /dev/null
+++ b/documentation/data-quality-monitoring.md
@@ -0,0 +1,129 @@
+# Data Quality Monitoring
+
+A utility for tracking extraction errors and data quality issues in DBpedia extractors.
+
+## Why?
+
+Many extractors silently drop invalid data without logging. This makes debugging hard and we have no visibility into what's failing. This monitor fixes that.
+
+## Basic Usage
+
+```scala
+import org.dbpedia.extraction.util.DataQualityMonitor
+
+// Create a monitor for your extractor
+val monitor = DataQualityMonitor.forExtractor("MyExtractor")
+
+// Log an error
+monitor.logInvalidData(
+ pageTitle = "Some_Page",
+ reason = "Malformed URL",
+ exception = Some(ex),
+ data = Some(badUrl)
+)
+
+// Log success (optional)
+monitor.logSuccess(pageTitle, triplesCount)
+
+// Log skipped page (optional)
+monitor.logSkipped(pageTitle, "Not in main namespace")
+```
+
+## Getting Metrics
+
+```scala
+// Metrics for one extractor
+val metrics = monitor.getMetrics()
+// => Map("MyExtractor:IRISyntaxException" -> 42, "MyExtractor:InvalidData" -> 15)
+
+// Total errors
+val total = monitor.getTotalErrors()
+
+// Global metrics (all extractors)
+val all = DataQualityMonitor.getGlobalMetrics()
+```
+
+## Error Details
+
+```scala
+// Get detailed error info
+val errors = DataQualityMonitor.getErrorDetails("MyExtractor:IRISyntaxException", limit = 100)
+
+errors.foreach { e =>
+ println(s"${e.pageTitle}: ${e.message}")
+}
+
+// Export to CSV
+val csv = DataQualityMonitor.exportToCsv("MyExtractor:IRISyntaxException", limit = 1000)
+```
+
+## Web Dashboard
+
+When running the server, access the dashboard at:
+
+```
+http://localhost:9999/server/quality/
+```
+
+Endpoints:
+- `/quality/` - Dashboard with error counts
+- `/quality/metrics` - JSON metrics
+- `/quality/errors?type=X&limit=10` - Error details
+- `/quality/export?type=X` - CSV download
+
+## Example: HomepageExtractor Integration
+
+```scala
+class HomepageExtractor(...) extends PageNodeExtractor {
+
+ private val monitor = DataQualityMonitor.forExtractor("HomepageExtractor")
+
+ override def extract(page: PageNode, subjectUri: String): Seq[Quad] = {
+ if (page.title.namespace != Namespace.Main) {
+ monitor.logSkipped(page.title.encoded, "Wrong namespace")
+ return Seq.empty
+ }
+ // ... extraction logic
+ }
+
+ private def generateStatement(...): Seq[Quad] = {
+ UriUtils.createURI(url) match {
+ case Success(u) =>
+ monitor.logSuccess(subjectUri, 1)
+ Seq(new Quad(...))
+
+ case Failure(ex: IRISyntaxException) =>
+ monitor.logInvalidData(subjectUri, "Bad IRI", Some(ex), Some(url))
+ Seq()
+ }
+ }
+}
+```
+
+## Error Categorization
+
+Errors are auto-categorized based on:
+- Exception type (if provided): `IRISyntaxException`, `RuntimeException`, etc.
+- Message keywords: "invalid" → `InvalidData`, "malformed" → `MalformedData`, "missing" → `MissingData`
+- Default: `Other`
+
+## Notes
+
+- Thread-safe (uses `TrieMap` and `AtomicLong`)
+- Memory bounded (max 10K errors per type)
+- Minimal overhead
+- Call `DataQualityMonitor.reset()` to clear metrics (useful for testing)
+
+## Adding to Other Extractors
+
+Find code like this:
+```scala
+case _ : SomeException => Seq() // TODO: log
+```
+
+Replace with:
+```scala
+case ex: SomeException =>
+ monitor.logInvalidData(page, "reason", Some(ex), Some(data))
+ Seq()
+```
\ No newline at end of file
diff --git a/dump/.project b/dump/.project
index cef50fe4d3..48578e6b99 100644
--- a/dump/.project
+++ b/dump/.project
@@ -1,16 +1,36 @@
- dump
-
- core
-
-
-
- org.scala-ide.sdt.core.scalabuilder
-
-
-
- org.scala-ide.sdt.core.scalanature
- org.eclipse.jdt.core.javanature
-
-
\ No newline at end of file
+ dump
+
+
+ core
+
+
+
+ org.scala-ide.sdt.core.scalabuilder
+
+
+
+
+ org.eclipse.m2e.core.maven2Builder
+
+
+
+
+
+ org.eclipse.m2e.core.maven2Nature
+ org.scala-ide.sdt.core.scalanature
+ org.eclipse.jdt.core.javanature
+
+
+
+ 1765562241631
+
+ 30
+
+ org.eclipse.core.resources.regexFilterMatcher
+ node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__
+
+
+
+
diff --git a/scripts/.project b/scripts/.project
index 3698e53539..edd63b409a 100644
--- a/scripts/.project
+++ b/scripts/.project
@@ -1,16 +1,36 @@
- scripts
-
- core
-
-
-
- org.scala-ide.sdt.core.scalabuilder
-
-
-
- org.scala-ide.sdt.core.scalanature
- org.eclipse.jdt.core.javanature
-
-
\ No newline at end of file
+ scripts
+
+
+ core
+
+
+
+ org.scala-ide.sdt.core.scalabuilder
+
+
+
+
+ org.eclipse.m2e.core.maven2Builder
+
+
+
+
+
+ org.eclipse.m2e.core.maven2Nature
+ org.scala-ide.sdt.core.scalanature
+ org.eclipse.jdt.core.javanature
+
+
+
+ 1765562241638
+
+ 30
+
+ org.eclipse.core.resources.regexFilterMatcher
+ node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__
+
+
+
+
diff --git a/server/.project b/server/.project
index f845f76f39..de69b57a70 100644
--- a/server/.project
+++ b/server/.project
@@ -1,16 +1,36 @@
- server
-
- core
-
-
-
- org.scala-ide.sdt.core.scalabuilder
-
-
-
- org.scala-ide.sdt.core.scalanature
- org.eclipse.jdt.core.javanature
-
-
\ No newline at end of file
+ server
+
+
+ core
+
+
+
+ org.scala-ide.sdt.core.scalabuilder
+
+
+
+
+ org.eclipse.m2e.core.maven2Builder
+
+
+
+
+
+ org.eclipse.m2e.core.maven2Nature
+ org.scala-ide.sdt.core.scalanature
+ org.eclipse.jdt.core.javanature
+
+
+
+ 1765562241656
+
+ 30
+
+ org.eclipse.core.resources.regexFilterMatcher
+ node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__
+
+
+
+
diff --git a/server/src/main/scala/org/dbpedia/extraction/server/resources/DataQuality.scala b/server/src/main/scala/org/dbpedia/extraction/server/resources/DataQuality.scala
new file mode 100644
index 0000000000..fc467778e7
--- /dev/null
+++ b/server/src/main/scala/org/dbpedia/extraction/server/resources/DataQuality.scala
@@ -0,0 +1,219 @@
+package org.dbpedia.extraction.server.resources
+
+import javax.ws.rs.{GET, Path, PathParam, Produces, QueryParam}
+import org.dbpedia.extraction.util.DataQualityMonitor
+import scala.xml.Elem
+
+/**
+ * REST resource for viewing Data Quality Monitor metrics
+ *
+ * Endpoints:
+ * - /quality/ - Overview of all metrics
+ * - /quality/metrics - JSON format metrics
+ * - /quality/errors/{errorType} - Detailed errors for specific type
+ * - /quality/export/{errorType} - CSV export
+ */
+@Path("/quality/")
+class DataQuality {
+
+ /**
+ * Main page showing all quality metrics
+ */
+ @GET
+ @Produces(Array("application/xhtml+xml"))
+ def get: Elem = {
+ val globalMetrics = DataQualityMonitor.getGlobalMetrics()
+
+ // Group metrics by extractor
+ val metricsByExtractor = globalMetrics.groupBy { case (key, _) =>
+ key.split(":").headOption.getOrElse("Unknown")
+ }
+
+
+ {ServerHeader.getHeader("Data Quality Monitoring", true)}
+
+
+
Data Quality Monitoring
+
Real-time extraction quality metrics and error tracking
+
+
+
+
Summary
+
+
+
Total Error Types: {globalMetrics.size}
+
Total Errors: {globalMetrics.values.sum}
+
Extractors Monitored: {metricsByExtractor.size}
+
+
+
+ {
+ if (metricsByExtractor.isEmpty) {
+
+ No errors logged yet. The monitor is running but hasn't recorded any errors.
+ Extract some pages to see metrics appear here.
+
+ } else {
+ for ((extractor, metrics) <- metricsByExtractor.toSeq.sortBy(_._1)) yield {
+ val totalErrors = metrics.values.sum
+
+
+
{extractor}
+
+
+
Total Errors: {totalErrors}
+
+
+
+ | Error Type |
+ Count |
+ Actions |
+
+
+
+ {
+ for ((errorType, count) <- metrics.toSeq.sortBy(-_._2)) yield {
+
+ | {errorType.split(":").lastOption.getOrElse(errorType)} |
+ {count} |
+
+ View Details
+ {" "}
+ Export CSV
+ |
+
+ }
+ }
+
+
+
+
+ }
+ }
+ }
+
+
+
+
API Endpoints
+
+
+
+ GET /quality/ - This page
+ GET /quality/metrics - JSON format metrics
+ GET /quality/errors?type=ErrorType&limit=10 - Error details
+ GET /quality/export?type=ErrorType&limit=1000 - CSV export
+
+
+
+
+
+
+ }
+
+ /**
+ * Get metrics in JSON format
+ */
+ @GET
+ @Path("metrics")
+ @Produces(Array("application/json"))
+ def getMetricsJson: String = {
+ val metrics = DataQualityMonitor.getGlobalMetrics()
+
+ val json = new StringBuilder()
+ json.append("{\n")
+ json.append(s""" "totalErrors": ${metrics.values.sum},\n""")
+ json.append(s""" "errorTypes": ${metrics.size},\n""")
+ json.append(""" "metrics": {""").append("\n")
+
+ val entries = metrics.toSeq
+ entries.zipWithIndex.foreach { case ((errorType, count), idx) =>
+ json.append(s""" "$errorType": $count""")
+ if (idx < entries.size - 1) json.append(",")
+ json.append("\n")
+ }
+
+ json.append(" }\n")
+ json.append("}")
+ json.toString()
+ }
+
+ /**
+ * Get detailed errors for a specific error type
+ */
+ @GET
+ @Path("errors")
+ @Produces(Array("application/xhtml+xml"))
+ def getErrors(
+ @QueryParam("type") errorType: String,
+ @QueryParam("limit") limitParam: String
+ ): Elem = {
+ val limit = Option(limitParam).map(_.toInt).getOrElse(50)
+ val errors = DataQualityMonitor.getErrorDetails(errorType, limit)
+
+
+ {ServerHeader.getHeader(s"Error Details: $errorType", true)}
+
+
+
Error Details
+
{errorType}
+
+
+ ← Back to Overview
+ {" "}
+ Export to CSV
+
+
+ {
+ if (errors.isEmpty) {
+
+ No errors found for type: {errorType}
+
+ } else {
+
+
Showing {errors.size} most recent errors
+
+
+
+ | Page Title |
+ Error Message |
+ Exception |
+ Timestamp |
+
+
+
+ {
+ for (error <- errors) yield {
+ val timestamp = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss")
+ .format(new java.util.Date(error.timestamp))
+
+ {error.pageTitle} |
+ {error.message} |
+ {error.exceptionType.getOrElse("-")} |
+ {timestamp} |
+
+ }
+ }
+
+
+
+ }
+ }
+
+
+
+ }
+
+ /**
+ * Export errors to CSV format
+ */
+ @GET
+ @Path("export")
+ @Produces(Array("text/csv"))
+ def exportCsv(
+ @QueryParam("type") errorType: String,
+ @QueryParam("limit") limitParam: String
+ ): String = {
+ val limit = Option(limitParam).map(_.toInt).getOrElse(1000)
+ DataQualityMonitor.exportToCsv(errorType, limit)
+ }
+}
diff --git a/server/src/main/scala/org/dbpedia/extraction/server/resources/Root.scala b/server/src/main/scala/org/dbpedia/extraction/server/resources/Root.scala
index 649c5d7435..6dab185d81 100644
--- a/server/src/main/scala/org/dbpedia/extraction/server/resources/Root.scala
+++ b/server/src/main/scala/org/dbpedia/extraction/server/resources/Root.scala
@@ -25,6 +25,7 @@ class Root
Statistics -
Mappings -
Extractors -
+ Data Quality -
Missing labels
diff --git a/server/src/test/scala/org/dbpedia/extraction/server/resources/DataQualityTest.scala b/server/src/test/scala/org/dbpedia/extraction/server/resources/DataQualityTest.scala
new file mode 100644
index 0000000000..4a72dbabc2
--- /dev/null
+++ b/server/src/test/scala/org/dbpedia/extraction/server/resources/DataQualityTest.scala
@@ -0,0 +1,161 @@
+package org.dbpedia.extraction.server.resources
+
+import org.dbpedia.extraction.util.DataQualityMonitor
+import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}
+
+class DataQualityTest extends FlatSpec with Matchers with BeforeAndAfter {
+
+ val dataQuality = new DataQuality()
+
+ before {
+ DataQualityMonitor.reset()
+ }
+
+ "DataQuality resource" should "show empty state" in {
+ val html = dataQuality.get
+
+ (html \\ "html").nonEmpty should be(true)
+ html.toString should include("No errors logged yet")
+ }
+
+ it should "display error summary" in {
+ val m1 = DataQualityMonitor.forExtractor("HomepageExtractor")
+ val m2 = DataQualityMonitor.forExtractor("AbstractExtractor")
+
+ m1.logInvalidData("Albert_Einstein", "Invalid IRI: malformed URL")
+ m1.logInvalidData("Nikola_Tesla", "Invalid IRI: malformed URL")
+ m2.logInvalidData("Isaac_Newton", "Missing property")
+
+ val html = dataQuality.get.toString
+ html should include("Total Error Types:")
+ html should include("HomepageExtractor")
+ html should include("AbstractExtractor")
+ }
+
+ it should "group by extractor" in {
+ DataQualityMonitor.forExtractor("ExtractorA").logInvalidData("Page1", "Error 1")
+ DataQualityMonitor.forExtractor("ExtractorB").logInvalidData("Page2", "Error 2")
+
+ val html = dataQuality.get.toString
+ html should include("ExtractorA")
+ html should include("ExtractorB")
+ }
+
+ "Metrics endpoint" should "return JSON" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+ monitor.logInvalidData("Page1", "Invalid data")
+
+ val json = dataQuality.getMetricsJson
+ json should include("totalErrors")
+ json should include("TestExtractor")
+ json.trim should startWith("{")
+ }
+
+ it should "count errors correctly" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+ (1 to 5).foreach(i => monitor.logInvalidData(s"Page$i", "Test error"))
+
+ dataQuality.getMetricsJson should include("\"totalErrors\": 5")
+ }
+
+ "Error details" should "show error table" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+ monitor.logInvalidData("Albert_Einstein", "Invalid IRI syntax")
+
+ val html = dataQuality.getErrors("TestExtractor:InvalidData", "10").toString
+ html should include(" monitor.logInvalidData(s"Page$i", "Invalid data error"))
+
+ val html = dataQuality.getErrors("TestExtractor:InvalidData", "10").toString
+ html should include("Showing")
+ }
+
+ "CSV export" should "export errors" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+ monitor.logInvalidData("Einstein", "Invalid IRI")
+
+ val csv = dataQuality.exportCsv("TestExtractor:InvalidData", "100")
+ csv should include("Extractor,PageTitle,ErrorMessage")
+ csv should include("Einstein")
+ }
+
+ it should "escape commas" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+ monitor.logInvalidData("Page1", "Invalid data with, comma, in message")
+
+ val csv = dataQuality.exportCsv("TestExtractor:InvalidData", "100")
+ csv should include("Invalid data with; comma; in message")
+ }
+
+ it should "respect export limit" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+ (1 to 200).foreach(i => monitor.logInvalidData(s"Page$i", "Test error"))
+
+ val csv = dataQuality.exportCsv("TestExtractor:InvalidData", "50")
+ csv.split("\n").length should be <= 51
+ }
+
+ "Integration" should "work with multiple extractors" in {
+ val homepage = DataQualityMonitor.forExtractor("HomepageExtractor")
+ val abstracts = DataQualityMonitor.forExtractor("AbstractExtractor")
+ val infobox = DataQualityMonitor.forExtractor("InfoboxExtractor")
+
+ homepage.logInvalidData("Einstein", "Invalid homepage URL")
+ homepage.logInvalidData("Tesla", "Invalid homepage IRI")
+ abstracts.logInvalidData("Newton", "Missing abstract")
+ infobox.logInvalidData("Curie", "Invalid property value")
+ infobox.logInvalidData("Darwin", "Invalid property value")
+
+ val page = dataQuality.get.toString
+ page should include("HomepageExtractor")
+ page should include("AbstractExtractor")
+
+ dataQuality.getMetricsJson should include("\"totalErrors\": 5")
+
+ val csv = dataQuality.exportCsv("HomepageExtractor:InvalidData", "100")
+ csv should include("Einstein")
+ csv should include("Tesla")
+ }
+
+ it should "handle concurrent access" in {
+ val monitor = DataQualityMonitor.forExtractor("ConcurrentTest")
+
+ val threads = (1 to 5).map { i =>
+ new Thread {
+ override def run(): Unit = {
+ for (j <- 1 to 20) {
+ monitor.logInvalidData(s"Page_${i}_$j", s"Error from thread $i")
+ if (j % 5 == 0) dataQuality.getMetricsJson
+ }
+ }
+ }
+ }
+
+ threads.foreach(_.start())
+ threads.foreach(_.join())
+
+ dataQuality.getMetricsJson should include("\"totalErrors\": 100")
+ }
+
+ it should "accumulate errors" in {
+ val monitor = DataQualityMonitor.forExtractor("TestExtractor")
+
+ monitor.logInvalidData("Page1", "Error 1")
+ dataQuality.getMetricsJson should include("\"totalErrors\": 1")
+
+ monitor.logInvalidData("Page2", "Error 2")
+ monitor.logInvalidData("Page3", "Error 3")
+ dataQuality.getMetricsJson should include("\"totalErrors\": 3")
+ }
+}