diff --git a/.DS_Store b/.DS_Store new file mode 100644 index 0000000000..50cdeb7d03 Binary files /dev/null and b/.DS_Store differ diff --git a/.gitignore b/.gitignore index 0733ddae0a..beedddcb6f 100644 --- a/.gitignore +++ b/.gitignore @@ -6,4 +6,4 @@ target/ *.lck *.tmp java_pid* -dump/test-basedir +dump/test-basedir \ No newline at end of file diff --git a/core/.project b/core/.project index a993a4c099..595e9e19b4 100644 --- a/core/.project +++ b/core/.project @@ -1,13 +1,35 @@ - core - - - org.scala-ide.sdt.core.scalabuilder - - - - org.scala-ide.sdt.core.scalanature - org.eclipse.jdt.core.javanature - - \ No newline at end of file + core + + + + + + org.scala-ide.sdt.core.scalabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.m2e.core.maven2Nature + org.scala-ide.sdt.core.scalanature + org.eclipse.jdt.core.javanature + + + + 1765562241624 + + 30 + + org.eclipse.core.resources.regexFilterMatcher + node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__ + + + + diff --git a/core/src/main/scala/org/dbpedia/extraction/mappings/HomepageExtractor.scala b/core/src/main/scala/org/dbpedia/extraction/mappings/HomepageExtractor.scala index 2d714ce43e..87e80f3a6d 100644 --- a/core/src/main/scala/org/dbpedia/extraction/mappings/HomepageExtractor.scala +++ b/core/src/main/scala/org/dbpedia/extraction/mappings/HomepageExtractor.scala @@ -6,7 +6,7 @@ import org.dbpedia.extraction.transform.Quad import org.dbpedia.extraction.wikiparser._ import org.dbpedia.extraction.config.mappings.HomepageExtractorConfig import org.dbpedia.extraction.ontology.Ontology -import org.dbpedia.extraction.util.Language +import org.dbpedia.extraction.util.{Language, DataQualityMonitor} import org.dbpedia.iri.{IRISyntaxException, UriUtils} import scala.language.reflectiveCalls @@ -26,6 +26,9 @@ extends PageNodeExtractor { private val language = context.language.wikiCode + // Extraction quality monitor for logging and metrics + private val monitor = DataQualityMonitor.forExtractor("HomepageExtractor") + private val propertyNames = HomepageExtractorConfig.propertyNames(language) private val official = HomepageExtractorConfig.official(language) @@ -48,7 +51,10 @@ extends PageNodeExtractor override def extract(page: PageNode, subjectUri: String): Seq[Quad] = { - if(page.title.namespace != Namespace.Main) return Seq.empty + if(page.title.namespace != Namespace.Main) { + monitor.logSkipped(page.title.encoded, s"Not in main namespace: ${page.title.namespace}") + return Seq.empty + } val list = collectProperties(page).filter(p => propertyNames.contains(p.key.toLowerCase)).flatMap { NodeUtil.splitPropertyNode(_, splitPropertyNodeLinkStrict, true) @@ -118,12 +124,34 @@ extends PageNodeExtractor { UriUtils.createURI(url) match{ case Success(u) => UriUtils.cleanLink(u) match{ - case Some(c) => Seq(new Quad(context.language, DBpediaDatasets.Homepages, subjectUri, homepageProperty, c , node.sourceIri)) - case None => Seq() + case Some(c) => + monitor.logSuccess(subjectUri, 1) + Seq(new Quad(context.language, DBpediaDatasets.Homepages, subjectUri, homepageProperty, c , node.sourceIri)) + case None => + monitor.logInvalidData( + subjectUri, + "URL could not be cleaned", + data = Some(url) + ) + Seq() } case Failure(f) => f match{ - case _ : IRISyntaxException => Seq() // TODO: log - case _ => Seq() + case ex: IRISyntaxException => + monitor.logInvalidData( + subjectUri, + "Malformed IRI syntax", + exception = Some(ex), + data = Some(url) + ) + Seq() + case ex => + monitor.logInvalidData( + subjectUri, + "Unexpected error creating URI", + exception = Some(ex), + data = Some(url) + ) + Seq() } } } diff --git a/core/src/main/scala/org/dbpedia/extraction/util/DataQualityMonitor.scala b/core/src/main/scala/org/dbpedia/extraction/util/DataQualityMonitor.scala new file mode 100644 index 0000000000..aa04afb4cd --- /dev/null +++ b/core/src/main/scala/org/dbpedia/extraction/util/DataQualityMonitor.scala @@ -0,0 +1,131 @@ +package org.dbpedia.extraction.util + +import java.util.logging.{Level, Logger} +import java.util.concurrent.atomic.AtomicLong +import scala.collection.concurrent.TrieMap + +/** + * Monitors data quality issues during extraction. + * Tracks errors per extractor and provides export capabilities. + */ +object DataQualityMonitor { + + private val logger = Logger.getLogger(classOf[DataQualityMonitor].getName) + private val errorCounts = new TrieMap[String, AtomicLong]() + private val errorDetails = new TrieMap[String, collection.mutable.ListBuffer[ExtractionError]]() + + def forExtractor(extractorName: String): DataQualityMonitor = { + new DataQualityMonitor(extractorName) + } + + def getGlobalMetrics(): Map[String, Long] = { + errorCounts.map { case (key, counter) => (key, counter.get()) }.toMap + } + + def getErrorDetails(errorType: String, limit: Int = 100): List[ExtractionError] = { + errorDetails.get(errorType) match { + case Some(errors) => errors.take(limit).toList + case None => List.empty + } + } + + def exportToCsv(errorType: String, limit: Int = 1000): String = { + val errors = getErrorDetails(errorType, limit) + val header = "Extractor,PageTitle,ErrorMessage,Timestamp\n" + val rows = errors.map(e => + s"${e.extractorName},${e.pageTitle},${e.message.replaceAll(",", ";")},${e.timestamp}" + ).mkString("\n") + header + rows + } + + def reset(): Unit = { + errorCounts.clear() + errorDetails.clear() + } +} + +class DataQualityMonitor(val extractorName: String) { + + private val logger = Logger.getLogger(s"org.dbpedia.extraction.monitor.$extractorName") + + def logInvalidData( + pageTitle: String, + reason: String, + exception: Option[Throwable] = None, + data: Option[String] = None + ): Unit = { + val message = buildMessage(pageTitle, reason, data) + exception match { + case Some(ex) => logger.log(Level.WARNING, message, ex) + case None => logger.warning(message) + } + recordError(pageTitle, reason, exception) + } + + def logSkipped(pageTitle: String, reason: String): Unit = { + logger.fine(s"[$extractorName] Skipped '$pageTitle': $reason") + } + + def logSuccess(pageTitle: String, triplesCount: Int): Unit = { + logger.fine(s"[$extractorName] Extracted $triplesCount triples from '$pageTitle'") + } + + def getMetrics(): Map[String, Long] = { + DataQualityMonitor.errorCounts + .filter { case (key, _) => key.startsWith(s"$extractorName:") } + .map { case (key, counter) => (key, counter.get()) } + .toMap + } + + def getTotalErrors(): Long = getMetrics().values.sum + + private def buildMessage(pageTitle: String, reason: String, data: Option[String]): String = { + val dataStr = data.map(d => s" | Data: ${truncate(d, 200)}").getOrElse("") + s"[$extractorName] Invalid data in '$pageTitle': $reason$dataStr" + } + + private def recordError(pageTitle: String, reason: String, exception: Option[Throwable]): Unit = { + val errorType = s"$extractorName:${categorizeError(reason, exception)}" + + DataQualityMonitor.errorCounts + .getOrElseUpdate(errorType, new AtomicLong(0)) + .incrementAndGet() + + val errorDetail = ExtractionError( + extractorName = extractorName, + pageTitle = pageTitle, + message = reason, + exceptionType = exception.map(_.getClass.getSimpleName), + timestamp = System.currentTimeMillis() + ) + + DataQualityMonitor.errorDetails.synchronized { + val buffer = DataQualityMonitor.errorDetails + .getOrElseUpdate(errorType, collection.mutable.ListBuffer.empty) + if (buffer.size < 10000) buffer += errorDetail + } + } + + private def categorizeError(reason: String, exception: Option[Throwable]): String = { + exception match { + case Some(ex) => ex.getClass.getSimpleName + case None if reason.toLowerCase.contains("invalid") => "InvalidData" + case None if reason.toLowerCase.contains("malformed") => "MalformedData" + case None if reason.toLowerCase.contains("missing") => "MissingData" + case None => "Other" + } + } + + private def truncate(str: String, maxLength: Int): String = { + if (str.length <= maxLength) str + else str.substring(0, maxLength) + "..." + } +} + +case class ExtractionError( + extractorName: String, + pageTitle: String, + message: String, + exceptionType: Option[String], + timestamp: Long +) diff --git a/core/src/test/scala/org/dbpedia/extraction/util/DataQualityMonitorTest.scala b/core/src/test/scala/org/dbpedia/extraction/util/DataQualityMonitorTest.scala new file mode 100644 index 0000000000..ee34953def --- /dev/null +++ b/core/src/test/scala/org/dbpedia/extraction/util/DataQualityMonitorTest.scala @@ -0,0 +1,145 @@ +package org.dbpedia.extraction.util + +import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers} + +class DataQualityMonitorTest extends FlatSpec with Matchers with BeforeAndAfter { + + before { + DataQualityMonitor.reset() + } + + "DataQualityMonitor" should "create monitor" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + monitor should not be null + monitor.extractorName should be("TestExtractor") + } + + it should "record errors" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + monitor.logInvalidData( + "Albert_Einstein", + "Invalid IRI syntax", + Some(new IllegalArgumentException("Test exception")), + Some("http://malformed url") + ) + + monitor.getMetrics() should not be empty + monitor.getTotalErrors() should be(1) + } + + it should "categorize errors" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + + monitor.logInvalidData("Page1", "Invalid data", exception = Some(new IllegalArgumentException())) + monitor.logInvalidData("Page2", "Malformed URL", exception = None) + monitor.logInvalidData("Page3", "Missing property", exception = None) + + monitor.getMetrics().size should be >= 1 + monitor.getTotalErrors() should be(3) + } + + it should "track extractors independently" in { + val m1 = DataQualityMonitor.forExtractor("ExtractorA") + val m2 = DataQualityMonitor.forExtractor("ExtractorB") + + m1.logInvalidData("Page1", "Error in A") + m1.logInvalidData("Page2", "Another error in A") + m2.logInvalidData("Page3", "Error in B") + + m1.getTotalErrors() should be(2) + m2.getTotalErrors() should be(1) + } + + it should "provide global metrics" in { + val m1 = DataQualityMonitor.forExtractor("ExtractorA") + val m2 = DataQualityMonitor.forExtractor("ExtractorB") + + m1.logInvalidData("Page1", "Error 1") + m2.logInvalidData("Page2", "Error 2") + m2.logInvalidData("Page3", "Error 3") + + DataQualityMonitor.getGlobalMetrics().values.sum should be(3) + } + + it should "retrieve error details" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + + monitor.logInvalidData("Einstein", "Invalid IRI", data = Some("malformed://url")) + monitor.logInvalidData("Tesla", "Invalid IRI", data = Some("another://bad")) + + val details = DataQualityMonitor.getErrorDetails("TestExtractor:InvalidData", limit = 10) + details.size should be(2) + details.head.extractorName should be("TestExtractor") + } + + it should "export to CSV" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + + monitor.logInvalidData("Einstein", "Invalid IRI") + monitor.logInvalidData("Tesla", "Malformed URL") + + val csv = DataQualityMonitor.exportToCsv("TestExtractor:InvalidData", limit = 100) + csv should include("Extractor,PageTitle,ErrorMessage") + csv should include("Einstein") + } + + it should "limit stored details" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + (1 to 11000).foreach(i => monitor.logInvalidData(s"Page$i", "Error")) + + monitor.getTotalErrors() should be(11000) + val details = DataQualityMonitor.getErrorDetails("TestExtractor:InvalidData", limit = 20000) + details.size should be <= 10000 + } + + it should "log skipped extractions" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + monitor.logSkipped("TestPage", "Not in main namespace") + monitor.getTotalErrors() should be(0) + } + + it should "log successful extractions" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + monitor.logSuccess("Einstein", 5) + monitor.getTotalErrors() should be(0) + } + + it should "handle concurrent logging" in { + val monitor = DataQualityMonitor.forExtractor("ConcurrentExtractor") + + val threads = (1 to 10).map { i => + new Thread { + override def run(): Unit = { + for (j <- 1 to 100) { + monitor.logInvalidData(s"Page_${i}_$j", s"Error from thread $i") + } + } + } + } + + threads.foreach(_.start()) + threads.foreach(_.join()) + monitor.getTotalErrors() should be(1000) + } + + "Error details" should "include required fields" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + + monitor.logInvalidData( + "TestPage", + "Test error message", + Some(new IllegalArgumentException("Test")), + Some("test data") + ) + + val details = DataQualityMonitor.getErrorDetails("TestExtractor:IllegalArgumentException", limit = 1) + details should have size 1 + + val error = details.head + error.extractorName should be("TestExtractor") + error.pageTitle should be("TestPage") + error.message should be("Test error message") + error.exceptionType should be(Some("IllegalArgumentException")) + error.timestamp should be > 0L + } +} diff --git a/documentation/data-quality-monitoring.md b/documentation/data-quality-monitoring.md new file mode 100644 index 0000000000..aea6e02419 --- /dev/null +++ b/documentation/data-quality-monitoring.md @@ -0,0 +1,129 @@ +# Data Quality Monitoring + +A utility for tracking extraction errors and data quality issues in DBpedia extractors. + +## Why? + +Many extractors silently drop invalid data without logging. This makes debugging hard and we have no visibility into what's failing. This monitor fixes that. + +## Basic Usage + +```scala +import org.dbpedia.extraction.util.DataQualityMonitor + +// Create a monitor for your extractor +val monitor = DataQualityMonitor.forExtractor("MyExtractor") + +// Log an error +monitor.logInvalidData( + pageTitle = "Some_Page", + reason = "Malformed URL", + exception = Some(ex), + data = Some(badUrl) +) + +// Log success (optional) +monitor.logSuccess(pageTitle, triplesCount) + +// Log skipped page (optional) +monitor.logSkipped(pageTitle, "Not in main namespace") +``` + +## Getting Metrics + +```scala +// Metrics for one extractor +val metrics = monitor.getMetrics() +// => Map("MyExtractor:IRISyntaxException" -> 42, "MyExtractor:InvalidData" -> 15) + +// Total errors +val total = monitor.getTotalErrors() + +// Global metrics (all extractors) +val all = DataQualityMonitor.getGlobalMetrics() +``` + +## Error Details + +```scala +// Get detailed error info +val errors = DataQualityMonitor.getErrorDetails("MyExtractor:IRISyntaxException", limit = 100) + +errors.foreach { e => + println(s"${e.pageTitle}: ${e.message}") +} + +// Export to CSV +val csv = DataQualityMonitor.exportToCsv("MyExtractor:IRISyntaxException", limit = 1000) +``` + +## Web Dashboard + +When running the server, access the dashboard at: + +``` +http://localhost:9999/server/quality/ +``` + +Endpoints: +- `/quality/` - Dashboard with error counts +- `/quality/metrics` - JSON metrics +- `/quality/errors?type=X&limit=10` - Error details +- `/quality/export?type=X` - CSV download + +## Example: HomepageExtractor Integration + +```scala +class HomepageExtractor(...) extends PageNodeExtractor { + + private val monitor = DataQualityMonitor.forExtractor("HomepageExtractor") + + override def extract(page: PageNode, subjectUri: String): Seq[Quad] = { + if (page.title.namespace != Namespace.Main) { + monitor.logSkipped(page.title.encoded, "Wrong namespace") + return Seq.empty + } + // ... extraction logic + } + + private def generateStatement(...): Seq[Quad] = { + UriUtils.createURI(url) match { + case Success(u) => + monitor.logSuccess(subjectUri, 1) + Seq(new Quad(...)) + + case Failure(ex: IRISyntaxException) => + monitor.logInvalidData(subjectUri, "Bad IRI", Some(ex), Some(url)) + Seq() + } + } +} +``` + +## Error Categorization + +Errors are auto-categorized based on: +- Exception type (if provided): `IRISyntaxException`, `RuntimeException`, etc. +- Message keywords: "invalid" → `InvalidData`, "malformed" → `MalformedData`, "missing" → `MissingData` +- Default: `Other` + +## Notes + +- Thread-safe (uses `TrieMap` and `AtomicLong`) +- Memory bounded (max 10K errors per type) +- Minimal overhead +- Call `DataQualityMonitor.reset()` to clear metrics (useful for testing) + +## Adding to Other Extractors + +Find code like this: +```scala +case _ : SomeException => Seq() // TODO: log +``` + +Replace with: +```scala +case ex: SomeException => + monitor.logInvalidData(page, "reason", Some(ex), Some(data)) + Seq() +``` \ No newline at end of file diff --git a/dump/.project b/dump/.project index cef50fe4d3..48578e6b99 100644 --- a/dump/.project +++ b/dump/.project @@ -1,16 +1,36 @@ - dump - - core - - - - org.scala-ide.sdt.core.scalabuilder - - - - org.scala-ide.sdt.core.scalanature - org.eclipse.jdt.core.javanature - - \ No newline at end of file + dump + + + core + + + + org.scala-ide.sdt.core.scalabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.m2e.core.maven2Nature + org.scala-ide.sdt.core.scalanature + org.eclipse.jdt.core.javanature + + + + 1765562241631 + + 30 + + org.eclipse.core.resources.regexFilterMatcher + node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__ + + + + diff --git a/scripts/.project b/scripts/.project index 3698e53539..edd63b409a 100644 --- a/scripts/.project +++ b/scripts/.project @@ -1,16 +1,36 @@ - scripts - - core - - - - org.scala-ide.sdt.core.scalabuilder - - - - org.scala-ide.sdt.core.scalanature - org.eclipse.jdt.core.javanature - - \ No newline at end of file + scripts + + + core + + + + org.scala-ide.sdt.core.scalabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.m2e.core.maven2Nature + org.scala-ide.sdt.core.scalanature + org.eclipse.jdt.core.javanature + + + + 1765562241638 + + 30 + + org.eclipse.core.resources.regexFilterMatcher + node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__ + + + + diff --git a/server/.project b/server/.project index f845f76f39..de69b57a70 100644 --- a/server/.project +++ b/server/.project @@ -1,16 +1,36 @@ - server - - core - - - - org.scala-ide.sdt.core.scalabuilder - - - - org.scala-ide.sdt.core.scalanature - org.eclipse.jdt.core.javanature - - \ No newline at end of file + server + + + core + + + + org.scala-ide.sdt.core.scalabuilder + + + + + org.eclipse.m2e.core.maven2Builder + + + + + + org.eclipse.m2e.core.maven2Nature + org.scala-ide.sdt.core.scalanature + org.eclipse.jdt.core.javanature + + + + 1765562241656 + + 30 + + org.eclipse.core.resources.regexFilterMatcher + node_modules|\.git|__CREATED_BY_JAVA_LANGUAGE_SERVER__ + + + + diff --git a/server/src/main/scala/org/dbpedia/extraction/server/resources/DataQuality.scala b/server/src/main/scala/org/dbpedia/extraction/server/resources/DataQuality.scala new file mode 100644 index 0000000000..fc467778e7 --- /dev/null +++ b/server/src/main/scala/org/dbpedia/extraction/server/resources/DataQuality.scala @@ -0,0 +1,219 @@ +package org.dbpedia.extraction.server.resources + +import javax.ws.rs.{GET, Path, PathParam, Produces, QueryParam} +import org.dbpedia.extraction.util.DataQualityMonitor +import scala.xml.Elem + +/** + * REST resource for viewing Data Quality Monitor metrics + * + * Endpoints: + * - /quality/ - Overview of all metrics + * - /quality/metrics - JSON format metrics + * - /quality/errors/{errorType} - Detailed errors for specific type + * - /quality/export/{errorType} - CSV export + */ +@Path("/quality/") +class DataQuality { + + /** + * Main page showing all quality metrics + */ + @GET + @Produces(Array("application/xhtml+xml")) + def get: Elem = { + val globalMetrics = DataQualityMonitor.getGlobalMetrics() + + // Group metrics by extractor + val metricsByExtractor = globalMetrics.groupBy { case (key, _) => + key.split(":").headOption.getOrElse("Unknown") + } + + + {ServerHeader.getHeader("Data Quality Monitoring", true)} + +
+

Data Quality Monitoring

+

Real-time extraction quality metrics and error tracking

+ +
+
+

Summary

+
+
+

Total Error Types: {globalMetrics.size}

+

Total Errors: {globalMetrics.values.sum}

+

Extractors Monitored: {metricsByExtractor.size}

+
+
+ + { + if (metricsByExtractor.isEmpty) { +
+ No errors logged yet. The monitor is running but hasn't recorded any errors. + Extract some pages to see metrics appear here. +
+ } else { + for ((extractor, metrics) <- metricsByExtractor.toSeq.sortBy(_._1)) yield { + val totalErrors = metrics.values.sum +
+
+

{extractor}

+
+
+

Total Errors: {totalErrors}

+ + + + + + + + + + { + for ((errorType, count) <- metrics.toSeq.sortBy(-_._2)) yield { + + + + + + } + } + +
Error TypeCountActions
{errorType.split(":").lastOption.getOrElse(errorType)}{count} + View Details + {" "} + Export CSV +
+
+
+ } + } + } + +
+
+

API Endpoints

+
+
+
    +
  • GET /quality/ - This page
  • +
  • GET /quality/metrics - JSON format metrics
  • +
  • GET /quality/errors?type=ErrorType&limit=10 - Error details
  • +
  • GET /quality/export?type=ErrorType&limit=1000 - CSV export
  • +
+
+
+
+ + + } + + /** + * Get metrics in JSON format + */ + @GET + @Path("metrics") + @Produces(Array("application/json")) + def getMetricsJson: String = { + val metrics = DataQualityMonitor.getGlobalMetrics() + + val json = new StringBuilder() + json.append("{\n") + json.append(s""" "totalErrors": ${metrics.values.sum},\n""") + json.append(s""" "errorTypes": ${metrics.size},\n""") + json.append(""" "metrics": {""").append("\n") + + val entries = metrics.toSeq + entries.zipWithIndex.foreach { case ((errorType, count), idx) => + json.append(s""" "$errorType": $count""") + if (idx < entries.size - 1) json.append(",") + json.append("\n") + } + + json.append(" }\n") + json.append("}") + json.toString() + } + + /** + * Get detailed errors for a specific error type + */ + @GET + @Path("errors") + @Produces(Array("application/xhtml+xml")) + def getErrors( + @QueryParam("type") errorType: String, + @QueryParam("limit") limitParam: String + ): Elem = { + val limit = Option(limitParam).map(_.toInt).getOrElse(50) + val errors = DataQualityMonitor.getErrorDetails(errorType, limit) + + + {ServerHeader.getHeader(s"Error Details: $errorType", true)} + +
+

Error Details

+

{errorType}

+ +

+ ← Back to Overview + {" "} + Export to CSV +

+ + { + if (errors.isEmpty) { +
+ No errors found for type: {errorType} +
+ } else { +
+

Showing {errors.size} most recent errors

+ + + + + + + + + + + { + for (error <- errors) yield { + val timestamp = new java.text.SimpleDateFormat("yyyy-MM-dd HH:mm:ss") + .format(new java.util.Date(error.timestamp)) + + + + + + + } + } + +
Page TitleError MessageExceptionTimestamp
{error.pageTitle}{error.message}{error.exceptionType.getOrElse("-")}{timestamp}
+
+ } + } +
+ + + } + + /** + * Export errors to CSV format + */ + @GET + @Path("export") + @Produces(Array("text/csv")) + def exportCsv( + @QueryParam("type") errorType: String, + @QueryParam("limit") limitParam: String + ): String = { + val limit = Option(limitParam).map(_.toInt).getOrElse(1000) + DataQualityMonitor.exportToCsv(errorType, limit) + } +} diff --git a/server/src/main/scala/org/dbpedia/extraction/server/resources/Root.scala b/server/src/main/scala/org/dbpedia/extraction/server/resources/Root.scala index 649c5d7435..6dab185d81 100644 --- a/server/src/main/scala/org/dbpedia/extraction/server/resources/Root.scala +++ b/server/src/main/scala/org/dbpedia/extraction/server/resources/Root.scala @@ -25,6 +25,7 @@ class Root Statistics - Mappings - Extractors - + Data Quality - Missing labels

diff --git a/server/src/test/scala/org/dbpedia/extraction/server/resources/DataQualityTest.scala b/server/src/test/scala/org/dbpedia/extraction/server/resources/DataQualityTest.scala new file mode 100644 index 0000000000..4a72dbabc2 --- /dev/null +++ b/server/src/test/scala/org/dbpedia/extraction/server/resources/DataQualityTest.scala @@ -0,0 +1,161 @@ +package org.dbpedia.extraction.server.resources + +import org.dbpedia.extraction.util.DataQualityMonitor +import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers} + +class DataQualityTest extends FlatSpec with Matchers with BeforeAndAfter { + + val dataQuality = new DataQuality() + + before { + DataQualityMonitor.reset() + } + + "DataQuality resource" should "show empty state" in { + val html = dataQuality.get + + (html \\ "html").nonEmpty should be(true) + html.toString should include("No errors logged yet") + } + + it should "display error summary" in { + val m1 = DataQualityMonitor.forExtractor("HomepageExtractor") + val m2 = DataQualityMonitor.forExtractor("AbstractExtractor") + + m1.logInvalidData("Albert_Einstein", "Invalid IRI: malformed URL") + m1.logInvalidData("Nikola_Tesla", "Invalid IRI: malformed URL") + m2.logInvalidData("Isaac_Newton", "Missing property") + + val html = dataQuality.get.toString + html should include("Total Error Types:") + html should include("HomepageExtractor") + html should include("AbstractExtractor") + } + + it should "group by extractor" in { + DataQualityMonitor.forExtractor("ExtractorA").logInvalidData("Page1", "Error 1") + DataQualityMonitor.forExtractor("ExtractorB").logInvalidData("Page2", "Error 2") + + val html = dataQuality.get.toString + html should include("ExtractorA") + html should include("ExtractorB") + } + + "Metrics endpoint" should "return JSON" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + monitor.logInvalidData("Page1", "Invalid data") + + val json = dataQuality.getMetricsJson + json should include("totalErrors") + json should include("TestExtractor") + json.trim should startWith("{") + } + + it should "count errors correctly" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + (1 to 5).foreach(i => monitor.logInvalidData(s"Page$i", "Test error")) + + dataQuality.getMetricsJson should include("\"totalErrors\": 5") + } + + "Error details" should "show error table" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + monitor.logInvalidData("Albert_Einstein", "Invalid IRI syntax") + + val html = dataQuality.getErrors("TestExtractor:InvalidData", "10").toString + html should include(" monitor.logInvalidData(s"Page$i", "Invalid data error")) + + val html = dataQuality.getErrors("TestExtractor:InvalidData", "10").toString + html should include("Showing") + } + + "CSV export" should "export errors" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + monitor.logInvalidData("Einstein", "Invalid IRI") + + val csv = dataQuality.exportCsv("TestExtractor:InvalidData", "100") + csv should include("Extractor,PageTitle,ErrorMessage") + csv should include("Einstein") + } + + it should "escape commas" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + monitor.logInvalidData("Page1", "Invalid data with, comma, in message") + + val csv = dataQuality.exportCsv("TestExtractor:InvalidData", "100") + csv should include("Invalid data with; comma; in message") + } + + it should "respect export limit" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + (1 to 200).foreach(i => monitor.logInvalidData(s"Page$i", "Test error")) + + val csv = dataQuality.exportCsv("TestExtractor:InvalidData", "50") + csv.split("\n").length should be <= 51 + } + + "Integration" should "work with multiple extractors" in { + val homepage = DataQualityMonitor.forExtractor("HomepageExtractor") + val abstracts = DataQualityMonitor.forExtractor("AbstractExtractor") + val infobox = DataQualityMonitor.forExtractor("InfoboxExtractor") + + homepage.logInvalidData("Einstein", "Invalid homepage URL") + homepage.logInvalidData("Tesla", "Invalid homepage IRI") + abstracts.logInvalidData("Newton", "Missing abstract") + infobox.logInvalidData("Curie", "Invalid property value") + infobox.logInvalidData("Darwin", "Invalid property value") + + val page = dataQuality.get.toString + page should include("HomepageExtractor") + page should include("AbstractExtractor") + + dataQuality.getMetricsJson should include("\"totalErrors\": 5") + + val csv = dataQuality.exportCsv("HomepageExtractor:InvalidData", "100") + csv should include("Einstein") + csv should include("Tesla") + } + + it should "handle concurrent access" in { + val monitor = DataQualityMonitor.forExtractor("ConcurrentTest") + + val threads = (1 to 5).map { i => + new Thread { + override def run(): Unit = { + for (j <- 1 to 20) { + monitor.logInvalidData(s"Page_${i}_$j", s"Error from thread $i") + if (j % 5 == 0) dataQuality.getMetricsJson + } + } + } + } + + threads.foreach(_.start()) + threads.foreach(_.join()) + + dataQuality.getMetricsJson should include("\"totalErrors\": 100") + } + + it should "accumulate errors" in { + val monitor = DataQualityMonitor.forExtractor("TestExtractor") + + monitor.logInvalidData("Page1", "Error 1") + dataQuality.getMetricsJson should include("\"totalErrors\": 1") + + monitor.logInvalidData("Page2", "Error 2") + monitor.logInvalidData("Page3", "Error 3") + dataQuality.getMetricsJson should include("\"totalErrors\": 3") + } +}