apache · igiguere · Feb 5, 2026 · Feb 11, 2026 · Feb 15, 2026 · Feb 18, 2026
diff --git a/conf/nutch-default.xml b/conf/nutch-default.xml
@@ -1794,6 +1794,15 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
   </description>
 </property>
 
+<property>
+  <name>parser.delete.failed</name>
+  <value>false</value>
+  <description>Boolean value for whether we should delete a page from the index when parsing the page fails. 
+  By default this property is deactivated, because it will delete an existing page from the index, where a 
+  previous fetch produced content that was successfully parsed.
+  </description>
+</property>
+
 <property>
   <name>parser.store.text</name>
   <value>true</value>
@@ -2426,7 +2435,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
 <property>
   <name>link.delete.gone</name>
   <value>false</value>
-  <description>Whether to delete gone pages from the web graph.</description>
+  <description>Whether to delete gone pages from the web graph. Gone pages include redirects and duplicates.</description>
 </property>
 
 <property>

diff --git a/src/java/org/apache/nutch/crawl/CrawlDatum.java b/src/java/org/apache/nutch/crawl/CrawlDatum.java
@@ -75,6 +75,8 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
   public static final byte STATUS_DB_DUPLICATE = 0x07;
   /** Page was marked as orphan, e.g. has no inlinks anymore */
   public static final byte STATUS_DB_ORPHAN = 0x08;
+  /** Page parsing failed */
+  public static final byte STATUS_DB_PARSE_FAILED = 0x09;
 
   /** Maximum value of DB-related status. */
   public static final byte STATUS_DB_MAX = 0x1f;
@@ -103,6 +105,8 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
   public static final byte STATUS_LINKED = 0x43;
   /** Page got metadata from a parser */
   public static final byte STATUS_PARSE_META = 0x44;
+  /** Page parse failed */
+  public static final byte STATUS_PARSE_FAILED = 0x45;
 
   public static final HashMap<Byte, String> statNames = new HashMap<>();
   static {
@@ -114,6 +118,7 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
     statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified");
     statNames.put(STATUS_DB_DUPLICATE, "db_duplicate");
     statNames.put(STATUS_DB_ORPHAN, "db_orphan");
+    statNames.put(STATUS_DB_PARSE_FAILED, "db_parse_failed");
     statNames.put(STATUS_SIGNATURE, "signature");
     statNames.put(STATUS_INJECTED, "injected");
     statNames.put(STATUS_LINKED, "linked");
@@ -124,6 +129,7 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
     statNames.put(STATUS_FETCH_GONE, "fetch_gone");
     statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified");
     statNames.put(STATUS_PARSE_META, "parse_metadata");
+    statNames.put(STATUS_PARSE_FAILED, "parse_failed");
 
     oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED);
     oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED);

diff --git a/src/java/org/apache/nutch/crawl/CrawlDbReducer.java b/src/java/org/apache/nutch/crawl/CrawlDbReducer.java
@@ -115,7 +115,9 @@ public void reduce(Text key, Iterable<CrawlDatum> values,
         continue;
       }
 
-      if (CrawlDatum.hasFetchStatus(datum)) {
+      // temporarily handle parse_failed as fetched (it was fetched!)
+      if (CrawlDatum.hasFetchStatus(datum)
+          || datum.getStatus() == CrawlDatum.STATUS_PARSE_FAILED) {
         if (!fetchSet) {
           if (multiple) {
             fetch.set(datum);
@@ -130,7 +132,7 @@ public void reduce(Text key, Iterable<CrawlDatum> values,
         }
         continue;
       }
-
+      
       switch (datum.getStatus()) { // collect other info
       case CrawlDatum.STATUS_LINKED:
         CrawlDatum link;
@@ -233,7 +235,7 @@ public void reduce(Text key, Iterable<CrawlDatum> values,
       }
       break;
 
-    case CrawlDatum.STATUS_FETCH_SUCCESS: // succesful fetch
+    case CrawlDatum.STATUS_FETCH_SUCCESS: // successful fetch
     case CrawlDatum.STATUS_FETCH_REDIR_TEMP: // successful fetch, redirected
     case CrawlDatum.STATUS_FETCH_REDIR_PERM:
     case CrawlDatum.STATUS_FETCH_NOTMODIFIED: // successful fetch, notmodified
@@ -320,6 +322,14 @@ public void reduce(Text key, Iterable<CrawlDatum> values,
       }
       break;
 
+    case CrawlDatum.STATUS_PARSE_FAILED: // successful fetch, but parse failed
+      if (oldSet)
+        result.setSignature(old.getSignature()); // use old signature
+      result.setStatus(CrawlDatum.STATUS_DB_PARSE_FAILED);
+      result = schedule.setPageGoneSchedule(key, result, prevFetchTime,
+          prevModifiedTime, fetch.getFetchTime());
+      break;
+
     case CrawlDatum.STATUS_FETCH_GONE: // permanent failure
       if (oldSet)
         result.setSignature(old.getSignature()); // use old signature

diff --git a/src/java/org/apache/nutch/fetcher/FetcherThread.java b/src/java/org/apache/nutch/fetcher/FetcherThread.java
@@ -113,6 +113,7 @@ public class FetcherThread extends Thread {
   URLNormalizers normalizersForOutlinks;
 
   private boolean skipTruncated;
+  private boolean deleteFailedParse;
 
   private boolean halted = false;
 
@@ -186,6 +187,7 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ
     this.scfilters = new ScoringFilters(conf);
     this.parseUtil = new ParseUtil(conf);
     this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
+    this.deleteFailedParse = conf.getBoolean(ParseSegment.DELETE_FAILED_PARSE, false);
     this.signatureWithoutParsing = conf.getBoolean("fetcher.signature", false);
     this.protocolFactory = new ProtocolFactory(conf);
     this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
@@ -476,6 +478,9 @@ public void run() {
                   fit = queueRedirect(redirUrl, fit);
                 }
               }
+              if (pstatus != null && pstatus.isFailed() && deleteFailedParse) {
+                output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_PARSE_FAILED);
+              }
               break;
 
             case ProtocolStatus.MOVED: // redirect

diff --git a/src/java/org/apache/nutch/indexer/IndexerMapReduce.java b/src/java/org/apache/nutch/indexer/IndexerMapReduce.java
@@ -226,6 +226,7 @@ public static class IndexerReducer extends
     private Counter deletedGoneCounter;
     private Counter deletedRedirectsCounter;
     private Counter deletedDuplicatesCounter;
+    private Counter deletedFailedParseCounter;
     private Counter skippedNotModifiedCounter;
     private Counter deletedByIndexingFilterCounter;
     private Counter skippedByIndexingFilterCounter;
@@ -279,6 +280,8 @@ private void initCounters(Reducer<Text, NutchWritable, Text, NutchIndexAction>.C
           NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_REDIRECTS_TOTAL);
       deletedDuplicatesCounter = context.getCounter(
           NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_DUPLICATES_TOTAL);
+      deletedFailedParseCounter = context.getCounter(
+          NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_FAILED_PARSE_TOTAL);
       skippedNotModifiedCounter = context.getCounter(
           NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_SKIPPED_NOT_MODIFIED_TOTAL);
       deletedByIndexingFilterCounter = context.getCounter(
@@ -354,6 +357,15 @@ public void reduce(Text key, Iterable<NutchWritable> values,
         }
       }
 
+      // Whether to delete pages where parsing failed
+      if (delete && fetchDatum != null) {
+        if (fetchDatum.getStatus() == CrawlDatum.STATUS_PARSE_FAILED
+            || dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_PARSE_FAILED) {
+          deletedFailedParseCounter.increment(1);
+          context.write(key, DELETE_ACTION);
+          return;
+        }
+      }
       // Whether to delete GONE or REDIRECTS
       if (delete && fetchDatum != null) {
         if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE

diff --git a/src/java/org/apache/nutch/metrics/NutchMetrics.java b/src/java/org/apache/nutch/metrics/NutchMetrics.java
@@ -185,6 +185,9 @@ private NutchMetrics() {
   /** Documents deleted as duplicates. */
   public static final String INDEXER_DELETED_DUPLICATES_TOTAL = "deleted_duplicates_total";
 
+  /** Documents deleted because parsing failed. */
+  public static final String INDEXER_DELETED_FAILED_PARSE_TOTAL = "deleted_failed_parse_total";
+
   /** Documents deleted by indexing filter. */
   public static final String INDEXER_DELETED_BY_INDEXING_FILTER_TOTAL = "deleted_by_indexing_filter_total";
 

diff --git a/src/java/org/apache/nutch/parse/ParseSegment.java b/src/java/org/apache/nutch/parse/ParseSegment.java
@@ -67,6 +67,7 @@ public class ParseSegment extends NutchTool implements Tool {
       .getLogger(MethodHandles.lookup().lookupClass());
 
   public static final String SKIP_TRUNCATED = "parser.skip.truncated";
+  public static final String DELETE_FAILED_PARSE = "parser.delete.failed";
 
   public ParseSegment() {
     this(null);

diff --git a/src/java/org/apache/nutch/parse/ParseStatus.java b/src/java/org/apache/nutch/parse/ParseStatus.java
@@ -188,6 +188,10 @@ public void write(DataOutput out) throws IOException {
   public boolean isSuccess() {
     return majorCode == SUCCESS;
   }
+
+  public boolean isFailed() {
+    return majorCode == FAILED;
+  }
 
   /**
    * @return a String representation of the first argument,