Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 10 additions & 1 deletion conf/nutch-default.xml
Original file line number Diff line number Diff line change
Expand Up @@ -1794,6 +1794,15 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
</description>
</property>

<property>
<name>parser.delete.failed</name>
<value>false</value>
<description>Boolean value for whether we should delete a page from the index when parsing the page fails.
By default this property is deactivated, because it will delete an existing page from the index, where a
previous fetch produced content that was successfully parsed.
</description>
</property>

<property>
<name>parser.store.text</name>
<value>true</value>
Expand Down Expand Up @@ -2426,7 +2435,7 @@ CAUTION: Set the parser.timeout to -1 or a bigger value than 30, when using this
<property>
<name>link.delete.gone</name>
<value>false</value>
<description>Whether to delete gone pages from the web graph.</description>
<description>Whether to delete gone pages from the web graph. Gone pages include redirects and duplicates.</description>
</property>

<property>
Expand Down
6 changes: 6 additions & 0 deletions src/java/org/apache/nutch/crawl/CrawlDatum.java
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,8 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
public static final byte STATUS_DB_DUPLICATE = 0x07;
/** Page was marked as orphan, e.g. has no inlinks anymore */
public static final byte STATUS_DB_ORPHAN = 0x08;
/** Page parsing failed */
public static final byte STATUS_DB_PARSE_FAILED = 0x09;

/** Maximum value of DB-related status. */
public static final byte STATUS_DB_MAX = 0x1f;
Expand Down Expand Up @@ -103,6 +105,8 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
public static final byte STATUS_LINKED = 0x43;
/** Page got metadata from a parser */
public static final byte STATUS_PARSE_META = 0x44;
/** Page parse failed */
public static final byte STATUS_PARSE_FAILED = 0x45;

public static final HashMap<Byte, String> statNames = new HashMap<>();
static {
Expand All @@ -114,6 +118,7 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
statNames.put(STATUS_DB_NOTMODIFIED, "db_notmodified");
statNames.put(STATUS_DB_DUPLICATE, "db_duplicate");
statNames.put(STATUS_DB_ORPHAN, "db_orphan");
statNames.put(STATUS_DB_PARSE_FAILED, "db_parse_failed");
statNames.put(STATUS_SIGNATURE, "signature");
statNames.put(STATUS_INJECTED, "injected");
statNames.put(STATUS_LINKED, "linked");
Expand All @@ -124,6 +129,7 @@ public class CrawlDatum implements WritableComparable<CrawlDatum>, Cloneable {
statNames.put(STATUS_FETCH_GONE, "fetch_gone");
statNames.put(STATUS_FETCH_NOTMODIFIED, "fetch_notmodified");
statNames.put(STATUS_PARSE_META, "parse_metadata");
statNames.put(STATUS_PARSE_FAILED, "parse_failed");

oldToNew.put(OLD_STATUS_DB_UNFETCHED, STATUS_DB_UNFETCHED);
oldToNew.put(OLD_STATUS_DB_FETCHED, STATUS_DB_FETCHED);
Expand Down
16 changes: 13 additions & 3 deletions src/java/org/apache/nutch/crawl/CrawlDbReducer.java
Original file line number Diff line number Diff line change
Expand Up @@ -115,7 +115,9 @@ public void reduce(Text key, Iterable<CrawlDatum> values,
continue;
}

if (CrawlDatum.hasFetchStatus(datum)) {
// temporarily handle parse_failed as fetched (it was fetched!)
if (CrawlDatum.hasFetchStatus(datum)
|| datum.getStatus() == CrawlDatum.STATUS_PARSE_FAILED) {
if (!fetchSet) {
if (multiple) {
fetch.set(datum);
Expand All @@ -130,7 +132,7 @@ public void reduce(Text key, Iterable<CrawlDatum> values,
}
continue;
}

switch (datum.getStatus()) { // collect other info
case CrawlDatum.STATUS_LINKED:
CrawlDatum link;
Expand Down Expand Up @@ -233,7 +235,7 @@ public void reduce(Text key, Iterable<CrawlDatum> values,
}
break;

case CrawlDatum.STATUS_FETCH_SUCCESS: // succesful fetch
case CrawlDatum.STATUS_FETCH_SUCCESS: // successful fetch
case CrawlDatum.STATUS_FETCH_REDIR_TEMP: // successful fetch, redirected
case CrawlDatum.STATUS_FETCH_REDIR_PERM:
case CrawlDatum.STATUS_FETCH_NOTMODIFIED: // successful fetch, notmodified
Expand Down Expand Up @@ -320,6 +322,14 @@ public void reduce(Text key, Iterable<CrawlDatum> values,
}
break;

case CrawlDatum.STATUS_PARSE_FAILED: // successful fetch, but parse failed
if (oldSet)
result.setSignature(old.getSignature()); // use old signature
result.setStatus(CrawlDatum.STATUS_DB_PARSE_FAILED);
result = schedule.setPageGoneSchedule(key, result, prevFetchTime,
prevModifiedTime, fetch.getFetchTime());
break;

case CrawlDatum.STATUS_FETCH_GONE: // permanent failure
if (oldSet)
result.setSignature(old.getSignature()); // use old signature
Expand Down
5 changes: 5 additions & 0 deletions src/java/org/apache/nutch/fetcher/FetcherThread.java
Original file line number Diff line number Diff line change
Expand Up @@ -113,6 +113,7 @@ public class FetcherThread extends Thread {
URLNormalizers normalizersForOutlinks;

private boolean skipTruncated;
private boolean deleteFailedParse;

private boolean halted = false;

Expand Down Expand Up @@ -186,6 +187,7 @@ public FetcherThread(Configuration conf, AtomicInteger activeThreads, FetchItemQ
this.scfilters = new ScoringFilters(conf);
this.parseUtil = new ParseUtil(conf);
this.skipTruncated = conf.getBoolean(ParseSegment.SKIP_TRUNCATED, true);
this.deleteFailedParse = conf.getBoolean(ParseSegment.DELETE_FAILED_PARSE, false);
this.signatureWithoutParsing = conf.getBoolean("fetcher.signature", false);
this.protocolFactory = new ProtocolFactory(conf);
this.normalizers = new URLNormalizers(conf, URLNormalizers.SCOPE_FETCHER);
Expand Down Expand Up @@ -476,6 +478,9 @@ public void run() {
fit = queueRedirect(redirUrl, fit);
}
}
if (pstatus != null && pstatus.isFailed() && deleteFailedParse) {
output(fit.url, fit.datum, null, status, CrawlDatum.STATUS_PARSE_FAILED);
}
break;

case ProtocolStatus.MOVED: // redirect
Expand Down
12 changes: 12 additions & 0 deletions src/java/org/apache/nutch/indexer/IndexerMapReduce.java
Original file line number Diff line number Diff line change
Expand Up @@ -226,6 +226,7 @@ public static class IndexerReducer extends
private Counter deletedGoneCounter;
private Counter deletedRedirectsCounter;
private Counter deletedDuplicatesCounter;
private Counter deletedFailedParseCounter;
private Counter skippedNotModifiedCounter;
private Counter deletedByIndexingFilterCounter;
private Counter skippedByIndexingFilterCounter;
Expand Down Expand Up @@ -279,6 +280,8 @@ private void initCounters(Reducer<Text, NutchWritable, Text, NutchIndexAction>.C
NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_REDIRECTS_TOTAL);
deletedDuplicatesCounter = context.getCounter(
NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_DUPLICATES_TOTAL);
deletedFailedParseCounter = context.getCounter(
NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_DELETED_FAILED_PARSE_TOTAL);
skippedNotModifiedCounter = context.getCounter(
NutchMetrics.GROUP_INDEXER, NutchMetrics.INDEXER_SKIPPED_NOT_MODIFIED_TOTAL);
deletedByIndexingFilterCounter = context.getCounter(
Expand Down Expand Up @@ -354,6 +357,15 @@ public void reduce(Text key, Iterable<NutchWritable> values,
}
}

// Whether to delete pages where parsing failed
if (delete && fetchDatum != null) {
if (fetchDatum.getStatus() == CrawlDatum.STATUS_PARSE_FAILED
|| dbDatum != null && dbDatum.getStatus() == CrawlDatum.STATUS_DB_PARSE_FAILED) {
deletedFailedParseCounter.increment(1);
context.write(key, DELETE_ACTION);
return;
}
}
// Whether to delete GONE or REDIRECTS
if (delete && fetchDatum != null) {
if (fetchDatum.getStatus() == CrawlDatum.STATUS_FETCH_GONE
Expand Down
3 changes: 3 additions & 0 deletions src/java/org/apache/nutch/metrics/NutchMetrics.java
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,9 @@ private NutchMetrics() {
/** Documents deleted as duplicates. */
public static final String INDEXER_DELETED_DUPLICATES_TOTAL = "deleted_duplicates_total";

/** Documents deleted because parsing failed. */
public static final String INDEXER_DELETED_FAILED_PARSE_TOTAL = "deleted_failed_parse_total";

/** Documents deleted by indexing filter. */
public static final String INDEXER_DELETED_BY_INDEXING_FILTER_TOTAL = "deleted_by_indexing_filter_total";

Expand Down
1 change: 1 addition & 0 deletions src/java/org/apache/nutch/parse/ParseSegment.java
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,7 @@ public class ParseSegment extends NutchTool implements Tool {
.getLogger(MethodHandles.lookup().lookupClass());

public static final String SKIP_TRUNCATED = "parser.skip.truncated";
public static final String DELETE_FAILED_PARSE = "parser.delete.failed";

public ParseSegment() {
this(null);
Expand Down
4 changes: 4 additions & 0 deletions src/java/org/apache/nutch/parse/ParseStatus.java
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,10 @@ public void write(DataOutput out) throws IOException {
public boolean isSuccess() {
return majorCode == SUCCESS;
}

public boolean isFailed() {
return majorCode == FAILED;
}

/**
* @return a String representation of the first argument,
Expand Down
Loading
Loading