Skip to content
Open
Show file tree
Hide file tree
Changes from 2 commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
2dc07d9
Address deprecation in Nutch codebase
lewismc Sep 24, 2025
4d0431d
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Oct 20, 2025
d97f408
Merge branch 'master' into NUTCH-3130
lewismc Feb 22, 2026
8997e64
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 22, 2026
01239fc
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 22, 2026
5d5920a
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
3a49221
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
1ba758b
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
608f76f
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
e4a6faf
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
cc5d047
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
5656cc4
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
7043fa6
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
6f013c7
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
c0ebcca
Merge branch 'master' into NUTCH-3130
lewismc Feb 23, 2026
a1854bb
NUTCH-3130 Address deprecated API usage across Nutch codebase and build
lewismc Feb 23, 2026
0d37158
Merge branch 'master' into NUTCH-3130
lewismc Feb 24, 2026
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
30 changes: 25 additions & 5 deletions .github/workflows/master-build.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.

name: master pull request ci
name: master branch ci
on:
push:
branches: [master]
Expand Down Expand Up @@ -89,16 +89,36 @@ jobs:
- '.github/workflows/*'
# run if the build configuration or both 'core' and 'plugins' files were changed
- name: test all
id: build_all
if: ${{ steps.filter.outputs.buildconf == 'true' || ( steps.filter.outputs.core == 'true' && steps.filter.outputs.plugins == 'true' ) }}
run: ant clean test -buildfile build.xml
run: ant clean test -buildfile build.xml | tee build.log
# run only if 'core' files were changed
- name: test core
id: build_core
if: ${{ steps.filter.outputs.core == 'true' && steps.filter.outputs.plugins == 'false' && steps.filter.outputs.buildconf == 'false' }}
run: ant clean test-core -buildfile build.xml
run: ant clean test-core -buildfile build.xml | tee build.log
# run only if 'plugins' files were changed
- name: test plugins
id: build_plugins
if: ${{ steps.filter.outputs.plugins == 'true' && steps.filter.outputs.core == 'false' && steps.filter.outputs.buildconf == 'false' }}
run: ant clean test-plugins -buildfile build.xml
run: ant clean test-plugins -buildfile build.xml | tee build.log
# check for deprecation warnings in build output
- name: Check for deprecation warnings
if: always()
run: |
if [ -f build.log ]; then
if grep -iEq "warning: \[deprecation\]" build.log ; then
echo "============================================================="
echo "= ❌ Java deprecation warnings detected! Failing the build. ="
echo "============================================================="
grep -iE "warning: \[deprecation\]" -A 2 build.log
exit 1
else
echo "✅ No Java deprecation warnings found."
fi
else
echo "⚠️ build.log not found, skipping deprecation check."
fi
- name: Upload Test Report
uses: actions/upload-artifact@v4
if: always()
Expand All @@ -108,4 +128,4 @@ jobs:
./build/test/TEST-*.xml
./build/**/test/TEST-*.xml
retention-days: 1
overwrite: true
overwrite: true
25 changes: 9 additions & 16 deletions build.xml
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@

<property environment="env"/>

<property name="spotbugs.version" value="4.2.0" />
<property name="spotbugs.version" value="4.9.6" />
<property name="spotbugs.home" value="${ivy.dir}/spotbugs-${spotbugs.version}" />
<property name="spotbugs.jar" value="${spotbugs.home}/lib/spotbugs-ant.jar" />

Expand Down Expand Up @@ -79,7 +79,12 @@
</path>

<presetdef name="javac">
<javac includeantruntime="false" />
<javac includeantruntime="false"
encoding="${build.encoding}"
debug="${javac.debug}"
optimize="${javac.optimize}"
release="${javac.version}"
deprecation="${javac.deprecation}"/>
</presetdef>

<target name="dependencytree" depends="resolve-default" description="Show dependency tree">
Expand Down Expand Up @@ -120,15 +125,9 @@

<target name="compile-core" depends="init, resolve-default" description="--> compile core Java files only">
<javac
encoding="${build.encoding}"
srcdir="${src.dir}"
includes="org/apache/nutch/**/*.java"
destdir="${build.classes}"
debug="${javac.debug}"
optimize="${javac.optimize}"
target="${javac.version}"
source="${javac.version}"
deprecation="${javac.deprecation}">
destdir="${build.classes}">
<compilerarg value="-Xlint:-path"/>
<classpath refid="classpath"/>
</javac>
Expand Down Expand Up @@ -450,15 +449,9 @@
<!-- ================================================================== -->
<target name="compile-core-test" depends="init, compile-core, resolve-test" description="--> compile test code">
<javac
encoding="${build.encoding}"
srcdir="${test.src.dir}"
includes="org/apache/nutch/**/*.java"
destdir="${test.build.classes}"
debug="${javac.debug}"
optimize="${javac.optimize}"
target="${javac.version}"
source="${javac.version}"
deprecation="${javac.deprecation}">
destdir="${test.build.classes}">
<compilerarg value="-Xlint:-path"/>
<classpath refid="test.classpath"/>
</javac>
Expand Down
78 changes: 27 additions & 51 deletions src/java/org/apache/nutch/crawl/CrawlDbReader.java
Original file line number Diff line number Diff line change
Expand Up @@ -16,47 +16,27 @@
*/
package org.apache.nutch.crawl;

import java.io.Closeable;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Date;
import java.util.HashMap;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Random;
import java.util.TreeMap;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import com.fasterxml.jackson.core.JsonGenerationException;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.json.JsonWriteFeature;
import com.fasterxml.jackson.core.util.MinimalPrettyPrinter;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.module.SimpleModule;
import com.tdunning.math.stats.MergingDigest;
import com.tdunning.math.stats.TDigest;
import org.apache.commons.jexl3.JexlScript;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FSDataOutputStream;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.BytesWritable;
import org.apache.hadoop.io.FloatWritable;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.MapFile;
import org.apache.hadoop.io.SequenceFile;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.io.Writable;
import org.apache.hadoop.io.*;
import org.apache.hadoop.io.compress.CompressionCodec;
import org.apache.hadoop.io.compress.GzipCodec;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.RecordWriter;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.TaskAttemptContext;
import org.apache.hadoop.mapreduce.*;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.input.SequenceFileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
Expand All @@ -67,26 +47,22 @@
import org.apache.hadoop.util.ReflectionUtils;
import org.apache.hadoop.util.StringUtils;
import org.apache.hadoop.util.ToolRunner;
import org.apache.nutch.util.AbstractChecker;
import org.apache.nutch.util.JexlUtil;
import org.apache.nutch.util.NutchConfiguration;
import org.apache.nutch.util.NutchJob;
import org.apache.nutch.util.SegmentReaderUtil;
import org.apache.nutch.util.StringUtil;
import org.apache.nutch.util.TimingUtil;
import org.apache.nutch.util.*;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

import com.fasterxml.jackson.core.JsonGenerationException;
import com.fasterxml.jackson.core.JsonGenerator;
import com.fasterxml.jackson.core.util.MinimalPrettyPrinter;
import com.fasterxml.jackson.databind.JsonSerializer;
import com.fasterxml.jackson.databind.ObjectMapper;
import com.fasterxml.jackson.databind.ObjectWriter;
import com.fasterxml.jackson.databind.SerializerProvider;
import com.fasterxml.jackson.databind.module.SimpleModule;
import com.tdunning.math.stats.MergingDigest;
import com.tdunning.math.stats.TDigest;
import java.io.Closeable;
import java.io.DataOutputStream;
import java.io.File;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.net.MalformedURLException;
import java.net.URL;
import java.nio.ByteBuffer;
import java.util.*;
import java.util.Map.Entry;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Read utility for the CrawlDB.
Expand Down Expand Up @@ -263,7 +239,7 @@ protected static class LineRecordWriter
public LineRecordWriter(DataOutputStream out) {
this.out = out;
jsonMapper.getFactory()
.configure(JsonGenerator.Feature.ESCAPE_NON_ASCII, true);
.configure(JsonWriteFeature.ESCAPE_NON_ASCII.mappedFeature(), true);
SimpleModule module = new SimpleModule();
module.addSerializer(Writable.class, new WritableSerializer());
jsonMapper.registerModule(module);
Expand Down
9 changes: 0 additions & 9 deletions src/java/org/apache/nutch/indexer/IndexWriter.java
Original file line number Diff line number Diff line change
Expand Up @@ -30,15 +30,6 @@ public interface IndexWriter extends Pluggable, Configurable {
*/
final static String X_POINT_ID = IndexWriter.class.getName();

/**
* @param conf Nutch configuration
* @param name target name of the {@link IndexWriter} to be opened
* @throws IOException Some exception thrown by some writer.
* @deprecated use {@link #open(IndexWriterParams)}} instead.
*/
@Deprecated
Comment thread
lewismc marked this conversation as resolved.
public void open(Configuration conf, String name) throws IOException;

/**
* Initializes the internal variables from a given index writer configuration.
*
Expand Down
2 changes: 1 addition & 1 deletion src/java/org/apache/nutch/indexer/IndexWriters.java
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ private Collection<String> getIndexWriters(NutchDocument doc) {
public void open(Configuration conf, String name) throws IOException {
for (Map.Entry<String, IndexWriterWrapper> entry : this.indexWriters
.entrySet()) {
entry.getValue().getIndexWriter().open(conf, name);
entry.getValue().getIndexWriter().open(new IndexWriterParams(new HashMap<>()));
Comment thread
lewismc marked this conversation as resolved.
entry.getValue().getIndexWriter()
.open(entry.getValue().getIndexWriterConfig().getParams());
}
Expand Down
4 changes: 2 additions & 2 deletions src/java/org/apache/nutch/metadata/SpellCheckedMetadata.java
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ private static String normalize(final String str) {
* <li>CoNtEntType gives Content-Type</li>
* <li>ConTnTtYpe gives Content-Type</li>
* </ul>
* If no matching with a well-known metadata name is found, then the original
* If no well-known metadata name match is found, then the original
* name is returned.
*
* @param name
Expand All @@ -115,7 +115,7 @@ public static String getNormalizedName(final String name) {
if ((value == null) && (normalized != null)) {
int threshold = Math.min(3, searched.length() / TRESHOLD_DIVIDER);
for (int i = 0; i < normalized.length && value == null; i++) {
if (StringUtils.getLevenshteinDistance(searched, normalized[i]) < threshold) {
if (StringUtils.compareIgnoreCase(searched, normalized[i]) < threshold) { //.getLevenshteinDistance(searched, normalized[i]) < threshold) {
Comment thread
lewismc marked this conversation as resolved.
Outdated
value = NAMES_IDX.get(normalized[i]);
}
}
Expand Down
5 changes: 0 additions & 5 deletions src/java/org/apache/nutch/plugin/Plugin.java
Original file line number Diff line number Diff line change
Expand Up @@ -88,9 +88,4 @@ public PluginDescriptor getDescriptor() {
private void setDescriptor(PluginDescriptor descriptor) {
fDescriptor = descriptor;
}

@Override
protected void finalize() throws Throwable {
shutDown();
Copy link
Copy Markdown
Member Author

@lewismc lewismc Oct 20, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure we can simply remove the call to shutdown. I need to further investigate options and confirm.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same for me.

Copy link
Copy Markdown
Member Author

@lewismc lewismc Feb 23, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@sebastian-nagel the smoke tests passes https://ci-builds.apache.org/job/Nutch/job/Nutch-Smoke-Test-Single-Node-Hadoop-Cluster/49/
I'm honestly not sure how to test this further... the same goes for similar removals in this PR.

}
}
27 changes: 13 additions & 14 deletions src/java/org/apache/nutch/plugin/PluginRepository.java
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

import java.lang.invoke.MethodHandles;
import java.lang.reflect.Array;
import java.lang.ref.Cleaner;
import java.lang.reflect.Constructor;
import java.lang.reflect.InvocationTargetException;
import java.lang.reflect.Method;
Expand Down Expand Up @@ -70,6 +71,8 @@ public class PluginRepository implements URLStreamHandlerFactory {

protected static final Logger LOG = LoggerFactory.getLogger(MethodHandles.lookup().lookupClass());

private static final Cleaner CLEANER = Cleaner.create();

/**
* @param conf a populated {@link Configuration}
* @throws RuntimeException if a fatal runtime error is encountered
Expand Down Expand Up @@ -98,13 +101,22 @@ public PluginRepository(Configuration conf) throws RuntimeException {
try {
installExtensions(this.fRegisteredPlugins);
} catch (PluginRuntimeException e) {
LOG.error("Could not install extensions.", e.toString());
LOG.error("Could not install extensions. {}", e.toString());
Comment thread
lewismc marked this conversation as resolved.
Outdated
throw new RuntimeException(e.getMessage());
}

registerURLStreamHandlerFactory();

displayStatus();

// Register cleanup action with Cleaner
CLEANER.register(this, () -> {
try {
shutDownActivatedPlugins();
} catch (PluginRuntimeException e) {
LOG.error("Error during cleanup of activated plugins", e);
}
});
}

/**
Expand Down Expand Up @@ -313,19 +325,6 @@ public Plugin getPluginInstance(PluginDescriptor pDescriptor)
}
}

/**
* Attempts to shut down all activated plugins.
* @deprecated
* @see <a href="https://openjdk.java.net/jeps/421">JEP 421: Deprecate Finalization for Removal</a>
* @see java.lang.Object#finalize()
* @deprecated
*/
@Override
@Deprecated
public void finalize() throws Throwable {
shutDownActivatedPlugins();
}

/**
* Shuts down all plugins
*
Expand Down
21 changes: 11 additions & 10 deletions src/java/org/apache/nutch/scoring/webgraph/LinkDumper.java
Original file line number Diff line number Diff line change
Expand Up @@ -27,10 +27,9 @@

import org.apache.commons.cli.CommandLine;
import org.apache.commons.cli.CommandLineParser;
import org.apache.commons.cli.GnuParser;
import org.apache.commons.cli.DefaultParser;
import org.apache.commons.cli.HelpFormatter;
import org.apache.commons.cli.Option;
import org.apache.commons.cli.OptionBuilder;
import org.apache.commons.cli.Options;
import org.apache.commons.lang3.time.StopWatch;
import org.slf4j.Logger;
Expand Down Expand Up @@ -417,18 +416,20 @@ public static void main(String[] args) throws Exception {
public int run(String[] args) throws Exception {

Options options = new Options();
OptionBuilder.withArgName("help");
OptionBuilder.withDescription("show this help message");
Option helpOpts = OptionBuilder.create("help");
Option helpOpts = Option.builder("help")
.argName("help")
.desc("show this help message")
.build();
options.addOption(helpOpts);

OptionBuilder.withArgName("webgraphdb");
OptionBuilder.hasArg();
OptionBuilder.withDescription("the web graph database to use");
Option webGraphDbOpts = OptionBuilder.create("webgraphdb");
Option webGraphDbOpts = Option.builder("webgraphdb")
.argName("webgraphdb")
.hasArg()
.desc("the web graph database to use")
.build();
options.addOption(webGraphDbOpts);

CommandLineParser parser = new GnuParser();
CommandLineParser parser = new DefaultParser();
try {

CommandLine line = parser.parse(options, args);
Expand Down
Loading