diff --git a/.github/workflows/junit-report.yml b/.github/workflows/junit-report.yml index 5e9bb20ca8..7bf7728437 100644 --- a/.github/workflows/junit-report.yml +++ b/.github/workflows/junit-report.yml @@ -24,21 +24,44 @@ concurrency: cancel-in-progress: true permissions: + actions: read checks: write contents: read issues: write pull-requests: write jobs: checks: + if: github.event.workflow_run.conclusion == 'success' runs-on: ubuntu-latest steps: - - name: Download Test Report (Ubuntu) + - name: Set up JDK 17 + uses: actions/setup-java@v5 + with: + java-version: '17' + distribution: 'temurin' + - name: Download Test Report (Ubuntu JDK 17) uses: dawidd6/action-download-artifact@v11 with: - name: junit-test-results-ubuntu-latest + name: junit-test-results-ubuntu-latest-jdk17 workflow: master-build.yml run_id: ${{ github.event.workflow_run.id }} - continue-on-error: true + path: ./junit-ubuntu-jdk17 + - name: Verify JUnit XML layout + run: | + set -euo pipefail + shopt -s globstar nullglob + root="./junit-ubuntu-jdk17" + if [ ! -d "$root" ]; then + echo "::error::Download path $root is missing." + exit 1 + fi + files=("$root"/build/**/TEST-*.xml) + if [ ${#files[@]} -eq 0 ] || [ ! -e "${files[0]}" ]; then + echo "::error::No TEST-*.xml under $root/build/ (artifact missing, wrong layout, or download failed)." + find "$root" -maxdepth 5 -type d -print 2>/dev/null | head -80 || true + exit 1 + fi + echo "Found ${#files[@]} JUnit report file(s) under $root/build/." - name: Resolve PR number id: pr run: | @@ -59,12 +82,8 @@ jobs: - name: Publish Test Report uses: mikepenz/action-junit-report@v6 with: - report_paths: |- - ./test/TEST-*.xml - ./*/**/test/TEST-*.xml - check_name: |- - JUnit Test Report - JUnit Test Report Plugins + report_paths: ./junit-ubuntu-jdk17/build/**/TEST-*.xml + check_name: JUnit Test Report commit: ${{ github.event.workflow_run.head_sha }} fail_on_failure: false fail_on_parse_error: true diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index a4419b8937..95cfcd1c25 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -26,19 +26,15 @@ concurrency: cancel-in-progress: true # Java Version Strategy: -# - BUILD: Requires Java 17+ (JUnit 6 dependency) -# - RUNTIME: Supports Java 11+ (javac.version=11 produces Java 11 bytecode) -# -# The 'build' job verifies bytecode compilation for both Java 11 and 17 targets. -# The 'runtime-java11' job verifies the built artifacts actually run on Java 11. -# The 'tests' job runs on JDK 17 (required by JUnit 6) with the default -# javac.version=11 bytecode target for backward compatibility. +# - Requires Java 17+ to build, test, and run (Hadoop 3.5+ client + JUnit 6). +# - Default bytecode: javac.version=17 (see default.properties). +# - CI exercises Eclipse Temurin JDK 17 and JDK 21 on Ubuntu (and tests on macOS). jobs: javadoc: strategy: matrix: - java: ['17'] + java: ['17', '21'] os: [ubuntu-latest] runs-on: ${{ matrix.os }} steps: @@ -61,7 +57,7 @@ jobs: rat: strategy: matrix: - java: ['17'] + java: ['17', '21'] os: [ubuntu-latest] runs-on: ${{ matrix.os }} steps: @@ -112,22 +108,21 @@ jobs: if: steps.filter.outputs.openapi == 'true' run: ./node_modules/.bin/lint-openapi openapi.yaml - # Build verification with Java bytecode target matrix - # Verifies bytecode compatibility for both Java 11 and Java 17 targets + # Build verification on JDK 17 and 21 (bytecode target Java 17) build: strategy: fail-fast: false matrix: - javac-version: ['11', '17'] + java: ['17', '21'] os: [ubuntu-latest] runs-on: ${{ matrix.os }} - name: build (javac.version=${{ matrix.javac-version }}) + name: build (jdk ${{ matrix.java }}, javac.version=17) steps: - uses: actions/checkout@v5 - - name: Set up JDK 17 + - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v5 with: - java-version: '17' + java-version: ${{ matrix.java }} distribution: 'temurin' - name: Cache Ivy dependencies uses: actions/cache@v4 @@ -136,14 +131,13 @@ jobs: key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} restore-keys: | ${{ runner.os }}-ivy- - - name: Build with javac.version=${{ matrix.javac-version }} - run: ant clean runtime -Djavac.version=${{ matrix.javac-version }} -buildfile build.xml + - name: Build with javac.version=17 + run: ant clean runtime -Djavac.version=17 -buildfile build.xml - name: Verify bytecode version run: | - # Extract and verify the bytecode version of compiled classes - # Java 11 = major version 55, Java 17 = major version 61 - EXPECTED_VERSION=${{ matrix.javac-version == '11' && '55' || '61' }} - echo "Expected major version: $EXPECTED_VERSION (Java ${{ matrix.javac-version }})" + # Java 17 = major version 61 + EXPECTED_VERSION=61 + echo "Expected major version: $EXPECTED_VERSION (Java 17 bytecode)" # Find a real class file (exclude package-info.class which may have different version) cd build/classes @@ -162,17 +156,21 @@ jobs: exit 1 fi - # Verify runtime compatibility on Java 11 - # This ensures the built artifacts can actually run on Java 11 - runtime-java11: + # Smoke-test runtime on the same JDK used to build (17 and 21) + runtime-smoke: needs: build + strategy: + fail-fast: false + matrix: + java: ['17', '21'] runs-on: ubuntu-latest + name: runtime-smoke (jdk ${{ matrix.java }}) steps: - uses: actions/checkout@v5 - - name: Set up JDK 17 for building + - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v5 with: - java-version: '17' + java-version: ${{ matrix.java }} distribution: 'temurin' - name: Cache Ivy dependencies uses: actions/cache@v4 @@ -181,38 +179,31 @@ jobs: key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} restore-keys: | ${{ runner.os }}-ivy- - - name: Build with Java 11 target - run: ant clean runtime -Djavac.version=11 -buildfile build.xml - - name: Set up JDK 11 for runtime verification - uses: actions/setup-java@v5 - with: - java-version: '11' - distribution: 'temurin' - - name: Verify runtime on Java 11 + - name: Build with javac.version=17 + run: ant clean runtime -Djavac.version=17 -buildfile build.xml + - name: Verify runtime on JDK ${{ matrix.java }} run: | - echo "Verifying Nutch can run on Java 11..." + echo "Verifying Nutch on JDK ${{ matrix.java }}..." java -version cd runtime/local - # Actually load Java classes by running showproperties - # This invokes org.apache.nutch.tools.ShowProperties and verifies the JAR loads bin/nutch showproperties | head -20 - echo "Java 11 runtime verification complete" + echo "Runtime smoke test complete" - # Tests run on JDK 17 (required by JUnit 6) with default javac.version=11 - # Java 11 runtime compatibility is verified by the runtime-java11 job tests: strategy: fail-fast: false matrix: + java: ['17', '21'] os: [ubuntu-latest, macos-latest] runs-on: ${{ matrix.os }} + name: tests (jdk ${{ matrix.java }}, ${{ matrix.os }}) timeout-minutes: 45 steps: - uses: actions/checkout@v5 - - name: Set up JDK 17 + - name: Set up JDK ${{ matrix.java }} uses: actions/setup-java@v5 with: - java-version: '17' + java-version: ${{ matrix.java }} distribution: 'temurin' - name: Cache Ivy dependencies uses: actions/cache@v4 @@ -266,9 +257,9 @@ jobs: fi - name: Upload Test Report uses: actions/upload-artifact@v4 - if: always() && matrix.os == 'ubuntu-latest' && steps.check_tests.outputs.has_results == 'true' + if: always() && matrix.os == 'ubuntu-latest' && matrix.java == '17' && steps.check_tests.outputs.has_results == 'true' with: - name: junit-test-results-${{ matrix.os }} + name: junit-test-results-${{ matrix.os }}-jdk${{ matrix.java }} path: | ./build/test/TEST-*.xml ./build/**/test/TEST-*.xml @@ -277,7 +268,7 @@ jobs: uses: actions/upload-artifact@v4 if: always() && matrix.os == 'ubuntu-latest' with: - name: coverage-data + name: coverage-data-ubuntu-jdk${{ matrix.java }} path: ./build/coverage/*.exec retention-days: 1 if-no-files-found: ignore diff --git a/.github/workflows/sonarcloud.yml b/.github/workflows/sonarcloud.yml index 894a78fe49..efcdb6a4cf 100644 --- a/.github/workflows/sonarcloud.yml +++ b/.github/workflows/sonarcloud.yml @@ -52,26 +52,27 @@ jobs: ${{ runner.os }}-ivy- - name: Compile (no tests) run: ant compile compile-plugins resolve-test -buildfile build.xml - - name: Download coverage data + # Coverage and JUnit XML come only from the master-build Ubuntu JDK 17 matrix job. + - name: Download coverage data (Ubuntu JDK 17) uses: dawidd6/action-download-artifact@v11 with: - name: coverage-data + name: coverage-data-ubuntu-jdk17 workflow: master-build.yml run_id: ${{ github.event.workflow_run.id }} path: ./build/coverage/ continue-on-error: true - - name: Download test reports + - name: Download test reports (Ubuntu JDK 17) uses: dawidd6/action-download-artifact@v11 with: - name: junit-test-results-ubuntu-latest + name: junit-test-results-ubuntu-latest-jdk17 workflow: master-build.yml run_id: ${{ github.event.workflow_run.id }} - path: ./build/test/ + path: ./build/test-jdk17/ continue-on-error: true - - name: Flatten test reports + - name: Flatten test reports (JDK 17 only) run: | mkdir -p ./build/test-reports - find ./build/test -name 'TEST-*.xml' -exec cp {} ./build/test-reports/ \; + find ./build/test-jdk17 -name 'TEST-*.xml' -exec cp {} ./build/test-reports/ \; 2>/dev/null || true continue-on-error: true - name: Generate JaCoCo XML report run: ant jacoco-report -buildfile build.xml @@ -110,7 +111,7 @@ jobs: GH_TOKEN: ${{ github.token }} - name: SonarCloud Scan (PR) if: steps.pr.outputs.is_pr == 'true' - uses: SonarSource/sonarqube-scan-action@v6 + uses: SonarSource/sonarqube-scan-action@299e4b793aaa83bf2aba7c9c14bedbb485688ec4 with: args: > -Dsonar.pullrequest.key=${{ steps.pr.outputs.number }} @@ -121,7 +122,7 @@ jobs: SONAR_HOST_URL: https://sonarcloud.io - name: SonarCloud Scan (branch) if: steps.pr.outputs.is_pr == 'false' && steps.pr.outputs.skip != 'true' - uses: SonarSource/sonarqube-scan-action@v6 + uses: SonarSource/sonarqube-scan-action@299e4b793aaa83bf2aba7c9c14bedbb485688ec4 with: args: > -Dsonar.branch.name=${{ github.event.workflow_run.head_branch }} diff --git a/.github/workflows/yetus.yml b/.github/workflows/yetus.yml index 8db586a959..58ba23c583 100644 --- a/.github/workflows/yetus.yml +++ b/.github/workflows/yetus.yml @@ -13,8 +13,9 @@ # See the License for the specific language governing permissions and # limitations under the License. # -# Apache Yetus test-patch: pre-commit patch testing (Ant, JDK 17). -# Runs alongside master-build.yml; all CI is unified on Java 17. +# Apache Yetus test-patch: pre-commit patch testing (apache/yetus-test-patch-action). +# The action runs inside ghcr.io/apache/yetus:0.15.1 — javahome must be a JDK path +# inside that image (OpenJDK 11 on amd64), not actions/setup-java on the runner. # See https://yetus.apache.org/documentation/0.15.1/precommit/ --- name: Apache Yetus @@ -43,7 +44,6 @@ jobs: timeout-minutes: 45 env: PATCH_DIR: ${{ github.workspace }}/out - JAVA_HOME: /usr/lib/jvm/java-17-openjdk-amd64 steps: - uses: actions/checkout@v5 with: @@ -61,7 +61,7 @@ jobs: basedir: . buildtool: nobuild githubtoken: ${{ secrets.GITHUB_TOKEN }} - javahome: '/usr/lib/jvm/java-17-openjdk-amd64' + javahome: /usr/lib/jvm/java-11-openjdk-amd64 patchdir: /github/workspace/out plugins: all,-jira,-gitlab,-unit,-compile project: nutch diff --git a/.yetus/blanks-tabs.txt b/.yetus/blanks-tabs.txt new file mode 100644 index 0000000000..07e4fb8c9c --- /dev/null +++ b/.yetus/blanks-tabs.txt @@ -0,0 +1,3 @@ +# Ignore tabs in Yetus-generated patch dir (not source files). +# See --blanks-tabs-ignore-file in the blanks plugin. +^out/ diff --git a/.yetus/personality.sh b/.yetus/personality.sh index d9da6ab782..cdf7756619 100644 --- a/.yetus/personality.sh +++ b/.yetus/personality.sh @@ -13,10 +13,13 @@ # See the License for the specific language governing permissions and # limitations under the License. -# Ensure JAVA_HOME is set for pre-patch and other phases when running in -# the Yetus Docker container (avoids "JAVA_HOME is not defined" in pre-patch). -if [ -z "${JAVA_HOME}" ] && [ -d "/usr/lib/jvm/java-17-openjdk-amd64" ]; then - export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64" +# Ensure JAVA_HOME is set for pre-patch and other phases. The Yetus GitHub Action +# runs inside ghcr.io/apache/yetus (see yetus.yml javahome). This fallback matches +# OpenJDK layouts on Debian/Ubuntu when JAVA_HOME is not already set. +if [ -z "${JAVA_HOME}" ] && [ -d "/usr/lib/jvm/java-11-openjdk-amd64" ]; then + export JAVA_HOME="/usr/lib/jvm/java-11-openjdk-amd64" +elif [ -z "${JAVA_HOME}" ] && [ -d "/usr/lib/jvm/java-11-openjdk-arm64" ]; then + export JAVA_HOME="/usr/lib/jvm/java-11-openjdk-arm64" fi # Pass JAVA_HOME into the re-exec Docker container so pre-patch and other diff --git a/README.md b/README.md index 13699fb98b..9ece018256 100644 --- a/README.md +++ b/README.md @@ -105,9 +105,9 @@ radio button and select "Eclipse". Click "Create". On the next screen the "Eclipse projects directory" should be already set to the nutch folder. Leave the "Create module files near .classpath files" radio button selected. -Click "Next" on the next screens. On the project SDK screen select Java 11 and -click "Create". **N.B.** On Mac with homebrew openjdk, use the directory under -_libexec_: `/libexec/openjdk.jdk/Contents/Home`. +Click "Next" on the next screens. On the project SDK screen select Java 17 +(or newer) and click "Create". **N.B.** On Mac with homebrew openjdk, use the +directory under _libexec_: `/libexec/openjdk.jdk/Contents/Home`. Once the project is imported, you will see a popup saying "Ant build scripts found", "Frameworks detected - IvyIDEA Framework detected". Click "Import". If diff --git a/build.xml b/build.xml index ea73583706..b6fa266a4a 100644 --- a/build.xml +++ b/build.xml @@ -56,17 +56,12 @@ diff --git a/default.properties b/default.properties index e0fde46d84..0454e0a8a5 100644 --- a/default.properties +++ b/default.properties @@ -46,11 +46,14 @@ test.failfast = false # JaCoCo code coverage jacoco.version=0.8.12 +# Apache Hadoop client libraries (Ivy rev for hadoop-common, hadoop-hdfs, mapreduce artifacts) +hadoop.version=3.5.0 + # Proxy Host and Port to use for building JavaDoc javadoc.proxy.host=-J-DproxyHost= javadoc.proxy.port=-J-DproxyPort= javadoc.link.java=https://docs.oracle.com/en/java/javase/17/docs/api/ -javadoc.link.hadoop=https://hadoop.apache.org/docs/r3.4.2/api/ +javadoc.link.hadoop=https://hadoop.apache.org/docs/r${hadoop.version}/api/ javadoc.packages=org.apache.nutch.* dist.dir=./dist @@ -61,12 +64,9 @@ javac.debug=on javac.optimize=on javac.deprecation=on -# Java bytecode target version for compiled classes. -# Set to 11 for backward-compatible runtime (works on Java 11+). -# Note: Building and running tests requires Java 17+ (JUnit 6 requirement), -# but the compiled artifacts will run on Java 11+. -# Override with: ant -Djavac.version=17 to target Java 17 bytecode. -javac.version=11 +# Java bytecode target version for compiled classes (Java 17; required with Hadoop 3.5+ client JARs). +# Build, test, and runtime require a Java 17+ JVM. +javac.version=17 runtime.dir=./runtime runtime.deploy=${runtime.dir}/deploy diff --git a/ivy/ivy.xml b/ivy/ivy.xml index 7b32242cc9..16d44674fa 100644 --- a/ivy/ivy.xml +++ b/ivy/ivy.xml @@ -16,99 +16,99 @@ limitations under the License. --> - - - - - Nutch is an open source web-search - software. It builds on Hadoop, Tika and Solr, adding web-specifics, - such as a crawler, a link-graph database etc. - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + + + + Nutch is an open source web-search + software. It builds on Hadoop, Tika and Solr, adding web-specifics, + such as a crawler, a link-graph database etc. + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + @@ -126,28 +126,28 @@ - - - - - - - - - - - - - - - - - - - - - - - + + + + + + + + + + + + + + + + + + + + + + + diff --git a/ivy/mvn.template b/ivy/mvn.template index 43ecfbd6af..d117c233b5 100644 --- a/ivy/mvn.template +++ b/ivy/mvn.template @@ -125,8 +125,8 @@ maven-compiler-plugin 3.13.0 - 11 - 11 + 17 + 17 diff --git a/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java b/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java index 12dca9a945..30da0f39fa 100644 --- a/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java +++ b/src/java/org/apache/nutch/fetcher/FetcherOutputFormat.java @@ -31,7 +31,6 @@ import org.apache.hadoop.util.Progressable; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat; -import org.apache.hadoop.mapred.InvalidJobConfException; import org.apache.hadoop.mapreduce.RecordWriter; import org.apache.hadoop.mapreduce.lib.output.SequenceFileOutputFormat; import org.apache.hadoop.mapreduce.TaskAttemptContext; @@ -48,7 +47,7 @@ public void checkOutputSpecs(JobContext job) throws IOException { Configuration conf = job.getConfiguration(); Path out = FileOutputFormat.getOutputPath(job); if ((out == null) && (job.getNumReduceTasks() != 0)) { - throw new InvalidJobConfException("Output directory not set in conf."); + throw new IOException("Output directory not set in conf."); } FileSystem fs = out.getFileSystem(conf); if (fs.exists(new Path(out, CrawlDatum.FETCH_DIR_NAME))) { diff --git a/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java b/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java index e14b9dd2b1..02ad4f147b 100644 --- a/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java +++ b/src/java/org/apache/nutch/segment/ContentAsTextInputFormat.java @@ -20,8 +20,8 @@ import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.mapreduce.Mapper.Context; diff --git a/src/java/org/apache/nutch/segment/SegmentReader.java b/src/java/org/apache/nutch/segment/SegmentReader.java index dc18d4adb9..3b386d24eb 100644 --- a/src/java/org/apache/nutch/segment/SegmentReader.java +++ b/src/java/org/apache/nutch/segment/SegmentReader.java @@ -30,6 +30,7 @@ import java.text.SimpleDateFormat; import java.util.ArrayList; import java.util.Arrays; +import java.util.Comparator; import java.util.Date; import java.util.HashMap; import java.util.Iterator; @@ -45,6 +46,7 @@ import org.apache.hadoop.fs.FileSystem; import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.MapFile; +import org.apache.hadoop.io.IOUtils; import org.apache.hadoop.io.SequenceFile; import org.apache.hadoop.io.Text; import org.apache.hadoop.io.Writable; @@ -489,25 +491,49 @@ private List getMapRecords(Path dir, Text key) throws Exception { } private List getSeqRecords(Path dir, Text key) throws Exception { - SequenceFile.Reader[] readers = org.apache.hadoop.mapred.SequenceFileOutputFormat - .getReaders(getConf(), dir); - ArrayList res = new ArrayList<>(); - Class keyClass = readers[0].getKeyClass(); - Class valueClass = readers[0].getValueClass(); - if (!keyClass.getName().equals("org.apache.hadoop.io.Text")) - throw new IOException("Incompatible key (" + keyClass.getName() + ")"); - WritableComparable aKey = (WritableComparable) keyClass.getConstructor().newInstance(); - Writable value = (Writable) valueClass.getConstructor().newInstance(); - for (int i = 0; i < readers.length; i++) { - while (readers[i].next(aKey, value)) { - if (aKey.equals(key)) { - res.add(value); - value = (Writable) valueClass.getConstructor().newInstance(); + Configuration conf = getConf(); + FileSystem fs = dir.getFileSystem(conf); + FileStatus[] listed = fs.listStatus(dir); + ArrayList parts = new ArrayList<>(); + for (FileStatus st : listed) { + if (!st.isFile()) { + continue; + } + String name = st.getPath().getName(); + if (!name.startsWith("_") && !name.startsWith(".")) { + parts.add(st); + } + } + FileStatus[] statuses = parts.toArray(new FileStatus[0]); + if (statuses.length == 0) { + throw new IOException("No sequence file parts under " + dir); + } + Arrays.sort(statuses, Comparator.comparing(f -> f.getPath().getName())); + SequenceFile.Reader[] readers = new SequenceFile.Reader[statuses.length]; + try { + for (int i = 0; i < statuses.length; i++) { + readers[i] = new SequenceFile.Reader(conf, + SequenceFile.Reader.file(statuses[i].getPath())); + } + ArrayList res = new ArrayList<>(); + Class keyClass = readers[0].getKeyClass(); + Class valueClass = readers[0].getValueClass(); + if (!keyClass.getName().equals("org.apache.hadoop.io.Text")) + throw new IOException("Incompatible key (" + keyClass.getName() + ")"); + WritableComparable aKey = (WritableComparable) keyClass.getConstructor().newInstance(); + Writable value = (Writable) valueClass.getConstructor().newInstance(); + for (int i = 0; i < readers.length; i++) { + while (readers[i].next(aKey, value)) { + if (aKey.equals(key)) { + res.add(value); + value = (Writable) valueClass.getConstructor().newInstance(); + } } } - readers[i].close(); + return res; + } finally { + IOUtils.cleanupWithLogger(LOG, readers); } - return res; } /** diff --git a/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java b/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java index abe985a851..fd86d2f72d 100644 --- a/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java +++ b/src/java/org/apache/nutch/tools/arc/ArcInputFormat.java @@ -22,8 +22,8 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.conf.Configuration; import org.apache.hadoop.mapreduce.lib.input.FileInputFormat; -import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.Job; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.RecordReader; diff --git a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java index cba4694ba8..e2e25ca5ed 100644 --- a/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java +++ b/src/java/org/apache/nutch/tools/arc/ArcRecordReader.java @@ -29,8 +29,8 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.BytesWritable; import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.FileSplit; import org.apache.hadoop.mapreduce.InputSplit; +import org.apache.hadoop.mapreduce.lib.input.FileSplit; import org.apache.hadoop.mapreduce.TaskAttemptContext; import org.apache.hadoop.mapreduce.RecordReader; import org.apache.hadoop.util.ReflectionUtils; diff --git a/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java b/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java index d11634bbd1..25f507ae92 100644 --- a/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java +++ b/src/test/org/apache/nutch/crawl/CrawlDBTestUtil.java @@ -43,8 +43,8 @@ import org.apache.hadoop.io.Text; import org.apache.hadoop.conf.Configuration.IntegerRanges; import org.apache.hadoop.io.RawComparator; -import org.apache.hadoop.mapred.Counters; -import org.apache.hadoop.mapred.Counters.Counter; +import org.apache.hadoop.mapreduce.Counter; +import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.JobID; import org.apache.hadoop.mapreduce.Mapper; @@ -141,12 +141,12 @@ public void progress() { @Override public Counter getCounter(Enum arg0) { - return dummyCounters.getGroup("dummy").getCounterForName("dummy"); + return dummyCounters.findCounter(arg0); } @Override public Counter getCounter(String arg0, String arg1) { - return dummyCounters.getGroup("dummy").getCounterForName("dummy"); + return dummyCounters.findCounter(arg0, arg1); } @Override diff --git a/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java b/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java index 1beab362be..aca818c219 100644 --- a/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java +++ b/src/test/org/apache/nutch/crawl/CrawlDbUpdateUtil.java @@ -30,8 +30,8 @@ import org.apache.hadoop.fs.Path; import org.apache.hadoop.io.RawComparator; import org.apache.hadoop.io.Text; -import org.apache.hadoop.mapred.Counters; -import org.apache.hadoop.mapred.Counters.Counter; +import org.apache.hadoop.mapreduce.Counter; +import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.InputFormat; import org.apache.hadoop.mapreduce.JobID; import org.apache.hadoop.mapreduce.Mapper; @@ -105,12 +105,12 @@ public void progress() { @Override public Counter getCounter(Enum arg0) { - return dummyCounters.getGroup("dummy").getCounterForName("dummy"); + return dummyCounters.findCounter(arg0); } @Override public Counter getCounter(String arg0, String arg1) { - return dummyCounters.getGroup("dummy").getCounterForName("dummy"); + return dummyCounters.findCounter(arg0, arg1); } @Override diff --git a/src/test/org/apache/nutch/util/ReducerContextWrapper.java b/src/test/org/apache/nutch/util/ReducerContextWrapper.java index 196116c4cb..ec683e466c 100644 --- a/src/test/org/apache/nutch/util/ReducerContextWrapper.java +++ b/src/test/org/apache/nutch/util/ReducerContextWrapper.java @@ -17,391 +17,88 @@ package org.apache.nutch.util; import java.io.IOException; -import java.net.URI; -import java.util.HashMap; import java.util.Map; +import java.util.Objects; import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.conf.Configuration.IntegerRanges; -import org.apache.hadoop.fs.Path; -import org.apache.hadoop.io.RawComparator; -import org.apache.hadoop.mapred.Counters; -import org.apache.hadoop.mapreduce.Counter; -import org.apache.hadoop.mapreduce.InputFormat; -import org.apache.hadoop.mapreduce.JobID; -import org.apache.hadoop.mapreduce.Mapper; -import org.apache.hadoop.mapreduce.OutputCommitter; -import org.apache.hadoop.mapreduce.OutputFormat; -import org.apache.hadoop.mapreduce.Partitioner; +import org.apache.hadoop.mapreduce.Counters; import org.apache.hadoop.mapreduce.Reducer; -import org.apache.hadoop.mapreduce.TaskAttemptID; -import org.apache.hadoop.security.Credentials; +import org.mockito.ArgumentMatchers; +import org.mockito.Mockito; +import org.mockito.invocation.InvocationOnMock; +import org.mockito.stubbing.Answer; /** - * This class wraps an implementation of {@link Reducer.Context}, to be used in unit tests, - * for example: TestIndexerMapReduce, TestCrawlDbStates.testCrawlDbStatTransitionInject. - * + * Supplies a {@link Reducer.Context} for unit tests (e.g. {@code IndexerMapReduce}, + * {@code Injector}) without subclassing Hadoop's abstract {@code Reducer.Context}. + *

+ * Hadoop marks several {@code JobContext} methods as {@link Deprecated}; an + * anonymous subclass that {@code @Override}s them triggers javac deprecation + * warnings. A Mockito mock implements only behavior we stub here, so test code + * does not reference those deprecated APIs directly. + * * @param Type of input keys - * @param Type of input values + * @param Type of input values * @param Type of output keys * @param Type of output values */ public class ReducerContextWrapper { - - private Reducer reducer; - private Configuration config; - private Counters counters; - private Map valuesIn; - private Map valuesOut; - - private int valuesIndex; - private KEYIN currentKey; - private VALUEIN currentValue; - private Reducer.Context context; - - private String status; + private final Configuration config; + private final Counters counters = new Counters(); + private final Map valuesOut; - public ReducerContextWrapper() { - counters = new Counters(); - valuesIn = new HashMap<>(); - valuesIndex = 0; - } + private Reducer.Context context; /** - * Constructs a ReducerContextWrapper - * - * @param reducer The reducer on which to implement the wrapped Reducer.Context - * @param config The configuration to inject in the wrapped Reducer.Context - * @param valuesOut The output values to fill (to fake the Hadoop process) + * @param reducer reducer under test (retained for call-site clarity; not used by the mock) + * @param config configuration exposed by {@link Reducer.Context#getConfiguration()} + * @param valuesOut map receiving {@link Reducer.Context#write(Object, Object)} calls */ - public ReducerContextWrapper(Reducer reducer, Configuration config, Map valuesOut) { - this(); - this.config = config; - this.reducer = reducer; - this.valuesOut = valuesOut; + public ReducerContextWrapper(Reducer reducer, + Configuration config, Map valuesOut) { + Objects.requireNonNull(reducer, "reducer"); + this.config = Objects.requireNonNull(config, "config"); + this.valuesOut = Objects.requireNonNull(valuesOut, "valuesOut"); initContext(); } /** - * Return the wrapped Reducer.Context to be used in calls to Reducer.setup and Reducer.reduce, in unit test - * @return + * @return context suitable for {@link Reducer#setup(Reducer.Context)} and + * {@link Reducer#reduce(Object, Iterable, Reducer.Context)} */ public Reducer.Context getContext() { return context; } - - private void initContext() { - // most methods are not used in Nutch unit tests. - context = reducer.new Context() { - - @Override - public KEYIN getCurrentKey() throws IOException, InterruptedException { - return currentKey; - } - - @Override - public VALUEIN getCurrentValue() throws IOException, InterruptedException { - return currentValue; - } - - @Override - public boolean nextKeyValue() throws IOException, InterruptedException { - return valuesIndex < valuesIn.size(); - } - - @SuppressWarnings("unchecked") - @Override - public void write(Object arg0, Object arg1) - throws IOException, InterruptedException { - valuesOut.put((KEYOUT) arg0, (VALUEOUT) arg1); - currentKey = (KEYIN) arg0; - currentValue = (VALUEIN) arg1; - valuesIndex++; - } - - @Override - public Counter getCounter(Enum arg0) { - return counters.findCounter(arg0); - } - - @Override - public Counter getCounter(String arg0, String arg1) { - return counters.findCounter(arg0, arg1); - } - - @Override - public float getProgress() { - return valuesIndex; - } - - @Override - public String getStatus() { - return status; - } - - @Override - public void setStatus(String arg0) { - status = arg0; - } - - @Override - public Configuration getConfiguration() { - return config; - } - - @Override - public Iterable getValues() - throws IOException, InterruptedException { - return valuesIn.values(); - } - - @Override - public boolean nextKey() throws IOException, InterruptedException { - return valuesIndex < valuesIn.size(); - } - - @Override - public OutputCommitter getOutputCommitter() { - // Auto-generated - return null; - } - - @Override - public TaskAttemptID getTaskAttemptID() { - // Auto-generated - return null; - } - - @Override - public Path[] getArchiveClassPaths() { - // Auto-generated - return null; - } - - @Override - public String[] getArchiveTimestamps() { - // Auto-generated - return null; - } - - @Override - public URI[] getCacheArchives() throws IOException { - // Auto-generated - return null; - } - - @Override - public URI[] getCacheFiles() throws IOException { - // Auto-generated - return null; - } - - @Override - public Class> getCombinerClass() - throws ClassNotFoundException { - // Auto-generated - return null; - } - - @Override - public RawComparator getCombinerKeyGroupingComparator() { - // Auto-generated - return null; - } - - @Override - public Credentials getCredentials() { - // Auto-generated - return null; - } - - @Override - public Path[] getFileClassPaths() { - // Auto-generated - return null; - } - - @Override - public String[] getFileTimestamps() { - // Auto-generated - return null; - } - - @Override - public RawComparator getGroupingComparator() { - // Auto-generated - return null; - } - @Override - public Class> getInputFormatClass() - throws ClassNotFoundException { - // Auto-generated - return null; - } - - @Override - public String getJar() { - // Auto-generated - return null; - } - - @Override - public JobID getJobID() { - // Auto-generated - return null; - } - - @Override - public String getJobName() { - // Auto-generated - return null; - } - - @Override - public boolean getJobSetupCleanupNeeded() { - // Auto-generated - return false; - } - - @Override - public Path[] getLocalCacheArchives() throws IOException { - // Auto-generated - return null; - } - - @Override - public Path[] getLocalCacheFiles() throws IOException { - // Auto-generated - return null; - } - - @Override - public Class getMapOutputKeyClass() { - // Auto-generated - return null; - } - - @Override - public Class getMapOutputValueClass() { - // Auto-generated - return null; - } - - @Override - public Class> getMapperClass() - throws ClassNotFoundException { - // Auto-generated - return null; - } - - @Override - public int getMaxMapAttempts() { - // Auto-generated - return 0; - } - - @Override - public int getMaxReduceAttempts() { - // Auto-generated - return 0; - } - - @Override - public int getNumReduceTasks() { - // Auto-generated - return 0; - } - - @Override - public Class> getOutputFormatClass() - throws ClassNotFoundException { - // Auto-generated - return null; - } - - @Override - public Class getOutputKeyClass() { - // Auto-generated - return null; - } - - @Override - public Class getOutputValueClass() { - // Auto-generated - return null; - } - - @Override - public Class> getPartitionerClass() - throws ClassNotFoundException { - // Auto-generated - return null; - } - - @Override - public boolean getProfileEnabled() { - // Auto-generated - return false; - } - - @Override - public String getProfileParams() { - // Auto-generated - return null; - } - - @Override - public IntegerRanges getProfileTaskRange(boolean arg0) { - // Auto-generated - return null; - } - - @Override - public Class> getReducerClass() - throws ClassNotFoundException { - // Auto-generated - return null; - } - - @Override - public RawComparator getSortComparator() { - // Auto-generated - return null; - } - - @Override - public boolean getSymlink() { - // Auto-generated - return false; - } - - @Override - public boolean getTaskCleanupNeeded() { - // Auto-generated - return false; - } - - @Override - public String getUser() { - // Auto-generated - return null; - } - - @Override - public Path getWorkingDirectory() throws IOException { - // Auto-generated - return null; - } - - @Override - public void progress() { - // Auto-generated - } - }; - + @SuppressWarnings("unchecked") + private void initContext() { + context = Mockito.mock(Reducer.Context.class, + Mockito.withSettings().defaultAnswer(Mockito.RETURNS_DEFAULTS)); + + Mockito.lenient().when(context.getConfiguration()).thenReturn(config); + + Mockito.lenient().when(context.getCounter(ArgumentMatchers.>any())) + .thenAnswer(inv -> counters.findCounter(inv.getArgument(0, Enum.class))); + + Mockito.lenient().when(context.getCounter(Mockito.anyString(), Mockito.anyString())) + .thenAnswer(inv -> counters.findCounter( + inv.getArgument(0, String.class), inv.getArgument(1, String.class))); + + try { + Mockito.doAnswer(new Answer() { + @Override + @SuppressWarnings("unchecked") + public Void answer(InvocationOnMock inv) { + KEYOUT k = inv.getArgument(0); + VALUEOUT v = inv.getArgument(1); + valuesOut.put(k, v); + return null; + } + }).when(context).write(Mockito.any(), Mockito.any()); + } catch (IOException | InterruptedException e) { + throw new IllegalStateException(e); + } } - }