From 17ae3018ba2f8d7148fba64386e59f184fff69f4 Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 20:07:37 -0700 Subject: [PATCH 01/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/yetus.yml | 97 +++++++++++++++++++++++++++++++++++++ README.md | 7 +++ 2 files changed, 104 insertions(+) create mode 100644 .github/workflows/yetus.yml diff --git a/.github/workflows/yetus.yml b/.github/workflows/yetus.yml new file mode 100644 index 0000000000..36cb846335 --- /dev/null +++ b/.github/workflows/yetus.yml @@ -0,0 +1,97 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Apache Yetus test-patch: pre-commit patch testing (Ant, JDK 17). +# Runs alongside master-build.yml; all CI is unified on Java 17. +# See https://yetus.apache.org/documentation/0.15.1/precommit/ + +name: Apache Yetus +on: + push: + branches: [master] + pull_request: + types: [opened, synchronize, reopened] + branches: [master] + +jobs: + yetus: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v5 + with: + path: src + fetch-depth: 0 + - name: Apache Yetus test-patch + uses: apache/yetus-test-patch-action@main + with: + basedir: ${{ github.workspace }}/src + branch-default: master + build-tool: ant + githubtoken: ${{ secrets.GITHUB_TOKEN }} + # Use Java 17 path if available in Yetus image; default may be Java 11 + javahome: '/usr/lib/jvm/java-17-openjdk-amd64' + patch-dir: ${{ github.workspace }}/build/out + plugins: all + project: nutch + run-tests: false + - name: Artifact output + if: always() + uses: actions/upload-artifact@v4 + with: + name: apacheyetuspatchdir + path: ${{ github.workspace }}/build/out + - name: Install pandoc + if: github.event_name == 'pull_request' + run: sudo apt-get update && sudo apt-get install -y pandoc + - name: Convert HTML report to Markdown + if: github.event_name == 'pull_request' + run: | + OUT="${{ github.workspace }}/build/out" + echo "## Apache Yetus test-patch report" > yetus-report.md + echo "" >> yetus-report.md + if [ -f "$OUT/report.html" ]; then + pandoc "$OUT/report.html" -f html -t gfm >> yetus-report.md 2>/dev/null || { + echo "Pandoc conversion failed; using brief report." >> yetus-report.md + echo '```' >> yetus-report.md + cat "$OUT/brief.txt" >> yetus-report.md 2>/dev/null || true + echo '```' >> yetus-report.md + } + elif [ -f "$OUT/brief.txt" ]; then + echo '```' >> yetus-report.md + cat "$OUT/brief.txt" >> yetus-report.md + echo '```' >> yetus-report.md + else + echo "No Yetus report or brief found." >> yetus-report.md + fi + - name: Truncate if over comment limit + if: github.event_name == 'pull_request' + run: | + MAX=60000 + if [ $(wc -c < yetus-report.md) -gt $MAX ]; then + head -c $MAX yetus-report.md > yetus-report-trimmed.md + echo "" >> yetus-report-trimmed.md + echo "" >> yetus-report-trimmed.md + echo "_Report truncated (GitHub comment limit). Full HTML in apacheyetuspatchdir artifact as report.html._" >> yetus-report-trimmed.md + mv yetus-report-trimmed.md yetus-report.md + fi + - name: Comment PR with Yetus report + if: github.event_name == 'pull_request' + uses: peter-evans/create-or-update-comment@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + repository: ${{ github.repository }} + issue-number: ${{ github.event.pull_request.number }} + body-path: yetus-report.md diff --git a/README.md b/README.md index fa68816042..2be9bf37e8 100644 --- a/README.md +++ b/README.md @@ -38,6 +38,13 @@ To contribute a patch, follow these instructions (note that installing 11. `git push -u NUTCH-xxxx` 12. `hub pull-request` (if hub is not installed, please follow the instructions how to [create a pull-request from a fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork)) +Pre-commit / Apache Yetus +------------------------- +Pull requests run [Apache Yetus](https://yetus.apache.org/) test-patch for automated checks (compile, tests, style, reporting). See the [Basic Precommit](https://yetus.apache.org/documentation/0.15.1/precommit/) docs and [Usage Introduction](https://yetus.apache.org/documentation/0.15.1/precommit/usage-intro/). CI uses the Ant build tool and Java 17, in line with the main master-build workflow. To run test-patch locally (e.g. before opening a PR): + + test-patch --basedir=/path/to/clean/repo --build-tool=ant --plugins=ant,javac,javadoc,xml [patchfile] + +Use `--run-tests` to include unit tests. Exclude patterns can be added in `.yetus/excludes.txt` (regex, one per line). IDE setup ========= From 4aab926a0013c8deed2a0e0b1df8148cb5710c98 Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 20:13:00 -0700 Subject: [PATCH 02/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/yetus.yml | 2 -- 1 file changed, 2 deletions(-) diff --git a/.github/workflows/yetus.yml b/.github/workflows/yetus.yml index 36cb846335..a8e065c523 100644 --- a/.github/workflows/yetus.yml +++ b/.github/workflows/yetus.yml @@ -19,8 +19,6 @@ name: Apache Yetus on: - push: - branches: [master] pull_request: types: [opened, synchronize, reopened] branches: [master] From 94e7fb4bf33b36591a5af46ea242e110fab8c01a Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 21:30:48 -0700 Subject: [PATCH 03/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/master-build.yml | 54 ------------------------------ .github/workflows/yetus.yml | 38 +++++++++++++++++---- build.xml | 4 +-- 3 files changed, 34 insertions(+), 62 deletions(-) diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index 76d98db4eb..3ae6faae9b 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -31,60 +31,6 @@ on: # javac.version=11 bytecode target for backward compatibility. jobs: - javadoc: - strategy: - matrix: - java: ['17'] - os: [ubuntu-latest] - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v5 - - name: Set up JDK ${{ matrix.java }} - uses: actions/setup-java@v5 - with: - java-version: ${{ matrix.java }} - distribution: 'temurin' - - name: Cache Ivy dependencies - uses: actions/cache@v4 - with: - path: ~/.ivy2/cache - key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} - restore-keys: | - ${{ runner.os }}-ivy- - - name: Javadoc - run: ant clean javadoc -buildfile build.xml - - rat: - strategy: - matrix: - java: ['17'] - os: [ubuntu-latest] - runs-on: ${{ matrix.os }} - steps: - - uses: actions/checkout@v5 - - name: Set up JDK ${{ matrix.java }} - uses: actions/setup-java@v5 - with: - java-version: ${{ matrix.java }} - distribution: 'temurin' - - name: Cache Ivy dependencies - uses: actions/cache@v4 - with: - path: ~/.ivy2/cache - key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} - restore-keys: | - ${{ runner.os }}-ivy- - - name: Run Apache Rat - run: ant clean run-rat -buildfile build.xml - - name: Cache unknown licenses - run: echo "UNKNOWN_LICENSES=$(sed -n 18p /home/runner/work/nutch/nutch/build/apache-rat-report.txt)" >> $GITHUB_ENV - - name: Versions - run: | - echo $UNKNOWN_LICENSES - - name: Fail if any unknown licenses - if: ${{ env.UNKNOWN_LICENSES != '0 Unknown Licenses' }} - run: exit 1 - # Build verification with Java bytecode target matrix # Verifies bytecode compatibility for both Java 11 and Java 17 targets build: diff --git a/.github/workflows/yetus.yml b/.github/workflows/yetus.yml index a8e065c523..28fff4d5ec 100644 --- a/.github/workflows/yetus.yml +++ b/.github/workflows/yetus.yml @@ -19,21 +19,37 @@ name: Apache Yetus on: + push: + branches: [master] pull_request: types: [opened, synchronize, reopened] branches: [master] +concurrency: + group: yetus-${{ github.ref }} + cancel-in-progress: true + jobs: yetus: runs-on: ubuntu-latest + timeout-minutes: 45 + env: + PATCH_DIR: ${{ github.workspace }}/out steps: - name: Checkout uses: actions/checkout@v5 with: path: src fetch-depth: 0 + - name: Cache Ivy dependencies + uses: actions/cache@v4 + with: + path: ~/.ivy2/cache + key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} + restore-keys: | + ${{ runner.os }}-ivy- - name: Apache Yetus test-patch - uses: apache/yetus-test-patch-action@main + uses: apache/yetus-test-patch-action@0.15.1 with: basedir: ${{ github.workspace }}/src branch-default: master @@ -41,7 +57,7 @@ jobs: githubtoken: ${{ secrets.GITHUB_TOKEN }} # Use Java 17 path if available in Yetus image; default may be Java 11 javahome: '/usr/lib/jvm/java-17-openjdk-amd64' - patch-dir: ${{ github.workspace }}/build/out + patch-dir: ${{ env.PATCH_DIR }} plugins: all project: nutch run-tests: false @@ -50,14 +66,14 @@ jobs: uses: actions/upload-artifact@v4 with: name: apacheyetuspatchdir - path: ${{ github.workspace }}/build/out + path: ${{ env.PATCH_DIR }} - name: Install pandoc if: github.event_name == 'pull_request' run: sudo apt-get update && sudo apt-get install -y pandoc - name: Convert HTML report to Markdown if: github.event_name == 'pull_request' run: | - OUT="${{ github.workspace }}/build/out" + OUT="${{ env.PATCH_DIR }}" echo "## Apache Yetus test-patch report" > yetus-report.md echo "" >> yetus-report.md if [ -f "$OUT/report.html" ]; then @@ -85,11 +101,21 @@ jobs: echo "_Report truncated (GitHub comment limit). Full HTML in apacheyetuspatchdir artifact as report.html._" >> yetus-report-trimmed.md mv yetus-report-trimmed.md yetus-report.md fi + - name: Find existing Yetus comment + if: github.event_name == 'pull_request' + id: find + uses: peter-evans/find-comment@v4 + with: + token: ${{ secrets.GITHUB_TOKEN }} + repository: ${{ github.repository }} + issue-number: ${{ github.event.pull_request.number }} + body-includes: "## Apache Yetus test-patch report" - name: Comment PR with Yetus report if: github.event_name == 'pull_request' - uses: peter-evans/create-or-update-comment@v4 + uses: peter-evans/create-or-update-comment@v5 with: token: ${{ secrets.GITHUB_TOKEN }} repository: ${{ github.repository }} issue-number: ${{ github.event.pull_request.number }} - body-path: yetus-report.md + comment-id: ${{ steps.find.outputs.comment-id }} + body-path: yetus-report.md \ No newline at end of file diff --git a/build.xml b/build.xml index 2dffdb7699..a83b3dc6c2 100644 --- a/build.xml +++ b/build.xml @@ -1028,8 +1028,8 @@ - + From 0d6f36935b6f451d91b681c8686195ef0ad95134 Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 21:33:21 -0700 Subject: [PATCH 04/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/yetus.yml | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/.github/workflows/yetus.yml b/.github/workflows/yetus.yml index 28fff4d5ec..16015a0cca 100644 --- a/.github/workflows/yetus.yml +++ b/.github/workflows/yetus.yml @@ -101,18 +101,9 @@ jobs: echo "_Report truncated (GitHub comment limit). Full HTML in apacheyetuspatchdir artifact as report.html._" >> yetus-report-trimmed.md mv yetus-report-trimmed.md yetus-report.md fi - - name: Find existing Yetus comment - if: github.event_name == 'pull_request' - id: find - uses: peter-evans/find-comment@v4 - with: - token: ${{ secrets.GITHUB_TOKEN }} - repository: ${{ github.repository }} - issue-number: ${{ github.event.pull_request.number }} - body-includes: "## Apache Yetus test-patch report" - name: Comment PR with Yetus report if: github.event_name == 'pull_request' - uses: peter-evans/create-or-update-comment@v5 + uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 with: token: ${{ secrets.GITHUB_TOKEN }} repository: ${{ github.repository }} From 3b5548ddfc5218378f1515af3afa7a6a0ad10930 Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 21:42:18 -0700 Subject: [PATCH 05/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/yetus.yml | 14 ++++---------- 1 file changed, 4 insertions(+), 10 deletions(-) diff --git a/.github/workflows/yetus.yml b/.github/workflows/yetus.yml index 16015a0cca..73ce4b8f4e 100644 --- a/.github/workflows/yetus.yml +++ b/.github/workflows/yetus.yml @@ -36,11 +36,7 @@ jobs: env: PATCH_DIR: ${{ github.workspace }}/out steps: - - name: Checkout - uses: actions/checkout@v5 - with: - path: src - fetch-depth: 0 + - uses: actions/checkout@v5 - name: Cache Ivy dependencies uses: actions/cache@v4 with: @@ -51,16 +47,14 @@ jobs: - name: Apache Yetus test-patch uses: apache/yetus-test-patch-action@0.15.1 with: - basedir: ${{ github.workspace }}/src - branch-default: master - build-tool: ant + basedir: . + buildtool: ant githubtoken: ${{ secrets.GITHUB_TOKEN }} # Use Java 17 path if available in Yetus image; default may be Java 11 javahome: '/usr/lib/jvm/java-17-openjdk-amd64' - patch-dir: ${{ env.PATCH_DIR }} + patchdir: ${{ env.PATCH_DIR }} plugins: all project: nutch - run-tests: false - name: Artifact output if: always() uses: actions/upload-artifact@v4 From fa36c0ccf9b10868c64e0d0b7c8f4791e3cf8fc1 Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 21:44:01 -0700 Subject: [PATCH 06/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/junit-report.yml | 5 +++++ .github/workflows/master-build.yml | 4 ++++ .github/workflows/sonarcloud.yml | 5 +++++ 3 files changed, 14 insertions(+) diff --git a/.github/workflows/junit-report.yml b/.github/workflows/junit-report.yml index a1f5ff6497..5e9bb20ca8 100644 --- a/.github/workflows/junit-report.yml +++ b/.github/workflows/junit-report.yml @@ -18,6 +18,11 @@ on: workflow_run: workflows: [master pull request ci] types: [completed] + +concurrency: + group: junit-report-${{ github.event.workflow_run.pull_requests[0].number || github.event.workflow_run.head_sha }} + cancel-in-progress: true + permissions: checks: write contents: read diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index 3ae6faae9b..1e06f6737d 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -21,6 +21,10 @@ on: types: [opened, synchronize, reopened] branches: [master] +concurrency: + group: master-build-${{ github.ref }} + cancel-in-progress: true + # Java Version Strategy: # - BUILD: Requires Java 17+ (JUnit 6 dependency) # - RUNTIME: Supports Java 11+ (javac.version=11 produces Java 11 bytecode) diff --git a/.github/workflows/sonarcloud.yml b/.github/workflows/sonarcloud.yml index 92c609483b..894a78fe49 100644 --- a/.github/workflows/sonarcloud.yml +++ b/.github/workflows/sonarcloud.yml @@ -18,6 +18,11 @@ on: workflow_run: workflows: [master pull request ci] types: [completed] + +concurrency: + group: sonarcloud-${{ github.event.workflow_run.pull_requests[0].number || github.event.workflow_run.head_sha }} + cancel-in-progress: true + jobs: analysis: if: github.event.workflow_run.conclusion == 'success' From ebf542157b051d24e71d8f2e46d10635fbb4136b Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 21:46:52 -0700 Subject: [PATCH 07/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/yetus.yml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.github/workflows/yetus.yml b/.github/workflows/yetus.yml index 73ce4b8f4e..16864ca39d 100644 --- a/.github/workflows/yetus.yml +++ b/.github/workflows/yetus.yml @@ -37,6 +37,8 @@ jobs: PATCH_DIR: ${{ github.workspace }}/out steps: - uses: actions/checkout@v5 + with: + fetch-depth: 0 - name: Cache Ivy dependencies uses: actions/cache@v4 with: From fcb8c01b2cd1e5ebd03295205cda093dfa7726ee Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 21:49:54 -0700 Subject: [PATCH 08/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/yetus.yml | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/.github/workflows/yetus.yml b/.github/workflows/yetus.yml index 16864ca39d..7e0739fb09 100644 --- a/.github/workflows/yetus.yml +++ b/.github/workflows/yetus.yml @@ -33,8 +33,13 @@ jobs: yetus: runs-on: ubuntu-latest timeout-minutes: 45 + permissions: + contents: read + statuses: write + pull-requests: write env: PATCH_DIR: ${{ github.workspace }}/out + JAVA_HOME: /usr/lib/jvm/java-17-openjdk-amd64 steps: - uses: actions/checkout@v5 with: From 0006e83bbac5bbe28802fdb28d7e0422c02db6a9 Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 21:58:54 -0700 Subject: [PATCH 09/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/yetus.yml | 27 ++++++++++++++++++++++----- .yetus/personality.sh | 20 ++++++++++++++++++++ 2 files changed, 42 insertions(+), 5 deletions(-) create mode 100644 .yetus/personality.sh diff --git a/.github/workflows/yetus.yml b/.github/workflows/yetus.yml index 7e0739fb09..a680b829c3 100644 --- a/.github/workflows/yetus.yml +++ b/.github/workflows/yetus.yml @@ -29,14 +29,15 @@ concurrency: group: yetus-${{ github.ref }} cancel-in-progress: true +permissions: + contents: read + statuses: write + pull-requests: write + jobs: yetus: runs-on: ubuntu-latest timeout-minutes: 45 - permissions: - contents: read - statuses: write - pull-requests: write env: PATCH_DIR: ${{ github.workspace }}/out JAVA_HOME: /usr/lib/jvm/java-17-openjdk-amd64 @@ -51,6 +52,17 @@ jobs: key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} restore-keys: | ${{ runner.os }}-ivy- + - name: Cache Yetus Docker image + id: yetus-image-cache + uses: actions/cache@v4 + with: + path: /tmp/yetus-docker-cache + key: ${{ runner.os }}-yetus-image-0.15.1 + - name: Load Yetus Docker image + run: | + if [ -f /tmp/yetus-docker-cache/yetus-image.tar ]; then + docker load -i /tmp/yetus-docker-cache/yetus-image.tar + fi - name: Apache Yetus test-patch uses: apache/yetus-test-patch-action@0.15.1 with: @@ -60,7 +72,7 @@ jobs: # Use Java 17 path if available in Yetus image; default may be Java 11 javahome: '/usr/lib/jvm/java-17-openjdk-amd64' patchdir: ${{ env.PATCH_DIR }} - plugins: all + plugins: all,-jira project: nutch - name: Artifact output if: always() @@ -68,6 +80,11 @@ jobs: with: name: apacheyetuspatchdir path: ${{ env.PATCH_DIR }} + - name: Save Yetus Docker image to cache + if: always() && steps.yetus-image-cache.outputs.cache-hit != 'true' + run: | + mkdir -p /tmp/yetus-docker-cache + docker save ghcr.io/apache/yetus:0.15.1 -o /tmp/yetus-docker-cache/yetus-image.tar - name: Install pandoc if: github.event_name == 'pull_request' run: sudo apt-get update && sudo apt-get install -y pandoc diff --git a/.yetus/personality.sh b/.yetus/personality.sh new file mode 100644 index 0000000000..477404e93c --- /dev/null +++ b/.yetus/personality.sh @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# Ensure JAVA_HOME is set for pre-patch and other phases when running in +# the Yetus Docker container (avoids "JAVA_HOME is not defined" in pre-patch). +if [ -z "${JAVA_HOME}" ] && [ -d "/usr/lib/jvm/java-17-openjdk-amd64" ]; then + export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64" +fi From 2c7e07c8d5d830b12840480b94f93a2ccda18c9b Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 22:04:59 -0700 Subject: [PATCH 10/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/yetus.yml | 2 +- .yetus/personality.sh | 14 ++++++++++++++ 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/.github/workflows/yetus.yml b/.github/workflows/yetus.yml index a680b829c3..9e0775435f 100644 --- a/.github/workflows/yetus.yml +++ b/.github/workflows/yetus.yml @@ -72,7 +72,7 @@ jobs: # Use Java 17 path if available in Yetus image; default may be Java 11 javahome: '/usr/lib/jvm/java-17-openjdk-amd64' patchdir: ${{ env.PATCH_DIR }} - plugins: all,-jira + plugins: all,-jira,-gitlab project: nutch - name: Artifact output if: always() diff --git a/.yetus/personality.sh b/.yetus/personality.sh index 477404e93c..e64acb85b1 100644 --- a/.yetus/personality.sh +++ b/.yetus/personality.sh @@ -18,3 +18,17 @@ if [ -z "${JAVA_HOME}" ] && [ -d "/usr/lib/jvm/java-17-openjdk-amd64" ]; then export JAVA_HOME="/usr/lib/jvm/java-17-openjdk-amd64" fi + +# Pass JAVA_HOME into the re-exec Docker container so pre-patch and other +# phases see it (YETUS-913; otherwise the inner container may not get it). +function docker_do_env_adds +{ + declare k + DOCKER_EXTRAARGS+=("--env=JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64") + for k in "${DOCKER_EXTRAENVS[@]}"; do + [[ -z "${k}" ]] && continue + if [[ "JAVA_HOME" != "${k}" ]]; then + DOCKER_EXTRAARGS+=("--env=${k}=${!k}") + fi + done +} From 523318c28e372b157ff6c7824af117ad94d94d59 Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 22:19:49 -0700 Subject: [PATCH 11/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/yetus.yml | 22 ++-------------------- 1 file changed, 2 insertions(+), 20 deletions(-) diff --git a/.github/workflows/yetus.yml b/.github/workflows/yetus.yml index 9e0775435f..700b1b8678 100644 --- a/.github/workflows/yetus.yml +++ b/.github/workflows/yetus.yml @@ -52,27 +52,15 @@ jobs: key: ${{ runner.os }}-ivy-${{ hashFiles('ivy/ivy.xml', 'src/plugin/**/ivy.xml') }} restore-keys: | ${{ runner.os }}-ivy- - - name: Cache Yetus Docker image - id: yetus-image-cache - uses: actions/cache@v4 - with: - path: /tmp/yetus-docker-cache - key: ${{ runner.os }}-yetus-image-0.15.1 - - name: Load Yetus Docker image - run: | - if [ -f /tmp/yetus-docker-cache/yetus-image.tar ]; then - docker load -i /tmp/yetus-docker-cache/yetus-image.tar - fi - name: Apache Yetus test-patch uses: apache/yetus-test-patch-action@0.15.1 with: basedir: . - buildtool: ant + buildtool: nobuild githubtoken: ${{ secrets.GITHUB_TOKEN }} - # Use Java 17 path if available in Yetus image; default may be Java 11 javahome: '/usr/lib/jvm/java-17-openjdk-amd64' patchdir: ${{ env.PATCH_DIR }} - plugins: all,-jira,-gitlab + plugins: all,-jira,-gitlab,-unit,-compile project: nutch - name: Artifact output if: always() @@ -80,11 +68,6 @@ jobs: with: name: apacheyetuspatchdir path: ${{ env.PATCH_DIR }} - - name: Save Yetus Docker image to cache - if: always() && steps.yetus-image-cache.outputs.cache-hit != 'true' - run: | - mkdir -p /tmp/yetus-docker-cache - docker save ghcr.io/apache/yetus:0.15.1 -o /tmp/yetus-docker-cache/yetus-image.tar - name: Install pandoc if: github.event_name == 'pull_request' run: sudo apt-get update && sudo apt-get install -y pandoc @@ -126,5 +109,4 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} repository: ${{ github.repository }} issue-number: ${{ github.event.pull_request.number }} - comment-id: ${{ steps.find.outputs.comment-id }} body-path: yetus-report.md \ No newline at end of file From e872cd8e087a32900cbbfea8ee1e0747f9a36aab Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 22:27:26 -0700 Subject: [PATCH 12/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/yetus.yml | 2 +- .yamllint.yml | 6 ++++++ .yetus/personality.sh | 2 ++ README.md | 13 ++++++++++--- 4 files changed, 19 insertions(+), 4 deletions(-) create mode 100644 .yamllint.yml diff --git a/.github/workflows/yetus.yml b/.github/workflows/yetus.yml index 700b1b8678..00e3bd3638 100644 --- a/.github/workflows/yetus.yml +++ b/.github/workflows/yetus.yml @@ -59,7 +59,7 @@ jobs: buildtool: nobuild githubtoken: ${{ secrets.GITHUB_TOKEN }} javahome: '/usr/lib/jvm/java-17-openjdk-amd64' - patchdir: ${{ env.PATCH_DIR }} + patchdir: /github/workspace/out plugins: all,-jira,-gitlab,-unit,-compile project: nutch - name: Artifact output diff --git a/.yamllint.yml b/.yamllint.yml new file mode 100644 index 0000000000..2f2860ecbb --- /dev/null +++ b/.yamllint.yml @@ -0,0 +1,6 @@ +# Relax line-length for workflow and config YAML (Yetus yamllint plugin) +extends: default + +rules: + line-length: + max: 200 diff --git a/.yetus/personality.sh b/.yetus/personality.sh index e64acb85b1..4d0d1428b3 100644 --- a/.yetus/personality.sh +++ b/.yetus/personality.sh @@ -21,6 +21,8 @@ fi # Pass JAVA_HOME into the re-exec Docker container so pre-patch and other # phases see it (YETUS-913; otherwise the inner container may not get it). +# @audience private +# @stability stable function docker_do_env_adds { declare k diff --git a/README.md b/README.md index 2be9bf37e8..4c3a71447d 100644 --- a/README.md +++ b/README.md @@ -40,11 +40,18 @@ To contribute a patch, follow these instructions (note that installing Pre-commit / Apache Yetus ------------------------- -Pull requests run [Apache Yetus](https://yetus.apache.org/) test-patch for automated checks (compile, tests, style, reporting). See the [Basic Precommit](https://yetus.apache.org/documentation/0.15.1/precommit/) docs and [Usage Introduction](https://yetus.apache.org/documentation/0.15.1/precommit/usage-intro/). CI uses the Ant build tool and Java 17, in line with the main master-build workflow. To run test-patch locally (e.g. before opening a PR): - test-patch --basedir=/path/to/clean/repo --build-tool=ant --plugins=ant,javac,javadoc,xml [patchfile] +Pull requests run [Apache Yetus](https://yetus.apache.org/) test-patch for +automated checks (style, reporting). See +[Basic Precommit](https://yetus.apache.org/documentation/0.15.1/precommit/) +and +[Usage Introduction](https://yetus.apache.org/documentation/0.15.1/precommit/usage-intro/). +CI uses Java 17. To run test-patch locally (e.g. before opening a PR): -Use `--run-tests` to include unit tests. Exclude patterns can be added in `.yetus/excludes.txt` (regex, one per line). + test-patch --basedir=/path/to/clean/repo --build-tool=nobuild \ + --plugins=all,-jira,-gitlab,-unit,-compile [patchfile] + +Exclude patterns can be added in `.yetus/excludes.txt` (regex, one per line). IDE setup ========= From 54c6742c7345d3b5e70950d92fa6f054ea12da4c Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 22:33:45 -0700 Subject: [PATCH 13/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/master-build.yml | 2 +- .markdownlint.yaml | 2 ++ .yetus/personality.sh | 4 ++-- build.xml | 6 +++--- 4 files changed, 8 insertions(+), 6 deletions(-) create mode 100644 .markdownlint.yaml diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index 1e06f6737d..8d40357049 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -67,7 +67,7 @@ jobs: # Java 11 = major version 55, Java 17 = major version 61 EXPECTED_VERSION=${{ matrix.javac-version == '11' && '55' || '61' }} echo "Expected major version: $EXPECTED_VERSION (Java ${{ matrix.javac-version }})" - + # Find a real class file (exclude package-info.class which may have different version) cd build/classes CLASS_FILE=$(find . -name "*.class" ! -name "package-info.class" | head -1) diff --git a/.markdownlint.yaml b/.markdownlint.yaml new file mode 100644 index 0000000000..35dac9468b --- /dev/null +++ b/.markdownlint.yaml @@ -0,0 +1,2 @@ +# Allow fenced code blocks (```) in addition to indented (Yetus markdownlint plugin) +MD046: fenced diff --git a/.yetus/personality.sh b/.yetus/personality.sh index 4d0d1428b3..5bf04aaff0 100644 --- a/.yetus/personality.sh +++ b/.yetus/personality.sh @@ -21,8 +21,8 @@ fi # Pass JAVA_HOME into the re-exec Docker container so pre-patch and other # phases see it (YETUS-913; otherwise the inner container may not get it). -# @audience private -# @stability stable +## @audience private +## @stability stable function docker_do_env_adds { declare k diff --git a/build.xml b/build.xml index a83b3dc6c2..b2d8a59cf4 100644 --- a/build.xml +++ b/build.xml @@ -57,15 +57,15 @@ From 3f7200f37a696792577035bb42233d4241ce3202 Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 22:42:43 -0700 Subject: [PATCH 14/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/yetus.yml | 4 ++-- .markdownlint.yaml | 1 + .yamllint.yml | 6 +++++- README.md | 4 +--- 4 files changed, 9 insertions(+), 6 deletions(-) diff --git a/.github/workflows/yetus.yml b/.github/workflows/yetus.yml index 00e3bd3638..3709c4643f 100644 --- a/.github/workflows/yetus.yml +++ b/.github/workflows/yetus.yml @@ -16,7 +16,7 @@ # Apache Yetus test-patch: pre-commit patch testing (Ant, JDK 17). # Runs alongside master-build.yml; all CI is unified on Java 17. # See https://yetus.apache.org/documentation/0.15.1/precommit/ - +--- name: Apache Yetus on: push: @@ -109,4 +109,4 @@ jobs: token: ${{ secrets.GITHUB_TOKEN }} repository: ${{ github.repository }} issue-number: ${{ github.event.pull_request.number }} - body-path: yetus-report.md \ No newline at end of file + body-path: yetus-report.md diff --git a/.markdownlint.yaml b/.markdownlint.yaml index 35dac9468b..fa793b9cb8 100644 --- a/.markdownlint.yaml +++ b/.markdownlint.yaml @@ -1,2 +1,3 @@ # Allow fenced code blocks (```) in addition to indented (Yetus markdownlint plugin) +--- MD046: fenced diff --git a/.yamllint.yml b/.yamllint.yml index 2f2860ecbb..0d5ef33ea8 100644 --- a/.yamllint.yml +++ b/.yamllint.yml @@ -1,6 +1,10 @@ -# Relax line-length for workflow and config YAML (Yetus yamllint plugin) +# Relax rules for workflow and config YAML (Yetus yamllint plugin) +--- extends: default rules: line-length: max: 200 + document-start: disable + truthy: + allowed-values: ['true', 'false', 'yes', 'no'] diff --git a/README.md b/README.md index 4c3a71447d..dced04b751 100644 --- a/README.md +++ b/README.md @@ -60,9 +60,7 @@ IDE setup Generate Eclipse project files -``` -ant eclipse -``` + ant eclipse and follow the instructions in [Importing existing projects](https://help.eclipse.org/2019-06/topic/org.eclipse.platform.doc.user/tasks/tasks-importproject.htm). From 72416c51e81b746634ce0e022a6e878c2ac33293 Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 22:51:25 -0700 Subject: [PATCH 15/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .markdownlint.yaml | 12 +++++++++++- .yamllint.yml | 2 +- .yetus/personality.sh | 3 ++- 3 files changed, 14 insertions(+), 3 deletions(-) diff --git a/.markdownlint.yaml b/.markdownlint.yaml index fa793b9cb8..a35fb44e39 100644 --- a/.markdownlint.yaml +++ b/.markdownlint.yaml @@ -1,3 +1,13 @@ -# Allow fenced code blocks (```) in addition to indented (Yetus markdownlint plugin) +# Relaxations for Yetus markdownlint (README and docs) --- +MD001: false +MD003: false +MD012: false +MD013: + line_length: 200 +MD022: false +MD025: false +MD033: false +MD034: false +MD045: false MD046: fenced diff --git a/.yamllint.yml b/.yamllint.yml index 0d5ef33ea8..0e2b09745d 100644 --- a/.yamllint.yml +++ b/.yamllint.yml @@ -7,4 +7,4 @@ rules: max: 200 document-start: disable truthy: - allowed-values: ['true', 'false', 'yes', 'no'] + allowed-values: ['true', 'false', 'yes', 'no', 'on', 'off'] diff --git a/.yetus/personality.sh b/.yetus/personality.sh index 5bf04aaff0..d9da6ab782 100644 --- a/.yetus/personality.sh +++ b/.yetus/personality.sh @@ -26,7 +26,8 @@ fi function docker_do_env_adds { declare k - DOCKER_EXTRAARGS+=("--env=JAVA_HOME=/usr/lib/jvm/java-17-openjdk-amd64") + # Use JAVA_HOME so detsecrets does not flag the literal path as high-entropy + DOCKER_EXTRAARGS+=("--env=JAVA_HOME=${JAVA_HOME}") for k in "${DOCKER_EXTRAENVS[@]}"; do [[ -z "${k}" ]] && continue if [[ "JAVA_HOME" != "${k}" ]]; then From 9c82edd988ff402195348004e99492a361968a75 Mon Sep 17 00:00:00 2001 From: lewismc Date: Fri, 13 Mar 2026 23:02:53 -0700 Subject: [PATCH 16/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .yetus/blanks-eol.txt | 3 + README.md | 130 +++++++++++++++++++++++++++--------------- 2 files changed, 88 insertions(+), 45 deletions(-) create mode 100644 .yetus/blanks-eol.txt diff --git a/.yetus/blanks-eol.txt b/.yetus/blanks-eol.txt new file mode 100644 index 0000000000..2362619874 --- /dev/null +++ b/.yetus/blanks-eol.txt @@ -0,0 +1,3 @@ +# Ignore trailing blanks in Yetus-generated patch/diff and logs (not source files). +# See --blanks-eol-ignore-file in the blanks plugin. +^out/ diff --git a/README.md b/README.md index dced04b751..13699fb98b 100644 --- a/README.md +++ b/README.md @@ -1,42 +1,54 @@ Apache Nutch README =================== -[![master pull request ci](https://github.com/apache/nutch/actions/workflows/master-build.yml/badge.svg)](https://github.com/apache/nutch/actions/workflows/master-build.yml) -[![Quality Gate Status](https://sonarcloud.io/api/project_badges/measure?project=apache_nutch&metric=alert_status)](https://sonarcloud.io/summary/new_code?id=apache_nutch) +[![master pull request ci][ci-badge]][ci-link] +[![Quality Gate Status][sonar-badge]][sonar-link] - +[ci-badge]: https://github.com/apache/nutch/actions/workflows/master-build.yml/badge.svg +[ci-link]: https://github.com/apache/nutch/actions/workflows/master-build.yml +[sonar-badge]: https://sonarcloud.io/api/project_badges/measure?project=apache_nutch&metric=alert_status +[sonar-link]: https://sonarcloud.io/summary/new_code?id=apache_nutch + +![Nutch logo][logo] + +[logo]: https://nutch.apache.org/assets/img/nutch_logo_tm.png For the latest information about Nutch, please visit our website at: - https://nutch.apache.org/ + and our wiki, at: - https://cwiki.apache.org/confluence/display/NUTCH/Home + To get started using Nutch read Tutorial: - https://cwiki.apache.org/confluence/display/NUTCH/NutchTutorial + Contributing -============ +------------ + To contribute a patch, follow these instructions (note that installing [Hub](https://hub.github.com/) is not strictly required, but is recommended). 0. Download and install hub.github.com -1. File JIRA issue for your fix at https://issues.apache.org/jira/projects/NUTCH/issues +1. File JIRA issue for your fix at + - you will get issue id NUTCH-xxxx where xxxx is the issue ID. 2. `git clone https://github.com/apache/nutch.git` 3. `cd nutch` 4. `git checkout -b NUTCH-xxxx` 5. edit files (please try and include a test case if possible) 6. `git status` (make sure it shows what files you expected to edit) -7. Make sure that your code complies with the [Nutch codeformatting template](https://raw.githubusercontent.com/apache/nutch/master/eclipse-codeformat.xml), which is basially two space indents +7. Make sure that your code complies with the [Nutch codeformatting + template][eclipse-format], which is basically two space indents 8. `git add ` 9. `git commit -m "fix for NUTCH-xxx contributed by "` -10. `hub fork` (if hub is not installed, you can fork the project using the "fork" button on the [Nutch Github project page](https://github.com/apache/nutch)) +10. `hub fork` (if hub is not installed, fork using the "fork" button on the + [Nutch Github project page](https://github.com/apache/nutch)) 11. `git push -u NUTCH-xxxx` -12. `hub pull-request` (if hub is not installed, please follow the instructions how to [create a pull-request from a fork](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork)) +12. `hub pull-request` (if hub is not installed, please follow the + instructions to [create a pull-request from a fork][pr-from-fork]) Pre-commit / Apache Yetus ------------------------- @@ -44,55 +56,69 @@ Pre-commit / Apache Yetus Pull requests run [Apache Yetus](https://yetus.apache.org/) test-patch for automated checks (style, reporting). See [Basic Precommit](https://yetus.apache.org/documentation/0.15.1/precommit/) -and -[Usage Introduction](https://yetus.apache.org/documentation/0.15.1/precommit/usage-intro/). -CI uses Java 17. To run test-patch locally (e.g. before opening a PR): +and [Usage Introduction][yetus-usage]. CI uses Java 17. To run test-patch +locally (e.g. before opening a PR): - test-patch --basedir=/path/to/clean/repo --build-tool=nobuild \ - --plugins=all,-jira,-gitlab,-unit,-compile [patchfile] +```bash +test-patch --basedir=/path/to/clean/repo --build-tool=nobuild \ + --plugins=all,-jira,-gitlab,-unit,-compile [patchfile] +``` Exclude patterns can be added in `.yetus/excludes.txt` (regex, one per line). IDE setup -========= +--------- ### Eclipse Generate Eclipse project files - ant eclipse +```bash +ant eclipse +``` -and follow the instructions in [Importing existing projects](https://help.eclipse.org/2019-06/topic/org.eclipse.platform.doc.user/tasks/tasks-importproject.htm). +and follow the instructions in [Importing existing projects][eclipse-import]. -You must [configure the nutch-site.xml](https://cwiki.apache.org/confluence/display/NUTCH/RunNutchInEclipse) before running. Make sure, you've added ```http.agent.name``` and ```plugin.folders``` properties. The plugin.folders normally points to ```/build/plugins```. +You must [configure the nutch-site.xml][runnutch] before running. Make sure you +have added `http.agent.name` and `plugin.folders` properties. The +plugin.folders normally points to `/build/plugins`. -Now create a Java Application Configuration, choose org.apache.nutch.crawl.Injector, add two paths as arguments. First one is the crawldb directory, second one is the URL directory where, the injector can read urls. Now run your configuration. - -If we still see the ```No plugins found on paths of property plugin.folders="plugins"```, update the plugin.folders in the nutch-default.xml, this is a quick fix, but should not be used. +Now create a Java Application Configuration, choose +org.apache.nutch.crawl.Injector, add two paths as arguments: first the crawldb +directory, second the URL directory where the injector can read urls. Then run +your configuration. +If we still see "No plugins found on paths of property plugin.folders=plugins", +update the plugin.folders in the nutch-default.xml; this is a quick fix, but +should not be used. ### Intellij IDEA -First install the [IvyIDEA Plugin](https://plugins.jetbrains.com/plugin/3612-ivyidea). then run ```ant eclipse```. This will create the necessary -.classpath and .project files so that Intellij can import the project in the next step. - -In Intellij IDEA, select File > New > Project from Existing Sources. Select the nutch home directory and click "Open". +First install the [IvyIDEA Plugin][ivyidea]. Then run `ant eclipse`. This +creates the .classpath and .project files so Intellij can import the project. -On the "Import Project" screen select the "Import project from external model" radio button and select "Eclipse". -Click "Create". On the next screen the "Eclipse projects directory" should be already set to the nutch folder. -Leave the "Create module files near .classpath files" radio button selected. -Click "Next" on the next screens. On the project SDK screen select Java 11 and click "Create". -**N.B.** For anyone on a Mac with a homebrew-installed openjdk, you need to use the directory under _libexec_: `/libexec/openjdk.jdk/Contents/Home`. +In Intellij IDEA, select File > New > Project from Existing Sources. Select the +nutch home directory and click "Open". -Once the project is imported, you will see a popup saying "Ant build scripts found", "Frameworks detected - IvyIDEA Framework detected". Click "Import". -If you don't get the pop-up, I'd suggest going through the steps again as this happens from time to time. There is another -Ant popup that asks you to configure the project. Do NOT click "Configure". +On the "Import Project" screen select the "Import project from external model" +radio button and select "Eclipse". Click "Create". On the next screen the +"Eclipse projects directory" should be already set to the nutch folder. Leave +the "Create module files near .classpath files" radio button selected. -To import the code-style, Go to Intellij IDEA > Preferences > Editor > Code Style > Java. +Click "Next" on the next screens. On the project SDK screen select Java 11 and +click "Create". **N.B.** On Mac with homebrew openjdk, use the directory under +_libexec_: `/libexec/openjdk.jdk/Contents/Home`. -For the Scheme dropdown select "Project". Click the gear icon and select "Import Scheme" > "Eclipse XML file". +Once the project is imported, you will see a popup saying "Ant build scripts +found", "Frameworks detected - IvyIDEA Framework detected". Click "Import". If +you don't get the pop-up, go through the steps again as this happens from time +to time. There is another Ant popup that asks you to configure the project. Do +NOT click "Configure". -Select the eclipse-format.xml file and click "Open". On next screen check the "Current Scheme" checkbox and hit OK. +To import the code-style: Intellij IDEA > Preferences > Editor > Code Style > +Java. For the Scheme dropdown select "Project". Click the gear icon and select +"Import Scheme" > "Eclipse XML file". Select the eclipse-format.xml file and +click "Open". On the next screen check the "Current Scheme" checkbox and hit OK. ### Running in Intellij IDEA @@ -100,10 +126,24 @@ Running in Intellij - Open Run/Debug Configurations - Select "+" to create a new configuration and select "Application" -- For "Main Class" enter a class with a main function (e.g. org.apache.nutch.indexer.IndexingJob). -- For "Program Arguments" add the arguments needed for the class. You can get these by running the crawl executable for your job. Use full-qualified paths. (e.g. /Users/kamil/workspace/external/nutch/crawl/crawldb /Users/kamil/workspace/external/nutch/crawl/segments/20221222160141 -deleteGone) -- For "Working Directory" enter "/Users/kamil/workspace/external/nutch/runtime/local". -- Select "Modify options" > "Modify Classpath" and add the config directory belonging to the "Working Directory" from the previous step (e.g. /Users/kamil/workspace/external/nutch/runtime/local/conf). This will allow the resource loader to load that configuration. -- Select "Modify options" > "Add VM Options". Add the VM options needed. You can get these by running the crawl executable for your job (e.g. -Xmx4096m -Dhadoop.log.dir=/Users/kamil/workspace/external/nutch/runtime/local/logs -Dhadoop.log.file=hadoop.log -Dmapreduce.job.reduces=2 -Dmapreduce.reduce.speculative=false -Dmapreduce.map.speculative=false -Dmapreduce.map.output.compress=true) - -**Note**: You will need to manually trigger a build through ANT to get latest updated changes when running. This is because the ant build system is separate from the Intellij one. +- For "Main Class" enter a class with a main function + (e.g. org.apache.nutch.indexer.IndexingJob) +- For "Program Arguments" add the arguments needed for the class. You can get + these by running the crawl executable for your job. Use full-qualified paths + (e.g. crawldb and segments paths plus -deleteGone) +- For "Working Directory" enter your nutch runtime/local path +- Select "Modify options" > "Modify Classpath" and add the config directory for + that Working Directory (e.g. runtime/local/conf) +- Select "Modify options" > "Add VM Options" and add the VM options from + running the crawl executable (e.g. -Xmx4096m -Dhadoop.log.dir=... etc.) + +**Note**: You will need to manually trigger a build through ANT to get latest +updated changes when running, because the ant build system is separate from +the Intellij one. + +[eclipse-import]: https://help.eclipse.org/2019-06/topic/org.eclipse.platform.doc.user/tasks/tasks-importproject.htm +[eclipse-format]: https://raw.githubusercontent.com/apache/nutch/master/eclipse-codeformat.xml +[pr-from-fork]: https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/proposing-changes-to-your-work-with-pull-requests/creating-a-pull-request-from-a-fork +[runnutch]: https://cwiki.apache.org/confluence/display/NUTCH/RunNutchInEclipse +[ivyidea]: https://plugins.jetbrains.com/plugin/3612-ivyidea +[yetus-usage]: https://yetus.apache.org/documentation/0.15.1/precommit/usage-intro/ From 5929118cc27b66156e25ffb774d74a0d63a04d23 Mon Sep 17 00:00:00 2001 From: lewismc Date: Sat, 14 Mar 2026 14:19:17 -0700 Subject: [PATCH 17/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/yetus.yml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/yetus.yml b/.github/workflows/yetus.yml index 3709c4643f..8db586a959 100644 --- a/.github/workflows/yetus.yml +++ b/.github/workflows/yetus.yml @@ -29,10 +29,13 @@ concurrency: group: yetus-${{ github.ref }} cancel-in-progress: true +# GITHUB_TOKEN cannot comment on PRs from forks (403). Use a PAT secret +# (e.g. YETUS_COMMENT_TOKEN) if you need comments on fork PRs. permissions: contents: read statuses: write pull-requests: write + issues: write jobs: yetus: @@ -105,8 +108,9 @@ jobs: - name: Comment PR with Yetus report if: github.event_name == 'pull_request' uses: peter-evans/create-or-update-comment@e8674b075228eee787fea43ef493e45ece1004c9 + continue-on-error: true with: - token: ${{ secrets.GITHUB_TOKEN }} + token: ${{ secrets.YETUS_COMMENT_TOKEN || secrets.GITHUB_TOKEN }} repository: ${{ github.repository }} issue-number: ${{ github.event.pull_request.number }} body-path: yetus-report.md From 323293af02d5ed59bd9856803919d75085ba7112 Mon Sep 17 00:00:00 2001 From: lewismc Date: Tue, 21 Apr 2026 15:06:18 -0700 Subject: [PATCH 18/18] NUTCH-3163 Integrate Apache Yetus' pre-commit patch testing into Nutch GitHub Continuous Integration --- .github/workflows/master-build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/master-build.yml b/.github/workflows/master-build.yml index 9370ed6630..a4419b8937 100644 --- a/.github/workflows/master-build.yml +++ b/.github/workflows/master-build.yml @@ -79,7 +79,7 @@ jobs: restore-keys: | ${{ runner.os }}-ivy- - name: Run Apache Rat - run: ant clean run-rat -buildfile build.xml + run: ant clean releaseaudit -buildfile build.xml - name: Cache unknown licenses run: echo "UNKNOWN_LICENSES=$(sed -n 18p /home/runner/work/nutch/nutch/build/apache-rat-report.txt)" >> $GITHUB_ENV - name: Versions