diff --git a/CHANGELOG.md b/CHANGELOG.md index c08d57b1c..fc7b10b64 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ ## [Unreleased] +- Added a workaround for `nltk.txt` package downloader errors caused by an upstream regression in NLTK v3.9.3. ([#2041](https://github.com/heroku/heroku-buildpack-python/pull/2041)) - Changed the S3 URL used to download Python to use AWS' dual-stack (IPv6 compatible) endpoint. ([#2035](https://github.com/heroku/heroku-buildpack-python/pull/2035)) ## [v335] - 2026-02-10 diff --git a/bin/steps/nltk b/bin/steps/nltk index a1317ebf7..0e72cdd23 100755 --- a/bin/steps/nltk +++ b/bin/steps/nltk @@ -25,10 +25,11 @@ if is_module_available 'nltk'; then readarray -t nltk_packages <"${nltk_packages_definition}" output::step "Downloading NLTK packages: ${nltk_packages[*]}" - nltk_data_dir="/app/.heroku/python/nltk_data" - + # Note: We have to use the raw build directory path here and not the symlinked `/app` path, + # otherwise it will cause a false positive in NLTK v3.9.3's new Zip Slip security check, + # which doesn't handle symlinked paths correctly: https://github.com/nltk/nltk/issues/3509 # TODO: Does this even need user-provided env vars, or can we remove the sub_env usage here? - if ! sub_env python -m nltk.downloader -d "${nltk_data_dir}" "${nltk_packages[@]}" |& output::indent; then + if ! sub_env python -m nltk.downloader -d "${BUILD_DIR}/.heroku/python/nltk_data" "${nltk_packages[@]}" |& output::indent; then output::error <<-EOF Error: Unable to download NLTK data. @@ -41,7 +42,9 @@ if is_module_available 'nltk'; then exit 1 fi - set_env NLTK_DATA "${nltk_data_dir}" + # Since this will be used at runtime, we must use the symlinked `/app` path and not + # the raw build directory path. + set_env NLTK_DATA "/app/.heroku/python/nltk_data" else build_data::set_string "nltk_downloader" "skipped-no-nltk-file" echo " 'nltk.txt' not found, not downloading any corpora" diff --git a/spec/hatchet/nltk_spec.rb b/spec/hatchet/nltk_spec.rb index 56ed95ce4..1c5a5a59d 100644 --- a/spec/hatchet/nltk_spec.rb +++ b/spec/hatchet/nltk_spec.rb @@ -13,10 +13,10 @@ remote: -----> Downloading NLTK packages: city_database stopwords remote: .*: RuntimeWarning: 'nltk.downloader' found in sys.modules after import of package 'nltk', but prior to execution of 'nltk.downloader'; this may result in unpredictable behaviour remote: \\[nltk_data\\] Downloading package city_database to - remote: \\[nltk_data\\] /app/.heroku/python/nltk_data... + remote: \\[nltk_data\\] /tmp/build_.+/.heroku/python/nltk_data... remote: \\[nltk_data\\] Unzipping corpora/city_database.zip. remote: \\[nltk_data\\] Downloading package stopwords to - remote: \\[nltk_data\\] /app/.heroku/python/nltk_data... + remote: \\[nltk_data\\] /tmp/build_.+/.heroku/python/nltk_data... remote: \\[nltk_data\\] Unzipping corpora/stopwords.zip. REGEX