catalyst-cooperative · zaneselvans · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/.env.example b/.env.example
@@ -0,0 +1,23 @@
+# Copy this file to .env and customize values for your local environment.
+# The .env file is ignored by git.
+
+# Required workspace paths
+PUDL_INPUT=/absolute/path/to/pudl-input
+PUDL_OUTPUT=/absolute/path/to/pudl-output
+DAGSTER_HOME=/absolute/path/to/dagster-home
+
+# Logging controls (read by pudl.logging_helpers.configure_root_logger)
+PUDL_LOGLEVEL=INFO
+PUDL_COLOR_LOGS=true
+
+# Optional: write logs to a file in addition to console output.
+# Leave unset for console-only logging.
+# PUDL_LOGFILE=/absolute/path/to/pudl/logs/pudl.log
+
+# Optional: don't try and use intersphinx to link to external documentation
+# during the docs build -- it can be flaky and isn't required for most docs edits.
+# PUDL_DOCS_DISABLE_INTERSPHINX=1
+
+# Optional: don't remove generated rst files after the docs build. Can be helpful
+# when debugging formatting errors.
+# PUDL_DOCS_KEEP_GENERATED_FILES=1
diff --git a/.github/.gitignore b/.github/.gitignore
@@ -0,0 +1 @@
+skills/**
diff --git a/.github/skills → .github/skills~Updated upstream b/.github/skills → .github/skills~Updated upstream
diff --git a/.github/workflows/build-deploy-ferceqr.yml b/.github/workflows/build-deploy-ferceqr.yml
@@ -11,6 +11,8 @@ env:
   GCS_LOGS_BUCKET: gs://builds.catalyst.coop/ferceqr_logs
   S3_OUTPUT_BUCKET: s3://pudl.catalyst.coop/ferceqr
   BATCH_JOB_JSON: batch_job.json
+  BUILD_ID: ""
+  BATCH_JOB_ID: ""
 
 jobs:
   build_and_deploy_eqr:

diff --git a/.github/workflows/build-deploy-pudl.yml b/.github/workflows/build-deploy-pudl.yml
@@ -139,7 +139,7 @@ jobs:
             --container-env OMP_NUM_THREADS=4 \
             --container-env PUDL_BOT_PAT=${{ secrets.PUDL_BOT_PAT }} \
             --container-env PUDL_GCS_OUTPUT=${{ env.PUDL_GCS_OUTPUT }} \
-            --container-env PUDL_SETTINGS_YML="/home/ubuntu/pudl/src/pudl/package_data/settings/etl_full.yml" \
+            --container-env DG_NIGHTLY_CONFIG="src/pudl/package_data/settings/dg_nightly.yml" \
             --container-env SLACK_TOKEN=${{ secrets.PUDL_DEPLOY_SLACK_TOKEN }} \
             --container-env ZENODO_SANDBOX_TOKEN_PUBLISH=${{ secrets.ZENODO_SANDBOX_TOKEN_PUBLISH }} \
             --container-env ZENODO_TARGET_ENV=${{ (startsWith(github.ref_name, 'v20') && 'production') || 'sandbox' }} \

diff --git a/.github/workflows/com-dev-notify.yml b/.github/workflows/com-dev-notify.yml
@@ -7,17 +7,23 @@ on:
     types: [created]
 
 env:
-  username: ${{ github.event.issue.user.login }}
-  url: ${{ github.event.issue.html_url }}
+  username: ""
+  url: ""
   org: catalyst-cooperative
 
 jobs:
   com-dev-notify:
     name: Notify Catalyst of community activity
     runs-on: ubuntu-latest
     steps:
+      - name: Get username if an issue was opened
+        if: ${{ github.event_name == 'issues' }}
+        run: |
+          echo "username=${{ github.event.issue.user.login }}" >> "${GITHUB_ENV}"
+          echo "url=${{ github.event.issue.html_url }}" >> "${GITHUB_ENV}"
+
       - name: Get username if a discussion was created
-        if: ${{ (github.event_name == 'discussion') }}
+        if: ${{ github.event_name == 'discussion' }}
         run: |
           echo "username=${{ github.event.discussion.user.login }}" >> "${GITHUB_ENV}"
           echo "url=${{ github.event.discussion.html_url }}" >> "${GITHUB_ENV}"
@@ -36,13 +42,11 @@ jobs:
         uses: slackapi/slack-github-action@v3
 
         with:
-          # Slack channel id, channel name, or user id to post message.
-          # See also: https://api.slack.com/methods/chat.postMessage#channels
-          # You can pass in multiple channels to post to by providing a comma-delimited list of channel IDs.
-          channel-id: "community-dev"
-          # For posting a markdown message
+          method: chat.postMessage
+          token: ${{ secrets.COMMUNITY_DEV_SLACK_BOT_TOKEN }}
           payload: |
             {
+              "channel": "community-dev",
               "blocks": [
                 {
                   "type": "section",
@@ -53,5 +57,3 @@ jobs:
                 }
               ]
             }
-        env:
-          SLACK_BOT_TOKEN: ${{ secrets.COMMUNITY_DEV_SLACK_BOT_TOKEN }}
diff --git a/.github/workflows/pytest.yml b/.github/workflows/pytest.yml
@@ -20,8 +20,8 @@ jobs:
     permissions:
       pull-requests: read
     outputs:
-      # 2025-07-17: because merge_group is an Object and run_code_checks is not a conditional, need to explicitly check for null-ness instead of relying on truthiness.
-      run_code_checks: ${{ github.event_name =='workflow_dispatch' || (steps.filter.outputs.code == 'true' && (github.event.merge_group != null)) }}
+      # Run code checks for manual dispatches and merge queue runs with code changes.
+      run_code_checks: ${{ github.event_name == 'workflow_dispatch' || (github.event_name == 'merge_group' && steps.filter.outputs.code == 'true') }}
     steps:
       - uses: actions/checkout@v6
         with:
@@ -47,6 +47,7 @@ jobs:
               - '!.github/workflows/bot-auto-merge.yml'
               - '!.github/workflows/build-deploy-docs.yml'
               - '!.github/workflows/build-deploy-pudl.yml'
+              - '!.github/workflows/deploy-pudl.yml'
               - '!.github/workflows/com-dev-notify.yml'
               - '!.github/workflows/docker-build-test.yml'
               - '!.github/workflows/q-update-issue-scheduler.yml'
@@ -77,7 +78,7 @@ jobs:
         run: |
           echo "event name (${{ github.event_name }}) is workflow dispatch: ${{ github.event_name == 'workflow_dispatch' }}"
           echo "found code changes: ${{ steps.filter.outputs.code }}"
-          echo "merge_group ${{ github.event.merge_group }} is not null: ${{ github.event.merge_group != null }}"
+          echo "event name (${{ github.event_name }}) is merge_group: ${{ github.event_name == 'merge_group' }}"
 
   ci-docs:
     permissions:

diff --git a/.github/workflows/update-lockfiles.yml b/.github/workflows/update-lockfiles.yml
@@ -10,6 +10,9 @@ on:
 # - workflow_dispatch: Whatever branch it was run against.
 # - schedule: Always runs on main
 
+env:
+  TODAY: ""
+
 jobs:
   update-lockfiles:
     runs-on: ubuntu-latest

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -72,6 +72,7 @@ repos:
             |.*\.bib
             |.*\.csv
             |.*\.html
+            |.*\.json
             |src/pudl/package_data/ferc1/.*_categories\.yaml
           )$ | migrations/ | devtools/ | test/ | notebooks/ | src/pudl/metadata/codes.py | src/pudl/transform/params/ferc1.py
         args: [] # Make this read, not write
@@ -131,6 +132,15 @@ repos:
         always_run: false
         entry: pixi run jupyter nbconvert --clear-output
 
+      - id: pixi-lock-update
+        name: pixi-lock-update
+        stages: [pre-commit]
+        language: system
+        verbose: false
+        pass_filenames: false
+        always_run: true
+        entry: bash -c 'pixi install --quiet && git add pixi.lock'
+
       - id: unit-tests
         name: unit-tests
         stages: [pre-commit]
@@ -150,5 +160,5 @@ ci:
   autoupdate_branch: main
   autoupdate_commit_msg: "[pre-commit.ci] pre-commit autoupdate"
   autoupdate_schedule: weekly
-  skip: [unit-tests, nb-output-clear, shellcheck]
+  skip: [pixi-lock-update, unit-tests, nb-output-clear, shellcheck]
   submodules: false
diff --git a/AGENTS.md b/AGENTS.md
@@ -1,112 +1,69 @@
 # LLM coding agent instructions for the Public Utility Data Liberation (PUDL) Project
 
-## PUDL Project Overview
-
-- The PUDL Project implements a data processing pipeline that ingests raw energy system
-  data from public agencies like the US Energy Information Administration (EIA) and the
-  Federal Energy Regulatory Commission (FERC) and transforms it into clean, well
-  organized tables for use in analysis and modeling.
-- PUDL uses the Dagster data orchestration framework to manage dependencies between
-  different assets, and to enable parallel execution of different portions of the data
-  processing pipeline.
-- The raw input data for the PUDL data processing pipeline can be found in the directory
-  indicated by the `$PUDL_INPUT` environment variable. The raw inputs are downloaded as
-  needed by the data pipeline, but can be pre-downloaded in bulk using the
-  `pudl_datastore` command line interface.
-- The PUDL data processing pipeline primarily generates Apache Parquet files as its
-  outputs. These outputs can be found in `$PUDL_OUTPUT/parquet/` where `$PUDL_OUTPUT` is
-  an environment variable which should be set by the user.
-
-## Development environment tips
-
-- PUDL uses pixi to manage its Python environment and dependencies. All dependencies and
-  configuration are defined in `pyproject.toml`.
-- The default pixi environment includes all development tools.
-- To run commands in the pixi environment, prefix them with `pixi run` (e.g.,
-  `pixi run pytest`)
-- Pixi environments and tasks are defined in `pyproject.toml` under `[tool.pixi]`
-  sections.
-- PUDL uses ruff to lint and automatically format python code. Before staging files for
-  a commit, always run `pixi run pre-commit run ruff-check --all-files` and
-  `pixi run pre-commit run ruff-format --all-files`
-- A number of pre-commit hooks are defined in .pre-commit-config.yaml.
-- We try to use appropriate type annotations in function, class, and method definitions,
-  but they are not yet checked or enforced. They are primarily to improve readability
-  for humans, LLMs, and IDEs.
+## PUDL project overview
+
+- PUDL ingests raw public energy data (EIA, FERC, EPA, and others) and transforms it
+  into clean, analysis-ready tables.
+- The pipeline is orchestrated using Dagster assets and jobs.
+- Raw inputs are managed through the datastore and are rooted at `$PUDL_INPUT`.
+- Primary outputs are Parquet files rooted at `$PUDL_OUTPUT/parquet/`.
+
+## Python environment and tooling
+
+- PUDL uses `pixi` for dependency and task management.
+- Use `pixi run <command>` to ensure commands run in the project environment.
+- Never try to create or manage Python environments manually; always use `pixi` to
+  ensure consistency.
+- Project tasks and environments are defined in `pyproject.toml` under `[tool.pixi]`.
+- Git pre-commit hooks are defined in `.pre-commit-config.yaml`.
 
 ## Available skills
 
 There are a number of skills defined in skills-lock.json that should be available to you.
 If they're not available, use `pixi run install-skills` to install them.
 
-## Testing instructions
-
-- PUDL uses pytest to manage its unit and integration tests.
-- Tests should avoid using unittest and monkeypatch, and use pytest-mock.
-- Rather than enumerating various test cases within a single test function, the
-  tests should use the pytest.parametrize decorator to enumerate tests cases, specifying
-  the appropriate success or failure or exception to be raised for each test as
-  appropriate.
-- Tests must be run inside the pixi environment.
-- When individual tests are run, we should turn off coverage collection, since otherwise
-  they will fail since they only cover a small portion of the codebase.
-- Test coverage collection should be disabled using `--no-cov` when running individual
-  tests to avoid getting spurious warnings.
-- For example, the unit tests can be run with `pixi run pytest --no-cov test/unit`.
-- We use dbt only for data validation, and NOT for data transformations. The PUDL data
-  tests are under the `dbt/` directory.
-- dbt commands must be typically run from within the dbt directory, e.g.:
-  `cd dbt && pixi run dbt build`
-- The PUDL integration tests process a substantial amount of data and take up to an hour
-  to run, and so should not generally be run during development interactively.
-
-## Code Style Guidelines
-
-- Follow pandas naming conventions: use `df` for DataFrames, descriptive column names
-- Prefer longer, readable, descriptive variable names over short, cryptic ones.
-- Use explicit type hints for function parameters and returns where helpful.
-- Prefer method chaining for pandas operations when it improves readability.
-- Use `pathlib.Path` for file system operations instead of string concatenation.
-- Follow snake_case for functions/variables, PascalCase for classes.
-- Use f-strings for string formatting, including in logging statements.
-- Write docstrings for all public functions/classes using Google style python
-  docstrings.
-- Limit lines to 88 characters for better readability. Do not artificially restrict line
-  length to 80 characters.
-- Do not use `print()` statements; use python's logging system instead.
-
-## PUDL-Specific Patterns
-
-- Asset dependencies in Dagster should be explicit and well-documented
-- In general, data validation should happen in dbt, not in Dagster asset checks.
-- Sanity checks that validate assumptions about the data should be done as it is being
-  transformed, with assertions failing loudly if expectations are not met.
-- Use PUDL's existing utility functions in `pudl.helpers` when available.
-- Raw data access should use the datastore pattern, not direct file I/O.
-- Use nullable pandas dtypes (e.g. `pd.Int64Dtype()` or `pd.StringDtype()`) when
-  possible, to avoid generic `object` dtypes and mixed NULL values.
-- Parquet outputs should use snappy compression and pyarrow dtypes.
-- Metadata describing the tables, columns, and data sources can be found in the
-  `pudl.metadata` subpackage. "Resources" are tables and "Fields" are columns.
-- Metadata classes defined in the `pudl.metadata.classes` module using Pydantic
-  generally mirror the frictionless datapackage standard.
-- Our documentation is built using Sphinx. The source files are in the `docs/`
-  directory. The source files are in reStructuredText format.
-- Whenever we make significant changes to the codebase, they should be noted in the PUDL
-  release notes found at `docs/release_notes.rst`.
-
-## Performance Considerations
-
-- Use vectorized pandas operations instead of row-wise `apply` or loops.
-- Consider using just-in-time compilation with numba for performance-critical code.
-- Do not use inplace operations on pandas DataFrames.
-- Avoid chained indexing in pandas to prevent SettingWithCopyWarning.
-- Use efficient pandas merging and joining techniques, ensuring indexes are set
-  appropriately.
-- Avoid creating unnecessary intermediate DataFrames.
-- Use categorical dtypes for columns with a limited set of values to save memory.
-- Profile and optimize any code that processes large datasets.
-- PUDL relies primarily on pandas for data processing, but in cases where performance or
-  memory limitations are important, we may also use DuckDB or polars dataframes.
-- For large datasets (>1GB), consider polars for aggregations before pandas.
-- Use polars for memory-intensive operations or when pandas performance is limiting.
+## Sandbox safe execution
+
+- Prefer already-installed binaries before invoking commands that may trigger package
+  resolution or updates.
+- Prefer direct binaries (for example `dg`, `rg`, `ruff`) when they are already
+  available in the active environment.
+- When using `pixi run`, prefer frozen/locked execution modes that avoid dependency
+  updates.
+- For sandboxed terminal runs, keep cache and temporary directories writable and local
+  to the workspace when possible, e.g. `TMPDIR`, `PIXI_HOME`.
+
+## Always-on coding expectations
+
+- Prefer explicit, readable code and descriptive names over terse names.
+- Follow existing naming conventions and data model conventions.
+- Reuse existing project helpers and established patterns before introducing new ones.
+- Prefer dbt for data validation by default; use Dagster/Python validation only when
+  there is a clear project-specific reason.
+
+## Where to find additional detailed instructions
+
+- If the `pudl` agent skill is enabled, it should be used to read PUDL database schemas
+  and descriptions of tables and columns.
+- If the `pudl-dev` agent skill is enabled, it should be used to inform software
+  development tasks in the PUDL project.
+- If the `dagster-expert` agent skill is enabled, it should be used when adding or
+  modifying code related to orchestration of the data processing pipeline, and any of
+  the concepts and classes defined by Dagster. This includes assets, resources, jobs,
+  IO managers, sensors, etc.
+
+## Developer documentation references
+
+- General contributor docs: `docs/dev/`
+- Python testing: `docs/dev/testing.rst`
+- Data validation quickstart and reference using dbt:
+  `docs/dev/data_validation_quickstart.rst`,
+  `docs/dev/data_validation_reference.rst`
+- Dagster development: `docs/dev/dev_dagster.rst`, `docs/dev/run_the_etl.rst`
+- Editing PUDL Metadata: `docs/dev/metadata.rst`
+- PUDL naming conventions: `docs/dev/naming_conventions.rst`
+
+## Release notes
+
+- Significant user-visible or developer-visible changes should be summarized in
+  `docs/release_notes.rst`.
diff --git a/docker/gcp_ferceqr_etl.sh b/docker/gcp_ferceqr_etl.sh
@@ -31,8 +31,8 @@ function run_ferceqr_etl() {
         authenticate_gcp &&
         dagster dev &
 
-    # Kick off the ferceqr_etl job asynchronously
-    dagster job backfill --noprompt -j ferceqr_etl --location pudl.etl
+    # Kick off the ferceqr job asynchronously
+    dagster job backfill --noprompt -j ferceqr --location pudl.etl
     # Wait for a file called 'SUCCESS' or 'FAILURE' to be created in PUDL_OUTPUT indicating completion
     # Timeout after 6 hours if file still doesn't exist
     inotifywait -e create -t 21600 --include 'SUCCESS|FAILURE' "$PUDL_OUTPUT"