Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 61 additions & 2 deletions pixi.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ license = "MIT"
# and tool.pixi.package.run-dependencies below. This means that the package cannot be
# installed with pip or uv, but it is set up to be built with pixi into a conda package
# which can be installed with conda, mamba, micromamba, pixi, etc.
dependencies = []
dependencies = ["usaddress>=0.5.16,<0.6"]

[project.urls]
"Homepage" = "https://github.com/catalyst-cooperative/pudl"
Expand Down
7 changes: 7 additions & 0 deletions src/pudl/extract/ferc1.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,6 +205,13 @@
"dbf": "f1_othr_reg_liab",
"xbrl": "other_regulatory_liabilities_account_254_278",
},
"core_ferc1__yearly_identification_certification": {
"dbf": "f1_ident_attsttn",
"xbrl": [
"identification_001",
"corporate_officer_certification_001",
],
Comment on lines +210 to +213
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ow double xbrl tables! neat

},
}
"""A mapping of PUDL DB table names to their XBRL and DBF source table names."""

Expand Down
43 changes: 43 additions & 0 deletions src/pudl/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,6 +29,7 @@
import polars as pl
import requests
import sqlalchemy as sa
import usaddress
from dagster import AssetKey, AssetsDefinition, AssetSelection, AssetSpec
from pandas._libs.missing import NAType
from pydantic import BaseModel, Field
Expand Down Expand Up @@ -2573,3 +2574,45 @@ def normalize_year_fragments(
f"Year out of expected range ({min_valid_year}-{max_valid_year}) in values: {bad}"
)
return year


def parse_address(addr: str):
"""Parse a U.S. address into components."""
try:
if pd.isna(addr):
return (addr, None, None, None)
tagged, addr_type = usaddress.tag(addr)

parsed = defaultdict(str)
for key, val in tagged.items():
parsed[key] = val.strip() if val else None

# Concatenate street parts into one column
# Handle occupancy a special way, as both parts should only get parsed it
# the first exists.
occupancy = (
f"{parsed.get('OccupancyType')} {parsed.get('OccupancyIdentifier')}"
if pd.notna(parsed.get("OccupancyType"))
else None
)

street_parts = [
parsed.get("AddressNumber", ""),
parsed.get("StreetNamePreDirectional", ""),
parsed.get("StreetName", ""),
parsed.get("StreetNamePostType", ""),
parsed.get("StreetNamePostDirectional"),
parsed.get("OccupancyType", ""),
occupancy, # Only add if occupancy type exists
]
street_address = " ".join([p for p in street_parts if pd.notna(p)]).strip()

return (
None if street_address == "" else street_address,
parsed.get("PlaceName", None),
parsed.get("StateName", None),
parsed.get("ZipCode", None),
)
except usaddress.RepeatedLabelError:
logger.warning(f"Could not parse {addr}")
return (addr, None, None, None)
139 changes: 137 additions & 2 deletions src/pudl/transform/ferc1.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,14 @@

import pudl
from pudl.extract.ferc1 import TABLE_NAME_MAP_FERC1
from pudl.helpers import assert_cols_areclose, convert_cols_dtypes
from pudl.helpers import (
assert_cols_areclose,
convert_cols_dtypes,
parse_address,
standardize_phone_column,
)
from pudl.metadata import PUDL_PACKAGE
from pudl.metadata.dfs import POLITICAL_SUBDIVISIONS
from pudl.metadata.fields import apply_pudl_dtypes
from pudl.settings import Ferc1Settings
from pudl.transform.classes import (
Expand Down Expand Up @@ -158,6 +164,7 @@ class TableIdFerc1(enum.Enum):
OTHER_REGULATORY_LIABILITIES = (
"core_ferc1__yearly_other_regulatory_liabilities_sched278"
)
IDENTIFICATION_CERTIFICATION = "core_ferc1__yearly_identification_certification"


################################################################################
Expand Down Expand Up @@ -3071,6 +3078,133 @@ def reconcile_table_calculations(
return df


class IdentificationCertificationTableTransformer(Ferc1AbstractTableTransformer):
"""Transformer class for the :ref:`core_ferc1__yearly_identification_certification` table."""

table_id: TableIdFerc1 = TableIdFerc1.IDENTIFICATION_CERTIFICATION

def source_table_primary_key(self, source_ferc1: SourceFerc1) -> list[str]:
"""Look up the pre-renaming source table primary key columns.

The identification table does not have spplmnt_num or row_number,
which are part of the DBF primary key for every other DBF table.
"""
if source_ferc1 == SourceFerc1.DBF:
pk_cols = [
"report_year",
"report_prd",
"respondent_id",
"submission_type",
]
else:
assert source_ferc1 == SourceFerc1.XBRL # nosec: B101
cols = self.params.rename_columns_ferc1.xbrl.columns
pk_cols = ["report_year", "entity_id"]
# Sort to avoid dependence on the ordering of rename_columns.
# Doing the sorting here because we have a particular ordering
# hard coded for the DBF primary keys.
pk_cols += sorted(col for col in cols if col.endswith("_axis"))
return pk_cols

@cache_df(key="dbf")
def drop_unused_original_columns_dbf(self, df: pd.DataFrame) -> pd.DataFrame:
"""Remove residual DBF specific column."""
return df
Comment on lines +3109 to +3112
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

you need this? or are you specifically wanting to not drop these? if yes then just add comment plzzz


def transform_main(self, df):
"""Standard transform_main plus.... ???!?!."""
df = (
super()
.transform_main(df)
.pipe(standardize_phone_column, columns=["contact_phone"])
)

# Check that is_migrated_data is all null and drop
assert df.is_migrated_data.isna().all()
df = df.drop(columns="is_migrated_data")

title_cols = [
"contact_name",
"contact_title",
"attestation_name",
"attestation_title",
]
for col in title_cols:
df[col] = df[col].str.title()

date_cols = ["attestation_date", "filing_date", "name_change_date"]
for col in date_cols:
df[col] = pd.to_datetime(df[col], errors="coerce")

to_null = [
"",
"not applicable",
"na",
"n/a",
"none",
"no change",
"x",
"xxx",
"z",
"zzz",
]
# Build a single regex pattern that is case insensitive
pattern = r"(?i)^(" + "|".join(map(re.escape, to_null)) + r")$"
df["prior_utility_name_ferc1"] = df["prior_utility_name_ferc1"].replace(
pattern, pd.NA, regex=True
)

df[
["office_street_address", "office_city", "office_state", "office_zip_code"]
] = pd.DataFrame(
df["office_street_address"].apply(parse_address).tolist(),
index=df.index,
)
Comment on lines +3157 to +3162
Copy link
Copy Markdown
Member

@katie-lamb katie-lamb Apr 2, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

a few suggested cleaning steps:

  • it might be nice to make them lowercase or title case, but not a big deal. will be done in splink/during match anyways
  • expand address suffixes to their full name and stripping punctuation, or vice versa. e.g. st. -> street. this could also happen within a separate cleaning in the match step. i believe i do this for the SEC addresses and can dig it up

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

taking these cleaning steps also depends on how useful address actually is for the match. maybe zip or state is really more useful

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Seems like there will be a normalization process on most of these columns during the match / inside the match module, so i think it's fine to keep them not as normalized here.

df[["contact_address", "contact_city", "contact_state", "contact_zip_code"]] = (
pd.DataFrame(
df["contact_address"].apply(parse_address).tolist(),
index=df.index,
)
)

# Standardize state columns
state_map = dict(
zip(
POLITICAL_SUBDIVISIONS.subdivision_name.str.upper(),
POLITICAL_SUBDIVISIONS.subdivision_code,
strict=True,
)
)

for col in ["office_state", "contact_state"]:
df[col] = df[col].str.upper()
df[col] = np.where(
df[col].isin(state_map.values()), df[col], df[col].map(state_map)
)

return df

# Transforms to add
# report_filing_type --> enum O/R

@cache_df(key="end")
def transform_end(self, df: pd.DataFrame) -> pd.DataFrame:
"""Standardized final cleanup after the transformations are done.

Checks calculations. Enforces dataframe schema. Checks for empty dataframes and
null columns.
"""
# df = self.reconcile_table_calculations(df).pipe(self.enforce_schema)
# if df.empty:
# raise ValueError(f"{self.table_id.value}: Final dataframe is empty!!!")
# for col in df:
# if df[col].isna().all():
# raise ValueError(
# f"{self.table_id.value}: Column {col} is entirely NULL!"
# )
return df
Comment on lines +3190 to +3205
Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

presumably you'll delete this when you add metadata?



class SteamPlantsFuelTableTransformer(Ferc1AbstractTableTransformer):
"""A table transformer specific to the :ref:`core_ferc1__yearly_steam_plants_fuel_sched402` table.

Expand Down Expand Up @@ -6114,6 +6248,7 @@ class OtherRegulatoryLiabilitiesTableTransformer(Ferc1AbstractTableTransformer):
"core_ferc1__yearly_cash_flows_sched120": CashFlowsTableTransformer,
"core_ferc1__yearly_sales_by_rate_schedules_sched304": SalesByRateSchedulesTableTransformer,
"core_ferc1__yearly_other_regulatory_liabilities_sched278": OtherRegulatoryLiabilitiesTableTransformer,
"core_ferc1__yearly_identification_certification": IdentificationCertificationTableTransformer,
}


Expand Down Expand Up @@ -6159,7 +6294,7 @@ def ferc1_transform_asset_factory(

table_id = TableIdFerc1(table_name)

@asset(name=table_name, ins=ins, io_manager_key=io_manager_key)
@asset(name=table_name, ins=ins) # io_manager_key=io_manager_key)
def ferc1_transform_asset(**kwargs: dict[str, pd.DataFrame]) -> pd.DataFrame:
"""Transform a FERC Form 1 table.

Expand Down
Loading
Loading