-
-
Notifications
You must be signed in to change notification settings - Fork 133
WIP: Add FERC Form 1 identification table to PUDL #5008
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
5ed40cc
f9a3f83
c008d44
3abf821
25a6566
cc2c965
7fe7878
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,8 +27,14 @@ | |
|
|
||
| import pudl | ||
| from pudl.extract.ferc1 import TABLE_NAME_MAP_FERC1 | ||
| from pudl.helpers import assert_cols_areclose, convert_cols_dtypes | ||
| from pudl.helpers import ( | ||
| assert_cols_areclose, | ||
| convert_cols_dtypes, | ||
| parse_address, | ||
| standardize_phone_column, | ||
| ) | ||
| from pudl.metadata import PUDL_PACKAGE | ||
| from pudl.metadata.dfs import POLITICAL_SUBDIVISIONS | ||
| from pudl.metadata.fields import apply_pudl_dtypes | ||
| from pudl.settings import Ferc1Settings | ||
| from pudl.transform.classes import ( | ||
|
|
@@ -158,6 +164,7 @@ class TableIdFerc1(enum.Enum): | |
| OTHER_REGULATORY_LIABILITIES = ( | ||
| "core_ferc1__yearly_other_regulatory_liabilities_sched278" | ||
| ) | ||
| IDENTIFICATION_CERTIFICATION = "core_ferc1__yearly_identification_certification" | ||
|
|
||
|
|
||
| ################################################################################ | ||
|
|
@@ -3071,6 +3078,133 @@ def reconcile_table_calculations( | |
| return df | ||
|
|
||
|
|
||
| class IdentificationCertificationTableTransformer(Ferc1AbstractTableTransformer): | ||
| """Transformer class for the :ref:`core_ferc1__yearly_identification_certification` table.""" | ||
|
|
||
| table_id: TableIdFerc1 = TableIdFerc1.IDENTIFICATION_CERTIFICATION | ||
|
|
||
| def source_table_primary_key(self, source_ferc1: SourceFerc1) -> list[str]: | ||
| """Look up the pre-renaming source table primary key columns. | ||
|
|
||
| The identification table does not have spplmnt_num or row_number, | ||
| which are part of the DBF primary key for every other DBF table. | ||
| """ | ||
| if source_ferc1 == SourceFerc1.DBF: | ||
| pk_cols = [ | ||
| "report_year", | ||
| "report_prd", | ||
| "respondent_id", | ||
| "submission_type", | ||
| ] | ||
| else: | ||
| assert source_ferc1 == SourceFerc1.XBRL # nosec: B101 | ||
| cols = self.params.rename_columns_ferc1.xbrl.columns | ||
| pk_cols = ["report_year", "entity_id"] | ||
| # Sort to avoid dependence on the ordering of rename_columns. | ||
| # Doing the sorting here because we have a particular ordering | ||
| # hard coded for the DBF primary keys. | ||
| pk_cols += sorted(col for col in cols if col.endswith("_axis")) | ||
| return pk_cols | ||
|
|
||
| @cache_df(key="dbf") | ||
| def drop_unused_original_columns_dbf(self, df: pd.DataFrame) -> pd.DataFrame: | ||
| """Remove residual DBF specific column.""" | ||
| return df | ||
|
Comment on lines
+3109
to
+3112
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. you need this? or are you specifically wanting to not drop these? if yes then just add comment plzzz |
||
|
|
||
| def transform_main(self, df): | ||
| """Standard transform_main plus.... ???!?!.""" | ||
| df = ( | ||
| super() | ||
| .transform_main(df) | ||
| .pipe(standardize_phone_column, columns=["contact_phone"]) | ||
| ) | ||
|
|
||
| # Check that is_migrated_data is all null and drop | ||
| assert df.is_migrated_data.isna().all() | ||
| df = df.drop(columns="is_migrated_data") | ||
|
|
||
| title_cols = [ | ||
| "contact_name", | ||
| "contact_title", | ||
| "attestation_name", | ||
| "attestation_title", | ||
| ] | ||
| for col in title_cols: | ||
| df[col] = df[col].str.title() | ||
|
|
||
| date_cols = ["attestation_date", "filing_date", "name_change_date"] | ||
| for col in date_cols: | ||
| df[col] = pd.to_datetime(df[col], errors="coerce") | ||
|
|
||
| to_null = [ | ||
| "", | ||
| "not applicable", | ||
| "na", | ||
| "n/a", | ||
| "none", | ||
| "no change", | ||
| "x", | ||
| "xxx", | ||
| "z", | ||
| "zzz", | ||
| ] | ||
| # Build a single regex pattern that is case insensitive | ||
| pattern = r"(?i)^(" + "|".join(map(re.escape, to_null)) + r")$" | ||
| df["prior_utility_name_ferc1"] = df["prior_utility_name_ferc1"].replace( | ||
| pattern, pd.NA, regex=True | ||
| ) | ||
|
|
||
| df[ | ||
| ["office_street_address", "office_city", "office_state", "office_zip_code"] | ||
| ] = pd.DataFrame( | ||
| df["office_street_address"].apply(parse_address).tolist(), | ||
| index=df.index, | ||
| ) | ||
|
Comment on lines
+3157
to
+3162
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. a few suggested cleaning steps:
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. taking these cleaning steps also depends on how useful address actually is for the match. maybe zip or state is really more useful
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Seems like there will be a normalization process on most of these columns during the match / inside the match module, so i think it's fine to keep them not as normalized here. |
||
| df[["contact_address", "contact_city", "contact_state", "contact_zip_code"]] = ( | ||
| pd.DataFrame( | ||
| df["contact_address"].apply(parse_address).tolist(), | ||
| index=df.index, | ||
| ) | ||
| ) | ||
|
|
||
| # Standardize state columns | ||
| state_map = dict( | ||
| zip( | ||
| POLITICAL_SUBDIVISIONS.subdivision_name.str.upper(), | ||
| POLITICAL_SUBDIVISIONS.subdivision_code, | ||
| strict=True, | ||
| ) | ||
| ) | ||
|
|
||
| for col in ["office_state", "contact_state"]: | ||
| df[col] = df[col].str.upper() | ||
| df[col] = np.where( | ||
| df[col].isin(state_map.values()), df[col], df[col].map(state_map) | ||
| ) | ||
|
|
||
| return df | ||
|
|
||
| # Transforms to add | ||
| # report_filing_type --> enum O/R | ||
|
|
||
| @cache_df(key="end") | ||
| def transform_end(self, df: pd.DataFrame) -> pd.DataFrame: | ||
| """Standardized final cleanup after the transformations are done. | ||
|
|
||
| Checks calculations. Enforces dataframe schema. Checks for empty dataframes and | ||
| null columns. | ||
| """ | ||
| # df = self.reconcile_table_calculations(df).pipe(self.enforce_schema) | ||
| # if df.empty: | ||
| # raise ValueError(f"{self.table_id.value}: Final dataframe is empty!!!") | ||
| # for col in df: | ||
| # if df[col].isna().all(): | ||
| # raise ValueError( | ||
| # f"{self.table_id.value}: Column {col} is entirely NULL!" | ||
| # ) | ||
| return df | ||
|
Comment on lines
+3190
to
+3205
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. presumably you'll delete this when you add metadata? |
||
|
|
||
|
|
||
| class SteamPlantsFuelTableTransformer(Ferc1AbstractTableTransformer): | ||
| """A table transformer specific to the :ref:`core_ferc1__yearly_steam_plants_fuel_sched402` table. | ||
|
|
||
|
|
@@ -6114,6 +6248,7 @@ class OtherRegulatoryLiabilitiesTableTransformer(Ferc1AbstractTableTransformer): | |
| "core_ferc1__yearly_cash_flows_sched120": CashFlowsTableTransformer, | ||
| "core_ferc1__yearly_sales_by_rate_schedules_sched304": SalesByRateSchedulesTableTransformer, | ||
| "core_ferc1__yearly_other_regulatory_liabilities_sched278": OtherRegulatoryLiabilitiesTableTransformer, | ||
| "core_ferc1__yearly_identification_certification": IdentificationCertificationTableTransformer, | ||
| } | ||
|
|
||
|
|
||
|
|
@@ -6159,7 +6294,7 @@ def ferc1_transform_asset_factory( | |
|
|
||
| table_id = TableIdFerc1(table_name) | ||
|
|
||
| @asset(name=table_name, ins=ins, io_manager_key=io_manager_key) | ||
| @asset(name=table_name, ins=ins) # io_manager_key=io_manager_key) | ||
| def ferc1_transform_asset(**kwargs: dict[str, pd.DataFrame]) -> pd.DataFrame: | ||
| """Transform a FERC Form 1 table. | ||
|
|
||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
ow double xbrl tables! neat