From baf2a2447c049ba2d6872ac114d26708335b796a Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 7 Apr 2026 09:48:50 -0400 Subject: [PATCH 01/28] Align SPM threshold recalculation --- .../calibration/calibration_utils.py | 44 ++- .../calibration/publish_local_area.py | 263 ++++++++---------- .../test_spm_thresholds.py | 51 ++++ policyengine_us_data/utils/spm.py | 144 +++++++++- 4 files changed, 344 insertions(+), 158 deletions(-) create mode 100644 policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py diff --git a/policyengine_us_data/calibration/calibration_utils.py b/policyengine_us_data/calibration/calibration_utils.py index 8af1bab7a..a4c489bab 100644 --- a/policyengine_us_data/calibration/calibration_utils.py +++ b/policyengine_us_data/calibration/calibration_utils.py @@ -7,10 +7,12 @@ import numpy as np import pandas as pd -from spm_calculator import SPMCalculator, spm_equivalence_scale -from spm_calculator.geoadj import calculate_geoadj_from_rent - -from policyengine_us_data.utils.spm import TENURE_CODE_MAP +from policyengine_us_data.utils.spm import ( + TENURE_CODE_MAP, + calculate_geoadj_from_rent, + get_spm_reference_thresholds, + spm_equivalence_scale, +) from policyengine_us.variables.household.demographic.geographic.state_name import ( StateName, ) @@ -491,6 +493,7 @@ def get_cd_index_mapping(db_uri: str = None): tuple: (cd_to_index dict, index_to_cd dict, cds_ordered list) """ from sqlalchemy import create_engine, text + from pathlib import Path from policyengine_us_data.storage import STORAGE_FOLDER if db_uri is None: @@ -531,7 +534,7 @@ def load_geo_labels(path) -> List[str]: def load_cd_geoadj_values( cds_to_calibrate: List[str], -) -> Dict[str, float]: +) -> Dict[str, Dict[str, float]]: """ Load geographic adjustment factors from rent data CSV. Uses median 2BR rent by CD vs national median to compute geoadj. @@ -550,11 +553,18 @@ def load_cd_geoadj_values( # Build lookup from rent data rent_lookup = {} for _, row in rent_df.iterrows(): - geoadj = calculate_geoadj_from_rent( - local_rent=row["median_2br_rent"], - national_rent=row["national_median_2br_rent"], - ) - rent_lookup[row["cd_geoid"]] = geoadj + rent_lookup[row["cd_geoid"]] = { + tenure: calculate_geoadj_from_rent( + local_rent=row["median_2br_rent"], + national_rent=row["national_median_2br_rent"], + tenure=tenure, + ) + for tenure in ( + "owner_with_mortgage", + "owner_without_mortgage", + "renter", + ) + } geoadj_dict = {} for cd in cds_to_calibrate: @@ -562,7 +572,11 @@ def load_cd_geoadj_values( geoadj_dict[cd] = rent_lookup[cd] else: print(f"Warning: No rent data for CD {cd}, using geoadj=1.0") - geoadj_dict[cd] = 1.0 + geoadj_dict[cd] = { + "owner_with_mortgage": 1.0, + "owner_without_mortgage": 1.0, + "renter": 1.0, + } return geoadj_dict @@ -593,6 +607,11 @@ def calculate_spm_thresholds_vectorized( Returns: Float32 array of SPM thresholds, one per SPM unit. """ + person_ages = np.asarray(person_ages) + person_spm_unit_ids = np.asarray(person_spm_unit_ids) + spm_unit_tenure_types = np.asarray(spm_unit_tenure_types) + spm_unit_geoadj = np.asarray(spm_unit_geoadj, dtype=np.float64) + n_units = len(spm_unit_tenure_types) # Count adults and children per SPM unit @@ -614,8 +633,7 @@ def calculate_spm_thresholds_vectorized( tenure_codes[mask] = code # Look up base thresholds - calc = SPMCalculator(year=year) - base_thresholds = calc.get_base_thresholds() + base_thresholds = get_spm_reference_thresholds(year) thresholds = np.zeros(n_units, dtype=np.float32) for i in range(n_units): diff --git a/policyengine_us_data/calibration/publish_local_area.py b/policyengine_us_data/calibration/publish_local_area.py index b3e6085a9..4a8bb8a16 100644 --- a/policyengine_us_data/calibration/publish_local_area.py +++ b/policyengine_us_data/calibration/publish_local_area.py @@ -8,11 +8,6 @@ python publish_local_area.py [--skip-download] [--states-only] [--upload] """ -import hashlib -import json -import shutil - - import numpy as np from pathlib import Path from typing import List @@ -30,6 +25,7 @@ ) from policyengine_us_data.calibration.block_assignment import ( derive_geography_from_blocks, + get_county_filter_probability, ) from policyengine_us_data.calibration.clone_and_assign import ( GeographyAssignment, @@ -45,76 +41,28 @@ CHECKPOINT_FILE_CITIES = Path("completed_cities.txt") WORK_DIR = Path("local_area_build") -NYC_COUNTY_FIPS = {"36005", "36047", "36061", "36081", "36085"} - - -META_FILE = WORK_DIR / "checkpoint_meta.json" - - -def compute_input_fingerprint( - weights_path: Path, dataset_path: Path, n_clones: int, seed: int -) -> str: - h = hashlib.sha256() - for p in [weights_path, dataset_path]: - with open(p, "rb") as f: - while chunk := f.read(8192): - h.update(chunk) - h.update(f"{n_clones}:{seed}".encode()) - return h.hexdigest()[:16] - - -def validate_or_clear_checkpoints(fingerprint: str): - if META_FILE.exists(): - stored = json.loads(META_FILE.read_text()) - if stored.get("fingerprint") == fingerprint: - print(f"Inputs unchanged ({fingerprint}), resuming...") - return - print( - f"Inputs changed " - f"({stored.get('fingerprint')} -> {fingerprint}), " - f"clearing..." - ) - else: - print(f"No checkpoint metadata, starting fresh ({fingerprint})") - h5_count = sum( - 1 - for subdir in ["states", "districts", "cities"] - if (WORK_DIR / subdir).exists() - for _ in (WORK_DIR / subdir).rglob("*.h5") - ) - if h5_count > 0: - print( - f"WARNING: {h5_count} H5 files exist. " - f"Clearing only checkpoint files, preserving H5s." - ) - for cp in [ - CHECKPOINT_FILE, - CHECKPOINT_FILE_DISTRICTS, - CHECKPOINT_FILE_CITIES, - ]: - if cp.exists(): - cp.unlink() - else: - for cp in [ - CHECKPOINT_FILE, - CHECKPOINT_FILE_DISTRICTS, - CHECKPOINT_FILE_CITIES, - ]: - if cp.exists(): - cp.unlink() - for subdir in ["states", "districts", "cities"]: - d = WORK_DIR / subdir - if d.exists(): - shutil.rmtree(d) - META_FILE.parent.mkdir(parents=True, exist_ok=True) - META_FILE.write_text(json.dumps({"fingerprint": fingerprint})) - - -SUB_ENTITIES = [ - "tax_unit", - "spm_unit", - "family", - "marital_unit", +NYC_COUNTIES = { + "QUEENS_COUNTY_NY", + "BRONX_COUNTY_NY", + "RICHMOND_COUNTY_NY", + "NEW_YORK_COUNTY_NY", + "KINGS_COUNTY_NY", +} + +NYC_CDS = [ + "3603", + "3605", + "3606", + "3607", + "3608", + "3609", + "3610", + "3611", + "3612", + "3613", + "3614", + "3615", + "3616", ] @@ -163,7 +111,7 @@ def build_h5( dataset_path: Path, output_path: Path, cd_subset: List[str] = None, - county_fips_filter: set = None, + county_filter: set = None, takeup_filter: List[str] = None, ) -> Path: """Build an H5 file by cloning records for each nonzero weight. @@ -174,8 +122,8 @@ def build_h5( dataset_path: Path to base dataset H5 file. output_path: Where to write the output H5 file. cd_subset: If provided, only include clones for these CDs. - county_fips_filter: If provided, zero out weights for clones - whose county FIPS is not in this set. + county_filter: If provided, scale weights by P(target|CD) + for city datasets. takeup_filter: List of takeup vars to apply. Returns: @@ -216,11 +164,17 @@ def build_h5( cd_mask = np.vectorize(lambda cd: cd in cd_subset_set)(clone_cds_matrix) W[~cd_mask] = 0 - # County FIPS filtering: zero out clones not in target counties - if county_fips_filter is not None: - fips_array = np.asarray(geography.county_fips).reshape(n_clones_total, n_hh) - fips_mask = np.isin(fips_array, list(county_fips_filter)) - W[~fips_mask] = 0 + # County filtering: scale weights by P(target_counties | CD) + if county_filter is not None: + unique_cds = np.unique(clone_cds_matrix) + cd_prob = { + cd: get_county_filter_probability(cd, county_filter) for cd in unique_cds + } + p_matrix = np.vectorize( + cd_prob.__getitem__, + otypes=[float], + )(clone_cds_matrix) + W *= p_matrix label = ( f"CD subset {cd_subset}" @@ -237,7 +191,7 @@ def build_h5( if n_clones == 0: raise ValueError( f"No active clones after filtering. " - f"cd_subset={cd_subset}, county_fips_filter={county_fips_filter}" + f"cd_subset={cd_subset}, county_filter={county_filter}" ) clone_weights = W[active_geo, active_hh] active_blocks = blocks.reshape(n_clones_total, n_hh)[active_geo, active_hh] @@ -258,6 +212,12 @@ def build_h5( for p_idx, p_hh_id in enumerate(person_hh_ids): hh_to_persons[hh_id_to_idx[int(p_hh_id)]].append(p_idx) + SUB_ENTITIES = [ + "tax_unit", + "spm_unit", + "family", + "marital_unit", + ] hh_to_entity = {} entity_id_arrays = {} person_entity_id_arrays = {} @@ -351,6 +311,22 @@ def build_h5( unique_geo = derive_geography_from_blocks(unique_blocks) clone_geo = {k: v[block_inv] for k, v in unique_geo.items()} + # === Calculate weights for all entity levels === + person_weights = np.repeat(clone_weights, persons_per_clone) + per_person_wt = clone_weights / np.maximum(persons_per_clone, 1) + + entity_weights = {} + for ek in SUB_ENTITIES: + n_ents = len(entity_clone_idx[ek]) + ent_person_counts = np.zeros(n_ents, dtype=np.int32) + np.add.at( + ent_person_counts, + new_person_entity_ids[ek], + 1, + ) + clone_ids_e = np.repeat(np.arange(n_clones), entities_per_clone[ek]) + entity_weights[ek] = per_person_wt[clone_ids_e] * ent_person_counts + # === Determine variables to save === vars_to_save = set(sim.input_variables) vars_to_save.add("county") @@ -400,6 +376,7 @@ def build_h5( for period in periods: values = holder.get_array(period) + # Convert Arrow-backed arrays to numpy before indexing if hasattr(values, "_pa_array") or hasattr(values, "_ndarray"): values = np.asarray(values) @@ -436,12 +413,16 @@ def build_h5( } # === Override weights === - # Only write household_weight; sub-entity weights (tax_unit_weight, - # spm_unit_weight, person_weight, etc.) are formula variables in - # policyengine-us that derive from household_weight at runtime. data["household_weight"] = { time_period: clone_weights.astype(np.float32), } + data["person_weight"] = { + time_period: person_weights.astype(np.float32), + } + for ek in SUB_ENTITIES: + data[f"{ek}_weight"] = { + time_period: entity_weights[ek].astype(np.float32), + } # === Override geography === data["state_fips"] = { @@ -470,13 +451,6 @@ def build_h5( time_period: clone_geo[gv].astype("S"), } - # === Set zip_code for LA County clones (ACA rating area fix) === - la_mask = clone_geo["county_fips"].astype(str) == "06037" - if la_mask.any(): - zip_codes = np.full(len(la_mask), "UNKNOWN") - zip_codes[la_mask] = "90001" - data["zip_code"] = {time_period: zip_codes.astype("S")} - # === Gap 4: Congressional district GEOID === clone_cd_geoids = np.array([int(cd) for cd in active_clone_cds], dtype=np.int32) data["congressional_district_geoid"] = { @@ -487,19 +461,11 @@ def build_h5( print("Recalculating SPM thresholds...") unique_cds_list = sorted(set(active_clone_cds)) cd_geoadj_values = load_cd_geoadj_values(unique_cds_list) - # Build per-SPM-unit geoadj from clone's CD - spm_clone_ids = np.repeat( - np.arange(n_clones, dtype=np.int64), - entities_per_clone["spm_unit"], - ) - spm_unit_geoadj = np.array( - [cd_geoadj_values[active_clone_cds[c]] for c in spm_clone_ids], - dtype=np.float64, - ) - # Get cloned person ages and SPM tenure types + # Get cloned person ages and SPM unit IDs person_ages = sim.calculate("age", map_to="person").values[person_clone_idx] + # Get cloned tenure types spm_tenure_holder = sim.get_holder("spm_unit_tenure_type") spm_tenure_periods = spm_tenure_holder.get_known_periods() if spm_tenure_periods: @@ -516,6 +482,25 @@ def build_h5( dtype="S30", ) + # Build per-SPM-unit geoadj from the clone's CD and tenure. + spm_clone_ids = np.repeat( + np.arange(n_clones, dtype=np.int64), + entities_per_clone["spm_unit"], + ) + spm_unit_geoadj = np.array( + [ + cd_geoadj_values[active_clone_cds[clone_id]][ + ( + spm_tenure_cloned[spm_unit_index] + .decode() + .lower() + ) + ] + for spm_unit_index, clone_id in enumerate(spm_clone_ids) + ], + dtype=np.float64, + ) + new_spm_thresholds = calculate_spm_thresholds_vectorized( person_ages=person_ages, person_spm_unit_ids=new_person_entity_ids["spm_unit"], @@ -556,7 +541,6 @@ def build_h5( hh_blocks=active_blocks, hh_state_fips=hh_state_fips, hh_ids=original_hh_ids, - hh_clone_indices=active_geo.astype(np.int64), entity_hh_indices=entity_hh_indices, entity_counts=entity_counts, time_period=time_period, @@ -754,6 +738,8 @@ def build_cities( """Build city H5 files with checkpointing, optionally uploading.""" w = np.load(weights_path) + all_cds = sorted(set(geography.cd_geoid.astype(str))) + cities_dir = output_dir / "cities" cities_dir.mkdir(parents=True, exist_ok=True) @@ -763,29 +749,34 @@ def build_cities( if "NYC" in completed_cities: print("Skipping NYC (already completed)") else: - output_path = cities_dir / "NYC.h5" - - try: - build_h5( - weights=w, - geography=geography, - dataset_path=dataset_path, - output_path=output_path, - county_fips_filter=NYC_COUNTY_FIPS, - takeup_filter=takeup_filter, - ) - - if upload: - print("Uploading NYC.h5 to GCP...") - upload_local_area_file(str(output_path), "cities", skip_hf=True) - hf_queue.append((str(output_path), "cities")) - - record_completed_city("NYC") - print("Completed NYC") - - except Exception as e: - print(f"ERROR building NYC: {e}") - raise + cd_subset = [cd for cd in all_cds if cd in NYC_CDS] + if not cd_subset: + print("No NYC-related CDs found, skipping") + else: + output_path = cities_dir / "NYC.h5" + + try: + build_h5( + weights=w, + geography=geography, + dataset_path=dataset_path, + output_path=output_path, + cd_subset=cd_subset, + county_filter=NYC_COUNTIES, + takeup_filter=takeup_filter, + ) + + if upload: + print("Uploading NYC.h5 to GCP...") + upload_local_area_file(str(output_path), "cities", skip_hf=True) + hf_queue.append((str(output_path), "cities")) + + record_completed_city("NYC") + print("Completed NYC") + + except Exception as e: + print(f"ERROR building NYC: {e}") + raise if upload and hf_queue: print(f"\nUploading batch of {len(hf_queue)} city files to HuggingFace...") @@ -880,19 +871,9 @@ def main(): print(f"Using dataset: {inputs['dataset']}") - print("Computing input fingerprint...") - fingerprint = compute_input_fingerprint( - inputs["weights"], - inputs["dataset"], - args.n_clones, - args.seed, - ) - validate_or_clear_checkpoints(fingerprint) - - print("Loading base simulation to get household count...") - _sim = Microsimulation(dataset=str(inputs["dataset"])) - n_hh = len(_sim.calculate("household_id", map_to="household").values) - del _sim + sim = Microsimulation(dataset=str(inputs["dataset"])) + n_hh = sim.calculate("household_id", map_to="household").shape[0] + del sim print(f"\nBase dataset has {n_hh:,} households") geo_cache = WORK_DIR / f"geography_{n_hh}x{args.n_clones}_s{args.seed}.npz" diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py b/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py new file mode 100644 index 000000000..8cd2a06b0 --- /dev/null +++ b/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py @@ -0,0 +1,51 @@ +from __future__ import annotations + +import numpy as np +import pandas as pd +import pytest + +from policyengine_us import CountryTaxBenefitSystem +from policyengine_us_data.calibration.calibration_utils import ( + calculate_spm_thresholds_vectorized, + load_cd_geoadj_values, +) + +SYSTEM = CountryTaxBenefitSystem() +CPI_U = SYSTEM.parameters.gov.bls.cpi.cpi_u + + +def test_load_cd_geoadj_values_returns_tenure_specific_lookup(monkeypatch): + rent_df = pd.DataFrame( + { + "cd_id": ["0101"], + "median_2br_rent": [1_500.0], + "national_median_2br_rent": [1_000.0], + } + ) + monkeypatch.setattr( + "policyengine_us_data.calibration.calibration_utils.pd.read_csv", + lambda *args, **kwargs: rent_df, + ) + + geoadj_lookup = load_cd_geoadj_values(["101"]) + + assert geoadj_lookup["101"]["renter"] == pytest.approx(1.2215) + assert geoadj_lookup["101"]["owner_with_mortgage"] == pytest.approx(1.217) + assert geoadj_lookup["101"]["owner_without_mortgage"] == pytest.approx( + 1.1615 + ) + + +def test_calculate_spm_thresholds_vectorized_matches_policyengine_us_future_path(): + thresholds = calculate_spm_thresholds_vectorized( + person_ages=np.array([40, 35, 10, 8]), + person_spm_unit_ids=np.array([0, 0, 0, 0]), + spm_unit_tenure_types=np.array([b"RENTER"]), + spm_unit_geoadj=np.array([1.1]), + year=2027, + ) + + cpi_ratio = float(CPI_U("2027-02-01") / CPI_U("2024-02-01")) + expected = 39_430.0 * cpi_ratio * 1.1 + + assert thresholds[0] == pytest.approx(expected) diff --git a/policyengine_us_data/utils/spm.py b/policyengine_us_data/utils/spm.py index ad3c9e9fb..3aba81693 100644 --- a/policyengine_us_data/utils/spm.py +++ b/policyengine_us_data/utils/spm.py @@ -1,7 +1,69 @@ -"""SPM threshold calculation utilities using the spm-calculator package.""" +"""SPM threshold calculation utilities aligned with PolicyEngine US.""" +from functools import lru_cache import numpy as np -from spm_calculator import SPMCalculator, spm_equivalence_scale +from policyengine_us import CountryTaxBenefitSystem + +PUBLISHED_SPM_REFERENCE_THRESHOLDS = { + 2015: { + "renter": 25_155.0, + "owner_with_mortgage": 24_859.0, + "owner_without_mortgage": 20_639.0, + }, + 2016: { + "renter": 25_558.0, + "owner_with_mortgage": 25_248.0, + "owner_without_mortgage": 20_943.0, + }, + 2017: { + "renter": 26_213.0, + "owner_with_mortgage": 25_897.0, + "owner_without_mortgage": 21_527.0, + }, + 2018: { + "renter": 26_905.0, + "owner_with_mortgage": 26_565.0, + "owner_without_mortgage": 22_095.0, + }, + 2019: { + "renter": 27_515.0, + "owner_with_mortgage": 27_172.0, + "owner_without_mortgage": 22_600.0, + }, + 2020: { + "renter": 28_881.0, + "owner_with_mortgage": 28_533.0, + "owner_without_mortgage": 23_948.0, + }, + 2021: { + "renter": 31_453.0, + "owner_with_mortgage": 31_089.0, + "owner_without_mortgage": 26_022.0, + }, + 2022: { + "renter": 33_402.0, + "owner_with_mortgage": 32_949.0, + "owner_without_mortgage": 27_679.0, + }, + 2023: { + "renter": 36_606.0, + "owner_with_mortgage": 36_192.0, + "owner_without_mortgage": 30_347.0, + }, + 2024: { + "renter": 39_430.0, + "owner_with_mortgage": 39_068.0, + "owner_without_mortgage": 32_586.0, + }, +} + +LATEST_PUBLISHED_SPM_THRESHOLD_YEAR = max(PUBLISHED_SPM_REFERENCE_THRESHOLDS) +REFERENCE_RAW_SCALE = 3**0.7 +TENURE_HOUSING_SHARES = { + "owner_with_mortgage": 0.434, + "owner_without_mortgage": 0.323, + "renter": 0.443, +} TENURE_CODE_MAP = { 1: "owner_with_mortgage", @@ -10,6 +72,81 @@ } +@lru_cache(maxsize=1) +def _get_cpi_u_parameter(): + system = CountryTaxBenefitSystem() + return system.parameters.gov.bls.cpi.cpi_u + + +def spm_equivalence_scale(num_adults, num_children): + adults, children = np.broadcast_arrays( + np.asarray(num_adults, dtype=float), + np.asarray(num_children, dtype=float), + ) + + raw = np.zeros_like(adults, dtype=float) + has_people = (adults + children) > 0 + with_children = has_people & (children > 0) + + single_adult_with_children = with_children & (adults <= 1) + raw[single_adult_with_children] = ( + 1.0 + + 0.8 + + 0.5 * np.maximum(children[single_adult_with_children] - 1, 0) + ) ** 0.7 + + multi_adult_with_children = with_children & ~single_adult_with_children + raw[multi_adult_with_children] = ( + adults[multi_adult_with_children] + + 0.5 * children[multi_adult_with_children] + ) ** 0.7 + + no_children = has_people & ~with_children + one_adult = no_children & (adults <= 1) + two_adults = no_children & (adults == 2) + larger_adult_units = no_children & (adults > 2) + + raw[one_adult] = 1.0 + raw[two_adults] = 1.41 + raw[larger_adult_units] = adults[larger_adult_units] ** 0.7 + + return raw / REFERENCE_RAW_SCALE + + +def get_spm_reference_thresholds(year: int) -> dict[str, float]: + if year in PUBLISHED_SPM_REFERENCE_THRESHOLDS: + return PUBLISHED_SPM_REFERENCE_THRESHOLDS[year].copy() + + if year < min(PUBLISHED_SPM_REFERENCE_THRESHOLDS): + raise ValueError( + f"No published SPM reference thresholds for {year}. " + f"Earliest available year is {min(PUBLISHED_SPM_REFERENCE_THRESHOLDS)}." + ) + + cpi_u = _get_cpi_u_parameter() + factor = float( + cpi_u(f"{year}-02-01") + / cpi_u(f"{LATEST_PUBLISHED_SPM_THRESHOLD_YEAR}-02-01") + ) + latest_thresholds = PUBLISHED_SPM_REFERENCE_THRESHOLDS[ + LATEST_PUBLISHED_SPM_THRESHOLD_YEAR + ] + return { + tenure: value * factor + for tenure, value in latest_thresholds.items() + } + + +def calculate_geoadj_from_rent( + local_rent, + national_rent: float, + tenure: str = "renter", +): + share = TENURE_HOUSING_SHARES[tenure] + rent_ratio = np.asarray(local_rent, dtype=float) / float(national_rent) + return rent_ratio * share + (1.0 - share) + + def calculate_spm_thresholds_with_geoadj( num_adults: np.ndarray, num_children: np.ndarray, @@ -35,8 +172,7 @@ def calculate_spm_thresholds_with_geoadj( Returns: Array of SPM threshold values. """ - calc = SPMCalculator(year=year) - base_thresholds = calc.get_base_thresholds() + base_thresholds = get_spm_reference_thresholds(year) n = len(num_adults) thresholds = np.zeros(n) From 418d73c3c70e46e5c4d89f864cc1a241eeb1783b Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Tue, 7 Apr 2026 15:59:44 -0400 Subject: [PATCH 02/28] Split housing benchmarks by Census and HUD concepts --- .../db/etl_national_targets.py | 11 +- .../pull_hardcoded_targets.py | 82 +++++---- policyengine_us_data/utils/census_spm.py | 65 +++++++ policyengine_us_data/utils/hud_housing.py | 49 ++++++ policyengine_us_data/utils/loss.py | 7 +- tests/unit/test_census_spm_housing_targets.py | 161 ++++++++++++++++++ validation/benefit_validation.py | 95 ++++++++--- 7 files changed, 403 insertions(+), 67 deletions(-) create mode 100644 policyengine_us_data/utils/census_spm.py create mode 100644 policyengine_us_data/utils/hud_housing.py create mode 100644 tests/unit/test_census_spm_housing_targets.py diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index 773411dc5..d1c459656 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -12,6 +12,9 @@ from policyengine_us_data.storage.calibration_targets.soi_metadata import ( RETIREMENT_CONTRIBUTION_TARGETS, ) +from policyengine_us_data.utils.census_spm import ( + build_census_spm_capped_housing_subsidy_target, +) from policyengine_us_data.utils.db import ( DEFAULT_DATASET, etl_argparser, @@ -187,13 +190,7 @@ def extract_national_targets(dataset: str = DEFAULT_DATASET): "notes": "Work and childcare expenses for SPM", "year": HARDCODED_YEAR, }, - { - "variable": "spm_unit_capped_housing_subsidy", - "value": 35e9, - "source": "HUD/Census", - "notes": "Housing subsidies", - "year": HARDCODED_YEAR, - }, + build_census_spm_capped_housing_subsidy_target(HARDCODED_YEAR), { "variable": "tanf", "value": 9e9, diff --git a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py index 3199a56a2..c1478d957 100644 --- a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py +++ b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py @@ -1,52 +1,66 @@ import pandas as pd import numpy as np from policyengine_us_data.storage import CALIBRATION_FOLDER +from policyengine_us_data.utils.census_spm import ( + get_census_spm_capped_housing_subsidy_total, +) """ -Hardcoded targets for the year 2024 from CPS-derived statistics and other sources. Include medical expenses, sum of SPM thresholds, and child support expenses. +Hardcoded targets for the year 2024 from CPS-derived statistics and other +sources. Housing uses the Census SPM capped subsidy concept, not HUD spending. """ -HARD_CODED_TOTALS = { - "health_insurance_premiums_without_medicare_part_b": 385e9, - "other_medical_expenses": 278e9, - "medicare_part_b_premiums": 112e9, - "over_the_counter_health_expenses": 72e9, - "spm_unit_spm_threshold": 3_945e9, - "child_support_expense": 33e9, - "child_support_received": 33e9, - "spm_unit_capped_work_childcare_expenses": 348e9, - "spm_unit_capped_housing_subsidy": 35e9, - "tanf": 9e9, - # Alimony could be targeted via SOI - "alimony_income": 13e9, - "alimony_expense": 13e9, - # Rough estimate, not CPS derived - "real_estate_taxes": 500e9, # Rough estimate between 350bn and 600bn total property tax collections - "rent": 735e9, # ACS total uprated by CPI - # Table 5A from https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics - # shows $38,316,190,000 in Box 7: Social security tips (2018) - # Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC - # Assume 40% through 2024 - "tip_income": 38e9 * 1.4, -} +def get_hard_coded_totals( + year: int = 2024, + storage_folder=None, +) -> dict[str, float]: + return { + "health_insurance_premiums_without_medicare_part_b": 385e9, + "other_medical_expenses": 278e9, + "medicare_part_b_premiums": 112e9, + "over_the_counter_health_expenses": 72e9, + "spm_unit_spm_threshold": 3_945e9, + "child_support_expense": 33e9, + "child_support_received": 33e9, + "spm_unit_capped_work_childcare_expenses": 348e9, + "spm_unit_capped_housing_subsidy": get_census_spm_capped_housing_subsidy_total( + year, storage_folder=storage_folder + ), + "tanf": 9e9, + # Alimony could be targeted via SOI + "alimony_income": 13e9, + "alimony_expense": 13e9, + # Rough estimate, not CPS derived + "real_estate_taxes": 500e9, # Rough estimate between 350bn and 600bn total property tax collections + "rent": 735e9, # ACS total uprated by CPI + # Table 5A from https://www.irs.gov/statistics/soi-tax-stats-individual-information-return-form-w2-statistics + # shows $38,316,190,000 in Box 7: Social security tips (2018) + # Wages and salaries grew 32% from 2018 to 2023: https://fred.stlouisfed.org/graph/?g=1J0CC + # Assume 40% through 2024 + "tip_income": 38e9 * 1.4, + } -def pull_hardcoded_targets(): +def pull_hardcoded_targets(year: int = 2024, storage_folder=None): """ Returns a DataFrame with hardcoded targets for various CPS-derived statistics and other sources. """ + hard_coded_totals = get_hard_coded_totals( + year=year, + storage_folder=storage_folder, + ) data = { - "DATA_SOURCE": ["hardcoded"] * len(HARD_CODED_TOTALS), - "GEO_ID": ["0000000US"] * len(HARD_CODED_TOTALS), - "GEO_NAME": ["national"] * len(HARD_CODED_TOTALS), - "VARIABLE": list(HARD_CODED_TOTALS.keys()), - "VALUE": list(HARD_CODED_TOTALS.values()), + "DATA_SOURCE": ["hardcoded"] * len(hard_coded_totals), + "GEO_ID": ["0000000US"] * len(hard_coded_totals), + "GEO_NAME": ["national"] * len(hard_coded_totals), + "VARIABLE": list(hard_coded_totals.keys()), + "VALUE": list(hard_coded_totals.values()), "IS_COUNT": [0.0] - * len(HARD_CODED_TOTALS), # All values are monetary amounts, not counts + * len(hard_coded_totals), # All values are monetary amounts, not counts "BREAKDOWN_VARIABLE": [np.nan] - * len(HARD_CODED_TOTALS), # No breakdown variable for hardcoded targets - "LOWER_BOUND": [np.nan] * len(HARD_CODED_TOTALS), - "UPPER_BOUND": [np.nan] * len(HARD_CODED_TOTALS), + * len(hard_coded_totals), # No breakdown variable for hardcoded targets + "LOWER_BOUND": [np.nan] * len(hard_coded_totals), + "UPPER_BOUND": [np.nan] * len(hard_coded_totals), } df = pd.DataFrame(data) diff --git a/policyengine_us_data/utils/census_spm.py b/policyengine_us_data/utils/census_spm.py new file mode 100644 index 000000000..e7a9c32b4 --- /dev/null +++ b/policyengine_us_data/utils/census_spm.py @@ -0,0 +1,65 @@ +from pathlib import Path + +import pandas as pd + +from policyengine_us_data.storage import STORAGE_FOLDER + +# These totals are derived from raw public-use Census CPS ASEC files by +# summing SPM_CAPHOUSESUB * SPM_WEIGHT / 100 at the SPM-unit level. +# They represent the Census SPM capped housing subsidy concept, not HUD +# spending or outlays. +CENSUS_SPM_CAPPED_HOUSING_SUBSIDY_TOTALS = { + 2022: 29_549_204_420.92, + 2023: 31_844_144_470.85, + 2024: 33_649_114_150.37, +} + + +def get_census_spm_capped_housing_subsidy_total( + year: int, + storage_folder: Path | str | None = None, +) -> float: + """ + Return the Census CPS ASEC total for SPM_CAPHOUSESUB. + + If a storage folder is provided, recompute directly from the local + raw Census CPS HDF file. Otherwise, use the checked-in year-specific + totals above so callers do not depend on local data files at import + time. + """ + + if storage_folder is None: + if year not in CENSUS_SPM_CAPPED_HOUSING_SUBSIDY_TOTALS: + raise ValueError( + f"No published Census SPM capped housing subsidy total for {year}." + ) + return CENSUS_SPM_CAPPED_HOUSING_SUBSIDY_TOTALS[year] + + storage_path = Path(storage_folder) / f"census_cps_{year}.h5" + if not storage_path.exists(): + raise FileNotFoundError( + f"Missing raw Census CPS file for {year}: {storage_path}" + ) + + with pd.HDFStore(storage_path, mode="r") as store: + spm_unit = store["spm_unit"][["SPM_CAPHOUSESUB", "SPM_WEIGHT"]] + + return float((spm_unit.SPM_CAPHOUSESUB * spm_unit.SPM_WEIGHT).sum() / 100) + + +def build_census_spm_capped_housing_subsidy_target( + year: int, + storage_folder: Path | str | None = None, +) -> dict: + return { + "variable": "spm_unit_capped_housing_subsidy", + "value": get_census_spm_capped_housing_subsidy_total( + year, storage_folder=storage_folder + ), + "source": "Census CPS ASEC public-use SPM_CAPHOUSESUB", + "notes": ( + "Capped SPM housing subsidy total from raw Census CPS ASEC " + "SPM_CAPHOUSESUB; not HUD spending or outlays." + ), + "year": year, + } diff --git a/policyengine_us_data/utils/hud_housing.py b/policyengine_us_data/utils/hud_housing.py new file mode 100644 index 000000000..1b7620e5d --- /dev/null +++ b/policyengine_us_data/utils/hud_housing.py @@ -0,0 +1,49 @@ +HUD_USER_HOUSING_ASSISTANCE_BENCHMARKS = { + 2022: { + "reported_households": 4_537_614, + "average_monthly_spending_per_unit": 899, + }, + 2023: { + "reported_households": 4_569_973, + "average_monthly_spending_per_unit": 989, + }, + 2024: { + "reported_households": 4_584_691, + "average_monthly_spending_per_unit": 1_067, + }, + 2025: { + "reported_households": 4_519_561, + "average_monthly_spending_per_unit": 1_135, + }, +} + + +def get_hud_user_housing_benchmark(year: int) -> dict: + if year not in HUD_USER_HOUSING_ASSISTANCE_BENCHMARKS: + raise ValueError(f"No HUD USER housing benchmark for {year}.") + + benchmark = dict(HUD_USER_HOUSING_ASSISTANCE_BENCHMARKS[year]) + benchmark["annual_spending_total"] = ( + benchmark["reported_households"] + * benchmark["average_monthly_spending_per_unit"] + * 12 + ) + return benchmark + + +def build_hud_user_housing_assistance_benchmark(year: int) -> dict: + benchmark = get_hud_user_housing_benchmark(year) + return { + "variable": "housing_assistance", + "annual_spending_total": benchmark["annual_spending_total"], + "reported_households": benchmark["reported_households"], + "average_monthly_spending_per_unit": benchmark[ + "average_monthly_spending_per_unit" + ], + "source": "HUD USER Picture of Subsidized Households", + "notes": ( + "Annual federal spending and occupied assisted households from " + "HUD USER; not Census SPM capped subsidy." + ), + "year": year, + } diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index e141cdf43..6d8b35f46 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -12,6 +12,9 @@ from policyengine_us_data.storage.calibration_targets.soi_metadata import ( RETIREMENT_CONTRIBUTION_TARGETS, ) +from policyengine_us_data.utils.census_spm import ( + get_census_spm_capped_housing_subsidy_total, +) from policyengine_core.reforms import Reform from policyengine_us_data.utils.soi import pe_to_soi, get_soi @@ -31,7 +34,9 @@ "child_support_expense": 33e9, "child_support_received": 33e9, "spm_unit_capped_work_childcare_expenses": 348e9, - "spm_unit_capped_housing_subsidy": 35e9, + "spm_unit_capped_housing_subsidy": get_census_spm_capped_housing_subsidy_total( + 2024 + ), "tanf": 9e9, # Alimony could be targeted via SOI "alimony_income": 13e9, diff --git a/tests/unit/test_census_spm_housing_targets.py b/tests/unit/test_census_spm_housing_targets.py new file mode 100644 index 000000000..6693ffe9a --- /dev/null +++ b/tests/unit/test_census_spm_housing_targets.py @@ -0,0 +1,161 @@ +import sys +import types + +import pandas as pd +import validation.benefit_validation as benefit_validation + +from policyengine_us_data.db import etl_national_targets +from policyengine_us_data.storage.calibration_targets.pull_hardcoded_targets import ( + pull_hardcoded_targets, +) +from policyengine_us_data.utils.census_spm import ( + build_census_spm_capped_housing_subsidy_target, + get_census_spm_capped_housing_subsidy_total, +) +from policyengine_us_data.utils.hud_housing import ( + build_hud_user_housing_assistance_benchmark, + get_hud_user_housing_benchmark, +) + + +def _write_census_cps(storage_dir, year=2024): + path = storage_dir / f"census_cps_{year}.h5" + with pd.HDFStore(path, mode="w") as store: + store["spm_unit"] = pd.DataFrame( + { + "SPM_CAPHOUSESUB": [1_000.0, 0.0, 2_000.0], + "SPM_WEIGHT": [150, 250, 100], + } + ) + return path + + +def test_get_census_spm_capped_housing_subsidy_total_uses_raw_cps_spm_values( + tmp_path, +): + _write_census_cps(tmp_path) + + total = get_census_spm_capped_housing_subsidy_total( + 2024, storage_folder=tmp_path + ) + + expected = (1_000 * 150 + 0 * 250 + 2_000 * 100) / 100 + assert total == expected + + +def test_pull_hardcoded_targets_uses_census_spm_housing_total(tmp_path): + _write_census_cps(tmp_path) + + targets = pull_hardcoded_targets(year=2024, storage_folder=tmp_path) + housing = targets.loc[ + targets.VARIABLE == "spm_unit_capped_housing_subsidy", "VALUE" + ].iat[0] + + assert housing == (1_000 * 150 + 0 * 250 + 2_000 * 100) / 100 + + +def test_extract_national_targets_uses_census_spm_housing_target(monkeypatch): + class DummyMicrosimulation: + def __init__(self, dataset): + self.default_calculation_period = 2024 + + monkeypatch.setitem( + sys.modules, + "policyengine_us", + types.SimpleNamespace(Microsimulation=DummyMicrosimulation), + ) + monkeypatch.setattr( + etl_national_targets, + "build_census_spm_capped_housing_subsidy_target", + lambda year, storage_folder=None: { + "variable": "spm_unit_capped_housing_subsidy", + "value": 123.0, + "source": "Census CPS ASEC public-use SPM_CAPHOUSESUB", + "notes": "Capped SPM housing subsidy total from raw Census CPS ASEC.", + "year": year, + }, + ) + + targets = etl_national_targets.extract_national_targets("dummy") + housing = next( + target + for target in targets["direct_sum_targets"] + if target["variable"] == "spm_unit_capped_housing_subsidy" + ) + + assert housing["value"] == 123.0 + assert housing["source"] == "Census CPS ASEC public-use SPM_CAPHOUSESUB" + assert "Capped SPM housing subsidy" in housing["notes"] + + +def test_build_census_spm_capped_housing_subsidy_target_labels_concept(tmp_path): + _write_census_cps(tmp_path) + + target = build_census_spm_capped_housing_subsidy_target( + 2024, storage_folder=tmp_path + ) + + assert target["variable"] == "spm_unit_capped_housing_subsidy" + assert target["value"] == (1_000 * 150 + 0 * 250 + 2_000 * 100) / 100 + assert target["source"] == "Census CPS ASEC public-use SPM_CAPHOUSESUB" + assert "not HUD spending" in target["notes"] + + +def test_get_hud_user_housing_benchmark_uses_annual_spending_concept(): + benchmark = get_hud_user_housing_benchmark(2022) + + assert benchmark["reported_households"] == 4_537_614 + assert benchmark["average_monthly_spending_per_unit"] == 899 + assert benchmark["annual_spending_total"] == 48_951_779_832 + + +def test_build_hud_user_housing_assistance_benchmark_labels_admin_concept(): + benchmark = build_hud_user_housing_assistance_benchmark(2022) + + assert benchmark["variable"] == "housing_assistance" + assert benchmark["source"] == "HUD USER Picture of Subsidized Households" + assert benchmark["reported_households"] == 4_537_614 + assert benchmark["annual_spending_total"] == 48_951_779_832 + assert "not Census SPM capped subsidy" in benchmark["notes"] + + +def test_get_program_benchmarks_keeps_housing_concepts_separate(monkeypatch): + monkeypatch.setattr( + benefit_validation, + "build_census_spm_capped_housing_subsidy_target", + lambda year: { + "variable": "spm_unit_capped_housing_subsidy", + "value": 111e9, + "source": "Census benchmark", + "notes": "Census concept", + "year": year, + }, + ) + monkeypatch.setattr( + benefit_validation, + "build_hud_user_housing_assistance_benchmark", + lambda year: { + "variable": "housing_assistance", + "annual_spending_total": 222e9, + "reported_households": 3_000_000, + "average_monthly_spending_per_unit": 999, + "source": "HUD USER benchmark", + "notes": "HUD concept", + "year": year, + }, + ) + + benchmarks = benefit_validation.get_program_benchmarks(2022) + + assert benchmarks["housing_spm_capped"]["variable"] == "spm_unit_capped_housing_subsidy" + assert benchmarks["housing_spm_capped"]["benchmark_total"] == 111 + assert benchmarks["housing_spm_capped"]["benchmark_source"] == "Census benchmark" + + assert benchmarks["housing_assistance_hud_user"]["variable"] == "housing_assistance" + assert benchmarks["housing_assistance_hud_user"]["benchmark_total"] == 222 + assert benchmarks["housing_assistance_hud_user"]["benchmark_source"] == "HUD USER benchmark" + assert ( + benchmarks["housing_assistance_hud_user"]["benchmark_participants_millions"] + == 3 + ) + assert benchmarks["housing_assistance_hud_user"]["map_to"] == "household" diff --git a/validation/benefit_validation.py b/validation/benefit_validation.py index cf4689720..723fe6a08 100644 --- a/validation/benefit_validation.py +++ b/validation/benefit_validation.py @@ -9,41 +9,75 @@ import numpy as np from policyengine_us import Microsimulation from policyengine_us_data.datasets.cps.enhanced_cps import EnhancedCPS - - -def analyze_benefit_underreporting(): - """Analyze impact of CPS benefit underreporting on results.""" - - sim = Microsimulation(dataset=EnhancedCPS, dataset_year=2022) - - programs = { +from policyengine_us_data.utils.census_spm import ( + build_census_spm_capped_housing_subsidy_target, +) +from policyengine_us_data.utils.hud_housing import ( + build_hud_user_housing_assistance_benchmark, +) + + +def get_program_benchmarks(year: int): + housing_spm_target = build_census_spm_capped_housing_subsidy_target(year) + housing_hud_target = build_hud_user_housing_assistance_benchmark(year) + return { "snap": { "variable": "snap", - "admin_total": 114.1, # billions, from USDA + "benchmark_total": 114.1, # billions, from USDA + "benchmark_source": "USDA", "participation_rate": 0.82, # from USDA + "weight_variable": "spm_unit_weight", }, "ssi": { "variable": "ssi", - "admin_total": 56.7, # from SSA + "benchmark_total": 56.7, # from SSA + "benchmark_source": "SSA", "participation_rate": 0.93, + "weight_variable": "spm_unit_weight", }, "tanf": { "variable": "tanf", - "admin_total": 7.1, # from HHS + "benchmark_total": 7.1, # from HHS + "benchmark_source": "HHS", "participation_rate": 0.23, + "weight_variable": "spm_unit_weight", }, - "housing": { - "variable": "housing_benefit", - "admin_total": 50.3, # from HUD - "participation_rate": 0.76, + "housing_spm_capped": { + "variable": "spm_unit_capped_housing_subsidy", + "benchmark_total": housing_spm_target["value"] / 1e9, + "benchmark_source": housing_spm_target["source"], + "participation_rate": np.nan, + "weight_variable": "spm_unit_weight", + }, + "housing_assistance_hud_user": { + "variable": "housing_assistance", + "benchmark_total": housing_hud_target["annual_spending_total"] / 1e9, + "benchmark_source": housing_hud_target["source"], + "benchmark_participants_millions": ( + housing_hud_target["reported_households"] / 1e6 + ), + "participation_rate": np.nan, + "weight_variable": "household_weight", + "map_to": "household", }, } + +def analyze_benefit_underreporting(): + """Analyze impact of CPS benefit underreporting on results.""" + + year = 2022 + sim = Microsimulation(dataset=EnhancedCPS, dataset_year=year) + programs = get_program_benchmarks(year) + results = [] for program, info in programs.items(): # Calculate totals - benefit = sim.calculate(info["variable"], 2022) - weight = sim.calculate("household_weight", 2022) + benefit_kwargs = {} + if info.get("map_to"): + benefit_kwargs["map_to"] = info["map_to"] + benefit = sim.calculate(info["variable"], year, **benefit_kwargs) + weight = sim.calculate(info["weight_variable"], year) # Total benefits total = (benefit * weight).sum() / 1e9 # billions @@ -53,15 +87,26 @@ def analyze_benefit_underreporting(): weighted_participants = ((benefit > 0) * weight).sum() / 1e6 # millions # Underreporting factor - underreporting = info["admin_total"] / total if total > 0 else np.inf + benchmark_ratio = ( + info["benchmark_total"] / total if total > 0 else np.inf + ) + benchmark_participants = info.get("benchmark_participants_millions") + participant_ratio = ( + weighted_participants / benchmark_participants + if benchmark_participants not in (None, 0) + else np.nan + ) results.append( { "program": program, "enhanced_cps_total": total, - "admin_total": info["admin_total"], - "underreporting_factor": underreporting, + "benchmark_total": info["benchmark_total"], + "benchmark_source": info["benchmark_source"], + "benchmark_ratio": benchmark_ratio, "participants_millions": weighted_participants, + "benchmark_participants_millions": benchmark_participants, + "participant_ratio": participant_ratio, "mean_benefit": ( benefit[benefit > 0].mean() if (benefit > 0).any() else 0 ), @@ -77,11 +122,11 @@ def validate_program_interactions(): sim = Microsimulation(dataset=EnhancedCPS, dataset_year=2022) # Get program benefits - snap = sim.calculate("snap", 2022) > 0 - medicaid = sim.calculate("medicaid", 2022) > 0 - ssi = sim.calculate("ssi", 2022) > 0 - tanf = sim.calculate("tanf", 2022) > 0 - housing = sim.calculate("housing_benefit", 2022) > 0 + snap = sim.calculate("snap", 2022, map_to="household") > 0 + medicaid = sim.calculate("medicaid", 2022, map_to="household") > 0 + ssi = sim.calculate("ssi", 2022, map_to="household") > 0 + tanf = sim.calculate("tanf", 2022, map_to="household") > 0 + housing = sim.calculate("housing_assistance", 2022, map_to="household") > 0 weight = sim.calculate("household_weight", 2022) From c71e53aaab58a06950faa20f02856e93969df382 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 09:28:37 -0400 Subject: [PATCH 03/28] Stabilize clone-half SPM thresholds and weight priors --- .../datasets/cps/enhanced_cps.py | 48 ++++++++-- .../datasets/cps/extended_cps.py | 89 ++++++++++++++++++- tests/unit/test_extended_cps.py | 46 ++++++++++ 3 files changed, 176 insertions(+), 7 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 275d71e2b..e0ce2c208 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -28,6 +28,42 @@ torch = None +def initialize_weight_priors( + original_weights: np.ndarray, + seed: int = 1456, + epsilon: float = 1e-6, + positive_jitter_scale: float = 0.01, +) -> np.ndarray: + """Build deterministic positive priors for sparse reweighting. + + Original CPS households should keep priors close to their survey + weights. Clone-half households start with zero weight on purpose, so + they should receive only a tiny positive epsilon to keep the log + optimization well-defined without giving them a meaningful head start. + """ + + weights = np.asarray(original_weights, dtype=np.float64) + if np.any(weights < 0): + raise ValueError("original_weights must be non-negative") + + rng = np.random.default_rng(seed) + priors = np.empty_like(weights, dtype=np.float64) + + positive_mask = weights > 0 + if positive_mask.any(): + jitter = np.maximum( + rng.normal(loc=1.0, scale=positive_jitter_scale, size=positive_mask.sum()), + 0.5, + ) + priors[positive_mask] = np.maximum(weights[positive_mask] * jitter, epsilon) + + zero_mask = ~positive_mask + if zero_mask.any(): + priors[zero_mask] = epsilon * rng.uniform(1.0, 2.0, size=zero_mask.sum()) + + return priors + + def _get_period_array(period_values: dict, period: int) -> np.ndarray: """Get a period array from a TIME_PERIOD_ARRAYS variable dict.""" value = period_values.get(period) @@ -191,9 +227,9 @@ def generate(self): data = sim.dataset.load_dataset() base_year = int(sim.default_calculation_period) data["household_weight"] = {} - original_weights = sim.calculate("household_weight") - original_weights = original_weights.values + np.random.normal( - 1, 0.1, len(original_weights) + original_weights = initialize_weight_priors( + sim.calculate("household_weight").values, + seed=1456, ) bad_targets = [ @@ -328,9 +364,9 @@ def generate(self): sim = Microsimulation(dataset=self.input_dataset) data = sim.dataset.load_dataset() - original_weights = sim.calculate("household_weight") - original_weights = original_weights.values + np.random.normal( - 1, 0.1, len(original_weights) + original_weights = initialize_weight_priors( + sim.calculate("household_weight").values, + seed=1456, ) for year in [2024]: loss_matrix, targets_array = build_loss_matrix(self.input_dataset, year) diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 8b9041c0c..b110b536c 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -18,6 +18,10 @@ convert_mortgage_interest_to_structural_inputs, impute_tax_unit_mortgage_balance_hints, ) +from policyengine_us_data.utils.spm import ( + get_spm_reference_thresholds, + spm_equivalence_scale, +) from policyengine_us_data.utils.policyengine import has_policyengine_us_variables from policyengine_us_data.utils.retirement_limits import ( get_retirement_limits, @@ -77,7 +81,6 @@ def _supports_structural_mortgage_inputs() -> bool: "spm_unit_federal_tax_reported", "spm_unit_state_tax_reported", "spm_unit_capped_work_childcare_expenses", - "spm_unit_spm_threshold", "spm_unit_net_income_reported", "spm_unit_pre_subsidy_childcare_expenses", # Medical expenses @@ -325,6 +328,15 @@ def reconcile_ss_subcomponents(predictions, total_ss): "social_security_survivors", } +_SPM_TENURE_TO_REFERENCE_KEY = { + b"OWNER_WITH_MORTGAGE": "owner_with_mortgage", + b"OWNER_WITHOUT_MORTGAGE": "owner_without_mortgage", + b"RENTER": "renter", + "OWNER_WITH_MORTGAGE": "owner_with_mortgage", + "OWNER_WITHOUT_MORTGAGE": "owner_without_mortgage", + "RENTER": "renter", +} + def _apply_post_processing(predictions, X_test, time_period, data): """Apply retirement constraints and SS reconciliation.""" @@ -369,6 +381,73 @@ def _apply_post_processing(predictions, X_test, time_period, data): return predictions +def _index_into_spm_units( + person_spm_unit_ids: np.ndarray, + spm_unit_ids: np.ndarray, +) -> np.ndarray: + if np.all(spm_unit_ids[:-1] <= spm_unit_ids[1:]): + indices = np.searchsorted(spm_unit_ids, person_spm_unit_ids) + valid = (indices >= 0) & (indices < len(spm_unit_ids)) + if valid.all() and np.array_equal(spm_unit_ids[indices], person_spm_unit_ids): + return indices + + indexer = pd.Index(spm_unit_ids).get_indexer(person_spm_unit_ids) + if (indexer < 0).any(): + raise ValueError("person_spm_unit_id contains values missing from spm_unit_id") + return indexer + + +def rebuild_cloned_spm_thresholds(data: dict, time_period: int) -> np.ndarray: + """Rebuild cloned SPM thresholds from donor-half geography. + + The clone half should keep the donor household's geography but use the + current threshold formula rather than a learned QRF output. We infer the + donor-half geographic adjustment from the stored first-half thresholds and + apply that same geography to the corresponding clone-half SPM units. + """ + + person_ages = np.asarray(data["age"][time_period]) + person_spm_unit_ids = np.asarray(data["person_spm_unit_id"][time_period]) + spm_unit_ids = np.asarray(data["spm_unit_id"][time_period]) + tenure_types = np.asarray(data["spm_unit_tenure_type"][time_period]) + stored_thresholds = np.asarray(data["spm_unit_spm_threshold"][time_period], dtype=float) + + n_units = len(spm_unit_ids) + if n_units % 2 != 0: + raise ValueError("Expected cloned SPM units to have an even-length spm_unit_id array") + + unit_indices = _index_into_spm_units(person_spm_unit_ids, spm_unit_ids) + is_adult = person_ages >= 18 + adult_counts = np.zeros(n_units, dtype=np.int32) + child_counts = np.zeros(n_units, dtype=np.int32) + np.add.at(adult_counts, unit_indices, is_adult.astype(np.int32)) + np.add.at(child_counts, unit_indices, (~is_adult).astype(np.int32)) + + thresholds_by_tenure = get_spm_reference_thresholds(time_period) + reference_base = np.array( + [ + thresholds_by_tenure[ + _SPM_TENURE_TO_REFERENCE_KEY.get(tenure, "renter") + ] + for tenure in tenure_types + ], + dtype=float, + ) + equivalence_scale = spm_equivalence_scale(adult_counts, child_counts) + + n_half = n_units // 2 + donor_denominator = reference_base[:n_half] * equivalence_scale[:n_half] + donor_geoadj = np.divide( + stored_thresholds[:n_half], + donor_denominator, + out=np.ones(n_half, dtype=float), + where=donor_denominator > 0, + ) + geoadj = np.concatenate([donor_geoadj, donor_geoadj]) + rebuilt = reference_base * equivalence_scale * geoadj + return rebuilt.astype(np.float32) + + def _splice_cps_only_predictions( data: dict, predictions: pd.DataFrame, @@ -489,6 +568,14 @@ def generate(self): time_period=self.time_period, dataset_path=str(self.cps.file_path), ) + if "spm_unit_spm_threshold" in new_data: + logger.info("Rebuilding cloned SPM thresholds from donor-half geography") + new_data["spm_unit_spm_threshold"] = { + self.time_period: rebuild_cloned_spm_thresholds( + new_data, + self.time_period, + ) + } new_data = self._rename_imputed_to_inputs(new_data) if _supports_structural_mortgage_inputs(): diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index a44977bdc..75bb66101 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -11,6 +11,7 @@ import pandas as pd import pytest +from policyengine_us_data.datasets.cps.enhanced_cps import initialize_weight_priors from policyengine_us_data.calibration.puf_impute import ( IMPUTED_VARIABLES, OVERRIDDEN_IMPUTED_VARIABLES, @@ -19,6 +20,7 @@ CPS_ONLY_IMPUTED_VARIABLES, CPS_STAGE2_INCOME_PREDICTORS, apply_retirement_constraints, + rebuild_cloned_spm_thresholds, reconcile_ss_subcomponents, ) from policyengine_us_data.datasets.org import ORG_IMPUTED_VARIABLES @@ -110,6 +112,7 @@ def test_pension_income_not_in_cps_only(self): should_not_be_here = { "taxable_private_pension_income", "tax_exempt_private_pension_income", + "spm_unit_spm_threshold", } present = should_not_be_here & set(CPS_ONLY_IMPUTED_VARIABLES) assert present == set(), ( @@ -117,6 +120,49 @@ def test_pension_income_not_in_cps_only(self): ) +class TestDeterministicSpmThresholdRebuild: + def test_rebuild_cloned_spm_thresholds_uses_donor_geoadj(self): + data = { + "age": {2024: np.array([40, 12, 70, 40, 12, 70], dtype=np.int32)}, + "person_spm_unit_id": {2024: np.array([10, 10, 20, 30, 30, 40], dtype=np.int32)}, + "spm_unit_id": {2024: np.array([10, 20, 30, 40], dtype=np.int32)}, + "spm_unit_tenure_type": { + 2024: np.array( + [b"RENTER", b"OWNER_WITHOUT_MORTGAGE", b"RENTER", b"OWNER_WITHOUT_MORTGAGE"] + ) + }, + "spm_unit_spm_threshold": { + 2024: np.array([20_000.0, 15_000.0, 999_999.0, 999_999.0], dtype=np.float64) + }, + } + + rebuilt = rebuild_cloned_spm_thresholds(data, 2024) + + np.testing.assert_allclose(rebuilt[:2], np.array([20_000.0, 15_000.0])) + np.testing.assert_allclose(rebuilt[2:], np.array([20_000.0, 15_000.0])) + + +class TestWeightPriorInitialization: + def test_initialize_weight_priors_keeps_zero_weight_records_near_zero(self): + weights = np.array([1_500.0, 0.0, 625.0, 0.0], dtype=np.float64) + + priors = initialize_weight_priors(weights, seed=123) + + assert np.all(priors > 0) + assert priors[1] < 1e-4 + assert priors[3] < 1e-4 + assert priors[0] == pytest.approx(1_500.0, rel=0.05) + assert priors[2] == pytest.approx(625.0, rel=0.05) + + def test_initialize_weight_priors_is_reproducible(self): + weights = np.array([400.0, 0.0, 100.0], dtype=np.float64) + + priors_a = initialize_weight_priors(weights, seed=77) + priors_b = initialize_weight_priors(weights, seed=77) + + np.testing.assert_allclose(priors_a, priors_b) + + class TestRetirementConstraints: """Post-processing retirement constraints enforce IRS caps.""" From 84e425991c01166ff1406401ce89b21974c1637f Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 09:52:55 -0400 Subject: [PATCH 04/28] Format clone-half threshold fixes --- policyengine_us_data/datasets/cps/extended_cps.py | 12 +++++++----- tests/unit/test_extended_cps.py | 15 ++++++++++++--- 2 files changed, 19 insertions(+), 8 deletions(-) diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index b110b536c..22a443804 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -410,11 +410,15 @@ def rebuild_cloned_spm_thresholds(data: dict, time_period: int) -> np.ndarray: person_spm_unit_ids = np.asarray(data["person_spm_unit_id"][time_period]) spm_unit_ids = np.asarray(data["spm_unit_id"][time_period]) tenure_types = np.asarray(data["spm_unit_tenure_type"][time_period]) - stored_thresholds = np.asarray(data["spm_unit_spm_threshold"][time_period], dtype=float) + stored_thresholds = np.asarray( + data["spm_unit_spm_threshold"][time_period], dtype=float + ) n_units = len(spm_unit_ids) if n_units % 2 != 0: - raise ValueError("Expected cloned SPM units to have an even-length spm_unit_id array") + raise ValueError( + "Expected cloned SPM units to have an even-length spm_unit_id array" + ) unit_indices = _index_into_spm_units(person_spm_unit_ids, spm_unit_ids) is_adult = person_ages >= 18 @@ -426,9 +430,7 @@ def rebuild_cloned_spm_thresholds(data: dict, time_period: int) -> np.ndarray: thresholds_by_tenure = get_spm_reference_thresholds(time_period) reference_base = np.array( [ - thresholds_by_tenure[ - _SPM_TENURE_TO_REFERENCE_KEY.get(tenure, "renter") - ] + thresholds_by_tenure[_SPM_TENURE_TO_REFERENCE_KEY.get(tenure, "renter")] for tenure in tenure_types ], dtype=float, diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index 75bb66101..c278d0ea1 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -124,15 +124,24 @@ class TestDeterministicSpmThresholdRebuild: def test_rebuild_cloned_spm_thresholds_uses_donor_geoadj(self): data = { "age": {2024: np.array([40, 12, 70, 40, 12, 70], dtype=np.int32)}, - "person_spm_unit_id": {2024: np.array([10, 10, 20, 30, 30, 40], dtype=np.int32)}, + "person_spm_unit_id": { + 2024: np.array([10, 10, 20, 30, 30, 40], dtype=np.int32) + }, "spm_unit_id": {2024: np.array([10, 20, 30, 40], dtype=np.int32)}, "spm_unit_tenure_type": { 2024: np.array( - [b"RENTER", b"OWNER_WITHOUT_MORTGAGE", b"RENTER", b"OWNER_WITHOUT_MORTGAGE"] + [ + b"RENTER", + b"OWNER_WITHOUT_MORTGAGE", + b"RENTER", + b"OWNER_WITHOUT_MORTGAGE", + ] ) }, "spm_unit_spm_threshold": { - 2024: np.array([20_000.0, 15_000.0, 999_999.0, 999_999.0], dtype=np.float64) + 2024: np.array( + [20_000.0, 15_000.0, 999_999.0, 999_999.0], dtype=np.float64 + ) }, } From 289bbc7137ac68534ec61250d1323bbb1e91f446 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 09:54:20 -0400 Subject: [PATCH 05/28] Format SPM benchmark and calibration files --- .../calibration/publish_local_area.py | 6 +----- .../calibration_targets/pull_hardcoded_targets.py | 1 + .../test_spm_thresholds.py | 4 +--- policyengine_us_data/utils/spm.py | 15 ++++----------- tests/unit/test_census_spm_housing_targets.py | 14 +++++++++----- validation/benefit_validation.py | 4 +--- 6 files changed, 17 insertions(+), 27 deletions(-) diff --git a/policyengine_us_data/calibration/publish_local_area.py b/policyengine_us_data/calibration/publish_local_area.py index 4a8bb8a16..bc9f409f0 100644 --- a/policyengine_us_data/calibration/publish_local_area.py +++ b/policyengine_us_data/calibration/publish_local_area.py @@ -490,11 +490,7 @@ def build_h5( spm_unit_geoadj = np.array( [ cd_geoadj_values[active_clone_cds[clone_id]][ - ( - spm_tenure_cloned[spm_unit_index] - .decode() - .lower() - ) + (spm_tenure_cloned[spm_unit_index].decode().lower()) ] for spm_unit_index, clone_id in enumerate(spm_clone_ids) ], diff --git a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py index c1478d957..0503c7b59 100644 --- a/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py +++ b/policyengine_us_data/storage/calibration_targets/pull_hardcoded_targets.py @@ -10,6 +10,7 @@ sources. Housing uses the Census SPM capped subsidy concept, not HUD spending. """ + def get_hard_coded_totals( year: int = 2024, storage_folder=None, diff --git a/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py b/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py index 8cd2a06b0..3e5b8bd34 100644 --- a/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py +++ b/policyengine_us_data/tests/test_local_area_calibration/test_spm_thresholds.py @@ -31,9 +31,7 @@ def test_load_cd_geoadj_values_returns_tenure_specific_lookup(monkeypatch): assert geoadj_lookup["101"]["renter"] == pytest.approx(1.2215) assert geoadj_lookup["101"]["owner_with_mortgage"] == pytest.approx(1.217) - assert geoadj_lookup["101"]["owner_without_mortgage"] == pytest.approx( - 1.1615 - ) + assert geoadj_lookup["101"]["owner_without_mortgage"] == pytest.approx(1.1615) def test_calculate_spm_thresholds_vectorized_matches_policyengine_us_future_path(): diff --git a/policyengine_us_data/utils/spm.py b/policyengine_us_data/utils/spm.py index 3aba81693..a2220339b 100644 --- a/policyengine_us_data/utils/spm.py +++ b/policyengine_us_data/utils/spm.py @@ -90,15 +90,12 @@ def spm_equivalence_scale(num_adults, num_children): single_adult_with_children = with_children & (adults <= 1) raw[single_adult_with_children] = ( - 1.0 - + 0.8 - + 0.5 * np.maximum(children[single_adult_with_children] - 1, 0) + 1.0 + 0.8 + 0.5 * np.maximum(children[single_adult_with_children] - 1, 0) ) ** 0.7 multi_adult_with_children = with_children & ~single_adult_with_children raw[multi_adult_with_children] = ( - adults[multi_adult_with_children] - + 0.5 * children[multi_adult_with_children] + adults[multi_adult_with_children] + 0.5 * children[multi_adult_with_children] ) ** 0.7 no_children = has_people & ~with_children @@ -125,16 +122,12 @@ def get_spm_reference_thresholds(year: int) -> dict[str, float]: cpi_u = _get_cpi_u_parameter() factor = float( - cpi_u(f"{year}-02-01") - / cpi_u(f"{LATEST_PUBLISHED_SPM_THRESHOLD_YEAR}-02-01") + cpi_u(f"{year}-02-01") / cpi_u(f"{LATEST_PUBLISHED_SPM_THRESHOLD_YEAR}-02-01") ) latest_thresholds = PUBLISHED_SPM_REFERENCE_THRESHOLDS[ LATEST_PUBLISHED_SPM_THRESHOLD_YEAR ] - return { - tenure: value * factor - for tenure, value in latest_thresholds.items() - } + return {tenure: value * factor for tenure, value in latest_thresholds.items()} def calculate_geoadj_from_rent( diff --git a/tests/unit/test_census_spm_housing_targets.py b/tests/unit/test_census_spm_housing_targets.py index 6693ffe9a..be0098d29 100644 --- a/tests/unit/test_census_spm_housing_targets.py +++ b/tests/unit/test_census_spm_housing_targets.py @@ -35,9 +35,7 @@ def test_get_census_spm_capped_housing_subsidy_total_uses_raw_cps_spm_values( ): _write_census_cps(tmp_path) - total = get_census_spm_capped_housing_subsidy_total( - 2024, storage_folder=tmp_path - ) + total = get_census_spm_capped_housing_subsidy_total(2024, storage_folder=tmp_path) expected = (1_000 * 150 + 0 * 250 + 2_000 * 100) / 100 assert total == expected @@ -147,13 +145,19 @@ def test_get_program_benchmarks_keeps_housing_concepts_separate(monkeypatch): benchmarks = benefit_validation.get_program_benchmarks(2022) - assert benchmarks["housing_spm_capped"]["variable"] == "spm_unit_capped_housing_subsidy" + assert ( + benchmarks["housing_spm_capped"]["variable"] + == "spm_unit_capped_housing_subsidy" + ) assert benchmarks["housing_spm_capped"]["benchmark_total"] == 111 assert benchmarks["housing_spm_capped"]["benchmark_source"] == "Census benchmark" assert benchmarks["housing_assistance_hud_user"]["variable"] == "housing_assistance" assert benchmarks["housing_assistance_hud_user"]["benchmark_total"] == 222 - assert benchmarks["housing_assistance_hud_user"]["benchmark_source"] == "HUD USER benchmark" + assert ( + benchmarks["housing_assistance_hud_user"]["benchmark_source"] + == "HUD USER benchmark" + ) assert ( benchmarks["housing_assistance_hud_user"]["benchmark_participants_millions"] == 3 diff --git a/validation/benefit_validation.py b/validation/benefit_validation.py index 723fe6a08..14360782d 100644 --- a/validation/benefit_validation.py +++ b/validation/benefit_validation.py @@ -87,9 +87,7 @@ def analyze_benefit_underreporting(): weighted_participants = ((benefit > 0) * weight).sum() / 1e6 # millions # Underreporting factor - benchmark_ratio = ( - info["benchmark_total"] / total if total > 0 else np.inf - ) + benchmark_ratio = info["benchmark_total"] / total if total > 0 else np.inf benchmark_participants = info.get("benchmark_participants_millions") participant_ratio = ( weighted_participants / benchmark_participants From 9b7222874afde3f6e1f088442a5679f622e49e21 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 10:44:13 -0400 Subject: [PATCH 06/28] Add changelog fragment for clone-half SPM fixes --- changelog.d/fixed/702.md | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/fixed/702.md diff --git a/changelog.d/fixed/702.md b/changelog.d/fixed/702.md new file mode 100644 index 000000000..6e2bafc59 --- /dev/null +++ b/changelog.d/fixed/702.md @@ -0,0 +1 @@ +Aligned clone-half enhanced CPS generation with the modeled SPM pipeline by rebuilding cloned SPM thresholds deterministically and keeping zero-weight synthetic households near zero in sparse reweighting priors. From 1151faacf82f338150c69baae3120ac9dcc8adcc Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 10:45:54 -0400 Subject: [PATCH 07/28] Retrigger PR checks From 9c01bc787abb8d55bc1591241b9107fda612a81e Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 10:46:50 -0400 Subject: [PATCH 08/28] Retrigger PR checks after cancel From 0caed7938e9ee4ff18a5aa16dfcc208fd0038600 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 10:57:01 -0400 Subject: [PATCH 09/28] Fix Towncrier fragment layout --- changelog.d/{fixed/702.md => 702.fixed} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename changelog.d/{fixed/702.md => 702.fixed} (100%) diff --git a/changelog.d/fixed/702.md b/changelog.d/702.fixed similarity index 100% rename from changelog.d/fixed/702.md rename to changelog.d/702.fixed From 67d2af7ed114a4c1495fb49dd07649ff57548bc8 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 11:18:12 -0400 Subject: [PATCH 10/28] Add clone origin diagnostics for ECPS --- .../calibration/puf_impute.py | 22 +++ .../datasets/cps/enhanced_cps.py | 186 ++++++++++++++++++ .../utils/dataset_validation.py | 26 +++ .../test_calibration_puf_impute.py | 27 +++ tests/unit/test_dataset_validation.py | 56 ++++++ .../test_enhanced_cps_clone_diagnostics.py | 37 ++++ 6 files changed, 354 insertions(+) create mode 100644 tests/unit/test_enhanced_cps_clone_diagnostics.py diff --git a/policyengine_us_data/calibration/puf_impute.py b/policyengine_us_data/calibration/puf_impute.py index b87f846f8..a2159fa6e 100644 --- a/policyengine_us_data/calibration/puf_impute.py +++ b/policyengine_us_data/calibration/puf_impute.py @@ -25,6 +25,14 @@ logger = logging.getLogger(__name__) +CLONE_ORIGIN_FLAGS = { + "person": "person_is_puf_clone", + "tax_unit": "tax_unit_is_puf_clone", + "spm_unit": "spm_unit_is_puf_clone", + "family": "family_is_puf_clone", + "household": "household_is_puf_clone", +} + PUF_SUBSAMPLE_TARGET = 20_000 PUF_TOP_PERCENTILE = 99.5 @@ -531,6 +539,20 @@ def _map_to_entity(pred_values, variable_name): time_period: np.concatenate([state_fips, state_fips]).astype(np.int32) } + for entity_key, flag_name in CLONE_ORIGIN_FLAGS.items(): + id_variable = f"{entity_key}_id" + if id_variable not in data: + continue + n_entities = len(data[id_variable][time_period]) + new_data[flag_name] = { + time_period: np.concatenate( + [ + np.zeros(n_entities, dtype=np.int8), + np.ones(n_entities, dtype=np.int8), + ] + ) + } + if y_full: for var in IMPUTED_VARIABLES: if var not in data: diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 275d71e2b..f8cf8fe95 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -1,3 +1,7 @@ +import json +from pathlib import Path + +import h5py from policyengine_core.data import Dataset import pandas as pd from policyengine_us_data.utils import ( @@ -28,6 +32,173 @@ torch = None +def _to_numpy(value) -> np.ndarray: + return np.asarray(getattr(value, "values", value)) + + +def _weighted_share(mask, weights) -> float: + weights = np.asarray(weights, dtype=np.float64) + total_weight = float(weights.sum()) + if total_weight <= 0: + return 0.0 + mask = np.asarray(mask, dtype=bool) + return 100 * float(weights[mask].sum()) / total_weight + + +def compute_clone_diagnostics_summary( + *, + household_is_puf_clone, + household_weight, + person_is_puf_clone, + person_weight, + person_in_poverty, + person_reported_in_poverty, + spm_unit_is_puf_clone, + spm_unit_weight, + spm_unit_capped_work_childcare_expenses, + spm_unit_pre_subsidy_childcare_expenses, + spm_unit_taxes, + spm_unit_market_income, +) -> dict[str, float]: + household_is_puf_clone = np.asarray(household_is_puf_clone, dtype=bool) + household_weight = np.asarray(household_weight, dtype=np.float64) + person_is_puf_clone = np.asarray(person_is_puf_clone, dtype=bool) + person_weight = np.asarray(person_weight, dtype=np.float64) + person_in_poverty = np.asarray(person_in_poverty, dtype=bool) + person_reported_in_poverty = np.asarray(person_reported_in_poverty, dtype=bool) + spm_unit_is_puf_clone = np.asarray(spm_unit_is_puf_clone, dtype=bool) + spm_unit_weight = np.asarray(spm_unit_weight, dtype=np.float64) + capped_childcare = np.asarray( + spm_unit_capped_work_childcare_expenses, dtype=np.float64 + ) + pre_subsidy_childcare = np.asarray( + spm_unit_pre_subsidy_childcare_expenses, dtype=np.float64 + ) + spm_unit_taxes = np.asarray(spm_unit_taxes, dtype=np.float64) + spm_unit_market_income = np.asarray(spm_unit_market_income, dtype=np.float64) + + poor_modeled_only = person_in_poverty & ~person_reported_in_poverty + clone_spm_weight = spm_unit_weight[spm_unit_is_puf_clone].sum() + + return { + "clone_household_weight_share_pct": _weighted_share( + household_is_puf_clone, household_weight + ), + "clone_person_weight_share_pct": _weighted_share( + person_is_puf_clone, person_weight + ), + "clone_poor_modeled_only_person_weight_share_pct": _weighted_share( + person_is_puf_clone & poor_modeled_only, + person_weight, + ), + "poor_modeled_only_within_clone_person_weight_share_pct": ( + 0.0 + if person_weight[person_is_puf_clone].sum() <= 0 + else _weighted_share( + poor_modeled_only[person_is_puf_clone], + person_weight[person_is_puf_clone], + ) + ), + "clone_childcare_exceeds_pre_subsidy_share_pct": ( + 0.0 + if clone_spm_weight <= 0 + else _weighted_share( + capped_childcare[spm_unit_is_puf_clone] + > pre_subsidy_childcare[spm_unit_is_puf_clone] + 1, + spm_unit_weight[spm_unit_is_puf_clone], + ) + ), + "clone_childcare_above_5000_share_pct": ( + 0.0 + if clone_spm_weight <= 0 + else _weighted_share( + capped_childcare[spm_unit_is_puf_clone] > 5_000, + spm_unit_weight[spm_unit_is_puf_clone], + ) + ), + "clone_taxes_exceed_market_income_share_pct": ( + 0.0 + if clone_spm_weight <= 0 + else _weighted_share( + spm_unit_taxes[spm_unit_is_puf_clone] + > spm_unit_market_income[spm_unit_is_puf_clone] + 1, + spm_unit_weight[spm_unit_is_puf_clone], + ) + ), + } + + +def _load_saved_period_array( + file_path: str | Path, + variable_name: str, + period: int, +) -> np.ndarray: + with h5py.File(file_path, "r") as h5_file: + obj = h5_file[variable_name] + if isinstance(obj, h5py.Dataset): + return np.asarray(obj[...]) + period_key = str(period) + if period_key in obj: + return np.asarray(obj[period_key][...]) + if period in obj: + return np.asarray(obj[period][...]) + raise KeyError(f"{variable_name} missing period {period}") + + +def clone_diagnostics_path(file_path: str | Path) -> Path: + return Path(file_path).with_suffix(".clone_diagnostics.json") + + +def write_clone_diagnostics_report(file_path: str | Path, diagnostics: dict) -> Path: + output_path = clone_diagnostics_path(file_path) + output_path.write_text(json.dumps(diagnostics, indent=2, sort_keys=True) + "\n") + return output_path + + +def build_clone_diagnostics_for_saved_dataset( + dataset_cls: Type[Dataset], period: int +) -> dict[str, float]: + from policyengine_us import Microsimulation + + sim = Microsimulation(dataset=dataset_cls) + dataset_path = Path(dataset_cls.file_path) + + person_reported_in_poverty = _to_numpy( + sim.calculate("spm_unit_net_income_reported", period=period, map_to="person") + ) < _to_numpy( + sim.calculate("spm_unit_spm_threshold", period=period, map_to="person") + ) + + return compute_clone_diagnostics_summary( + household_is_puf_clone=_load_saved_period_array( + dataset_path, "household_is_puf_clone", period + ), + household_weight=_to_numpy(sim.calculate("household_weight", period=period)), + person_is_puf_clone=_load_saved_period_array( + dataset_path, "person_is_puf_clone", period + ), + person_weight=_to_numpy(sim.calculate("person_weight", period=period)), + person_in_poverty=_to_numpy( + sim.calculate("person_in_poverty", period=period) + ), + person_reported_in_poverty=person_reported_in_poverty, + spm_unit_is_puf_clone=_load_saved_period_array( + dataset_path, "spm_unit_is_puf_clone", period + ), + spm_unit_weight=_to_numpy(sim.calculate("spm_unit_weight", period=period)), + spm_unit_capped_work_childcare_expenses=_to_numpy( + sim.calculate("spm_unit_capped_work_childcare_expenses", period=period) + ), + spm_unit_pre_subsidy_childcare_expenses=_to_numpy( + sim.calculate("spm_unit_pre_subsidy_childcare_expenses", period=period) + ), + spm_unit_taxes=_to_numpy(sim.calculate("spm_unit_taxes", period=period)), + spm_unit_market_income=_to_numpy( + sim.calculate("spm_unit_market_income", period=period) + ), + ) + + def _get_period_array(period_values: dict, period: int) -> np.ndarray: """Get a period array from a TIME_PERIOD_ARRAYS variable dict.""" value = period_values.get(period) @@ -313,6 +484,21 @@ def generate(self): logging.info("Post-generation weight validation passed") self.save_dataset(data) + try: + diagnostics = build_clone_diagnostics_for_saved_dataset( + type(self), + base_year, + ) + diagnostics["period"] = base_year + output_path = write_clone_diagnostics_report(self.file_path, diagnostics) + logging.info("Saved clone diagnostics to %s", output_path) + logging.info("Clone diagnostics summary: %s", diagnostics) + except Exception: + logging.warning( + "Unable to compute clone diagnostics for %s", + self.file_path, + exc_info=True, + ) class ReweightedCPS_2024(Dataset): diff --git a/policyengine_us_data/utils/dataset_validation.py b/policyengine_us_data/utils/dataset_validation.py index 932003860..58960b472 100644 --- a/policyengine_us_data/utils/dataset_validation.py +++ b/policyengine_us_data/utils/dataset_validation.py @@ -22,6 +22,14 @@ "household": "household_id", } +AUXILIARY_ENTITY_PREFIXES = { + "person_": "person", + "tax_unit_": "tax_unit", + "family_": "family", + "spm_unit_": "spm_unit", + "household_": "household", +} + class DatasetContractError(Exception): """Raised when a built dataset does not match the active country package.""" @@ -127,6 +135,24 @@ def _infer_auxiliary_entity( entity_counts: dict[str, int], file_name: str, ) -> str: + for prefix, entity_key in AUXILIARY_ENTITY_PREFIXES.items(): + if not variable_name.startswith(prefix): + continue + expected_length = entity_counts.get(entity_key) + if expected_length is None: + raise DatasetContractError( + f"{file_name} contains auxiliary variable {variable_name} with a " + f"{entity_key}-scoped prefix, but {ENTITY_ID_VARIABLES[entity_key]} " + "is missing from the dataset." + ) + if actual_length != expected_length: + raise DatasetContractError( + f"{file_name} contains auxiliary variable {variable_name} with " + f"{prefix!r} prefix and expected {entity_key} length " + f"{expected_length}, found {actual_length}." + ) + return entity_key + candidate_entities = [ entity_key for entity_key, entity_count in entity_counts.items() diff --git a/tests/unit/calibration/test_calibration_puf_impute.py b/tests/unit/calibration/test_calibration_puf_impute.py index d803486ee..13a2f5176 100644 --- a/tests/unit/calibration/test_calibration_puf_impute.py +++ b/tests/unit/calibration/test_calibration_puf_impute.py @@ -34,9 +34,11 @@ def _make_mock_data(n_persons=20, n_households=5, time_period=2024): "household_id": {time_period: np.arange(1, n_households + 1)}, "tax_unit_id": {time_period: np.arange(1, n_households + 1)}, "spm_unit_id": {time_period: np.arange(1, n_households + 1)}, + "family_id": {time_period: np.arange(1, n_households + 1)}, "person_household_id": {time_period: household_ids_person}, "person_tax_unit_id": {time_period: tax_unit_ids_person}, "person_spm_unit_id": {time_period: spm_unit_ids_person}, + "person_family_id": {time_period: household_ids_person}, "age": {time_period: ages.astype(np.float32)}, "is_male": {time_period: is_male.astype(np.float32)}, "household_weight": {time_period: np.ones(n_households) * 1000}, @@ -135,6 +137,31 @@ def test_overridden_subset_of_imputed(self): for var in OVERRIDDEN_IMPUTED_VARIABLES: assert var in IMPUTED_VARIABLES + def test_clone_origin_flags_are_added(self): + data = _make_mock_data(n_persons=20, n_households=5) + state_fips = np.array([1, 2, 36, 6, 48]) + + result = puf_clone_dataset( + data=data, + state_fips=state_fips, + time_period=2024, + skip_qrf=True, + ) + + expected_lengths = { + "person_is_puf_clone": 20, + "tax_unit_is_puf_clone": 5, + "spm_unit_is_puf_clone": 5, + "family_is_puf_clone": 5, + "household_is_puf_clone": 5, + } + + for variable_name, half_length in expected_lengths.items(): + values = result[variable_name][2024] + assert values.dtype == np.int8 + np.testing.assert_array_equal(values[:half_length], 0) + np.testing.assert_array_equal(values[half_length:], 1) + class TestStratifiedSubsample: def test_noop_when_small(self): diff --git a/tests/unit/test_dataset_validation.py b/tests/unit/test_dataset_validation.py index c1ee1ea84..af9941577 100644 --- a/tests/unit/test_dataset_validation.py +++ b/tests/unit/test_dataset_validation.py @@ -248,3 +248,59 @@ def test_validate_dataset_contract_rejects_entity_length_mismatch( microsimulation_cls=_FakeMicrosimulation, dataset_loader=lambda path: path, ) + + +def test_validate_dataset_contract_accepts_prefixed_auxiliary_entity( + tmp_path, monkeypatch +): + file_path = tmp_path / "prefixed_aux.h5" + _write_test_h5( + file_path, + { + "person_id": np.array([101], dtype=np.int32), + "household_id": np.array([201], dtype=np.int32), + "household_is_puf_clone": np.array([1], dtype=np.int8), + }, + ) + monkeypatch.setattr( + "policyengine_us_data.utils.dataset_validation.assert_locked_policyengine_us_version", + lambda: PolicyEngineUSBuildInfo(version="1.587.0"), + ) + + summary = validate_dataset_contract( + file_path, + tax_benefit_system=_fake_tax_benefit_system(), + microsimulation_cls=_FakeMicrosimulation, + dataset_loader=lambda path: path, + ) + + assert summary.variable_count == 3 + + +def test_validate_dataset_contract_rejects_prefixed_auxiliary_length_mismatch( + tmp_path, monkeypatch +): + file_path = tmp_path / "prefixed_aux_bad.h5" + _write_test_h5( + file_path, + { + "person_id": np.array([101], dtype=np.int32), + "household_id": np.array([201], dtype=np.int32), + "household_is_puf_clone": np.array([1, 0], dtype=np.int8), + }, + ) + monkeypatch.setattr( + "policyengine_us_data.utils.dataset_validation.assert_locked_policyengine_us_version", + lambda: PolicyEngineUSBuildInfo(version="1.587.0"), + ) + + with pytest.raises( + DatasetContractError, + match="expected household length 1, found 2", + ): + validate_dataset_contract( + file_path, + tax_benefit_system=_fake_tax_benefit_system(), + microsimulation_cls=_FakeMicrosimulation, + dataset_loader=lambda path: path, + ) diff --git a/tests/unit/test_enhanced_cps_clone_diagnostics.py b/tests/unit/test_enhanced_cps_clone_diagnostics.py new file mode 100644 index 000000000..89e2047ef --- /dev/null +++ b/tests/unit/test_enhanced_cps_clone_diagnostics.py @@ -0,0 +1,37 @@ +import pytest + +from policyengine_us_data.datasets.cps.enhanced_cps import ( + compute_clone_diagnostics_summary, +) + + +def test_compute_clone_diagnostics_summary(): + diagnostics = compute_clone_diagnostics_summary( + household_is_puf_clone=[False, True], + household_weight=[9.0, 1.0], + person_is_puf_clone=[False, True, True], + person_weight=[4.0, 3.0, 3.0], + person_in_poverty=[False, True, True], + person_reported_in_poverty=[False, False, True], + spm_unit_is_puf_clone=[False, True, True], + spm_unit_weight=[2.0, 3.0, 5.0], + spm_unit_capped_work_childcare_expenses=[0.0, 6000.0, 7000.0], + spm_unit_pre_subsidy_childcare_expenses=[0.0, 5000.0, 8000.0], + spm_unit_taxes=[100.0, 9000.0, 200.0], + spm_unit_market_income=[1000.0, 8000.0, 1000.0], + ) + + assert diagnostics["clone_household_weight_share_pct"] == pytest.approx(10.0) + assert diagnostics["clone_poor_modeled_only_person_weight_share_pct"] == pytest.approx( + 30.0 + ) + assert diagnostics["poor_modeled_only_within_clone_person_weight_share_pct"] == pytest.approx( + 50.0 + ) + assert diagnostics["clone_childcare_exceeds_pre_subsidy_share_pct"] == pytest.approx( + 37.5 + ) + assert diagnostics["clone_childcare_above_5000_share_pct"] == pytest.approx(100.0) + assert diagnostics["clone_taxes_exceed_market_income_share_pct"] == pytest.approx( + 37.5 + ) From b06a2bd5955b7623d7d5c3178fe22dfeffb2a646 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 11:18:51 -0400 Subject: [PATCH 11/28] Add changelog fragment for clone diagnostics --- changelog.d/703.fixed | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/703.fixed diff --git a/changelog.d/703.fixed b/changelog.d/703.fixed new file mode 100644 index 000000000..b3dd0c7d0 --- /dev/null +++ b/changelog.d/703.fixed @@ -0,0 +1 @@ +Added explicit clone-origin flags to extended/enhanced CPS datasets and saved ECPS clone diagnostics for clone weight share, modeled-only-poor share, and extreme childcare/tax checks. From f908809ae833afdb0d79c75a070d42ef4c191751 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 12:17:40 -0400 Subject: [PATCH 12/28] Format clone diagnostics files --- .../datasets/cps/enhanced_cps.py | 4 +--- .../test_enhanced_cps_clone_diagnostics.py | 18 +++++++++--------- 2 files changed, 10 insertions(+), 12 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index f8cf8fe95..c0c1a42b3 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -178,9 +178,7 @@ def build_clone_diagnostics_for_saved_dataset( dataset_path, "person_is_puf_clone", period ), person_weight=_to_numpy(sim.calculate("person_weight", period=period)), - person_in_poverty=_to_numpy( - sim.calculate("person_in_poverty", period=period) - ), + person_in_poverty=_to_numpy(sim.calculate("person_in_poverty", period=period)), person_reported_in_poverty=person_reported_in_poverty, spm_unit_is_puf_clone=_load_saved_period_array( dataset_path, "spm_unit_is_puf_clone", period diff --git a/tests/unit/test_enhanced_cps_clone_diagnostics.py b/tests/unit/test_enhanced_cps_clone_diagnostics.py index 89e2047ef..fe8704173 100644 --- a/tests/unit/test_enhanced_cps_clone_diagnostics.py +++ b/tests/unit/test_enhanced_cps_clone_diagnostics.py @@ -22,15 +22,15 @@ def test_compute_clone_diagnostics_summary(): ) assert diagnostics["clone_household_weight_share_pct"] == pytest.approx(10.0) - assert diagnostics["clone_poor_modeled_only_person_weight_share_pct"] == pytest.approx( - 30.0 - ) - assert diagnostics["poor_modeled_only_within_clone_person_weight_share_pct"] == pytest.approx( - 50.0 - ) - assert diagnostics["clone_childcare_exceeds_pre_subsidy_share_pct"] == pytest.approx( - 37.5 - ) + assert diagnostics[ + "clone_poor_modeled_only_person_weight_share_pct" + ] == pytest.approx(30.0) + assert diagnostics[ + "poor_modeled_only_within_clone_person_weight_share_pct" + ] == pytest.approx(50.0) + assert diagnostics[ + "clone_childcare_exceeds_pre_subsidy_share_pct" + ] == pytest.approx(37.5) assert diagnostics["clone_childcare_above_5000_share_pct"] == pytest.approx(100.0) assert diagnostics["clone_taxes_exceed_market_income_share_pct"] == pytest.approx( 37.5 From 98a4e4701a2b8c3267e2e20538ad1303d999db62 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 13:40:18 -0400 Subject: [PATCH 13/28] Tighten clone prior and tenure validation --- .../datasets/cps/enhanced_cps.py | 13 +++------ .../datasets/cps/extended_cps.py | 12 ++++++++- tests/unit/test_extended_cps.py | 27 +++++++++++++++++-- 3 files changed, 40 insertions(+), 12 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index e0ce2c208..82bf9cc06 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -32,13 +32,12 @@ def initialize_weight_priors( original_weights: np.ndarray, seed: int = 1456, epsilon: float = 1e-6, - positive_jitter_scale: float = 0.01, ) -> np.ndarray: """Build deterministic positive priors for sparse reweighting. - Original CPS households should keep priors close to their survey - weights. Clone-half households start with zero weight on purpose, so - they should receive only a tiny positive epsilon to keep the log + Original CPS households should keep their survey weights unchanged. + Clone-half households start with zero weight on purpose, so they + should receive only a tiny positive epsilon to keep the log optimization well-defined without giving them a meaningful head start. """ @@ -51,11 +50,7 @@ def initialize_weight_priors( positive_mask = weights > 0 if positive_mask.any(): - jitter = np.maximum( - rng.normal(loc=1.0, scale=positive_jitter_scale, size=positive_mask.sum()), - 0.5, - ) - priors[positive_mask] = np.maximum(weights[positive_mask] * jitter, epsilon) + priors[positive_mask] = weights[positive_mask] zero_mask = ~positive_mask if zero_mask.any(): diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 22a443804..b7867f795 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -338,6 +338,16 @@ def reconcile_ss_subcomponents(predictions, total_ss): } +def _reference_threshold_key_for_tenure(tenure_type) -> str: + reference_key = _SPM_TENURE_TO_REFERENCE_KEY.get(tenure_type) + if reference_key is None: + raise ValueError( + "Unsupported spm_unit_tenure_type for cloned threshold rebuild: " + f"{tenure_type!r}" + ) + return reference_key + + def _apply_post_processing(predictions, X_test, time_period, data): """Apply retirement constraints and SS reconciliation.""" ret_cols = [c for c in predictions.columns if c in _RETIREMENT_VARS] @@ -430,7 +440,7 @@ def rebuild_cloned_spm_thresholds(data: dict, time_period: int) -> np.ndarray: thresholds_by_tenure = get_spm_reference_thresholds(time_period) reference_base = np.array( [ - thresholds_by_tenure[_SPM_TENURE_TO_REFERENCE_KEY.get(tenure, "renter")] + thresholds_by_tenure[_reference_threshold_key_for_tenure(tenure)] for tenure in tenure_types ], dtype=float, diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index c278d0ea1..1c86a3bee 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -150,6 +150,22 @@ def test_rebuild_cloned_spm_thresholds_uses_donor_geoadj(self): np.testing.assert_allclose(rebuilt[:2], np.array([20_000.0, 15_000.0])) np.testing.assert_allclose(rebuilt[2:], np.array([20_000.0, 15_000.0])) + def test_rebuild_cloned_spm_thresholds_rejects_unknown_tenure(self): + data = { + "age": {2024: np.array([40, 12, 40, 12], dtype=np.int32)}, + "person_spm_unit_id": {2024: np.array([10, 10, 20, 20], dtype=np.int32)}, + "spm_unit_id": {2024: np.array([10, 20], dtype=np.int32)}, + "spm_unit_tenure_type": { + 2024: np.array([b"RENTER", b"UNRECOGNIZED"], dtype="|S12") + }, + "spm_unit_spm_threshold": { + 2024: np.array([20_000.0, 20_000.0], dtype=np.float64) + }, + } + + with pytest.raises(ValueError, match="Unsupported spm_unit_tenure_type"): + rebuild_cloned_spm_thresholds(data, 2024) + class TestWeightPriorInitialization: def test_initialize_weight_priors_keeps_zero_weight_records_near_zero(self): @@ -160,8 +176,15 @@ def test_initialize_weight_priors_keeps_zero_weight_records_near_zero(self): assert np.all(priors > 0) assert priors[1] < 1e-4 assert priors[3] < 1e-4 - assert priors[0] == pytest.approx(1_500.0, rel=0.05) - assert priors[2] == pytest.approx(625.0, rel=0.05) + assert priors[0] == pytest.approx(1_500.0) + assert priors[2] == pytest.approx(625.0) + + def test_initialize_weight_priors_preserves_positive_weights_exactly(self): + weights = np.array([1_500.0, 625.0, 42.0], dtype=np.float64) + + priors = initialize_weight_priors(weights, seed=123) + + np.testing.assert_array_equal(priors, weights) def test_initialize_weight_priors_is_reproducible(self): weights = np.array([400.0, 0.0, 100.0], dtype=np.float64) From a8553841d1b0c0ceecfef85dad85bcb0a2123604 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 13:40:39 -0400 Subject: [PATCH 14/28] Harden clone diagnostics reporting --- .../datasets/cps/enhanced_cps.py | 54 ++++++++++++++++--- .../test_enhanced_cps_clone_diagnostics.py | 48 +++++++++++++++++ 2 files changed, 96 insertions(+), 6 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index c0c1a42b3..863865261 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -149,12 +149,43 @@ def clone_diagnostics_path(file_path: str | Path) -> Path: return Path(file_path).with_suffix(".clone_diagnostics.json") +def build_clone_diagnostics_payload( + period_to_diagnostics: dict[int, dict[str, float]], +) -> dict: + if not period_to_diagnostics: + raise ValueError("Expected at least one period of clone diagnostics") + + ordered_periods = sorted(period_to_diagnostics) + if len(ordered_periods) == 1: + period = ordered_periods[0] + diagnostics = dict(period_to_diagnostics[period]) + diagnostics["period"] = int(period) + return diagnostics + + return { + "periods": { + str(period): period_to_diagnostics[period] for period in ordered_periods + } + } + + def write_clone_diagnostics_report(file_path: str | Path, diagnostics: dict) -> Path: output_path = clone_diagnostics_path(file_path) output_path.write_text(json.dumps(diagnostics, indent=2, sort_keys=True) + "\n") return output_path +def refresh_clone_diagnostics_report( + file_path: str | Path, + diagnostics_builder, +) -> Path: + output_path = clone_diagnostics_path(file_path) + if output_path.exists(): + output_path.unlink() + diagnostics = diagnostics_builder() + return write_clone_diagnostics_report(file_path, diagnostics) + + def build_clone_diagnostics_for_saved_dataset( dataset_cls: Type[Dataset], period: int ) -> dict[str, float]: @@ -483,14 +514,25 @@ def generate(self): self.save_dataset(data) try: - diagnostics = build_clone_diagnostics_for_saved_dataset( - type(self), - base_year, + periods = list(range(self.start_year, self.end_year + 1)) + diagnostics_payload = build_clone_diagnostics_payload( + { + period: build_clone_diagnostics_for_saved_dataset( + type(self), + period, + ) + for period in periods + } + ) + output_path = refresh_clone_diagnostics_report( + self.file_path, + lambda: diagnostics_payload, ) - diagnostics["period"] = base_year - output_path = write_clone_diagnostics_report(self.file_path, diagnostics) logging.info("Saved clone diagnostics to %s", output_path) - logging.info("Clone diagnostics summary: %s", diagnostics) + logging.info( + "Clone diagnostics summary: %s", + diagnostics_payload, + ) except Exception: logging.warning( "Unable to compute clone diagnostics for %s", diff --git a/tests/unit/test_enhanced_cps_clone_diagnostics.py b/tests/unit/test_enhanced_cps_clone_diagnostics.py index fe8704173..728f1fc13 100644 --- a/tests/unit/test_enhanced_cps_clone_diagnostics.py +++ b/tests/unit/test_enhanced_cps_clone_diagnostics.py @@ -1,7 +1,12 @@ +from pathlib import Path + import pytest from policyengine_us_data.datasets.cps.enhanced_cps import ( + build_clone_diagnostics_payload, compute_clone_diagnostics_summary, + clone_diagnostics_path, + refresh_clone_diagnostics_report, ) @@ -35,3 +40,46 @@ def test_compute_clone_diagnostics_summary(): assert diagnostics["clone_taxes_exceed_market_income_share_pct"] == pytest.approx( 37.5 ) + + +def test_build_clone_diagnostics_payload_single_period(): + payload = build_clone_diagnostics_payload( + {2024: {"clone_person_weight_share_pct": 12.5}} + ) + + assert payload == { + "period": 2024, + "clone_person_weight_share_pct": 12.5, + } + + +def test_build_clone_diagnostics_payload_multiple_periods(): + payload = build_clone_diagnostics_payload( + { + 2026: {"clone_person_weight_share_pct": 20.0}, + 2024: {"clone_person_weight_share_pct": 10.0}, + } + ) + + assert payload == { + "periods": { + "2024": {"clone_person_weight_share_pct": 10.0}, + "2026": {"clone_person_weight_share_pct": 20.0}, + } + } + + +def test_refresh_clone_diagnostics_report_removes_stale_sidecar_on_failure(tmp_path): + file_path = tmp_path / "enhanced_cps_2024.h5" + file_path.write_text("placeholder") + stale_path = clone_diagnostics_path(file_path) + stale_path.write_text("stale") + + def _raise(): + raise RuntimeError("boom") + + with pytest.raises(RuntimeError, match="boom"): + refresh_clone_diagnostics_report(file_path, _raise) + + assert stale_path == Path(file_path).with_suffix(".clone_diagnostics.json") + assert not stale_path.exists() From aed202ae5dfccbcc05130b31999877057cde9c66 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 21:13:05 -0400 Subject: [PATCH 15/28] Fix local-area calibration imports and takeup args --- policyengine_us_data/calibration/publish_local_area.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/policyengine_us_data/calibration/publish_local_area.py b/policyengine_us_data/calibration/publish_local_area.py index b08c59232..0429c933f 100644 --- a/policyengine_us_data/calibration/publish_local_area.py +++ b/policyengine_us_data/calibration/publish_local_area.py @@ -25,6 +25,8 @@ ) from policyengine_us_data.calibration.block_assignment import ( derive_geography_from_blocks, +) +from policyengine_us_data.calibration.county_assignment import ( get_county_filter_probability, ) from policyengine_us_data.calibration.clone_and_assign import ( @@ -544,6 +546,7 @@ def build_h5( hh_blocks=active_blocks, hh_state_fips=hh_state_fips, hh_ids=original_hh_ids, + hh_clone_indices=active_geo.astype(np.int64), entity_hh_indices=entity_hh_indices, entity_counts=entity_counts, time_period=time_period, From e2ae174debe40c7709480ed2180aeb70f1aadbf2 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 21:46:56 -0400 Subject: [PATCH 16/28] Harden housing target test stub --- tests/unit/test_census_spm_housing_targets.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/unit/test_census_spm_housing_targets.py b/tests/unit/test_census_spm_housing_targets.py index be0098d29..6befed4ed 100644 --- a/tests/unit/test_census_spm_housing_targets.py +++ b/tests/unit/test_census_spm_housing_targets.py @@ -1,3 +1,4 @@ +import importlib import sys import types @@ -57,10 +58,18 @@ class DummyMicrosimulation: def __init__(self, dataset): self.default_calculation_period = 2024 + policyengine_us = importlib.import_module("policyengine_us") monkeypatch.setitem( sys.modules, "policyengine_us", - types.SimpleNamespace(Microsimulation=DummyMicrosimulation), + types.SimpleNamespace( + **{ + name: getattr(policyengine_us, name) + for name in dir(policyengine_us) + if not name.startswith("__") and name != "Microsimulation" + }, + Microsimulation=DummyMicrosimulation, + ), ) monkeypatch.setattr( etl_national_targets, From dacb1f6d9d3be8c0c442c7143e649a51ec11cd54 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 21:49:08 -0400 Subject: [PATCH 17/28] Harden clone diagnostics refresh --- .../datasets/cps/enhanced_cps.py | 40 +++++++++----- .../test_enhanced_cps_clone_diagnostics.py | 53 +++++++++++++++++++ 2 files changed, 80 insertions(+), 13 deletions(-) diff --git a/policyengine_us_data/datasets/cps/enhanced_cps.py b/policyengine_us_data/datasets/cps/enhanced_cps.py index 863865261..656fce122 100644 --- a/policyengine_us_data/datasets/cps/enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/enhanced_cps.py @@ -186,6 +186,29 @@ def refresh_clone_diagnostics_report( return write_clone_diagnostics_report(file_path, diagnostics) +def save_clone_diagnostics_report( + dataset_cls: Type[Dataset], + *, + start_year: int, + end_year: int, +) -> tuple[Path, dict]: + periods = list(range(start_year, end_year + 1)) + output_path = refresh_clone_diagnostics_report( + dataset_cls.file_path, + lambda: build_clone_diagnostics_payload( + { + period: build_clone_diagnostics_for_saved_dataset( + dataset_cls, + period, + ) + for period in periods + } + ), + ) + diagnostics_payload = json.loads(output_path.read_text()) + return output_path, diagnostics_payload + + def build_clone_diagnostics_for_saved_dataset( dataset_cls: Type[Dataset], period: int ) -> dict[str, float]: @@ -514,19 +537,10 @@ def generate(self): self.save_dataset(data) try: - periods = list(range(self.start_year, self.end_year + 1)) - diagnostics_payload = build_clone_diagnostics_payload( - { - period: build_clone_diagnostics_for_saved_dataset( - type(self), - period, - ) - for period in periods - } - ) - output_path = refresh_clone_diagnostics_report( - self.file_path, - lambda: diagnostics_payload, + output_path, diagnostics_payload = save_clone_diagnostics_report( + type(self), + start_year=self.start_year, + end_year=self.end_year, ) logging.info("Saved clone diagnostics to %s", output_path) logging.info( diff --git a/tests/unit/test_enhanced_cps_clone_diagnostics.py b/tests/unit/test_enhanced_cps_clone_diagnostics.py index 728f1fc13..a0261aac3 100644 --- a/tests/unit/test_enhanced_cps_clone_diagnostics.py +++ b/tests/unit/test_enhanced_cps_clone_diagnostics.py @@ -7,6 +7,7 @@ compute_clone_diagnostics_summary, clone_diagnostics_path, refresh_clone_diagnostics_report, + save_clone_diagnostics_report, ) @@ -83,3 +84,55 @@ def _raise(): assert stale_path == Path(file_path).with_suffix(".clone_diagnostics.json") assert not stale_path.exists() + + +def test_save_clone_diagnostics_report_removes_stale_sidecar_on_failure( + tmp_path, monkeypatch +): + class DummyDataset: + file_path = tmp_path / "enhanced_cps_2024.h5" + + DummyDataset.file_path.write_text("placeholder") + stale_path = clone_diagnostics_path(DummyDataset.file_path) + stale_path.write_text("stale") + + monkeypatch.setattr( + "policyengine_us_data.datasets.cps.enhanced_cps.build_clone_diagnostics_for_saved_dataset", + lambda dataset_cls, period: (_ for _ in ()).throw(RuntimeError("boom")), + ) + + with pytest.raises(RuntimeError, match="boom"): + save_clone_diagnostics_report( + DummyDataset, + start_year=2024, + end_year=2024, + ) + + assert not stale_path.exists() + + +def test_save_clone_diagnostics_report_writes_fresh_payload(tmp_path, monkeypatch): + class DummyDataset: + file_path = tmp_path / "enhanced_cps_2024.h5" + + DummyDataset.file_path.write_text("placeholder") + + monkeypatch.setattr( + "policyengine_us_data.datasets.cps.enhanced_cps.build_clone_diagnostics_for_saved_dataset", + lambda dataset_cls, period: {"clone_person_weight_share_pct": float(period)}, + ) + + output_path, payload = save_clone_diagnostics_report( + DummyDataset, + start_year=2024, + end_year=2025, + ) + + assert output_path == clone_diagnostics_path(DummyDataset.file_path) + assert payload == { + "periods": { + "2024": {"clone_person_weight_share_pct": 2024.0}, + "2025": {"clone_person_weight_share_pct": 2025.0}, + } + } + assert output_path.exists() From f5750e3215d2292596e97f0be7e0577f5811f291 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 21:51:26 -0400 Subject: [PATCH 18/28] Trigger PR checks From f9162bd0846e2e8be15c55d4e309efb48426206e Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 22:15:41 -0400 Subject: [PATCH 19/28] Use Census childcare capping formula --- policyengine_us_data/datasets/cps/cps.py | 1 + .../datasets/cps/extended_cps.py | 151 +++++++++++------- tests/unit/test_extended_cps.py | 43 ++--- 3 files changed, 116 insertions(+), 79 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 5d5774eea..08301f167 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -494,6 +494,7 @@ def add_personal_income_variables(cps: h5py.File, person: DataFrame, year: int): cps["weekly_hours_worked"] = person.HRSWK cps["hours_worked_last_week"] = person.A_HRS1 + cps["weeks_worked"] = np.clip(person.WKSWORK, 0, 52) cps["taxable_interest_income"] = person.INT_VAL * (p["taxable_interest_fraction"]) cps["tax_exempt_interest_income"] = person.INT_VAL * ( diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index c840f29af..760ac5df9 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -26,6 +26,12 @@ logger = logging.getLogger(__name__) +# Census SPM technical documentation, "SPM Work Expense Values". +# These are weekly work expense amounts applied to each adult earner. +SPM_WEEKLY_WORK_EXPENSE_BY_YEAR = { + 2024: 41.17, +} + def _supports_structural_mortgage_inputs() -> bool: return has_policyengine_us_variables(*STRUCTURAL_MORTGAGE_VARIABLES) @@ -325,69 +331,101 @@ def reconcile_ss_subcomponents(predictions, total_ss): } +def _get_spm_weekly_work_expense(year: int) -> float: + try: + return SPM_WEEKLY_WORK_EXPENSE_BY_YEAR[year] + except KeyError as exc: + raise ValueError( + f"No Census SPM weekly work expense value configured for {year}" + ) from exc + + +def _calculate_clone_work_expenses( + clone_person_data: pd.DataFrame, + clone_spm_unit_ids: np.ndarray, +) -> np.ndarray: + clone_spm_unit_ids = np.asarray(clone_spm_unit_ids) + if clone_person_data.empty: + return np.zeros(len(clone_spm_unit_ids), dtype=float) + + adult_earners = clone_person_data.loc[ + (clone_person_data["age"] >= 18) & (clone_person_data["earnings"] > 0), + ["spm_unit_id", "weeks_worked"], + ].copy() + if adult_earners.empty: + return np.zeros(len(clone_spm_unit_ids), dtype=float) + + adult_earners["weeks_worked"] = adult_earners["weeks_worked"].clip( + lower=0, upper=52 + ) + return ( + adult_earners.groupby("spm_unit_id")["weeks_worked"] + .sum() + .reindex( + clone_spm_unit_ids, + fill_value=0.0, + ) + .to_numpy(dtype=float) + ) + + +def _calculate_clone_lower_earner_caps( + clone_person_data: pd.DataFrame, + clone_spm_unit_ids: np.ndarray, +) -> np.ndarray: + clone_spm_unit_ids = np.asarray(clone_spm_unit_ids) + if clone_person_data.empty: + return np.zeros(len(clone_spm_unit_ids), dtype=float) + + head_or_spouse = clone_person_data.loc[ + clone_person_data["is_parent_proxy"].astype(bool), + ["spm_unit_id", "earnings"], + ].copy() + if head_or_spouse.empty: + return np.zeros(len(clone_spm_unit_ids), dtype=float) + + head_or_spouse["earnings"] = head_or_spouse["earnings"].clip(lower=0.0) + lower_earner_caps = head_or_spouse.groupby("spm_unit_id")["earnings"].agg( + lambda values: float(values.min()) if len(values) > 1 else float(values.iloc[0]) + ) + return lower_earner_caps.reindex( + clone_spm_unit_ids, + fill_value=0.0, + ).to_numpy(dtype=float) + + def derive_clone_capped_childcare_expenses( - donor_pre_subsidy: np.ndarray, - donor_capped: np.ndarray, clone_pre_subsidy: np.ndarray, clone_person_data: pd.DataFrame, clone_spm_unit_ids: np.ndarray, + time_period: int, ) -> np.ndarray: - """Derive clone-half capped childcare from clone inputs. + """Derive clone-half capped work and childcare expenses from clone inputs. The CPS provides both pre-subsidy childcare and the SPM-specific - capped childcare deduction. For the clone half, we impute only the - pre-subsidy amount, then deterministically rebuild the capped amount - instead of letting a second QRF predict it independently. - - We preserve the donor's observed capping share while also respecting - the clone's own earnings cap. This keeps the clone-half value - consistent with pre-subsidy childcare and avoids impossible outputs - such as capped childcare exceeding pre-subsidy childcare. + capped work-and-childcare deduction. For the clone half, we impute + only the pre-subsidy childcare amount, then deterministically rebuild + the capped value using the Census SPM rule: + work expenses plus childcare, capped at the lower earner's earnings + for the reference person and spouse/partner. """ - donor_pre_subsidy = np.asarray(donor_pre_subsidy, dtype=float) - donor_capped = np.asarray(donor_capped, dtype=float) clone_pre_subsidy = np.asarray(clone_pre_subsidy, dtype=float) - clone_spm_unit_ids = np.asarray(clone_spm_unit_ids) - - donor_cap_share = np.divide( - donor_capped, - donor_pre_subsidy, - out=np.zeros_like(donor_capped, dtype=float), - where=donor_pre_subsidy > 0, + weekly_work_expense = _get_spm_weekly_work_expense(time_period) + annual_work_expenses = ( + _calculate_clone_work_expenses( + clone_person_data=clone_person_data, + clone_spm_unit_ids=clone_spm_unit_ids, + ) + * weekly_work_expense + ) + lower_earner_cap = _calculate_clone_lower_earner_caps( + clone_person_data=clone_person_data, + clone_spm_unit_ids=clone_spm_unit_ids, ) - donor_cap_share = np.clip(donor_cap_share, 0.0, 1.0) - capped_from_share = np.maximum(clone_pre_subsidy, 0.0) * donor_cap_share - - if clone_person_data.empty: - earnings_cap = np.zeros(len(clone_spm_unit_ids), dtype=float) - else: - eligible = clone_person_data["is_parent_proxy"].astype(bool) - parent_rows = clone_person_data.loc[ - eligible, ["spm_unit_id", "age", "earnings"] - ].copy() - if parent_rows.empty: - earnings_cap = np.zeros(len(clone_spm_unit_ids), dtype=float) - else: - parent_rows["earnings"] = parent_rows["earnings"].clip(lower=0.0) - parent_rows["age_rank"] = parent_rows.groupby("spm_unit_id")["age"].rank( - method="first", ascending=False - ) - top_two = parent_rows[parent_rows["age_rank"] <= 2].sort_values( - ["spm_unit_id", "age_rank"] - ) - earnings_cap_by_unit = top_two.groupby("spm_unit_id")["earnings"].agg( - lambda values: ( - float(values.iloc[0]) - if len(values) == 1 - else float(np.minimum(values.iloc[0], values.iloc[1])) - ) - ) - earnings_cap = earnings_cap_by_unit.reindex( - clone_spm_unit_ids, fill_value=0.0 - ).to_numpy(dtype=float) - return np.minimum(capped_from_share, earnings_cap) + combined_expenses = np.maximum(clone_pre_subsidy, 0.0) + annual_work_expenses + return np.minimum(combined_expenses, lower_earner_cap) def _rebuild_clone_capped_childcare_expenses( @@ -421,26 +459,19 @@ def _rebuild_clone_capped_childcare_expenses( data["employment_income"][time_period][n_persons_half:] + data["self_employment_income"][time_period][n_persons_half:] ), + "weeks_worked": data["weeks_worked"][time_period][n_persons_half:], } ) - - donor_pre_subsidy = data["spm_unit_pre_subsidy_childcare_expenses"][time_period][ - :n_spm_units_half - ] - donor_capped = data["spm_unit_capped_work_childcare_expenses"][time_period][ - :n_spm_units_half - ] clone_pre_subsidy = data["spm_unit_pre_subsidy_childcare_expenses"][time_period][ n_spm_units_half: ] clone_spm_unit_ids = data["spm_unit_id"][time_period][n_spm_units_half:] return derive_clone_capped_childcare_expenses( - donor_pre_subsidy=donor_pre_subsidy, - donor_capped=donor_capped, clone_pre_subsidy=clone_pre_subsidy, clone_person_data=clone_person_data, clone_spm_unit_ids=clone_spm_unit_ids, + time_period=time_period, ) diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index e32172db2..a6ea8b654 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -126,57 +126,62 @@ def test_capped_childcare_not_in_cps_only(self): class TestCloneChildcareDerivation: - """Clone-half capped childcare should be derived deterministically.""" + """Clone-half capped work-and-childcare expenses should be deterministic.""" - def test_caps_at_pre_subsidy_and_clone_earnings(self): - donor_pre_subsidy = np.array([10000.0, 4000.0, 6000.0]) - donor_capped = np.array([4000.0, 4000.0, 0.0]) - clone_pre_subsidy = np.array([12000.0, 5000.0, 3000.0]) + def test_caps_combined_work_and_childcare_at_lower_earner(self): + clone_pre_subsidy = np.array([1200.0, 5000.0, 3000.0]) person_data = pd.DataFrame( { "spm_unit_id": [1, 1, 2, 2, 3], "age": [40, 38, 35, 33, 29], "is_parent_proxy": [True, True, True, True, True], "earnings": [9000.0, 3000.0, 1500.0, 0.0, 2000.0], + "weeks_worked": [10.0, 20.0, 30.0, 5.0, 15.0], } ) result = derive_clone_capped_childcare_expenses( - donor_pre_subsidy=donor_pre_subsidy, - donor_capped=donor_capped, clone_pre_subsidy=clone_pre_subsidy, clone_person_data=person_data, clone_spm_unit_ids=np.array([1, 2, 3]), + time_period=2024, ) - np.testing.assert_allclose(result, np.array([3000.0, 0.0, 0.0])) + np.testing.assert_allclose( + result, + np.array( + [ + 2435.1, # 1200 childcare + (10 + 20) * 41.17 work expenses + 0.0, # Two-parent unit capped by the lower earner's zero earnings + 2000.0, # Single proxy unit capped at the proxy's earnings + ] + ), + rtol=0, + atol=1e-6, + ) - def test_uses_single_parent_earnings_cap_for_single_proxy_units(self): - donor_pre_subsidy = np.array([4000.0]) - donor_capped = np.array([4000.0]) - clone_pre_subsidy = np.array([6000.0]) + def test_includes_work_expenses_even_without_childcare(self): + clone_pre_subsidy = np.array([0.0]) person_data = pd.DataFrame( { "spm_unit_id": [10], "age": [31], "is_parent_proxy": [True], "earnings": [2500.0], + "weeks_worked": [12.0], } ) result = derive_clone_capped_childcare_expenses( - donor_pre_subsidy=donor_pre_subsidy, - donor_capped=donor_capped, clone_pre_subsidy=clone_pre_subsidy, clone_person_data=person_data, clone_spm_unit_ids=np.array([10]), + time_period=2024, ) - np.testing.assert_allclose(result, np.array([2500.0])) + np.testing.assert_allclose(result, np.array([494.04]), rtol=0, atol=1e-6) def test_falls_back_to_zero_without_parent_proxies(self): - donor_pre_subsidy = np.array([3000.0]) - donor_capped = np.array([2000.0]) clone_pre_subsidy = np.array([3000.0]) person_data = pd.DataFrame( { @@ -184,15 +189,15 @@ def test_falls_back_to_zero_without_parent_proxies(self): "age": [12, 9], "is_parent_proxy": [False, False], "earnings": [0.0, 0.0], + "weeks_worked": [0.0, 0.0], } ) result = derive_clone_capped_childcare_expenses( - donor_pre_subsidy=donor_pre_subsidy, - donor_capped=donor_capped, clone_pre_subsidy=clone_pre_subsidy, clone_person_data=person_data, clone_spm_unit_ids=np.array([20]), + time_period=2024, ) np.testing.assert_allclose(result, np.array([0.0])) From fe77557976f369b505db2ce3d98ae40bed7f8255 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 22:56:15 -0400 Subject: [PATCH 20/28] Add childcare formula changelog fragment --- changelog.d/705.fixed | 1 + 1 file changed, 1 insertion(+) create mode 100644 changelog.d/705.fixed diff --git a/changelog.d/705.fixed b/changelog.d/705.fixed new file mode 100644 index 000000000..4c60c9bff --- /dev/null +++ b/changelog.d/705.fixed @@ -0,0 +1 @@ +Use Census work-and-childcare capping inputs for clone-half SPM childcare expenses instead of donor capping shares. From 1f518c9381f0b489263228031f9baeeab8dd96f2 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Wed, 8 Apr 2026 23:06:09 -0400 Subject: [PATCH 21/28] Upload clone diagnostics and preserve clone flags --- .../datasets/cps/small_enhanced_cps.py | 83 +++++++++++++++-- .../storage/upload_completed_datasets.py | 2 + tests/unit/test_small_enhanced_cps.py | 89 +++++++++++++++++++ tests/unit/test_upload_completed_datasets.py | 77 ++++++++++++++++ 4 files changed, 243 insertions(+), 8 deletions(-) create mode 100644 tests/unit/test_small_enhanced_cps.py diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py index a15080321..e825d080b 100644 --- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py @@ -1,14 +1,81 @@ +import h5py +import logging import os -import pandas as pd import numpy as np -import h5py - +import pandas as pd from policyengine_us import Microsimulation -from policyengine_us_data.datasets import EnhancedCPS_2024 -from policyengine_us_data.storage import STORAGE_FOLDER from policyengine_core.enums import Enum from policyengine_core.data.dataset import Dataset -import logging +from policyengine_us_data.calibration.puf_impute import CLONE_ORIGIN_FLAGS +from policyengine_us_data.datasets import EnhancedCPS_2024 +from policyengine_us_data.storage import STORAGE_FOLDER + + +def _load_saved_period_array( + h5_file: h5py.File, variable_name: str, period: int +) -> np.ndarray: + obj = h5_file[variable_name] + if isinstance(obj, h5py.Dataset): + return np.asarray(obj[...]) + + period_key = str(period) + if period_key in obj: + return np.asarray(obj[period_key][...]) + if period in obj: + return np.asarray(obj[period][...]) + + raise KeyError(f"{variable_name} missing period {period}") + + +def _attach_clone_origin_flags( + data: dict[str, dict[int, np.ndarray]], + source_dataset_path, +) -> None: + with h5py.File(source_dataset_path, "r") as source_h5: + for entity_key, flag_name in CLONE_ORIGIN_FLAGS.items(): + id_variable = f"{entity_key}_id" + if id_variable not in data: + raise KeyError( + f"Expected {id_variable} in sampled small ECPS data " + f"before attaching {flag_name}" + ) + + source_ids_by_period = {} + source_flags_by_period = {} + for period_key in data[id_variable]: + period = int(period_key) + source_ids = _load_saved_period_array(source_h5, id_variable, period) + source_flags = _load_saved_period_array(source_h5, flag_name, period) + if len(source_ids) != len(source_flags): + raise ValueError( + f"{flag_name} length {len(source_flags)} does not match " + f"{id_variable} length {len(source_ids)} for period {period}" + ) + source_ids_by_period[period_key] = source_ids + source_flags_by_period[period_key] = source_flags + + data[flag_name] = {} + for period_key, sampled_ids in data[id_variable].items(): + source_ids = source_ids_by_period[period_key] + source_flags = source_flags_by_period[period_key] + flag_lookup = { + int(entity_id): np.int8(flag_value) + for entity_id, flag_value in zip(source_ids, source_flags) + } + sampled_ids = np.asarray(sampled_ids) + missing_ids = [ + int(entity_id) + for entity_id in sampled_ids + if int(entity_id) not in flag_lookup + ] + if missing_ids: + raise KeyError( + f"Missing {flag_name} values for sampled IDs {missing_ids[:5]}" + ) + data[flag_name][period_key] = np.asarray( + [flag_lookup[int(entity_id)] for entity_id in sampled_ids], + dtype=np.int8, + ) def create_small_ecps(): @@ -51,6 +118,8 @@ def create_small_ecps(): if len(data[variable]) == 0: del data[variable] + _attach_clone_origin_flags(data, EnhancedCPS_2024.file_path) + with h5py.File(STORAGE_FOLDER / "small_enhanced_cps_2024.h5", "w") as f: for variable, periods in data.items(): grp = f.create_group(variable) @@ -65,7 +134,6 @@ def create_sparse_ecps(): ecps = EnhancedCPS_2024() h5 = ecps.load() sparse_weights = h5["household_weight"][str(time_period)][:] - hh_ids = h5["household_id"][str(time_period)][:] h5.close() template_sim = Microsimulation( @@ -78,7 +146,6 @@ def create_sparse_ecps(): household_weight_column = f"household_weight__{time_period}" df_household_id_column = f"household_id__{time_period}" - df_person_id_column = f"person_id__{time_period}" # Group by household ID and get the first entry for each group h_df = df.groupby(df_household_id_column).first() diff --git a/policyengine_us_data/storage/upload_completed_datasets.py b/policyengine_us_data/storage/upload_completed_datasets.py index a21a94b3c..e88504ab2 100644 --- a/policyengine_us_data/storage/upload_completed_datasets.py +++ b/policyengine_us_data/storage/upload_completed_datasets.py @@ -5,6 +5,7 @@ from policyengine_us_data.datasets import EnhancedCPS_2024 from policyengine_us_data.datasets.cps.cps import CPS_2024 +from policyengine_us_data.datasets.cps.enhanced_cps import clone_diagnostics_path from policyengine_us_data.storage import STORAGE_FOLDER from policyengine_us_data.utils.data_upload import upload_data_files from policyengine_us_data.utils.dataset_validation import ( @@ -187,6 +188,7 @@ def upload_datasets(require_enhanced_cps: bool = True): ] enhanced_files = [ EnhancedCPS_2024.file_path, + clone_diagnostics_path(EnhancedCPS_2024.file_path), STORAGE_FOLDER / "small_enhanced_cps_2024.h5", ] if require_enhanced_cps: diff --git a/tests/unit/test_small_enhanced_cps.py b/tests/unit/test_small_enhanced_cps.py new file mode 100644 index 000000000..f9a926831 --- /dev/null +++ b/tests/unit/test_small_enhanced_cps.py @@ -0,0 +1,89 @@ +import h5py +import numpy as np +import pytest + +from policyengine_us_data.datasets.cps.small_enhanced_cps import ( + _attach_clone_origin_flags, +) + + +def _write_period_group( + h5_file, variable_name: str, values_by_period: dict[int, np.ndarray] +): + group = h5_file.create_group(variable_name) + for period, values in values_by_period.items(): + group.create_dataset(str(period), data=values) + + +def test_attach_clone_origin_flags_maps_sampled_entity_ids(tmp_path): + source_path = tmp_path / "enhanced_cps_2024.h5" + with h5py.File(source_path, "w") as h5_file: + _write_period_group(h5_file, "person_id", {2024: np.array([11, 12, 13])}) + _write_period_group( + h5_file, "person_is_puf_clone", {2024: np.array([0, 1, 0], dtype=np.int8)} + ) + _write_period_group(h5_file, "tax_unit_id", {2024: np.array([21, 22])}) + _write_period_group( + h5_file, "tax_unit_is_puf_clone", {2024: np.array([1, 0], dtype=np.int8)} + ) + _write_period_group(h5_file, "spm_unit_id", {2024: np.array([31, 32])}) + _write_period_group( + h5_file, "spm_unit_is_puf_clone", {2024: np.array([0, 1], dtype=np.int8)} + ) + _write_period_group(h5_file, "family_id", {2024: np.array([41, 42])}) + _write_period_group( + h5_file, "family_is_puf_clone", {2024: np.array([1, 1], dtype=np.int8)} + ) + _write_period_group(h5_file, "household_id", {2024: np.array([51, 52])}) + _write_period_group( + h5_file, "household_is_puf_clone", {2024: np.array([0, 1], dtype=np.int8)} + ) + + sampled_data = { + "person_id": {2024: np.array([12, 11], dtype=np.int32)}, + "tax_unit_id": {2024: np.array([22, 21], dtype=np.int32)}, + "spm_unit_id": {2024: np.array([32, 31], dtype=np.int32)}, + "family_id": {2024: np.array([41, 42], dtype=np.int32)}, + "household_id": {2024: np.array([52, 51], dtype=np.int32)}, + } + + _attach_clone_origin_flags(sampled_data, source_path) + + assert sampled_data["person_is_puf_clone"][2024].tolist() == [1, 0] + assert sampled_data["tax_unit_is_puf_clone"][2024].tolist() == [0, 1] + assert sampled_data["spm_unit_is_puf_clone"][2024].tolist() == [1, 0] + assert sampled_data["family_is_puf_clone"][2024].tolist() == [1, 1] + assert sampled_data["household_is_puf_clone"][2024].tolist() == [1, 0] + + +def test_attach_clone_origin_flags_rejects_missing_sampled_id(tmp_path): + source_path = tmp_path / "enhanced_cps_2024.h5" + with h5py.File(source_path, "w") as h5_file: + for entity_name, id_base in [ + ("person", 10), + ("tax_unit", 20), + ("spm_unit", 30), + ("family", 40), + ("household", 50), + ]: + _write_period_group( + h5_file, + f"{entity_name}_id", + {2024: np.array([id_base], dtype=np.int32)}, + ) + _write_period_group( + h5_file, + f"{entity_name}_is_puf_clone", + {2024: np.array([0], dtype=np.int8)}, + ) + + sampled_data = { + "person_id": {2024: np.array([999], dtype=np.int32)}, + "tax_unit_id": {2024: np.array([20], dtype=np.int32)}, + "spm_unit_id": {2024: np.array([30], dtype=np.int32)}, + "family_id": {2024: np.array([40], dtype=np.int32)}, + "household_id": {2024: np.array([50], dtype=np.int32)}, + } + + with pytest.raises(KeyError, match="person_is_puf_clone"): + _attach_clone_origin_flags(sampled_data, source_path) diff --git a/tests/unit/test_upload_completed_datasets.py b/tests/unit/test_upload_completed_datasets.py index 7a602c340..34265ec18 100644 --- a/tests/unit/test_upload_completed_datasets.py +++ b/tests/unit/test_upload_completed_datasets.py @@ -7,6 +7,7 @@ import policyengine_us_data.storage.upload_completed_datasets as upload_module from policyengine_us_data.storage.upload_completed_datasets import ( DatasetValidationError, + upload_datasets, validate_dataset, ) import policyengine_us_data.utils.dataset_validation as _dv_mod @@ -153,3 +154,79 @@ def test_validate_dataset_infers_time_period_for_flat_h5(tmp_path, monkeypatch): validate_dataset(file_path) assert _TimePeriodCheckingAggregateMicrosimulation.last_dataset.time_period == 2024 + + +def test_upload_datasets_includes_clone_diagnostics_sidecar(tmp_path, monkeypatch): + storage_folder = tmp_path / "storage" + calibration_folder = storage_folder / "calibration" + calibration_folder.mkdir(parents=True) + + cps_path = storage_folder / "cps_2024.h5" + enhanced_path = storage_folder / "enhanced_cps_2024.h5" + diagnostics_path = enhanced_path.with_suffix(".clone_diagnostics.json") + small_path = storage_folder / "small_enhanced_cps_2024.h5" + policy_db = calibration_folder / "policy_data.db" + + for path in [cps_path, enhanced_path, diagnostics_path, small_path, policy_db]: + path.write_text("placeholder") + + monkeypatch.setattr( + upload_module, + "CPS_2024", + SimpleNamespace(file_path=cps_path), + ) + monkeypatch.setattr( + upload_module, + "EnhancedCPS_2024", + SimpleNamespace(file_path=enhanced_path), + ) + monkeypatch.setattr(upload_module, "STORAGE_FOLDER", storage_folder) + monkeypatch.setattr(upload_module, "validate_dataset", lambda file_path: None) + + uploaded_files = [] + + def _capture_upload(*, files, **kwargs): + uploaded_files.extend(files) + + monkeypatch.setattr(upload_module, "upload_data_files", _capture_upload) + + upload_datasets(require_enhanced_cps=True) + + assert uploaded_files == [ + cps_path, + policy_db, + enhanced_path, + diagnostics_path, + small_path, + ] + + +def test_upload_datasets_requires_clone_diagnostics_sidecar(tmp_path, monkeypatch): + storage_folder = tmp_path / "storage" + calibration_folder = storage_folder / "calibration" + calibration_folder.mkdir(parents=True) + + cps_path = storage_folder / "cps_2024.h5" + enhanced_path = storage_folder / "enhanced_cps_2024.h5" + small_path = storage_folder / "small_enhanced_cps_2024.h5" + policy_db = calibration_folder / "policy_data.db" + + for path in [cps_path, enhanced_path, small_path, policy_db]: + path.write_text("placeholder") + + monkeypatch.setattr( + upload_module, + "CPS_2024", + SimpleNamespace(file_path=cps_path), + ) + monkeypatch.setattr( + upload_module, + "EnhancedCPS_2024", + SimpleNamespace(file_path=enhanced_path), + ) + monkeypatch.setattr(upload_module, "STORAGE_FOLDER", storage_folder) + monkeypatch.setattr(upload_module, "validate_dataset", lambda file_path: None) + monkeypatch.setattr(upload_module, "upload_data_files", lambda **kwargs: None) + + with pytest.raises(FileNotFoundError, match="clone_diagnostics"): + upload_datasets(require_enhanced_cps=True) From dd75f1d97eecb1f193e95a6feb7d366ba99a34cb Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 9 Apr 2026 06:59:25 -0400 Subject: [PATCH 22/28] Populate childcare formula inputs from CPS --- .../datasets/cps/extended_cps.py | 168 ------------------ tests/unit/test_extended_cps.py | 82 +-------- tests/unit/test_weeks_worked.py | 41 +++++ 3 files changed, 42 insertions(+), 249 deletions(-) create mode 100644 tests/unit/test_weeks_worked.py diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index 760ac5df9..310d0a072 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -26,12 +26,6 @@ logger = logging.getLogger(__name__) -# Census SPM technical documentation, "SPM Work Expense Values". -# These are weekly work expense amounts applied to each adult earner. -SPM_WEEKLY_WORK_EXPENSE_BY_YEAR = { - 2024: 41.17, -} - def _supports_structural_mortgage_inputs() -> bool: return has_policyengine_us_variables(*STRUCTURAL_MORTGAGE_VARIABLES) @@ -331,150 +325,6 @@ def reconcile_ss_subcomponents(predictions, total_ss): } -def _get_spm_weekly_work_expense(year: int) -> float: - try: - return SPM_WEEKLY_WORK_EXPENSE_BY_YEAR[year] - except KeyError as exc: - raise ValueError( - f"No Census SPM weekly work expense value configured for {year}" - ) from exc - - -def _calculate_clone_work_expenses( - clone_person_data: pd.DataFrame, - clone_spm_unit_ids: np.ndarray, -) -> np.ndarray: - clone_spm_unit_ids = np.asarray(clone_spm_unit_ids) - if clone_person_data.empty: - return np.zeros(len(clone_spm_unit_ids), dtype=float) - - adult_earners = clone_person_data.loc[ - (clone_person_data["age"] >= 18) & (clone_person_data["earnings"] > 0), - ["spm_unit_id", "weeks_worked"], - ].copy() - if adult_earners.empty: - return np.zeros(len(clone_spm_unit_ids), dtype=float) - - adult_earners["weeks_worked"] = adult_earners["weeks_worked"].clip( - lower=0, upper=52 - ) - return ( - adult_earners.groupby("spm_unit_id")["weeks_worked"] - .sum() - .reindex( - clone_spm_unit_ids, - fill_value=0.0, - ) - .to_numpy(dtype=float) - ) - - -def _calculate_clone_lower_earner_caps( - clone_person_data: pd.DataFrame, - clone_spm_unit_ids: np.ndarray, -) -> np.ndarray: - clone_spm_unit_ids = np.asarray(clone_spm_unit_ids) - if clone_person_data.empty: - return np.zeros(len(clone_spm_unit_ids), dtype=float) - - head_or_spouse = clone_person_data.loc[ - clone_person_data["is_parent_proxy"].astype(bool), - ["spm_unit_id", "earnings"], - ].copy() - if head_or_spouse.empty: - return np.zeros(len(clone_spm_unit_ids), dtype=float) - - head_or_spouse["earnings"] = head_or_spouse["earnings"].clip(lower=0.0) - lower_earner_caps = head_or_spouse.groupby("spm_unit_id")["earnings"].agg( - lambda values: float(values.min()) if len(values) > 1 else float(values.iloc[0]) - ) - return lower_earner_caps.reindex( - clone_spm_unit_ids, - fill_value=0.0, - ).to_numpy(dtype=float) - - -def derive_clone_capped_childcare_expenses( - clone_pre_subsidy: np.ndarray, - clone_person_data: pd.DataFrame, - clone_spm_unit_ids: np.ndarray, - time_period: int, -) -> np.ndarray: - """Derive clone-half capped work and childcare expenses from clone inputs. - - The CPS provides both pre-subsidy childcare and the SPM-specific - capped work-and-childcare deduction. For the clone half, we impute - only the pre-subsidy childcare amount, then deterministically rebuild - the capped value using the Census SPM rule: - work expenses plus childcare, capped at the lower earner's earnings - for the reference person and spouse/partner. - """ - - clone_pre_subsidy = np.asarray(clone_pre_subsidy, dtype=float) - weekly_work_expense = _get_spm_weekly_work_expense(time_period) - annual_work_expenses = ( - _calculate_clone_work_expenses( - clone_person_data=clone_person_data, - clone_spm_unit_ids=clone_spm_unit_ids, - ) - * weekly_work_expense - ) - lower_earner_cap = _calculate_clone_lower_earner_caps( - clone_person_data=clone_person_data, - clone_spm_unit_ids=clone_spm_unit_ids, - ) - - combined_expenses = np.maximum(clone_pre_subsidy, 0.0) + annual_work_expenses - return np.minimum(combined_expenses, lower_earner_cap) - - -def _rebuild_clone_capped_childcare_expenses( - data: dict, - time_period: int, - cps_sim, -) -> np.ndarray: - """Rebuild clone-half capped childcare expenses after stage-2 imputation.""" - - n_persons_half = len(data["person_id"][time_period]) // 2 - n_spm_units_half = len(data["spm_unit_id"][time_period]) // 2 - - person_roles = cps_sim.calculate_dataframe( - ["age", "is_tax_unit_head", "is_tax_unit_spouse"] - ) - if len(person_roles) != n_persons_half: - raise ValueError( - "Unexpected person role frame length while rebuilding clone childcare " - f"expenses: got {len(person_roles)}, expected {n_persons_half}" - ) - - clone_person_data = pd.DataFrame( - { - "spm_unit_id": data["person_spm_unit_id"][time_period][n_persons_half:], - "age": person_roles["age"].values, - "is_parent_proxy": ( - person_roles["is_tax_unit_head"].values - | person_roles["is_tax_unit_spouse"].values - ), - "earnings": ( - data["employment_income"][time_period][n_persons_half:] - + data["self_employment_income"][time_period][n_persons_half:] - ), - "weeks_worked": data["weeks_worked"][time_period][n_persons_half:], - } - ) - clone_pre_subsidy = data["spm_unit_pre_subsidy_childcare_expenses"][time_period][ - n_spm_units_half: - ] - clone_spm_unit_ids = data["spm_unit_id"][time_period][n_spm_units_half:] - - return derive_clone_capped_childcare_expenses( - clone_pre_subsidy=clone_pre_subsidy, - clone_person_data=clone_person_data, - clone_spm_unit_ids=clone_spm_unit_ids, - time_period=time_period, - ) - - def _apply_post_processing(predictions, X_test, time_period, data): """Apply retirement constraints and SS reconciliation.""" ret_cols = [c for c in predictions.columns if c in _RETIREMENT_VARS] @@ -579,24 +429,6 @@ def _splice_cps_only_predictions( new_values = np.concatenate([cps_half, pred_values]) data[var] = {time_period: new_values} - if ( - "spm_unit_capped_work_childcare_expenses" in data - and "spm_unit_pre_subsidy_childcare_expenses" in data - ): - n_half = entity_half_lengths.get( - "spm_unit", - len(data["spm_unit_capped_work_childcare_expenses"][time_period]) // 2, - ) - cps_half = data["spm_unit_capped_work_childcare_expenses"][time_period][:n_half] - clone_half = _rebuild_clone_capped_childcare_expenses( - data=data, - time_period=time_period, - cps_sim=cps_sim, - ) - data["spm_unit_capped_work_childcare_expenses"] = { - time_period: np.concatenate([cps_half, clone_half]) - } - del cps_sim return data diff --git a/tests/unit/test_extended_cps.py b/tests/unit/test_extended_cps.py index a6ea8b654..3bbd98a2d 100644 --- a/tests/unit/test_extended_cps.py +++ b/tests/unit/test_extended_cps.py @@ -19,7 +19,6 @@ CPS_ONLY_IMPUTED_VARIABLES, CPS_STAGE2_INCOME_PREDICTORS, apply_retirement_constraints, - derive_clone_capped_childcare_expenses, reconcile_ss_subcomponents, ) from policyengine_us_data.datasets.org import ORG_IMPUTED_VARIABLES @@ -118,91 +117,12 @@ def test_pension_income_not_in_cps_only(self): ) def test_capped_childcare_not_in_cps_only(self): - """Capped childcare should be derived from clone-half inputs, not - independently QRF-imputed.""" + """Capped childcare should not be independently QRF-imputed.""" assert "spm_unit_capped_work_childcare_expenses" not in set( CPS_ONLY_IMPUTED_VARIABLES ) -class TestCloneChildcareDerivation: - """Clone-half capped work-and-childcare expenses should be deterministic.""" - - def test_caps_combined_work_and_childcare_at_lower_earner(self): - clone_pre_subsidy = np.array([1200.0, 5000.0, 3000.0]) - person_data = pd.DataFrame( - { - "spm_unit_id": [1, 1, 2, 2, 3], - "age": [40, 38, 35, 33, 29], - "is_parent_proxy": [True, True, True, True, True], - "earnings": [9000.0, 3000.0, 1500.0, 0.0, 2000.0], - "weeks_worked": [10.0, 20.0, 30.0, 5.0, 15.0], - } - ) - - result = derive_clone_capped_childcare_expenses( - clone_pre_subsidy=clone_pre_subsidy, - clone_person_data=person_data, - clone_spm_unit_ids=np.array([1, 2, 3]), - time_period=2024, - ) - - np.testing.assert_allclose( - result, - np.array( - [ - 2435.1, # 1200 childcare + (10 + 20) * 41.17 work expenses - 0.0, # Two-parent unit capped by the lower earner's zero earnings - 2000.0, # Single proxy unit capped at the proxy's earnings - ] - ), - rtol=0, - atol=1e-6, - ) - - def test_includes_work_expenses_even_without_childcare(self): - clone_pre_subsidy = np.array([0.0]) - person_data = pd.DataFrame( - { - "spm_unit_id": [10], - "age": [31], - "is_parent_proxy": [True], - "earnings": [2500.0], - "weeks_worked": [12.0], - } - ) - - result = derive_clone_capped_childcare_expenses( - clone_pre_subsidy=clone_pre_subsidy, - clone_person_data=person_data, - clone_spm_unit_ids=np.array([10]), - time_period=2024, - ) - - np.testing.assert_allclose(result, np.array([494.04]), rtol=0, atol=1e-6) - - def test_falls_back_to_zero_without_parent_proxies(self): - clone_pre_subsidy = np.array([3000.0]) - person_data = pd.DataFrame( - { - "spm_unit_id": [20, 20], - "age": [12, 9], - "is_parent_proxy": [False, False], - "earnings": [0.0, 0.0], - "weeks_worked": [0.0, 0.0], - } - ) - - result = derive_clone_capped_childcare_expenses( - clone_pre_subsidy=clone_pre_subsidy, - clone_person_data=person_data, - clone_spm_unit_ids=np.array([20]), - time_period=2024, - ) - - np.testing.assert_allclose(result, np.array([0.0])) - - class TestRetirementConstraints: """Post-processing retirement constraints enforce IRS caps.""" diff --git a/tests/unit/test_weeks_worked.py b/tests/unit/test_weeks_worked.py new file mode 100644 index 000000000..ad5f801af --- /dev/null +++ b/tests/unit/test_weeks_worked.py @@ -0,0 +1,41 @@ +""" +Tests for weeks_worked extraction from CPS ASEC. + +The Census CPS ASEC exposes WKSWORK directly, which we now carry through as +the model input for future-year SPM work-expense calculations. +""" + +import numpy as np +from pathlib import Path + + +class TestWeeksWorked: + """Test suite for weeks_worked variable extraction.""" + + def test_census_cps_includes_wkswork(self): + census_cps_path = Path(__file__).parent.parent.parent / ( + "policyengine_us_data/datasets/cps/census_cps.py" + ) + content = census_cps_path.read_text() + + assert '"WKSWORK"' in content, "WKSWORK should be in PERSON_COLUMNS" + + def test_cps_maps_weeks_worked_from_wkswork(self): + cps_path = Path(__file__).parent.parent.parent / ( + "policyengine_us_data/datasets/cps/cps.py" + ) + content = cps_path.read_text() + + assert 'cps["weeks_worked"]' in content + assert "person.WKSWORK" in content + assert "np.clip(person.WKSWORK, 0, 52)" in content + + def test_weeks_worked_value_range(self): + raw_values = np.array([-4, 0, 1, 26, 52, 60]) + processed = np.clip(raw_values, 0, 52) + + assert processed.min() >= 0, "Minimum should be >= 0" + assert processed.max() <= 52, "Maximum should be <= 52" + assert processed[0] == 0, "Negative values should clip to 0" + assert processed[3] == 26, "Valid weeks should be preserved" + assert processed[5] == 52, "Values above 52 should clip to 52" From 18f8ebd4a44b174180df43fdb13daefe2890cdf2 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 9 Apr 2026 07:40:25 -0400 Subject: [PATCH 23/28] Carry CPS partner input for SPM childcare --- .../datasets/cps/census_cps.py | 1 + policyengine_us_data/datasets/cps/cps.py | 3 ++ tests/unit/test_reference_partner.py | 32 +++++++++++++++++++ 3 files changed, 36 insertions(+) create mode 100644 tests/unit/test_reference_partner.py diff --git a/policyengine_us_data/datasets/cps/census_cps.py b/policyengine_us_data/datasets/cps/census_cps.py index 042fefe56..6faed88fe 100644 --- a/policyengine_us_data/datasets/cps/census_cps.py +++ b/policyengine_us_data/datasets/cps/census_cps.py @@ -233,6 +233,7 @@ class CensusCPS_2018(CensusCPS): "A_FNLWGT", "A_LINENO", "A_SPOUSE", + "PERRP", "A_AGE", "A_SEX", "PEDISEYE", diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 08301f167..6fe4d2f35 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -462,6 +462,9 @@ def children_per_parent(col: str) -> pd.DataFrame: cps["is_surviving_spouse"] = person.A_MARITL == 4 cps["is_separated"] = person.A_MARITL == 6 + cps["is_unmarried_partner_of_household_head"] = person.PERRP.isin( + [43, 44, 46, 47] + ) # High school or college/university enrollment status. cps["is_full_time_college_student"] = person.A_HSCOL == 2 diff --git a/tests/unit/test_reference_partner.py b/tests/unit/test_reference_partner.py new file mode 100644 index 000000000..83ee6bd28 --- /dev/null +++ b/tests/unit/test_reference_partner.py @@ -0,0 +1,32 @@ +""" +Tests for reference-person partner extraction from CPS ASEC. + +The public CPS ASEC relationship-to-reference-person variable PERRP identifies +unmarried partners of the household head/reference person. We carry that +through so the SPM childcare cap can distinguish the reference person's partner +from unrelated adults in the same SPM unit. +""" + +from pathlib import Path + + +class TestReferencePartner: + """Test suite for CPS relationship-to-reference-person extraction.""" + + def test_census_cps_includes_perrp(self): + census_cps_path = Path(__file__).parent.parent.parent / ( + "policyengine_us_data/datasets/cps/census_cps.py" + ) + content = census_cps_path.read_text() + + assert '"PERRP"' in content, "PERRP should be in PERSON_COLUMNS" + + def test_cps_maps_unmarried_partner_from_perrp(self): + cps_path = Path(__file__).parent.parent.parent / ( + "policyengine_us_data/datasets/cps/cps.py" + ) + content = cps_path.read_text() + + assert 'cps["is_unmarried_partner_of_household_head"]' in content + for code in ("43", "44", "46", "47"): + assert code in content From e5a488e0bc35207b9c19f94b853949d4733e2750 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 9 Apr 2026 10:16:50 -0400 Subject: [PATCH 24/28] Format CPS PERRP mapping --- policyengine_us_data/datasets/cps/cps.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index 6fe4d2f35..f0e2c756a 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -462,9 +462,7 @@ def children_per_parent(col: str) -> pd.DataFrame: cps["is_surviving_spouse"] = person.A_MARITL == 4 cps["is_separated"] = person.A_MARITL == 6 - cps["is_unmarried_partner_of_household_head"] = person.PERRP.isin( - [43, 44, 46, 47] - ) + cps["is_unmarried_partner_of_household_head"] = person.PERRP.isin([43, 44, 46, 47]) # High school or college/university enrollment status. cps["is_full_time_college_student"] = person.A_HSCOL == 2 From 1a0678a5fbf9384fcc861fb6bbece4ad6503b387 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 9 Apr 2026 10:17:17 -0400 Subject: [PATCH 25/28] Handle period keys in small ECPS clone flags --- .../datasets/cps/small_enhanced_cps.py | 10 ++++- tests/unit/test_small_enhanced_cps.py | 41 +++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/policyengine_us_data/datasets/cps/small_enhanced_cps.py b/policyengine_us_data/datasets/cps/small_enhanced_cps.py index e825d080b..6dd6a02b7 100644 --- a/policyengine_us_data/datasets/cps/small_enhanced_cps.py +++ b/policyengine_us_data/datasets/cps/small_enhanced_cps.py @@ -27,6 +27,14 @@ def _load_saved_period_array( raise KeyError(f"{variable_name} missing period {period}") +def _normalize_annual_period_key(period_key) -> int: + if isinstance(period_key, (int, np.integer)): + return int(period_key) + if hasattr(period_key, "year"): + return int(period_key.year) + return int(str(period_key)) + + def _attach_clone_origin_flags( data: dict[str, dict[int, np.ndarray]], source_dataset_path, @@ -43,7 +51,7 @@ def _attach_clone_origin_flags( source_ids_by_period = {} source_flags_by_period = {} for period_key in data[id_variable]: - period = int(period_key) + period = _normalize_annual_period_key(period_key) source_ids = _load_saved_period_array(source_h5, id_variable, period) source_flags = _load_saved_period_array(source_h5, flag_name, period) if len(source_ids) != len(source_flags): diff --git a/tests/unit/test_small_enhanced_cps.py b/tests/unit/test_small_enhanced_cps.py index f9a926831..9a5089cc0 100644 --- a/tests/unit/test_small_enhanced_cps.py +++ b/tests/unit/test_small_enhanced_cps.py @@ -1,5 +1,6 @@ import h5py import numpy as np +import pandas as pd import pytest from policyengine_us_data.datasets.cps.small_enhanced_cps import ( @@ -87,3 +88,43 @@ def test_attach_clone_origin_flags_rejects_missing_sampled_id(tmp_path): with pytest.raises(KeyError, match="person_is_puf_clone"): _attach_clone_origin_flags(sampled_data, source_path) + + +def test_attach_clone_origin_flags_accepts_period_keys(tmp_path): + source_path = tmp_path / "enhanced_cps_2024.h5" + with h5py.File(source_path, "w") as h5_file: + _write_period_group(h5_file, "person_id", {2024: np.array([11, 12])}) + _write_period_group( + h5_file, + "person_is_puf_clone", + {2024: np.array([0, 1], dtype=np.int8)}, + ) + for entity_name, id_base in [ + ("tax_unit", 20), + ("spm_unit", 30), + ("family", 40), + ("household", 50), + ]: + _write_period_group( + h5_file, + f"{entity_name}_id", + {2024: np.array([id_base], dtype=np.int32)}, + ) + _write_period_group( + h5_file, + f"{entity_name}_is_puf_clone", + {2024: np.array([0], dtype=np.int8)}, + ) + + period = pd.Period("2024", freq="Y") + sampled_data = { + "person_id": {period: np.array([12, 11], dtype=np.int32)}, + "tax_unit_id": {period: np.array([20], dtype=np.int32)}, + "spm_unit_id": {period: np.array([30], dtype=np.int32)}, + "family_id": {period: np.array([40], dtype=np.int32)}, + "household_id": {period: np.array([50], dtype=np.int32)}, + } + + _attach_clone_origin_flags(sampled_data, source_path) + + assert sampled_data["person_is_puf_clone"][period].tolist() == [1, 0] From 17e77b3938f07937e77bb28ff7f44277e25e1c6e Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 9 Apr 2026 10:20:39 -0400 Subject: [PATCH 26/28] Use valid year in housing target test --- tests/unit/test_census_spm_housing_targets.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/unit/test_census_spm_housing_targets.py b/tests/unit/test_census_spm_housing_targets.py index 6befed4ed..e595848d1 100644 --- a/tests/unit/test_census_spm_housing_targets.py +++ b/tests/unit/test_census_spm_housing_targets.py @@ -83,7 +83,7 @@ def __init__(self, dataset): }, ) - targets = etl_national_targets.extract_national_targets("dummy") + targets = etl_national_targets.extract_national_targets(2024) housing = next( target for target in targets["direct_sum_targets"] From 09a42f4f3d98d24ec28610d22b8d80d12b1b1801 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 9 Apr 2026 21:08:51 -0400 Subject: [PATCH 27/28] Use modeled Medicare Part B inputs and targets --- changelog.d/712.fixed | 1 + policyengine_us_data/datasets/cps/cps.py | 3 +- .../datasets/cps/extended_cps.py | 1 - policyengine_us_data/datasets/puf/puf.py | 3 +- .../db/etl_national_targets.py | 17 ++++++-- policyengine_us_data/utils/cms_medicare.py | 41 +++++++++++++++++++ policyengine_us_data/utils/loss.py | 7 +++- tests/unit/test_cms_medicare_targets.py | 25 +++++++++++ tests/unit/test_medicare_part_b_inputs.py | 10 +++++ 9 files changed, 101 insertions(+), 7 deletions(-) create mode 100644 changelog.d/712.fixed create mode 100644 policyengine_us_data/utils/cms_medicare.py create mode 100644 tests/unit/test_cms_medicare_targets.py create mode 100644 tests/unit/test_medicare_part_b_inputs.py diff --git a/changelog.d/712.fixed b/changelog.d/712.fixed new file mode 100644 index 000000000..5fea41f40 --- /dev/null +++ b/changelog.d/712.fixed @@ -0,0 +1 @@ +Modeled Medicare Part B premiums from enrollment and premium schedules, netted a cycle-free MSP standard-premium offset, and documented the national Part B calibration target as an approximate beneficiary-paid out-of-pocket benchmark rather than gross CMS premium income. diff --git a/policyengine_us_data/datasets/cps/cps.py b/policyengine_us_data/datasets/cps/cps.py index d4f2d3026..2ffe05c6f 100644 --- a/policyengine_us_data/datasets/cps/cps.py +++ b/policyengine_us_data/datasets/cps/cps.py @@ -720,7 +720,8 @@ def add_personal_income_variables(cps: h5py.File, person: DataFrame, year: int): cps["health_insurance_premiums_without_medicare_part_b"] = person.PHIP_VAL cps["over_the_counter_health_expenses"] = person.POTC_VAL cps["other_medical_expenses"] = person.PMED_VAL - cps["medicare_part_b_premiums"] = person.PEMCPREM + cps["medicare_part_b_premiums_reported"] = person.PEMCPREM + cps["medicare_enrolled"] = person.MCARE == 1 # Get QBI simulation parameters --- yamlfilename = ( diff --git a/policyengine_us_data/datasets/cps/extended_cps.py b/policyengine_us_data/datasets/cps/extended_cps.py index d1ee29012..de583e5d9 100644 --- a/policyengine_us_data/datasets/cps/extended_cps.py +++ b/policyengine_us_data/datasets/cps/extended_cps.py @@ -153,7 +153,6 @@ def _supports_structural_mortgage_inputs() -> bool: "health_insurance_premiums_without_medicare_part_b", "over_the_counter_health_expenses", "other_medical_expenses", - "medicare_part_b_premiums", "child_support_expense", # Hours/employment "weekly_hours_worked", diff --git a/policyengine_us_data/datasets/puf/puf.py b/policyengine_us_data/datasets/puf/puf.py index bde0f33ff..4d2ce72fd 100644 --- a/policyengine_us_data/datasets/puf/puf.py +++ b/policyengine_us_data/datasets/puf/puf.py @@ -802,10 +802,11 @@ class PUF_2024(PUF): url = "release://policyengine/irs-soi-puf/1.8.0/puf_2024.h5" +# Leave Medicare Part B out of the generic PUF medical-expense split: +# the baseline model now derives Part B premiums separately. MEDICAL_EXPENSE_CATEGORY_BREAKDOWNS = { "health_insurance_premiums_without_medicare_part_b": 0.453, "other_medical_expenses": 0.325, - "medicare_part_b_premiums": 0.137, "over_the_counter_health_expenses": 0.085, } diff --git a/policyengine_us_data/db/etl_national_targets.py b/policyengine_us_data/db/etl_national_targets.py index 349bd015a..7989e8fbc 100644 --- a/policyengine_us_data/db/etl_national_targets.py +++ b/policyengine_us_data/db/etl_national_targets.py @@ -15,6 +15,11 @@ from policyengine_us_data.utils.census_spm import ( build_census_spm_capped_housing_subsidy_target, ) +from policyengine_us_data.utils.cms_medicare import ( + get_beneficiary_paid_medicare_part_b_premiums_notes, + get_beneficiary_paid_medicare_part_b_premiums_source, + get_beneficiary_paid_medicare_part_b_premiums_target, +) from policyengine_us_data.utils.db import ( DEFAULT_YEAR, etl_argparser, @@ -155,9 +160,15 @@ def extract_national_targets(year: int = DEFAULT_YEAR): }, { "variable": "medicare_part_b_premiums", - "value": 112e9, - "source": "CMS Medicare data", - "notes": "Medicare Part B premium payments", + "value": get_beneficiary_paid_medicare_part_b_premiums_target( + HARDCODED_YEAR + ), + "source": get_beneficiary_paid_medicare_part_b_premiums_source( + HARDCODED_YEAR + ), + "notes": get_beneficiary_paid_medicare_part_b_premiums_notes( + HARDCODED_YEAR + ), "year": HARDCODED_YEAR, }, { diff --git a/policyengine_us_data/utils/cms_medicare.py b/policyengine_us_data/utils/cms_medicare.py new file mode 100644 index 000000000..b710cbdea --- /dev/null +++ b/policyengine_us_data/utils/cms_medicare.py @@ -0,0 +1,41 @@ +MEDICARE_PART_B_GROSS_PREMIUM_INCOME = { + 2024: 139.837e9, +} + + +MEDICARE_STATE_BUY_IN_MINIMUM_BENEFICIARIES = { + 2024: 10_000_000, +} + + +BENEFICIARY_PAID_MEDICARE_PART_B_PREMIUM_TARGETS = { + 2024: 112e9, +} + + +def get_beneficiary_paid_medicare_part_b_premiums_target(year: int) -> float: + try: + return BENEFICIARY_PAID_MEDICARE_PART_B_PREMIUM_TARGETS[year] + except KeyError as exc: + raise ValueError( + f"No beneficiary-paid Medicare Part B premium target sourced for {year}." + ) from exc + + +def get_beneficiary_paid_medicare_part_b_premiums_source(year: int) -> str: + gross_income = MEDICARE_PART_B_GROSS_PREMIUM_INCOME[year] / 1e9 + minimum_buy_in = MEDICARE_STATE_BUY_IN_MINIMUM_BENEFICIARIES[year] + return ( + "CMS 2025 Medicare Trustees Report Table III.C3 actual 2024 Part B " + f"premium income (${gross_income:.3f}B), plus CMS State Buy-In FAQ " + f"noting states paid Part B premiums for over {minimum_buy_in:,} people" + ) + + +def get_beneficiary_paid_medicare_part_b_premiums_notes(year: int) -> str: + return ( + "Approximate beneficiary-paid Medicare Part B out-of-pocket premiums " + "for SPM/MOOP calibration. This intentionally does not target gross " + "trust-fund premium income because Medicaid and other MSP pathways pay " + "premiums on behalf of some enrollees." + ) diff --git a/policyengine_us_data/utils/loss.py b/policyengine_us_data/utils/loss.py index fee180e3f..72c38e5ce 100644 --- a/policyengine_us_data/utils/loss.py +++ b/policyengine_us_data/utils/loss.py @@ -15,6 +15,9 @@ from policyengine_us_data.utils.census_spm import ( get_census_spm_capped_housing_subsidy_total, ) +from policyengine_us_data.utils.cms_medicare import ( + get_beneficiary_paid_medicare_part_b_premiums_target, +) from policyengine_core.reforms import Reform from policyengine_us_data.utils.soi import pe_to_soi, get_soi @@ -28,7 +31,9 @@ HARD_CODED_TOTALS = { "health_insurance_premiums_without_medicare_part_b": 385e9, "other_medical_expenses": 278e9, - "medicare_part_b_premiums": 112e9, + "medicare_part_b_premiums": get_beneficiary_paid_medicare_part_b_premiums_target( + 2024 + ), "over_the_counter_health_expenses": 72e9, "spm_unit_spm_threshold": 3_945e9, "child_support_expense": 33e9, diff --git a/tests/unit/test_cms_medicare_targets.py b/tests/unit/test_cms_medicare_targets.py new file mode 100644 index 000000000..d780bf8d4 --- /dev/null +++ b/tests/unit/test_cms_medicare_targets.py @@ -0,0 +1,25 @@ +import pytest + +from policyengine_us_data.utils.cms_medicare import ( + get_beneficiary_paid_medicare_part_b_premiums_notes, + get_beneficiary_paid_medicare_part_b_premiums_source, + get_beneficiary_paid_medicare_part_b_premiums_target, +) + + +def test_beneficiary_paid_medicare_part_b_target_2024_is_sourced(): + assert get_beneficiary_paid_medicare_part_b_premiums_target(2024) == pytest.approx( + 112e9 + ) + + +def test_beneficiary_paid_medicare_part_b_source_mentions_primary_sources(): + source = get_beneficiary_paid_medicare_part_b_premiums_source(2024) + assert "2025 Medicare Trustees Report" in source + assert "State Buy-In FAQ" in source + + +def test_beneficiary_paid_medicare_part_b_notes_describe_out_of_pocket_semantics(): + notes = get_beneficiary_paid_medicare_part_b_premiums_notes(2024) + assert "out-of-pocket" in notes + assert "gross trust-fund premium income" in notes diff --git a/tests/unit/test_medicare_part_b_inputs.py b/tests/unit/test_medicare_part_b_inputs.py new file mode 100644 index 000000000..a082720df --- /dev/null +++ b/tests/unit/test_medicare_part_b_inputs.py @@ -0,0 +1,10 @@ +from policyengine_us_data.datasets.cps.extended_cps import CPS_ONLY_IMPUTED_VARIABLES +from policyengine_us_data.datasets.puf.puf import MEDICAL_EXPENSE_CATEGORY_BREAKDOWNS + + +def test_medicare_part_b_not_qrf_imputed_for_clone_half(): + assert "medicare_part_b_premiums" not in set(CPS_ONLY_IMPUTED_VARIABLES) + + +def test_medicare_part_b_not_allocated_from_generic_puf_medical_expenses(): + assert "medicare_part_b_premiums" not in MEDICAL_EXPENSE_CATEGORY_BREAKDOWNS From 8f8eb8e3f9357341fa9b603070baef19954f3e71 Mon Sep 17 00:00:00 2001 From: Max Ghenis Date: Thu, 9 Apr 2026 21:10:20 -0400 Subject: [PATCH 28/28] Fix Medicare Part B changelog fragment --- changelog.d/{712.fixed => 713.fixed} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename changelog.d/{712.fixed => 713.fixed} (100%) diff --git a/changelog.d/712.fixed b/changelog.d/713.fixed similarity index 100% rename from changelog.d/712.fixed rename to changelog.d/713.fixed