ethereum-optimism · dioptx · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025 · Jul 22, 2025
@@ -0,0 +1,46 @@
+import os
+import polars as pl
+from op_analytics.datapipeline.chains.aggregator import build_all_chains_metadata
+
+# WARNING:
+# This script accesses production data sources (BigQuery, Goldsky, DefiLlama, L2Beat, Dune).
+# It requires valid credentials and may incur costs.
+# To run this test, you must set the OPLABS_ENV=PROD environment variable.
+# Example: OPLABS_ENV=PROD python scripts/test_real_loaders.py
+
+
+def test_real_aggregator():
+    """
+    Tests the chain metadata aggregator with real, live data sources.
+    """
+    print("=== Testing Chain Metadata Aggregator with real data ===")
+
+    # Check if the environment variable is set
+    if os.getenv("OPLABS_ENV") != "PROD":
+        print("Skipping test: OPLABS_ENV is not set to PROD.")
+        print("Please set OPLABS_ENV=PROD to run this test.")
+        return
+
+    try:
+        # Run the full aggregation pipeline
+        aggregated_df = build_all_chains_metadata()
+
+        # Basic validation
+        assert aggregated_df is not None, "Aggregated DataFrame should not be None"
+        assert isinstance(aggregated_df, pl.DataFrame), "Output should be a Polars DataFrame"
+        assert not aggregated_df.is_empty(), "Aggregated DataFrame should not be empty"
+        assert "chain_key" in aggregated_df.columns, "chain_key should be in the columns"
+
+        print("Chain metadata aggregator test passed.")
+        print("Aggregated DataFrame head:")
+        print(aggregated_df.head(5))
+        print(f"Total chains aggregated: {len(aggregated_df)}")
+
+    except Exception as e:
+        print(f"Chain Metadata Aggregator Test Error: {e}")
+        # Re-raise the exception to make it clear that the test failed
+        raise
+
+
+if __name__ == "__main__":
+    test_real_aggregator()
@@ -105,6 +105,7 @@ def build_metadata_command(
         manual_mappings_filepath=manual_mappings_file,
         bq_project_id=bq_project_id,
         bq_dataset_id=bq_dataset_id,
+        csv_path="src/op_analytics/datapipeline/chains/resources/chain_metadata.csv",
     )
 
 

@@ -33,6 +33,7 @@ def all_chains_metadata_asset(context: AssetExecutionContext, config: ChainMetad
         manual_mappings_filepath=config.manual_mappings_filepath,
         bq_project_id=config.bq_project_id,
         bq_dataset_id=config.bq_dataset_id,
+        csv_path="src/op_analytics/datapipeline/chains/resources/chain_metadata.csv",
     )
 
     log.info("Chain metadata aggregation asset completed successfully")
@@ -1,12 +1,15 @@
 """
-ChainMetadataAggregator module for op_analytics.datapipeline.chains.
+ChainMetadataAggregator for op_analytics.datapipeline.chains
 
-This module provides functionality to aggregate and process metadata from multiple chains,
-performing entity resolution, deduplication, and enrichment before outputting to BigQuery.
+Aggregates, deduplicates, and outputs harmonized chain metadata from all registered loaders.
 """
 
+import polars as pl
+
+from op_analytics.coreutils.bigquery.write import overwrite_unpartitioned_table
 from op_analytics.coreutils.logger import structlog
-from op_analytics.datapipeline.chains.mapping_utils import load_manual_mappings
+from op_analytics.datapipeline.chains.loaders.base import LoaderRegistry
+from op_analytics.datapipeline.chains.mapping_utils import load_manual_mappings, apply_mapping_rules
 
 log = structlog.get_logger()
 
@@ -16,167 +19,56 @@ def build_all_chains_metadata(
     manual_mappings_filepath: str,
     bq_project_id: str,
     bq_dataset_id: str,
-) -> None:
+    csv_path: str,
+) -> pl.DataFrame:
     """
-    Build aggregated metadata for all chains with comprehensive processing pipeline.
-
-    This function orchestrates the complete chain metadata aggregation pipeline,
-    including data loading, preprocessing, combination, entity resolution, deduplication,
-    enrichment, validation, and output to BigQuery.
+    Orchestrates the complete chain metadata aggregation pipeline.
 
     Args:
-        output_bq_table (str): Target BigQuery table name for aggregated metadata
-        manual_mappings_filepath (str): Path to manual mappings configuration file
-        bq_project_id (str): BigQuery project ID for data operations
-        bq_dataset_id (str): BigQuery dataset ID for table operations
+        output_bq_table: Target BigQuery table name
+        manual_mappings_filepath: Path to manual mappings configuration file
+        bq_project_id: BigQuery project ID for data operations
+        bq_dataset_id: BigQuery dataset ID for table operations
+        csv_path: Path to CSV file for CSV loader
 
     Returns:
-        None
+        Aggregated chain metadata DataFrame
     """
-    log.info("Pipeline execution started")
-
-    log.info(
-        "Pipeline configuration parameters",
-        output_bq_table=output_bq_table,
-        manual_mappings_filepath=manual_mappings_filepath,
-        bq_project_id=bq_project_id,
-        bq_dataset_id=bq_dataset_id,
-    )
-
-    # Load manual mappings configuration early in the pipeline
-    try:
-        manual_mappings = load_manual_mappings(manual_mappings_filepath)
-        log.info(
-            "Manual mappings loaded successfully",
-            mapping_count=len(manual_mappings),
-            filepath=manual_mappings_filepath,
-        )
-    except Exception as e:
-        log.error(
-            "Failed to load manual mappings",
-            error=str(e),
-            filepath=manual_mappings_filepath,
-        )
-        raise RuntimeError(
-            f"Failed to load manual mappings from {manual_mappings_filepath}: {e}"
-        ) from e
-
-    # TODO: Implement the following pipeline steps:
-
-    # Step 1: Load Data Sources
-    # Intent: Load chain metadata from multiple BigQuery data sources using standardized loaders:
-    # - op_stack_chain_metadata from api_table_uploads (OP Labs source, source_rank=1)
-    # - Goldsky chain usage data from daily_aggegate_l2_chain_usage_goldsky (OP Labs source, source_rank=1)
-    # - L2Beat activity data from daily_l2beat_l2_activity (L2Beat source, source_rank=2)
-    # - L2Beat metadata extended from l2beat_metadata_extended (L2Beat source, source_rank=1.9)
-    # - GrowThePie activity data from daily_growthepie_l2_activity (GrowThePie source, source_rank=3)
-    # - Dune transaction data from dune_all_txs (Dune source, source_rank=4.5)
-    # - DefiLlama chain TVL data from daily_defillama_chain_tvl (DefiLlama source, source_rank=5)
-    # - Manual mappings file for known corrections/overrides and special case handling
-    # Each source loader should return standardized DataFrames with consistent core columns
-    # Design allows easy addition of new data sources by implementing the standardized interface
-    # Note: Some sources will have unique metadata fields (e.g., L2Beat stage, DA layer) that others lack
-    log.info("Step 1: Load Data Sources - Not yet implemented")
-
-    # Step 2: Preprocess Individual Sources
-    # Intent: Clean and standardize each data source individually before combining:
-    # - Generate chain_key column (hash of chain_name for grouping similar names)
-    # - Standardize chain_id column (ensure consistent format, handle known collisions)
-    # - Apply source-specific transformations and data type conversions
-    # - Normalize chain names and symbols for consistency across sources
-    # - Apply "best display name" selection logic (e.g., prefer "Arbitrum One" over "Arbitrum")
-    # - Handle special cases through repeatable functions (e.g., Celo L1->L2 transition)
-    # This preprocessing ensures each source is standardized before entity resolution
-    log.info("Step 2: Preprocess Individual Sources - Not yet implemented")
-
-    # Step 3: Combine Preprocessed Sources
-    # Intent: Concatenate all preprocessed source DataFrames into unified dataset:
-    # - Union all preprocessed source_dfs into all_sources_df using pd.concat()
-    # - Maintain source attribution and ranking for later prioritization
-    # - Log total records combined from all sources
-    # - Result is a comprehensive dataset ready for entity resolution
-    log.info("Step 3: Combine Preprocessed Sources - Not yet implemented")
-
-    # Step 4: Entity Resolution (Most Complex Step)
-    # Intent: Generate unified_key and apply manual mappings to resolve chain entities:
-    # - Create unified_key column combining chain_id, chain_key, and display_name logic
-    # - Apply manual mappings to correct data inconsistencies and handle edge cases
-    # - Use metadata file approach for special cases (e.g., Celo -l2 suffix) rather than hardcoded logic
-    # - Handle chain_id collisions through sophisticated matching algorithms
-    # - Manual mappings applied BEFORE grouping to ensure correct entity resolution
-    # Success of this step depends heavily on quality of preprocessing in Steps 1-2
-    log.info("Step 4: Entity Resolution - Starting implementation with manual mappings")
-
-    # Example of how manual mappings would be integrated:
-    # When Step 4 is implemented, it would include:
-    # combined_df = ... # Result from Step 3
-    #
-    # # Apply manual mappings to resolve special cases and data inconsistencies
-    # resolved_df = apply_mapping_rules(combined_df, manual_mappings)
-    #
-    # # Continue with unified_key generation and entity resolution
-    # resolved_df = resolved_df.with_columns([
-    #     # Generate unified_key for entity resolution
-    #     pl.coalesce([
-    #         pl.col("chain_id"),
-    #         pl.col("chain_key"),
-    #         pl.col("display_name").str.to_lowercase().str.replace_all(r"[^\w]+", "")
-    #     ]).alias("unified_key")
-    # ])
-
-    log.info("Step 4: Entity Resolution - Manual mapping integration ready")
-
-    # Step 5: Deduplication and Attribute Merging
-    # Intent: Group by unified_key and merge attributes using source prioritization:
-    # - Group records by unified_key (represents same chain entity)
-    # - Within each group, sort by source_rank (lower rank = higher priority)
-    # - Select primary record from highest priority source
-    # - Aggregate temporal fields: min_dt_day (min), max_dt_day (max)
-    # - Concatenate tracking fields: data_sources, all_chain_keys
-    # - Apply secondary deduplication using is_dupe logic for remaining duplicates
-    # - Results in one canonical record per unique chain entity
-    log.info("Step 5: Deduplication and Attribute Merging - Not yet implemented")
-
-    # Step 6: Field Enrichment
-    # Intent: Enhance deduplicated data with additional metadata fields:
-    # - Load op_stack_chain_metadata specifically for enrichment (not as competing source)
-    # - Left join with deduplicated data on chain_id using flexible matching logic
-    # - Coalesce/add fields: gas_token, da_layer, public_mainnet_launch_date, etc.
-    # - Calculate derived fields: provider_entity_w_superchain, eth_eco_l2l3, etc.
-    # - Alternative approach: Join back to original datasets using chain aliases for field extraction
-    # - Integration point for registry ingestion and manual mapping outputs
-    # - Apply business logic overrides using metadata file configurations
-    log.info("Step 6: Field Enrichment - Not yet implemented")
-
-    # Step 7: Data Quality Validation
-    # Intent: Perform comprehensive error checking and data quality validation:
-    # - Detect potential duplicates (exact matches or high similarity across columns)
-    # - Validate data consistency and completeness
-    # - Check for unexpected chain_id collisions or mapping conflicts
-    # - Generate data quality reports and warnings
-    # - Flag records requiring manual review
-    # - Ensure final dataset meets quality standards before output
-    log.info("Step 7: Data Quality Validation - Not yet implemented")
-
-    # Step 8: Output to BigQuery
-    # Intent: Write the final validated metadata to BigQuery with comprehensive logging:
-    # - Format final DataFrame according to target BigQuery schema requirements
-    # - Perform final data validation before writing
-    # - Write to specified BigQuery table using write_full_df_to_bq with if_exists='replace'
-    # - Log detailed success/failure status and record counts
-    # - Update processing metadata and create comprehensive audit logs
-    # - Generate summary statistics and data lineage information
-    log.info("Step 8: Output to BigQuery - Not yet implemented")
-
-    log.info("Pipeline execution finished")
-
-
-if __name__ == "__main__":
-    # Example usage with enhanced manual mappings support
-    # In production, these would come from command line arguments or configuration
-    build_all_chains_metadata(
-        output_bq_table="aggregated_chains_metadata",
-        manual_mappings_filepath="src/op_analytics/datapipeline/chains/resources/manual_chain_mappings.csv",
-        bq_project_id="op-analytics-dev",
-        bq_dataset_id="chains_metadata",
-    )
+    loader_names = LoaderRegistry.list_loaders()
+    log.info(f"Running {len(loader_names)} loaders: {loader_names}")
+
+    # Load data from all loaders
+    dfs = []
+    for loader_name in loader_names:
+        loader_cls = LoaderRegistry.get_loader(loader_name)
+        if loader_cls is None:
+            log.warning(f"Loader {loader_name} not found, skipping")
+            continue
+
+        # Simple loader instantiation based on known types
+        if loader_name == "csv_loader":
+            loader = loader_cls(csv_path=csv_path)
+        elif loader_name in ["bq_chain_metadata", "goldsky"]:
+            loader = loader_cls(bq_project_id=bq_project_id, bq_dataset_id=bq_dataset_id)
+        else:
+            loader = loader_cls()
+
+        df = loader.run()
+        dfs.append(df)
+        log.info(f"Loaded {len(df)} records from {loader_name}")
+
+    # Concatenate and deduplicate
+    all_chains_df = pl.concat(dfs, how="vertical_relaxed")
+    all_chains_df = all_chains_df.unique(subset=["chain_key", "source_name"], keep="first")
+    log.info(f"Aggregated {len(all_chains_df)} unique chain records")
+
+    # Apply manual mappings
+    mapping_rules = load_manual_mappings(manual_mappings_filepath)
+    all_chains_df = apply_mapping_rules(all_chains_df, mapping_rules)
+
+    # Write to BigQuery
+    dataset, table_name = output_bq_table.split(".", 1)
+    overwrite_unpartitioned_table(all_chains_df, dataset, table_name)
+    log.info(f"Wrote {len(all_chains_df)} records to {output_bq_table}")
+
+    return all_chains_df
diff --git a/src/op_analytics/datapipeline/chains/loaders/base.py b/src/op_analytics/datapipeline/chains/loaders/base.py
@@ -0,0 +1,78 @@
+import polars as pl
+from op_analytics.coreutils.logger import structlog
+from op_analytics.datapipeline.chains.schemas import (
+    harmonize_to_canonical_schema,
+    generate_chain_key,
+)
+
+log = structlog.get_logger()
+
+
+class BaseChainMetadataLoader:
+    """Base class for chain metadata loaders."""
+
+    def __init__(self, **kwargs):
+        for key, value in kwargs.items():
+            setattr(self, key, value)
+
+    def run(self) -> pl.DataFrame:
+        """Load and process chain metadata."""
+        df = self.load_data()
+
+        # Ensure chain_key exists
+        if df.height > 0 and "chain_key" not in df.columns:
+            raise ValueError(f"Loader {self.__class__.__name__} must add 'chain_key' column")
+
+        return harmonize_to_canonical_schema(df)
+
+    def add_metadata_columns(
+        self, df: pl.DataFrame, chain_key_col: str, source: str, source_rank: int
+    ) -> pl.DataFrame:
+        """Helper method to safely add metadata columns, handling empty DataFrames."""
+        if df.height == 0:
+            return df
+        return df.with_columns(
+            [
+                generate_chain_key(chain_key_col),
+                pl.lit(source).alias("source"),
+                pl.lit(source_rank).alias("source_rank"),
+            ]
+        )
+
+    def load_data(self) -> pl.DataFrame:
+        """Override this method to load data from your source."""
+        raise NotImplementedError
+
+
+# Simple registry
+_LOADERS = {}
+
+
+def register_loader(name: str, loader_class):
+    """Register a loader class."""
+    _LOADERS[name] = loader_class
+
+
+def get_loader(name: str):
+    """Get a loader class by name."""
+    return _LOADERS.get(name)
+
+
+def list_loaders():
+    """List all registered loader names."""
+    return list(_LOADERS.keys())
+
+
+# For backward compatibility
+class LoaderRegistry:
+    @classmethod
+    def register(cls, name: str, loader_cls):
+        register_loader(name, loader_cls)
+
+    @classmethod
+    def get_loader(cls, name: str):
+        return get_loader(name)
+
+    @classmethod
+    def list_loaders(cls):
+        return list_loaders()