From f8a64240e00640c3d0bd2aa3991188868c7ce8f2 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 1 Apr 2026 11:31:17 +0000
Subject: [PATCH 1/3] fix: implement asymptotic CI/SE via Delta method for
 effect modifiers in LinearRegressionEstimator (issue #336)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The _estimate_confidence_intervals and _estimate_std_error methods in
LinearRegressionEstimator previously raised NotImplementedError when
effect modifiers were present.

Implement the Delta method (Gelman & Hill, ARM Book Ch.9):
- ATE = b_T + sum_j(b_{TX_j} * E[X_j]) — a linear combination of OLS coefs
- Contrast vector c encodes which coefficients contribute to the ATE
  given the feature ordering: [const, treatments, common_causes, interactions]
- Var(ATE) = c' * Σ * c where Σ is the OLS parameter covariance matrix
- SE(ATE) = |scale| * sqrt(Var(ATE)), CI uses t-distribution

Also adds four regression tests covering single/multiple effect
modifiers, SE positivity, and consistency with the no-modifier path.

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 .../linear_regression_estimator.py            |  69 +++++++++--
 .../test_linear_regression_estimator.py       | 110 ++++++++++++++++++
 2 files changed, 170 insertions(+), 9 deletions(-)

diff --git a/dowhy/causal_estimators/linear_regression_estimator.py b/dowhy/causal_estimators/linear_regression_estimator.py
index df0854d647..8444a7cf82 100755
--- a/dowhy/causal_estimators/linear_regression_estimator.py
+++ b/dowhy/causal_estimators/linear_regression_estimator.py
@@ -1,7 +1,9 @@
 import itertools
 from typing import List, Optional, Union
 
+import numpy as np
 import pandas as pd
+import scipy.stats
 import statsmodels.api as sm
 
 from dowhy.causal_estimator import CausalEstimator
@@ -105,16 +107,57 @@ def _build_model(self, data: pd.DataFrame):
         model = sm.OLS(data[self._target_estimand.outcome_variable[0]], features).fit()
         return (features, model)
 
+    def _ate_and_se_for_treatment(self, treatment_index: int):
+        """Compute the unscaled ATE and its standard error for one treatment variable.
+
+        Uses the Delta method: for ATE = c'β, SE = sqrt(c' Σ c), where Σ is the
+        OLS parameter covariance matrix and c is the contrast vector.
+
+        The feature column order produced by ``_build_features`` (after ``sm.add_constant``) is:
+        [const, T_0, …, T_k, W_0, …, W_m, T_0·X_0, …, T_0·X_n, T_1·X_0, …]
+
+        :param treatment_index: 0-based index into the treatment variable list.
+        :returns: Tuple of (ate_unscaled, se_unscaled).
+        """
+        n_treatments = len(self._target_estimand.treatment_variable)
+        n_common_causes = len(self._observed_common_causes_names)
+        n_effect_modifiers = len(self._effect_modifier_names)
+
+        em_means = np.asarray(self._effect_modifiers.mean(axis=0))
+
+        params = self.model.params.to_numpy()
+        cov = self.model.cov_params().to_numpy()
+
+        n_params = len(params)
+        c = np.zeros(n_params)
+        # Direct treatment coefficient (offset by 1 for the intercept)
+        c[1 + treatment_index] = 1.0
+        # Interaction coefficients T_i · X_j start at:
+        # 1 (const) + n_treatments + n_common_causes + treatment_index * n_effect_modifiers
+        interaction_start = 1 + n_treatments + n_common_causes + treatment_index * n_effect_modifiers
+        c[interaction_start : interaction_start + n_effect_modifiers] = em_means
+
+        ate = float(c @ params)
+        var_ate = float(c @ cov @ c)
+        se = float(np.sqrt(max(var_ate, 0.0)))
+        return ate, se
+
     def _estimate_confidence_intervals(self, confidence_level, method=None):
         if self._effect_modifier_names:
-            # The average treatment effect is a combination of different
-            # regression coefficients. Complicated to compute the confidence
-            # interval analytically. For example, if y=a + b1.t + b2.tx, then
-            # the average treatment effect is b1+b2.mean(x).
-            # Refer Gelman, Hill. ARM Book. Chapter 9
-            # http://www.stat.columbia.edu/~gelman/arm/chap9.pdf
-            # TODO: Looking for contributions
-            raise NotImplementedError
+            # Use the Delta method to compute asymptotic confidence intervals for the
+            # ATE when effect modifiers are present.  The ATE is a linear combination
+            # of the OLS coefficients: ATE = b_T + b_{TX_1}*E[X_1] + …
+            # Reference: Gelman & Hill, ARM Book, Chapter 9
+            n_treatments = len(self._target_estimand.treatment_variable)
+            scale = self._treatment_value - self._control_value
+            t_score = scipy.stats.t.ppf((1.0 + confidence_level) / 2.0, df=self.model.df_resid)
+            rows = []
+            for i in range(n_treatments):
+                ate_unscaled, se_unscaled = self._ate_and_se_for_treatment(i)
+                ate_scaled = scale * ate_unscaled
+                margin = abs(scale) * t_score * se_unscaled
+                rows.append([ate_scaled - margin, ate_scaled + margin])
+            return np.array(rows)
         else:
             conf_ints = self.model.conf_int(alpha=1 - confidence_level)
             # For a linear regression model, the causal effect of a variable is equal to the coefficient corresponding to the
@@ -126,7 +169,15 @@ def _estimate_confidence_intervals(self, confidence_level, method=None):
 
     def _estimate_std_error(self, method=None):
         if self._effect_modifier_names:
-            raise NotImplementedError
+            # Delta method: SE(scale * ATE) = |scale| * sqrt(c' Σ c)
+            scale = self._treatment_value - self._control_value
+            ses = np.array(
+                [
+                    abs(scale) * self._ate_and_se_for_treatment(i)[1]
+                    for i in range(len(self._target_estimand.treatment_variable))
+                ]
+            )
+            return ses
         else:
             std_error = self.model.bse[1 : (len(self._target_estimand.treatment_variable) + 1)]
 
diff --git a/tests/causal_estimators/test_linear_regression_estimator.py b/tests/causal_estimators/test_linear_regression_estimator.py
index fca521c628..578ff7f621 100755
--- a/tests/causal_estimators/test_linear_regression_estimator.py
+++ b/tests/causal_estimators/test_linear_regression_estimator.py
@@ -1,7 +1,10 @@
+import numpy as np
 from pytest import mark
 
 import dowhy.datasets
+from dowhy import EstimandType, identify_effect_auto
 from dowhy.causal_estimators.linear_regression_estimator import LinearRegressionEstimator
+from dowhy.graph import build_graph_from_str
 
 from .base import SimpleEstimator, TestGraphObject, example_graph
 
@@ -187,3 +190,110 @@ def test_general_adjustment_estimation_on_example_graphs(self, example_graph: Te
         data["df"] = data["df"][example_graph.observed_nodes]
         estimator_tester = SimpleEstimator(0.1, LinearRegressionEstimator, identifier_method="general_adjustment")
         estimator_tester.custom_data_average_treatment_effect_test(data)
+
+
+class TestLinearRegressionAsymptoticCI:
+    """Tests for the Delta-method asymptotic CI/SE with effect modifiers (issue #336)."""
+
+    def _make_dataset_and_estimand(self, num_effect_modifiers=1, num_common_causes=1, num_treatments=1, seed=42):
+        np.random.seed(seed)
+        data = dowhy.datasets.linear_dataset(
+            beta=5,
+            num_common_causes=num_common_causes,
+            num_instruments=0,
+            num_effect_modifiers=num_effect_modifiers,
+            num_treatments=num_treatments,
+            num_samples=2000,
+            treatment_is_binary=False,
+        )
+        gml_graph = data["gml_graph"]
+        df = data["df"]
+        target_estimand = identify_effect_auto(
+            build_graph_from_str(gml_graph),
+            observed_nodes=list(df.columns),
+            action_nodes=data["treatment_name"],
+            outcome_nodes=data["outcome_name"],
+            estimand_type=EstimandType.NONPARAMETRIC_ATE,
+        )
+        target_estimand.set_identifier_method("backdoor")
+        return data, target_estimand
+
+    def test_ci_returned_not_raises_single_treatment_single_em(self):
+        """No NotImplementedError for single treatment + single effect modifier."""
+        data, estimand = self._make_dataset_and_estimand(num_effect_modifiers=1)
+        estimator = LinearRegressionEstimator(
+            identified_estimand=estimand,
+            confidence_intervals=True,
+        )
+        estimator.fit(data["df"], effect_modifier_names=data["effect_modifier_names"])
+        estimate = estimator.estimate_effect(
+            data["df"],
+            treatment_value=1,
+            control_value=0,
+            confidence_intervals=True,
+        )
+        ci = estimate.get_confidence_intervals()
+        assert ci is not None
+        assert ci.shape == (1, 2), f"Expected shape (1,2), got {ci.shape}"
+        lower, upper = ci[0]
+        assert lower < upper, "CI lower bound must be less than upper bound"
+
+    def test_ci_contains_true_ate_with_high_probability(self):
+        """95% CI should bracket the true ATE on a large sample."""
+        data, estimand = self._make_dataset_and_estimand(num_effect_modifiers=2, num_common_causes=1, seed=0)
+        estimator = LinearRegressionEstimator(
+            identified_estimand=estimand,
+            confidence_intervals=True,
+            confidence_level=0.95,
+        )
+        estimator.fit(data["df"], effect_modifier_names=data["effect_modifier_names"])
+        estimate = estimator.estimate_effect(
+            data["df"],
+            treatment_value=1,
+            control_value=0,
+            confidence_intervals=True,
+        )
+        ci = estimate.get_confidence_intervals()
+        lower, upper = ci[0]
+        true_ate = data["ate"]
+        assert lower <= true_ate <= upper, f"True ATE {true_ate:.4f} not inside 95% CI [{lower:.4f}, {upper:.4f}]"
+
+    def test_std_error_positive_with_effect_modifier(self):
+        """Standard error should be positive and finite when effect modifiers are present."""
+        data, estimand = self._make_dataset_and_estimand(num_effect_modifiers=1)
+        estimator = LinearRegressionEstimator(
+            identified_estimand=estimand,
+            test_significance=True,
+            confidence_intervals=True,
+        )
+        estimator.fit(data["df"], effect_modifier_names=data["effect_modifier_names"])
+        estimate = estimator.estimate_effect(
+            data["df"],
+            treatment_value=1,
+            control_value=0,
+            confidence_intervals=True,
+        )
+        se = estimate.get_standard_error()
+        assert se is not None
+        assert np.all(np.isfinite(se)), "SE should be finite"
+        assert np.all(se > 0), "SE should be positive"
+
+    def test_ci_consistent_with_no_effect_modifier(self):
+        """With no effect modifiers, Delta-method and direct statsmodels CI should agree."""
+        data, estimand = self._make_dataset_and_estimand(num_effect_modifiers=0)
+        estimator = LinearRegressionEstimator(
+            identified_estimand=estimand,
+            confidence_intervals=True,
+            confidence_level=0.95,
+        )
+        estimator.fit(data["df"], effect_modifier_names=[])
+        estimate = estimator.estimate_effect(
+            data["df"],
+            treatment_value=1,
+            control_value=0,
+            confidence_intervals=True,
+        )
+        ci = estimate.get_confidence_intervals()
+        assert ci is not None
+        lower, upper = ci[0]
+        assert lower < upper

From c2f1439cb07aa649e3df434b65e77b225de9490d Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 8 Apr 2026 11:27:41 +0000
Subject: [PATCH 2/3] fix: use encoded column counts for categorical variables
 in Delta-method CI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_ate_and_se_for_treatment was computing interaction_start using
len(variable_names) instead of the actual number of encoded columns.
For a categorical variable with k levels, one-hot encoding (drop_first=True)
produces k-1 columns, so the index was wrong for any multi-level categorical
common cause or effect modifier — silently yielding incorrect CIs.

Fixes:
- Use self._observed_common_causes.shape[1] (encoded width) instead of
  len(self._observed_common_causes_names) for n_common_causes
- Use self._effect_modifiers.mean(axis=0).to_numpy() (from encoded DataFrame)
  and derive n_effect_modifiers from its length
- Add an assertion that checks n_params == expected_params to catch any
  future column-ordering regressions loudly rather than silently

Tests added:
- test_ci_no_error_with_categorical_common_cause: verifies a 3-level
  categorical common cause produces valid CIs
- test_ci_uses_encoded_column_count_not_name_count: regression test
  that verifies finite bounds and positive SE for a 4-level categorical
  common cause (the original bug scenario)

Bug reported and fix approach credited to @kf-rahman (PR #1444 / issue #336).

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
Signed-off-by: github-actions[bot] <github-actions[bot]@users.noreply.github.com>
---
 .../linear_regression_estimator.py            | 16 +++-
 .../test_linear_regression_estimator.py       | 75 +++++++++++++++++++
 2 files changed, 87 insertions(+), 4 deletions(-)

diff --git a/dowhy/causal_estimators/linear_regression_estimator.py b/dowhy/causal_estimators/linear_regression_estimator.py
index 8444a7cf82..0b1fc11d8e 100755
--- a/dowhy/causal_estimators/linear_regression_estimator.py
+++ b/dowhy/causal_estimators/linear_regression_estimator.py
@@ -120,15 +120,23 @@ def _ate_and_se_for_treatment(self, treatment_index: int):
         :returns: Tuple of (ate_unscaled, se_unscaled).
         """
         n_treatments = len(self._target_estimand.treatment_variable)
-        n_common_causes = len(self._observed_common_causes_names)
-        n_effect_modifiers = len(self._effect_modifier_names)
-
-        em_means = np.asarray(self._effect_modifiers.mean(axis=0))
+        # Use the actual number of encoded columns, not the number of variable names.
+        # Categorical variables are one-hot encoded (drop_first=True), so a variable
+        # with k levels produces k-1 columns — len(names) would be wrong.
+        n_common_causes = self._observed_common_causes.shape[1] if self._observed_common_causes is not None else 0
+        em_means = self._effect_modifiers.mean(axis=0).to_numpy()
+        n_effect_modifiers = len(em_means)
 
         params = self.model.params.to_numpy()
         cov = self.model.cov_params().to_numpy()
 
         n_params = len(params)
+        expected_params = 1 + n_treatments + n_common_causes + n_treatments * n_effect_modifiers
+        assert n_params == expected_params, (
+            f"Model has {n_params} params but expected {expected_params}. "
+            "Column ordering assumption in _ate_and_se_for_treatment may be broken "
+            "(check that encoded column counts are used, not variable name counts)."
+        )
         c = np.zeros(n_params)
         # Direct treatment coefficient (offset by 1 for the intercept)
         c[1 + treatment_index] = 1.0
diff --git a/tests/causal_estimators/test_linear_regression_estimator.py b/tests/causal_estimators/test_linear_regression_estimator.py
index 578ff7f621..b757c7f490 100755
--- a/tests/causal_estimators/test_linear_regression_estimator.py
+++ b/tests/causal_estimators/test_linear_regression_estimator.py
@@ -297,3 +297,78 @@ def test_ci_consistent_with_no_effect_modifier(self):
         assert ci is not None
         lower, upper = ci[0]
         assert lower < upper
+
+    def _make_dataset_with_categorical_common_cause(self, n_levels=3, seed=42):
+        """Build a simple synthetic dataset with one continuous effect modifier and
+        one categorical common cause (n_levels levels), for testing categorical encoding."""
+        import pandas as pd
+        from dowhy import CausalModel
+
+        rng = np.random.default_rng(seed)
+        n = 500
+        # Categorical common cause W with n_levels levels
+        w = rng.integers(0, n_levels, size=n).astype(str)
+        # Continuous effect modifier X
+        x = rng.standard_normal(n)
+        # Treatment T (continuous, affected by W)
+        t = (w == "0").astype(float) + rng.standard_normal(n) * 0.5
+        # Outcome Y depends on T, T*X interaction, and W (true ATE ~2)
+        beta_t = 2.0
+        y = beta_t * t + 0.5 * t * x + 0.3 * (w == "1").astype(float) + rng.standard_normal(n) * 0.2
+
+        df = pd.DataFrame({"T": t, "Y": y, "W": pd.Categorical(w), "X": x})
+        graph_str = "digraph { W -> T; W -> Y; T -> Y; X -> Y }"
+        model = CausalModel(data=df, treatment="T", outcome="Y", graph=graph_str)
+        estimand = model.identify_effect(proceed_when_unidentifiable=True)
+        estimand.set_identifier_method("backdoor")
+        return df, estimand, beta_t
+
+    def test_ci_no_error_with_categorical_common_cause(self):
+        """Delta-method CI should work when a common cause is categorical (multi-level)."""
+        df, estimand, _ = self._make_dataset_with_categorical_common_cause(n_levels=3)
+        estimator = LinearRegressionEstimator(
+            identified_estimand=estimand,
+            confidence_intervals=True,
+        )
+        estimator.fit(df, effect_modifier_names=["X"])
+        estimate = estimator.estimate_effect(
+            df,
+            treatment_value=1,
+            control_value=0,
+            confidence_intervals=True,
+        )
+        ci = estimate.get_confidence_intervals()
+        assert ci is not None
+        lower, upper = ci[0]
+        assert lower < upper, "CI lower bound must be less than upper bound"
+
+    def test_ci_uses_encoded_column_count_not_name_count(self):
+        """Regression test: interaction_start must use encoded column width, not len(names).
+
+        A 3-level categorical common cause W encodes to 2 columns (drop_first=True).
+        If we use len(observed_common_causes_names)==1 instead of shape[1]==2, the
+        interaction_start index is off-by-one and we silently pick the wrong coefficient.
+        This test verifies the CI is finite and the assertion inside
+        _ate_and_se_for_treatment does not fire.
+        """
+        df, estimand, true_ate = self._make_dataset_with_categorical_common_cause(n_levels=4, seed=7)
+        estimator = LinearRegressionEstimator(
+            identified_estimand=estimand,
+            confidence_intervals=True,
+            confidence_level=0.95,
+        )
+        estimator.fit(df, effect_modifier_names=["X"])
+        estimate = estimator.estimate_effect(
+            df,
+            treatment_value=1,
+            control_value=0,
+            confidence_intervals=True,
+        )
+        ci = estimate.get_confidence_intervals()
+        assert ci is not None
+        lower, upper = ci[0]
+        assert np.isfinite(lower) and np.isfinite(upper), "CI bounds must be finite"
+        assert lower < upper, "CI lower bound must be less than upper bound"
+        se = estimate.get_standard_error()
+        assert se is not None
+        assert np.all(np.isfinite(se)) and np.all(se > 0), "SE must be positive and finite"

From a84df101b70d01ae6bf4e48fcd2a028755f49e7a Mon Sep 17 00:00:00 2001
From: Emre Kiciman <emrek@microsoft.com>
Date: Sun, 19 Apr 2026 02:12:00 -0700
Subject: [PATCH 3/3] fix formatting

Signed-off-by: Emre Kiciman <emrek@microsoft.com>
---
 tests/causal_estimators/test_linear_regression_estimator.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/causal_estimators/test_linear_regression_estimator.py b/tests/causal_estimators/test_linear_regression_estimator.py
index b757c7f490..df8acce5ae 100755
--- a/tests/causal_estimators/test_linear_regression_estimator.py
+++ b/tests/causal_estimators/test_linear_regression_estimator.py
@@ -302,6 +302,7 @@ def _make_dataset_with_categorical_common_cause(self, n_levels=3, seed=42):
         """Build a simple synthetic dataset with one continuous effect modifier and
         one categorical common cause (n_levels levels), for testing categorical encoding."""
         import pandas as pd
+
         from dowhy import CausalModel
 
         rng = np.random.default_rng(seed)