Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
108 changes: 65 additions & 43 deletions dowhy/causal_refuters/placebo_treatment_refuter.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,60 @@ def refute_estimate(self, show_progress_bar=False):
return refute


def _get_placebo_names(treatment_names: List[str]) -> List[str]:
"""Return placebo column name(s) for the given treatment name(s).

Single-treatment case uses ``"placebo"`` for backward compatibility;
multi-treatment case prefixes each name with ``"placebo_"``.
"""
if len(treatment_names) == 1:
return ["placebo"]
return ["placebo_" + t for t in treatment_names]


def _generate_random_placebo(data: pd.DataFrame, treatment_name: str, type_dict: Dict) -> pd.Series:
"""Generate a single random placebo column matching the dtype of *treatment_name*."""
dtype_name = type_dict[treatment_name].name
n = data.shape[0]
if "float" in dtype_name:
logger.info(
"Using a Normal Distribution with Mean:{} and Variance:{}".format(
DEFAULT_MEAN_OF_NORMAL,
DEFAULT_STD_DEV_OF_NORMAL,
)
)
return pd.Series(
np.random.randn(n) * DEFAULT_STD_DEV_OF_NORMAL + DEFAULT_MEAN_OF_NORMAL,
index=data.index,
)
elif "bool" in dtype_name:
logger.info(
"Using a Binomial Distribution with {} trials and {} probability of success".format(
DEFAULT_NUMBER_OF_TRIALS,
DEFAULT_PROBABILITY_OF_BINOMIAL,
)
)
return pd.Series(
np.random.binomial(DEFAULT_NUMBER_OF_TRIALS, DEFAULT_PROBABILITY_OF_BINOMIAL, n).astype(bool),
index=data.index,
)
elif "int" in dtype_name:
logger.info(
Comment on lines +97 to +124
Copy link

Copilot AI Apr 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

_generate_random_placebo infers dtype via substring checks on type_dict[treatment_name].name (e.g., "int" in dtype_name). This is brittle for pandas extension dtypes like Int64/Float64 (case mismatch) and can lead to the new ValueError even when the column is a valid numeric type. Prefer using pandas.api.types helpers (is_float_dtype/is_integer_dtype/is_bool_dtype/is_categorical_dtype) on the actual dtype/Series for a robust check.

Copilot uses AI. Check for mistakes.
"Using a Discrete Uniform Distribution lying between {} and {}".format(
data[treatment_name].min(), data[treatment_name].max()
)
)
return pd.Series(
np.random.randint(low=data[treatment_name].min(), high=data[treatment_name].max() + 1, size=n),
index=data.index,
)
elif "category" in dtype_name:
categories = data[treatment_name].unique()
logger.info("Using a Discrete Uniform Distribution with the following categories:{}".format(categories))
return pd.Series(np.random.choice(categories, size=n), index=data.index).astype("category")
Comment on lines +134 to +136
Copy link

Copilot AI Apr 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For categorical treatments, sampling from data[treatment_name].unique() and then .astype('category') can drop the original category set/order and produce a different categorical dtype than the source column. It’s safer to preserve the original categories (e.g., use .cat.categories / original dtype) when generating the placebo so downstream estimators that rely on consistent categories don’t see a dtype change.

Suggested change
categories = data[treatment_name].unique()
logger.info("Using a Discrete Uniform Distribution with the following categories:{}".format(categories))
return pd.Series(np.random.choice(categories, size=n), index=data.index).astype("category")
treatment = data[treatment_name]
categories = treatment.cat.categories
logger.info("Using a Discrete Uniform Distribution with the following categories:{}".format(categories))
return pd.Series(
pd.Categorical(
np.random.choice(categories, size=n),
categories=categories,
ordered=treatment.cat.ordered,
),
index=data.index,
)

Copilot uses AI. Check for mistakes.
raise ValueError("Unsupported treatment dtype '{}' for treatment '{}'.".format(dtype_name, treatment_name))


def _refute_once(
data: pd.DataFrame,
target_estimand: IdentifiedEstimand,
Expand All @@ -92,61 +146,29 @@ def _refute_once(
placebo_type: PlaceboType = PlaceboType.DEFAULT,
random_state: Optional[np.random.RandomState] = None,
):
placebo_names = _get_placebo_names(treatment_names)

if placebo_type == PlaceboType.PERMUTE:
permuted_idx = None
if random_state is None:
permuted_idx = np.random.choice(data.shape[0], size=data.shape[0], replace=False)

else:
permuted_idx = random_state.choice(data.shape[0], size=data.shape[0], replace=False)
new_treatment = data[treatment_names].iloc[permuted_idx].values
permuted_values = data[treatment_names].iloc[permuted_idx].values
new_data = data.copy()
for i, pname in enumerate(placebo_names):
col = permuted_values[:, i] if len(treatment_names) > 1 else permuted_values.ravel()
new_data[pname] = col
Comment on lines +156 to +160
Copy link

Copilot AI Apr 19, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the PERMUTE path, data[treatment_names].iloc[permuted_idx].values converts the permuted columns to a NumPy array, which can silently coerce dtypes (e.g., categorical -> object, mixed dtypes -> object). That can change estimator behavior compared to the original treatment columns. Consider permuting each treatment column as a pandas Series (preserving dtype) and then resetting its index to data.index before assignment so the permutation applies by position without index alignment undoing it.

Copilot uses AI. Check for mistakes.
if target_estimand.identifier_method.startswith("iv"):
new_instruments_values = data[estimate.estimator.estimating_instrument_names].iloc[permuted_idx].values
new_instruments_df = pd.DataFrame(
new_instruments_values,
columns=["placebo_" + s for s in data[estimate.estimator.estimating_instrument_names].columns],
)
else:
if "float" in type_dict[treatment_names[0]].name:
logger.info(
"Using a Normal Distribution with Mean:{} and Variance:{}".format(
DEFAULT_MEAN_OF_NORMAL,
DEFAULT_STD_DEV_OF_NORMAL,
)
)
new_treatment = np.random.randn(data.shape[0]) * DEFAULT_STD_DEV_OF_NORMAL + DEFAULT_MEAN_OF_NORMAL

elif "bool" in type_dict[treatment_names[0]].name:
logger.info(
"Using a Binomial Distribution with {} trials and {} probability of success".format(
DEFAULT_NUMBER_OF_TRIALS,
DEFAULT_PROBABILITY_OF_BINOMIAL,
)
)
new_treatment = np.random.binomial(
DEFAULT_NUMBER_OF_TRIALS,
DEFAULT_PROBABILITY_OF_BINOMIAL,
data.shape[0],
).astype(bool)

elif "int" in type_dict[treatment_names[0]].name:
logger.info(
"Using a Discrete Uniform Distribution lying between {} and {}".format(
data[treatment_names[0]].min(), data[treatment_names[0]].max()
)
)
new_treatment = np.random.randint(
low=data[treatment_names[0]].min(), high=data[treatment_names[0]].max() + 1, size=data.shape[0]
)

elif "category" in type_dict[treatment_names[0]].name:
categories = data[treatment_names[0]].unique()
logger.info("Using a Discrete Uniform Distribution with the following categories:{}".format(categories))
sample = np.random.choice(categories, size=data.shape[0])
new_treatment = pd.Series(sample, index=data.index).astype("category")
new_data = data.copy()
for t, pname in zip(treatment_names, placebo_names):
new_data[pname] = _generate_random_placebo(data, t, type_dict)

# Create a new column in the data by the name of placebo
new_data = data.assign(placebo=new_treatment)
if target_estimand.identifier_method.startswith("iv"):
new_data = pd.concat((new_data, new_instruments_df), axis=1)
# Sanity check the data
Expand Down Expand Up @@ -219,7 +241,7 @@ def refute_placebo_treatment(
# We make a copy as a safety measure, we don't want to change the
# original DataFrame
identified_estimand = copy.deepcopy(target_estimand)
identified_estimand.treatment_variable = ["placebo"]
identified_estimand.treatment_variable = _get_placebo_names(treatment_names)

if target_estimand.identifier_method.startswith("iv"):
identified_estimand.instrumental_variables = [
Expand Down
48 changes: 48 additions & 0 deletions tests/causal_refuters/test_placebo_refuter.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,3 +119,51 @@ def test_placebo_refuter_iv_with_explicit_instrument_name(self):
assert (
abs(ref_explicit.new_effect) < 5
), f"Placebo effect with implicit iv_instrument_name ({ref_implicit.new_effect:.2f}) is unexpectedly large"

@mark.parametrize("placebo_type", ["permute", "Random Data"])
def test_placebo_refuter_multiple_treatments(self, placebo_type):
"""Regression test for #251: placebo_treatment_refuter must not raise
'Wrong number of items passed N, placement implies 1' when multiple treatments are used.
"""
np.random.seed(42)
n_treatments = 3
data = dowhy.datasets.linear_dataset(
num_samples=500,
beta=10,
num_common_causes=0,
num_instruments=0,
num_effect_modifiers=0,
num_treatments=n_treatments,
treatment_is_binary=True,
outcome_is_binary=False,
num_discrete_common_causes=0,
num_discrete_effect_modifiers=0,
one_hot_encode=False,
)
model = CausalModel(
data=data["df"],
treatment=data["treatment_name"],
outcome=data["outcome_name"],
graph=data["gml_graph"],
)
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)
estimate = model.estimate_effect(
identified_estimand,
method_name="backdoor.linear_regression",
control_value=[0] * n_treatments,
treatment_value=[0, 0, 1],
method_params={"need_conditional_estimates": False},
)
# Must not raise ValueError
result = model.refute_estimate(
identified_estimand,
estimate,
method_name="placebo_treatment_refuter",
placebo_type=placebo_type,
num_simulations=10,
)
# The placebo new_effect should be near zero (no real causal link after permutation/randomisation)
assert abs(result.new_effect) < abs(estimate.value), (
f"Placebo new_effect ({result.new_effect:.3f}) is unexpectedly close to the original "
f"estimate ({estimate.value:.3f}); the placebo is likely not severing the causal link."
)