fix bug: obs_indx -> obs_ind + improve set_data

drbenvincent · drbenvincent · commit e6113d9aefb7 · 2025-04-21T13:14:00.000+01:00
diff --git a/causalpy/experiments/diff_in_diff.py b/causalpy/experiments/diff_in_diff.py
@@ -104,7 +104,7 @@ def __init__(
 
         # fit model
         if isinstance(self.model, PyMCModel):
-            COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.X.shape[0])}
+            COORDS = {"coeffs": self.labels, "obs_ind": np.arange(self.X.shape[0])}
             self.model.fit(X=self.X, y=self.y, coords=COORDS)
         elif isinstance(self.model, RegressorMixin):
             self.model.fit(X=self.X, y=self.y)
diff --git a/causalpy/experiments/interrupted_time_series.py b/causalpy/experiments/interrupted_time_series.py
@@ -110,7 +110,7 @@ def __init__(
 
         # fit the model to the observed (pre-intervention) data
         if isinstance(self.model, PyMCModel):
-            COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.pre_X.shape[0])}
+            COORDS = {"coeffs": self.labels, "obs_ind": np.arange(self.pre_X.shape[0])}
             self.model.fit(X=self.pre_X, y=self.pre_y, coords=COORDS)
         elif isinstance(self.model, RegressorMixin):
             self.model.fit(X=self.pre_X, y=self.pre_y)
diff --git a/causalpy/experiments/prepostnegd.py b/causalpy/experiments/prepostnegd.py
@@ -113,7 +113,7 @@ def __init__(
 
         # fit the model to the observed (pre-intervention) data
         if isinstance(self.model, PyMCModel):
-            COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.X.shape[0])}
+            COORDS = {"coeffs": self.labels, "obs_ind": np.arange(self.X.shape[0])}
             self.model.fit(X=self.X, y=self.y, coords=COORDS)
         elif isinstance(self.model, RegressorMixin):
             raise NotImplementedError("Not implemented for OLS model")
diff --git a/causalpy/experiments/regression_discontinuity.py b/causalpy/experiments/regression_discontinuity.py
@@ -124,7 +124,7 @@ def __init__(
         # fit model
         if isinstance(self.model, PyMCModel):
             # fit the model to the observed (pre-intervention) data
-            COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.X.shape[0])}
+            COORDS = {"coeffs": self.labels, "obs_ind": np.arange(self.X.shape[0])}
             self.model.fit(X=self.X, y=self.y, coords=COORDS)
         elif isinstance(self.model, RegressorMixin):
             self.model.fit(X=self.X, y=self.y)
diff --git a/causalpy/experiments/regression_kink.py b/causalpy/experiments/regression_kink.py
@@ -84,7 +84,7 @@ def __init__(
         self.y, self.X = np.asarray(y), np.asarray(X)
         self.outcome_variable_name = y.design_info.column_names[0]
 
-        COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.X.shape[0])}
+        COORDS = {"coeffs": self.labels, "obs_ind": np.arange(self.X.shape[0])}
         self.model.fit(X=self.X, y=self.y, coords=COORDS)
 
         # score the goodness of fit to all data
diff --git a/causalpy/experiments/synthetic_control.py b/causalpy/experiments/synthetic_control.py
@@ -105,7 +105,7 @@ def __init__(
 
         # fit the model to the observed (pre-intervention) data
         if isinstance(self.model, PyMCModel):
-            COORDS = {"coeffs": self.labels, "obs_indx": np.arange(self.pre_X.shape[0])}
+            COORDS = {"coeffs": self.labels, "obs_ind": np.arange(self.pre_X.shape[0])}
             self.model.fit(X=self.pre_X, y=self.pre_y, coords=COORDS)
         elif isinstance(self.model, RegressorMixin):
             self.model.fit(X=self.pre_X, y=self.pre_y)
diff --git a/causalpy/pymc_models.py b/causalpy/pymc_models.py
@@ -87,9 +87,20 @@ def _data_setter(self, X) -> None:
 
         This method is used internally to register new data for the model for
         prediction.
+
+        NOTE: We are actively changing the `X`. Often, this matrix will have a different
+        number of rows than the original data. So to make the shapes work, we need to
+        update all data nodes in the model to have the correct shape. The values are not
+        used, so we set them to 0. In our case, we just have data nodes X and y, but if
+        in the future we get more complex models with more data nodes, then we'll need
+        to update all of them - ideally programmatically.
         """
+        new_no_of_observations = X.shape[0]
         with self:
-            pm.set_data({"X": X})
+            pm.set_data(
+                {"X": X, "y": np.zeros(new_no_of_observations)},
+                coords={"obs_ind": np.arange(new_no_of_observations)},
+            )
 
     def fit(self, X, y, coords: Optional[Dict[str, Any]] = None) -> None:
         """Draw samples from posterior, prior predictive, and posterior predictive
@@ -111,7 +122,7 @@ def fit(self, X, y, coords: Optional[Dict[str, Any]] = None) -> None:
             )
         return self.idata
 
-    def predict(self, X):
+    def predict(self, X: np.ndarray):
         """
         Predict data given input data `X`
 
@@ -206,7 +217,7 @@ class LinearRegression(PyMCModel):
     >>> lr = LinearRegression(sample_kwargs={"progressbar": False})
     >>> lr.fit(X, y, coords={
     ...                 'coeffs': ['x', 'treated'],
-    ...                 'obs_indx': np.arange(rd.shape[0])
+    ...                 'obs_ind': np.arange(rd.shape[0])
     ...                },
     ... )
     Inference data...
@@ -451,7 +462,7 @@ class PropensityScore(PyMCModel):
     >>> ps = PropensityScore(sample_kwargs={"progressbar": False})
     >>> ps.fit(X, t, coords={
     ...                 'coeffs': ['age', 'race'],
-    ...                 'obs_indx': np.arange(df.shape[0])
+    ...                 'obs_ind': np.arange(df.shape[0])
     ...                },
     ... )
     Inference...