Added offset option for fepois

bradhackinen · bradhackinen · commit 7f9cc5d9eafa · 2025-02-03T16:03:03.000-08:00
diff --git a/pyfixest/estimation/FixestMulti_.py b/pyfixest/estimation/FixestMulti_.py
@@ -135,6 +135,7 @@ def _prepare_estimation(
         ssc: Optional[dict[str, Union[str, bool]]] = None,
         fixef_rm: str = "none",
         drop_intercept: bool = False,
+        offset: Optional[Union[None, str]] = None,
     ) -> None:
         """
         Prepare model for estimation.
@@ -153,9 +154,13 @@ def _prepare_estimation(
             A string or dictionary specifying the type of variance-covariance
             matrix to use for inference.
             See `feols()` or `fepois()`.
-        weights : Union[None, np.ndarray], optional
-            An array of weights.
-            Either None or a 1D array of length N. Default is None.
+        weights : Union[None, str], optional
+            Default is None. Weights for WLS estimation. If None, all observations
+            are weighted equally. If a string, the name of the column in `data` that
+            contains the weights.
+        offset : Union[None, str], optional
+            Default is None. Offset variable for Poisson regression. If None, no offset.
+            If a string, the name of the column in `data` that contains the offset.
         ssc : dict[str, str], optional
             A dictionary specifying the type of standard errors to use for inference.
             See `feols()` or `fepois()`.
@@ -179,6 +184,7 @@ def _prepare_estimation(
         self._is_multiple_estimation = False
         self._drop_intercept = False
         self._weights = weights
+        self._offset = offset
         self._has_weights = False
         if weights is not None:
             self._has_weights = True
@@ -247,6 +253,7 @@ def _estimate_all_models(
         _ssc_dict = self._ssc_dict
         _drop_intercept = self._drop_intercept
         _weights = self._weights
+        _offset = self._offset
         _fixef_tol = self._fixef_tol
         _weights_type = self._weights_type
         _lean = self._lean
@@ -339,6 +346,7 @@ def _estimate_all_models(
                             drop_intercept=_drop_intercept,
                             weights=_weights,
                             weights_type=_weights_type,
+                            offset=_offset,
                             solver=solver,
                             demeaner_backend=demeaner_backend,
                             collin_tol=collin_tol,
diff --git a/pyfixest/estimation/estimation.py b/pyfixest/estimation/estimation.py
@@ -1,5 +1,5 @@
 from collections.abc import Mapping
-from typing import Any, Optional, Union
+from typing import Any, Optional, Union, Sequence
 
 import pandas as pd
 
@@ -503,6 +503,7 @@ def fepois(
     fml: str,
     data: DataFrameType,  # type: ignore
     vcov: Optional[Union[VcovTypeOptions, dict[str, str]]] = None,
+    offset:  Union[None, str] = None,
     ssc: Optional[dict[str, Union[str, bool]]] = None,
     fixef_rm: FixedRmOptions = "none",
     fixef_tol: float = 1e-08,
@@ -545,6 +546,10 @@ def fepois(
         Type of variance-covariance matrix for inference. Options include "iid",
         "hetero", "HC1", "HC2", "HC3", or a dictionary for CRV1/CRV3 inference.
 
+    offset : Union[None, str], optional
+        Default is None. Offset variable for Poisson regression. If None, no offset.
+        If a string, the name of the column in `data` that contains the offset.
+
     ssc : str
         A ssc object specifying the small sample correction for inference.
 
@@ -670,6 +675,7 @@ def fepois(
         data=data,
         vcov=vcov,
         weights=weights,
+        offset=offset,
         ssc=ssc,
         fixef_rm=fixef_rm,
         collin_tol=collin_tol,
@@ -702,7 +708,7 @@ def fepois(
     )
 
     fixest._prepare_estimation(
-        "fepois", fml, vcov, weights, ssc, fixef_rm, drop_intercept
+        "fepois", fml, vcov, weights, ssc, fixef_rm, drop_intercept, offset=offset
     )
     if fixest._is_iv:
         raise NotImplementedError(
@@ -1001,6 +1007,7 @@ def _estimation_input_checks(
     split: Optional[str],
     fsplit: Optional[str],
     separation_check: Optional[list[str]] = None,
+    offset: Optional[Union[None, str]] = None,
 ):
     if not isinstance(fml, str):
         raise TypeError("fml must be a string")
@@ -1027,6 +1034,9 @@ def _estimation_input_checks(
     if weights is not None:
         assert weights in data.columns, "weights must be a column in data"
 
+    if offset is not None:
+        assert offset in data.columns, "offset must be a column in data"
+
     bool_args = [copy_data, store_data, lean]
     for arg in bool_args:
         if not isinstance(arg, bool):
diff --git a/pyfixest/estimation/feols_.py b/pyfixest/estimation/feols_.py
@@ -229,6 +229,7 @@ def __init__(
         context: Union[int, Mapping[str, Any]] = 0,
         sample_split_var: Optional[str] = None,
         sample_split_value: Optional[Union[str, int, float]] = None,
+        offset: Optional[str] = None,
     ) -> None:
         self._sample_split_value = sample_split_value
         self._sample_split_var = sample_split_var
@@ -250,6 +251,7 @@ def __init__(
         self._drop_intercept = drop_intercept
         self._weights_name = weights
         self._weights_type = weights_type
+        self._offset_name = offset
         self._has_weights = weights is not None
         self._collin_tol = collin_tol
         self._fixef_tol = fixef_tol
@@ -347,6 +349,7 @@ def prepare_model_matrix(self):
             drop_singletons=self._drop_singletons,
             drop_intercept=self._drop_intercept,
             weights=self._weights_name,
+            offset=self._offset_name,
             context=self._context,
         )
 
@@ -357,6 +360,7 @@ def prepare_model_matrix(self):
         self._endogvar = mm_dict.get("endogvar")
         self._Z = mm_dict.get("Z")
         self._weights_df = mm_dict.get("weights_df")
+        self._offset_df = mm_dict.get("offset_df")
         self._na_index = mm_dict.get("na_index")
         self._na_index_str = mm_dict.get("na_index_str")
         self._icovars = mm_dict.get("icovars")
@@ -372,8 +376,10 @@ def prepare_model_matrix(self):
         self._data = _drop_cols(self._data, self._na_index)
 
         self._weights = self._set_weights()
+        self._offset = self._set_offset()
         self._N, self._N_rows = self._set_nobs()
 
+
     def _set_nobs(self) -> tuple[int, int]:
         """
         Fetch the number of observations used in fitting the regression model.
@@ -411,6 +417,27 @@ def _set_weights(self) -> np.ndarray:
             _weights = np.ones(N)
 
         return _weights.reshape((N, 1))
+    
+    def _set_offset(self) -> np.ndarray:
+        """
+        Return the offset used in the regression model.
+
+        Returns
+        -------
+        np.ndarray
+            The offset used in the regression model.
+            If no offset is used, returns an array of zeros
+            with the same length as the dependent variable array.
+        """
+
+        N = len(self._Y)
+
+        if self._offset_df is not None:
+            _offset = self._offset_df.to_numpy()
+        else:
+            _offset = np.zeros(N)
+
+        return _offset.reshape((N, 1))
 
     def demean(self):
         "Demean the dependent variable and covariates by the fixed effect(s)."
diff --git a/pyfixest/estimation/fepois_.py b/pyfixest/estimation/fepois_.py
@@ -80,6 +80,7 @@ def __init__(
         drop_intercept: bool,
         weights: Optional[str],
         weights_type: Optional[str],
+        offset: Optional[float],
         collin_tol: float,
         fixef_tol: float,
         lookup_demeaned_data: dict[str, pd.DataFrame],
@@ -105,6 +106,7 @@ def __init__(
             drop_intercept=drop_intercept,
             weights=weights,
             weights_type=weights_type,
+            offset=offset,
             collin_tol=collin_tol,
             fixef_tol=fixef_tol,
             lookup_demeaned_data=lookup_demeaned_data,
@@ -147,7 +149,7 @@ def prepare_model_matrix(self):
             raise ValueError(
                 "The dependent variable must be a weakly positive integer."
             )
-
+        
         # check for separation
         na_separation: list[int] = []
         if (
@@ -168,6 +170,7 @@ def prepare_model_matrix(self):
             self._Y.drop(na_separation, axis=0, inplace=True)
             self._X.drop(na_separation, axis=0, inplace=True)
             self._fe.drop(na_separation, axis=0, inplace=True)
+            self._offset = np.delete(self._offset,na_separation,axis=0) # _offset is a numpy array so we use delete instead of drop
             self._data.drop(na_separation, axis=0, inplace=True)
             self._N = self._Y.shape[0]
 
@@ -217,6 +220,7 @@ def get_fit(self) -> None:
         _Y = self._Y
         _X = self._X
         _fe = self._fe
+        _offset = self._offset
         _N = self._N
         _convergence = self.convergence  # False
         _maxiter = self.maxiter
@@ -251,13 +255,13 @@ def compute_deviance(_Y: np.ndarray, mu: np.ndarray):
                 _mean = np.mean(_Y)
                 mu = (_Y + _mean) / 2
                 eta = np.log(mu)
-                Z = eta + _Y / mu - 1
+                Z = eta - _offset + _Y / mu - 1
                 reg_Z = Z.copy()
                 last = compute_deviance(_Y, mu)
 
             else:
                 # update w and Z
-                Z = eta + _Y / mu - 1  # eq (8)
+                Z = eta - _offset + _Y / mu - 1  # eq (8)
                 reg_Z = Z.copy()  # eq (9)
 
             # tighten HDFE tolerance - currently not possible with PyHDFE
@@ -294,7 +298,7 @@ def compute_deviance(_Y: np.ndarray, mu: np.ndarray):
 
             mu_old = mu.copy()
             # more updating
-            eta = Z - resid
+            eta = Z - resid + _offset
             mu = np.exp(eta)
 
             # same criterion as fixest
@@ -695,3 +699,4 @@ def _fepois_input_checks(drop_singletons: bool, tol: float, maxiter: int):
         raise TypeError("maxiter must be integer.")
     if maxiter <= 0:
         raise AssertionError("maxiter must be greater than 0.")
+        
diff --git a/pyfixest/estimation/model_matrix_fixest_.py b/pyfixest/estimation/model_matrix_fixest_.py
@@ -17,6 +17,7 @@ def model_matrix_fixest(
     data: pd.DataFrame,
     drop_singletons: bool = False,
     weights: Optional[str] = None,
+    offset: Optional[str] = None,
     drop_intercept=False,
     context: Union[int, Mapping[str, Any]] = 0,
 ) -> dict:
@@ -123,14 +124,15 @@ def model_matrix_fixest(
         **({"fml_first_stage": fml_first_stage} if _is_iv else {}),
         **({"fe": wrap_factorize(fval)} if fval != "0" else {}),
         **({"weights": weights} if weights is not None else {}),
+        **({"offset": offset} if offset is not None else {}),
     }
 
     FML = Formula(**fml_kwargs)
     _context = capture_context(context)
     mm = FML.get_model_matrix(
         data, output="pandas", context={"factorize": factorize, **_context}
     )
-    endogvar = Z = weights_df = fe = None
+    endogvar = Z = weights_df = offset_df = fe = None
 
     Y = mm["fml_second_stage"]["lhs"]
     X = mm["fml_second_stage"]["rhs"]
@@ -142,8 +144,10 @@ def model_matrix_fixest(
         fe = mm["fe"]
     if weights is not None:
         weights_df = mm["weights"]
+    if offset is not None:
+        offset_df = mm["offset"]
 
-    for df in [Y, X, Z, endogvar, weights_df]:
+    for df in [Y, X, Z, endogvar, weights_df,offset_df]:
         if df is not None:
             cols_to_convert = df.select_dtypes(exclude=["int64", "float64"]).columns
             if cols_to_convert.size > 0:
@@ -196,6 +200,8 @@ def model_matrix_fixest(
                 endogvar = endogvar[keep_idx]
             if weights_df is not None:
                 weights_df = weights_df[keep_idx]
+            if offset is not None:
+                offset_df = offset_df[keep_idx]
 
     na_index = _get_na_index(data.shape[0], Y.index)
     na_index_str = ",".join(str(x) for x in na_index)
@@ -213,6 +219,7 @@ def model_matrix_fixest(
         "endogvar": endogvar,
         "Z": Z,
         "weights_df": weights_df,
+        "offset_df": offset_df,
         "na_index": na_index,
         "na_index_str": na_index_str,
         "icovars": _icovars,