diff --git a/docs/source/user-guide/common-operations/joins.rst b/docs/source/user-guide/common-operations/joins.rst index 40d922150..035d7488d 100644 --- a/docs/source/user-guide/common-operations/joins.rst +++ b/docs/source/user-guide/common-operations/joins.rst @@ -101,4 +101,36 @@ the right table. .. ipython:: python - left.join(right, left_on="customer_id", right_on="id", how="anti") \ No newline at end of file + left.join(right, left_on="customer_id", right_on="id", how="anti") + +Duplicate Keys +-------------- + +It is common to join two DataFrames on a common column name. Starting in +version 51.0.0, ``datafusion-python``` will now drop duplicate column names by +default. This reduces problems with ambiguous column selection after joins. +You can disable this feature by setting the parameter ``drop_duplicate_keys`` +to ``False``. + +.. ipython:: python + + left = ctx.from_pydict( + { + "id": [1, 2, 3], + "customer": ["Alice", "Bob", "Charlie"], + } + ) + + right = ctx.from_pylist([ + {"id": 1, "name": "CityCabs"}, + {"id": 2, "name": "MetroRide"}, + {"id": 5, "name": "UrbanGo"}, + ]) + + left.join(right, "id", how="inner") + +In contrast to the above example, if we wish to get both columns: + +.. ipython:: python + + left.join(right, "id", how="inner", drop_duplicate_keys=False) diff --git a/python/datafusion/dataframe.py b/python/datafusion/dataframe.py index c6ff7eda5..b3b48e963 100644 --- a/python/datafusion/dataframe.py +++ b/python/datafusion/dataframe.py @@ -774,6 +774,7 @@ def join( left_on: None = None, right_on: None = None, join_keys: None = None, + drop_duplicate_keys: bool = True, ) -> DataFrame: ... @overload @@ -786,6 +787,7 @@ def join( left_on: str | Sequence[str], right_on: str | Sequence[str], join_keys: tuple[list[str], list[str]] | None = None, + drop_duplicate_keys: bool = True, ) -> DataFrame: ... @overload @@ -798,6 +800,7 @@ def join( join_keys: tuple[list[str], list[str]], left_on: None = None, right_on: None = None, + drop_duplicate_keys: bool = True, ) -> DataFrame: ... def join( @@ -809,6 +812,7 @@ def join( left_on: str | Sequence[str] | None = None, right_on: str | Sequence[str] | None = None, join_keys: tuple[list[str], list[str]] | None = None, + drop_duplicate_keys: bool = True, ) -> DataFrame: """Join this :py:class:`DataFrame` with another :py:class:`DataFrame`. @@ -821,11 +825,23 @@ def join( "right", "full", "semi", "anti". left_on: Join column of the left dataframe. right_on: Join column of the right dataframe. + drop_duplicate_keys: When True, the columns from the right DataFrame + that have identical names in the ``on`` fields to the left DataFrame + will be dropped. join_keys: Tuple of two lists of column names to join on. [Deprecated] Returns: DataFrame after join. """ + if join_keys is not None: + warnings.warn( + "`join_keys` is deprecated, use `on` or `left_on` with `right_on`", + category=DeprecationWarning, + stacklevel=2, + ) + left_on = join_keys[0] + right_on = join_keys[1] + # This check is to prevent breaking API changes where users prior to # DF 43.0.0 would pass the join_keys as a positional argument instead # of a keyword argument. @@ -836,18 +852,10 @@ def join( and isinstance(on[1], list) ): # We know this is safe because we've checked the types - join_keys = on # type: ignore[assignment] + left_on = on[0] + right_on = on[1] on = None - if join_keys is not None: - warnings.warn( - "`join_keys` is deprecated, use `on` or `left_on` with `right_on`", - category=DeprecationWarning, - stacklevel=2, - ) - left_on = join_keys[0] - right_on = join_keys[1] - if on is not None: if left_on is not None or right_on is not None: error_msg = "`left_on` or `right_on` should not provided with `on`" @@ -866,7 +874,9 @@ def join( if isinstance(right_on, str): right_on = [right_on] - return DataFrame(self.df.join(right.df, how, left_on, right_on)) + return DataFrame( + self.df.join(right.df, how, left_on, right_on, drop_duplicate_keys) + ) def join_on( self, diff --git a/python/datafusion/dataframe_formatter.py b/python/datafusion/dataframe_formatter.py index 4082ff4ec..bb53d323e 100644 --- a/python/datafusion/dataframe_formatter.py +++ b/python/datafusion/dataframe_formatter.py @@ -370,7 +370,7 @@ def _build_table_container_start(self) -> list[str]: f"max-height: {self.max_height}px; overflow: auto; border: " '1px solid #ccc;">' ) - html.append('