Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
34 changes: 33 additions & 1 deletion docs/source/user-guide/common-operations/joins.rst
Original file line number Diff line number Diff line change
Expand Up @@ -101,4 +101,36 @@ the right table.

.. ipython:: python

left.join(right, left_on="customer_id", right_on="id", how="anti")
left.join(right, left_on="customer_id", right_on="id", how="anti")

Duplicate Keys
--------------

It is common to join two DataFrames on a common column name. Starting in
version 51.0.0, ``datafusion-python``` will now drop duplicate column names by
default. This reduces problems with ambiguous column selection after joins.
You can disable this feature by setting the parameter ``drop_duplicate_keys``
to ``False``.

.. ipython:: python

left = ctx.from_pydict(
{
"id": [1, 2, 3],
"customer": ["Alice", "Bob", "Charlie"],
}
)

right = ctx.from_pylist([
{"id": 1, "name": "CityCabs"},
{"id": 2, "name": "MetroRide"},
{"id": 5, "name": "UrbanGo"},
])

left.join(right, "id", how="inner")

In contrast to the above example, if we wish to get both columns:

.. ipython:: python

left.join(right, "id", how="inner", drop_duplicate_keys=False)
32 changes: 21 additions & 11 deletions python/datafusion/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -774,6 +774,7 @@ def join(
left_on: None = None,
right_on: None = None,
join_keys: None = None,
drop_duplicate_keys: bool = True,
) -> DataFrame: ...

@overload
Expand All @@ -786,6 +787,7 @@ def join(
left_on: str | Sequence[str],
right_on: str | Sequence[str],
join_keys: tuple[list[str], list[str]] | None = None,
drop_duplicate_keys: bool = True,
) -> DataFrame: ...

@overload
Expand All @@ -798,6 +800,7 @@ def join(
join_keys: tuple[list[str], list[str]],
left_on: None = None,
right_on: None = None,
drop_duplicate_keys: bool = True,
) -> DataFrame: ...

def join(
Expand All @@ -809,6 +812,7 @@ def join(
left_on: str | Sequence[str] | None = None,
right_on: str | Sequence[str] | None = None,
join_keys: tuple[list[str], list[str]] | None = None,
drop_duplicate_keys: bool = True,
) -> DataFrame:
"""Join this :py:class:`DataFrame` with another :py:class:`DataFrame`.

Expand All @@ -821,11 +825,23 @@ def join(
"right", "full", "semi", "anti".
left_on: Join column of the left dataframe.
right_on: Join column of the right dataframe.
drop_duplicate_keys: When True, the columns from the right DataFrame
that have identical names in the ``on`` fields to the left DataFrame
will be dropped.
join_keys: Tuple of two lists of column names to join on. [Deprecated]

Returns:
DataFrame after join.
"""
if join_keys is not None:
warnings.warn(
"`join_keys` is deprecated, use `on` or `left_on` with `right_on`",
category=DeprecationWarning,
stacklevel=2,
)
left_on = join_keys[0]
right_on = join_keys[1]

# This check is to prevent breaking API changes where users prior to
# DF 43.0.0 would pass the join_keys as a positional argument instead
# of a keyword argument.
Expand All @@ -836,18 +852,10 @@ def join(
and isinstance(on[1], list)
):
# We know this is safe because we've checked the types
join_keys = on # type: ignore[assignment]
left_on = on[0]
right_on = on[1]
on = None

if join_keys is not None:
warnings.warn(
"`join_keys` is deprecated, use `on` or `left_on` with `right_on`",
category=DeprecationWarning,
stacklevel=2,
)
left_on = join_keys[0]
right_on = join_keys[1]

if on is not None:
if left_on is not None or right_on is not None:
error_msg = "`left_on` or `right_on` should not provided with `on`"
Expand All @@ -866,7 +874,9 @@ def join(
if isinstance(right_on, str):
right_on = [right_on]

return DataFrame(self.df.join(right.df, how, left_on, right_on))
return DataFrame(
self.df.join(right.df, how, left_on, right_on, drop_duplicate_keys)
)

def join_on(
self,
Expand Down
2 changes: 1 addition & 1 deletion python/datafusion/dataframe_formatter.py
Original file line number Diff line number Diff line change
Expand Up @@ -370,7 +370,7 @@ def _build_table_container_start(self) -> list[str]:
f"max-height: {self.max_height}px; overflow: auto; border: "
'1px solid #ccc;">'
)
html.append('<table style="border-collapse: collapse; min-width: 100%">')
html.append('<table style="border-collapse: collapse">')
return html

def _build_table_header(self, schema: Any) -> list[str]:
Expand Down
28 changes: 20 additions & 8 deletions python/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -647,7 +647,6 @@ def test_unnest_without_nulls(nested_df):
assert result.column(1) == pa.array([7, 8, 8, 9, 9, 9])


Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be good if new tests for keep_duplicate_keys=False or True were added. To ensure coverage, add tests verifying that passing keep_duplicate_keys=True preserves both columns.

@pytest.mark.filterwarnings("ignore:`join_keys`:DeprecationWarning")
def test_join():
ctx = SessionContext()

Expand All @@ -664,25 +663,38 @@ def test_join():
df1 = ctx.create_dataframe([[batch]], "r")

df2 = df.join(df1, on="a", how="inner")
df2.show()
df2 = df2.sort(column("l.a"))
table = pa.Table.from_batches(df2.collect())

expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]}
assert table.to_pydict() == expected

df2 = df.join(df1, left_on="a", right_on="a", how="inner")
df2.show()
# Test the default behavior for dropping duplicate keys
# Since we may have a duplicate column name and pa.Table()
# hides the fact, instead we need to explicitly check the
# resultant arrays.
df2 = df.join(df1, left_on="a", right_on="a", how="inner", drop_duplicate_keys=True)
df2 = df2.sort(column("l.a"))
table = pa.Table.from_batches(df2.collect())
result = df2.collect()[0]
assert result.num_columns == 3
assert result.column(0) == pa.array([1, 2], pa.int64())
assert result.column(1) == pa.array([4, 5], pa.int64())
assert result.column(2) == pa.array([8, 10], pa.int64())

expected = {"a": [1, 2], "c": [8, 10], "b": [4, 5]}
assert table.to_pydict() == expected
df2 = df.join(
df1, left_on="a", right_on="a", how="inner", drop_duplicate_keys=False
)
df2 = df2.sort(column("l.a"))
result = df2.collect()[0]
assert result.num_columns == 4
assert result.column(0) == pa.array([1, 2], pa.int64())
assert result.column(1) == pa.array([4, 5], pa.int64())
assert result.column(2) == pa.array([1, 2], pa.int64())
assert result.column(3) == pa.array([8, 10], pa.int64())

# Verify we don't make a breaking change to pre-43.0.0
# where users would pass join_keys as a positional argument
df2 = df.join(df1, (["a"], ["a"]), how="inner")
df2.show()
df2 = df2.sort(column("l.a"))
table = pa.Table.from_batches(df2.collect())

Expand Down
3 changes: 3 additions & 0 deletions python/tests/test_sql.py
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,9 @@ def test_register_parquet(ctx, tmp_path):
assert result.to_pydict() == {"cnt": [100]}


@pytest.mark.filterwarnings(
"ignore:using literals for table_partition_cols data types:DeprecationWarning"
)
@pytest.mark.parametrize(
("path_to_str", "legacy_data_type"), [(True, False), (False, False), (False, True)]
)
Expand Down
36 changes: 35 additions & 1 deletion src/dataframe.rs
Original file line number Diff line number Diff line change
Expand Up @@ -629,6 +629,7 @@ impl PyDataFrame {
how: &str,
left_on: Vec<PyBackedStr>,
right_on: Vec<PyBackedStr>,
drop_duplicate_keys: bool,
) -> PyDataFusionResult<Self> {
let join_type = match how {
"inner" => JoinType::Inner,
Expand All @@ -647,13 +648,46 @@ impl PyDataFrame {
let left_keys = left_on.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();
let right_keys = right_on.iter().map(|s| s.as_ref()).collect::<Vec<&str>>();

let df = self.df.as_ref().clone().join(
let mut df = self.df.as_ref().clone().join(
right.df.as_ref().clone(),
join_type,
&left_keys,
&right_keys,
None,
)?;

if drop_duplicate_keys {
let mutual_keys = left_keys
.iter()
.zip(right_keys.iter())
.filter(|(l, r)| l == r)
.map(|(key, _)| *key)
.collect::<Vec<_>>();

let fields_to_drop = mutual_keys
.iter()
.map(|name| {
df.logical_plan()
.schema()
.qualified_fields_with_unqualified_name(name)
})
.filter(|r| r.len() == 2)
.map(|r| r[1])
.collect::<Vec<_>>();

let expr: Vec<Expr> = df
.logical_plan()
.schema()
.fields()
.into_iter()
.enumerate()
.map(|(idx, _)| df.logical_plan().schema().qualified_field(idx))
.filter(|(qualifier, f)| !fields_to_drop.contains(&(*qualifier, f)))
.map(|(qualifier, field)| Expr::Column(Column::from((qualifier, field))))
.collect();
df = df.select(expr)?;
}

Ok(Self::new(df))
}

Expand Down