@@ -1480,7 +1480,6 @@ def build_dataframe(args, constructor):
14801480 # data_frame is PySpark: it does not support interchange protocol and it is not
14811481 # integrated in Narwhals. We use its native method to convert it to pandas.
14821482 elif hasattr (args ["data_frame" ], "toPandas" ):
1483- # data_frame is PySpark:
14841483 args ["data_frame" ] = nw .from_native (
14851484 args ["data_frame" ].toPandas (), eager_only = True
14861485 )
@@ -1603,8 +1602,10 @@ def build_dataframe(args, constructor):
16031602 value_name = _escape_col_name (columns , "value" , [])
16041603 var_name = _escape_col_name (columns , var_name , [])
16051604
1605+ # If the data_frame has interchange-only support levelin Narwhals, then we need to
1606+ # convert it to a full support level backend.
1607+ # Hence we convert requires Interchange to PyArrow.
16061608 if needs_interchanging :
1607- # Interchange to PyArrow
16081609 if wide_mode :
16091610 args ["data_frame" ] = nw .from_native (
16101611 args ["data_frame" ].to_arrow (), eager_only = True
@@ -1908,25 +1909,38 @@ def process_dataframe_hierarchy(args):
19081909 n_bytes = 16 , columns = [* path , count_colname ]
19091910 )
19101911
1912+ # In theory, for discrete columns aggregation, we should have a way to do
1913+ # `.agg(nw.col(x).unique())` in group_by and successively unpack/parse it as:
1914+ # ```
1915+ # (nw.when(nw.col(x).list.len()==1)
1916+ # .then(nw.col(x).list.first())
1917+ # .otherwise(nw.lit("(?)"))
1918+ # )
1919+ # ```
1920+ # which replicates the original pandas only codebase:
1921+ # ```
1922+ # def discrete_agg(x):
1923+ # uniques = x.unique()
1924+ # return uniques[0] if len(uniques) == 1 else "(?)"
1925+ #
1926+ # df.groupby(path[i:]).agg(...)
1927+ # ```
1928+ # However this is not possible, therefore the following workaround is provided.
1929+ # We make two aggregations for the same column:
1930+ # - take the max value
1931+ # - take the number of unique values
1932+ # Finally, after the group by statement, it is unpacked via:
1933+ # ```
1934+ # (nw.when(nw.col(col_n_unique) == 1)
1935+ # .then(nw.col(col_max_value)) # which is the unique value
1936+ # .otherwise(nw.lit("(?)"))
1937+ # )
1938+ # ```
1939+
19111940 if args ["color" ]:
19121941 if discrete_color :
19131942
19141943 discrete_aggs .append (args ["color" ])
1915- # Hack: In theory, we should have a way to do `.agg(nw.col(x).unique())` and
1916- # successively unpack/parse it as:
1917- # ```
1918- # (nw.when(nw.col(x).list.len()==1)
1919- # .then(nw.col(x).list.first())
1920- # .otherwise(nw.lit("(?)"))
1921- # )
1922- # ```
1923- # which replicates:
1924- # ```
1925- # def discrete_agg(x):
1926- # uniques = x.unique()
1927- # return uniques[0] if len(uniques) == 1 else "(?)"
1928- # ```
1929- # However we cannot do that just yet, therefore a workaround is provided
19301944 agg_f [args ["color" ]] = nw .col (args ["color" ]).max ()
19311945 agg_f [f'{ args ["color" ]} _{ n_unique_token } __' ] = (
19321946 nw .col (args ["color" ])
0 commit comments