Skip to content

Commit 0f48cb5

Browse files
committed
Add string formatter
1 parent 0e25450 commit 0f48cb5

File tree

2 files changed

+99
-63
lines changed

2 files changed

+99
-63
lines changed

python/datafusion/html_formatter.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -26,6 +26,8 @@
2626
runtime_checkable,
2727
)
2828

29+
from datafusion._internal import DataFrame as DataFrameInternal
30+
2931

3032
def _validate_positive_int(value: Any, param_name: str) -> None:
3133
"""Validate that a parameter is a positive integer.
@@ -345,6 +347,32 @@ def format_html(
345347

346348
return "\n".join(html)
347349

350+
def format_str(
351+
self,
352+
batches: list,
353+
schema: Any,
354+
has_more: bool = False,
355+
table_uuid: str | None = None,
356+
) -> str:
357+
"""Format record batches as a string.
358+
359+
This method is used by DataFrame's __repr__ implementation and can be
360+
called directly when string rendering is needed.
361+
362+
Args:
363+
batches: List of Arrow RecordBatch objects
364+
schema: Arrow Schema object
365+
has_more: Whether there are more batches not shown
366+
table_uuid: Unique ID for the table, used for JavaScript interactions
367+
368+
Returns:
369+
String representation of the data
370+
371+
Raises:
372+
TypeError: If schema is invalid and no batches are provided
373+
"""
374+
return DataFrameInternal.default_str_repr(batches, schema, has_more, table_uuid)
375+
348376
def _build_html_header(self) -> list[str]:
349377
"""Build the HTML header with CSS styles."""
350378
html = []

src/dataframe.rs

Lines changed: 71 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -24,6 +24,7 @@ use arrow::compute::can_cast_types;
2424
use arrow::error::ArrowError;
2525
use arrow::ffi::FFI_ArrowSchema;
2626
use arrow::ffi_stream::FFI_ArrowArrayStream;
27+
use arrow::pyarrow::FromPyArrow;
2728
use datafusion::arrow::datatypes::Schema;
2829
use datafusion::arrow::pyarrow::{PyArrowType, ToPyArrow};
2930
use datafusion::arrow::util::pretty;
@@ -301,68 +302,8 @@ impl PyDataFrame {
301302
batches: None,
302303
}
303304
}
304-
}
305-
306-
#[pymethods]
307-
impl PyDataFrame {
308-
/// Enable selection for `df[col]`, `df[col1, col2, col3]`, and `df[[col1, col2, col3]]`
309-
fn __getitem__(&self, key: Bound<'_, PyAny>) -> PyDataFusionResult<Self> {
310-
if let Ok(key) = key.extract::<PyBackedStr>() {
311-
// df[col]
312-
self.select_columns(vec![key])
313-
} else if let Ok(tuple) = key.downcast::<PyTuple>() {
314-
// df[col1, col2, col3]
315-
let keys = tuple
316-
.iter()
317-
.map(|item| item.extract::<PyBackedStr>())
318-
.collect::<PyResult<Vec<PyBackedStr>>>()?;
319-
self.select_columns(keys)
320-
} else if let Ok(keys) = key.extract::<Vec<PyBackedStr>>() {
321-
// df[[col1, col2, col3]]
322-
self.select_columns(keys)
323-
} else {
324-
let message = "DataFrame can only be indexed by string index or indices".to_string();
325-
Err(PyDataFusionError::Common(message))
326-
}
327-
}
328-
329-
fn __repr__(&mut self, py: Python) -> PyDataFusionResult<String> {
330-
// Get the Python formatter config
331-
let PythonFormatter {
332-
formatter: _,
333-
config,
334-
} = get_python_formatter_with_config(py)?;
335-
336-
let should_cache = *is_ipython_env(py) && self.batches.is_none();
337-
let (batches, has_more) = match self.batches.take() {
338-
Some(b) => b,
339-
None => wait_for_future(
340-
py,
341-
collect_record_batches_to_display(self.df.as_ref().clone(), config),
342-
)??,
343-
};
344-
345-
if batches.is_empty() {
346-
// This should not be reached, but do it for safety since we index into the vector below
347-
return Ok("No data to display".to_string());
348-
}
349-
350-
let batches_as_displ =
351-
pretty::pretty_format_batches(&batches).map_err(py_datafusion_err)?;
352-
353-
let additional_str = match has_more {
354-
true => "\nData truncated.",
355-
false => "",
356-
};
357-
358-
if should_cache {
359-
self.batches = Some((batches, has_more));
360-
}
361-
362-
Ok(format!("DataFrame()\n{batches_as_displ}{additional_str}"))
363-
}
364305

365-
fn _repr_html_(&mut self, py: Python) -> PyDataFusionResult<String> {
306+
fn prepare_repr_string(&mut self, py: Python, as_html: bool) -> PyDataFusionResult<String> {
366307
// Get the Python formatter and config
367308
let PythonFormatter { formatter, config } = get_python_formatter_with_config(py)?;
368309

@@ -398,15 +339,82 @@ impl PyDataFrame {
398339
kwargs.set_item("has_more", has_more)?;
399340
kwargs.set_item("table_uuid", table_uuid)?;
400341

401-
let html_result = formatter.call_method("format_html", (), Some(&kwargs))?;
402-
let html_str: String = html_result.extract()?;
342+
let method_name = match as_html {
343+
true => "format_html",
344+
false => "format_str",
345+
};
403346

347+
let html_result = formatter.call_method(method_name, (), Some(&kwargs))?;
348+
let html_str: String = html_result.extract()?;
404349
if should_cache {
405350
self.batches = Some((batches, has_more));
406351
}
407352

408353
Ok(html_str)
409354
}
355+
}
356+
357+
#[pymethods]
358+
impl PyDataFrame {
359+
/// Enable selection for `df[col]`, `df[col1, col2, col3]`, and `df[[col1, col2, col3]]`
360+
fn __getitem__(&self, key: Bound<'_, PyAny>) -> PyDataFusionResult<Self> {
361+
if let Ok(key) = key.extract::<PyBackedStr>() {
362+
// df[col]
363+
self.select_columns(vec![key])
364+
} else if let Ok(tuple) = key.downcast::<PyTuple>() {
365+
// df[col1, col2, col3]
366+
let keys = tuple
367+
.iter()
368+
.map(|item| item.extract::<PyBackedStr>())
369+
.collect::<PyResult<Vec<PyBackedStr>>>()?;
370+
self.select_columns(keys)
371+
} else if let Ok(keys) = key.extract::<Vec<PyBackedStr>>() {
372+
// df[[col1, col2, col3]]
373+
self.select_columns(keys)
374+
} else {
375+
let message = "DataFrame can only be indexed by string index or indices".to_string();
376+
Err(PyDataFusionError::Common(message))
377+
}
378+
}
379+
380+
fn __repr__(&mut self, py: Python) -> PyDataFusionResult<String> {
381+
self.prepare_repr_string(py, false)
382+
}
383+
384+
fn _repr_html_(&mut self, py: Python) -> PyDataFusionResult<String> {
385+
self.prepare_repr_string(py, true)
386+
}
387+
388+
#[staticmethod]
389+
#[expect(unused_variables)]
390+
fn default_str_repr<'py>(
391+
batches: Vec<Bound<'py, PyAny>>,
392+
schema: &Bound<'py, PyAny>,
393+
has_more: bool,
394+
table_uuid: &str,
395+
) -> PyResult<String> {
396+
let batches = batches
397+
.into_iter()
398+
.map(|batch| RecordBatch::from_pyarrow_bound(&batch))
399+
.collect::<PyResult<Vec<RecordBatch>>>()?
400+
.into_iter()
401+
.filter(|batch| batch.num_rows() > 0)
402+
.collect::<Vec<_>>();
403+
404+
if batches.is_empty() {
405+
return Ok("No data to display".to_owned());
406+
}
407+
408+
let batches_as_displ =
409+
pretty::pretty_format_batches(&batches).map_err(py_datafusion_err)?;
410+
411+
let additional_str = match has_more {
412+
true => "\nData truncated.",
413+
false => "",
414+
};
415+
416+
Ok(format!("DataFrame()\n{batches_as_displ}{additional_str}"))
417+
}
410418

411419
/// Calculate summary statistics for a DataFrame
412420
fn describe(&self, py: Python) -> PyDataFusionResult<Self> {

0 commit comments

Comments
 (0)