Skip to content

Commit 9c17fcf

Browse files
Make .arrow() from relation return record batch reader (#32)
port of duckdb/duckdb#18742 also see duckdb/duckdb#18642
2 parents c7b7a88 + 6aaf2bc commit 9c17fcf

File tree

4 files changed

+17
-7
lines changed

4 files changed

+17
-7
lines changed

duckdb/__init__.pyi

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -415,7 +415,7 @@ class DuckDBPyRelation:
415415
def variance(self, column: str, groups: str = ..., window_spec: str = ..., projected_columns: str = ...) -> DuckDBPyRelation: ...
416416
def list(self, column: str, groups: str = ..., window_spec: str = ..., projected_columns: str = ...) -> DuckDBPyRelation: ...
417417

418-
def arrow(self, batch_size: int = ...) -> pyarrow.lib.Table: ...
418+
def arrow(self, batch_size: int = ...) -> pyarrow.lib.RecordBatchReader: ...
419419
def __arrow_c_stream__(self, requested_schema: Optional[object] = None) -> object: ...
420420
def create(self, table_name: str) -> None: ...
421421
def create_view(self, view_name: str, replace: bool = ...) -> DuckDBPyRelation: ...
@@ -448,6 +448,7 @@ class DuckDBPyRelation:
448448
def pl(self, rows_per_batch: int = ..., connection: DuckDBPyConnection = ...) -> polars.DataFrame: ...
449449
def query(self, virtual_table_name: str, sql_query: str) -> DuckDBPyRelation: ...
450450
def record_batch(self, batch_size: int = ...) -> pyarrow.lib.RecordBatchReader: ...
451+
def fetch_record_batch(self, rows_per_batch: int = 1000000, *, connection: DuckDBPyConnection = ...) -> pyarrow.lib.RecordBatchReader: ...
451452
def select_types(self, types: List[Union[str, DuckDBPyType]]) -> DuckDBPyRelation: ...
452453
def select_dtypes(self, types: List[Union[str, DuckDBPyType]]) -> DuckDBPyRelation: ...
453454
def set_alias(self, alias: str) -> DuckDBPyRelation: ...

duckdb/experimental/spark/sql/dataframe.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ def toArrow(self) -> "pa.Table":
7575
age: [[2,5]]
7676
name: [["Alice","Bob"]]
7777
"""
78-
return self.relation.arrow()
78+
return self.relation.to_arrow_table()
7979

8080
def createOrReplaceTempView(self, name: str) -> None:
8181
"""Creates or replaces a local temporary view with this :class:`DataFrame`.

src/duckdb_py/pyrelation/initialize.cpp

Lines changed: 13 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
6161
py::arg("date_as_object") = false)
6262
.def("fetch_df_chunk", &DuckDBPyRelation::FetchDFChunk, "Execute and fetch a chunk of the rows",
6363
py::arg("vectors_per_chunk") = 1, py::kw_only(), py::arg("date_as_object") = false)
64-
.def("arrow", &DuckDBPyRelation::ToArrowTable, "Execute and fetch all rows as an Arrow Table",
64+
.def("arrow", &DuckDBPyRelation::ToRecordBatch, "Execute and return an Arrow Record Batch Reader that yields all rows",
6565
py::arg("batch_size") = 1000000)
6666
.def("fetch_arrow_table", &DuckDBPyRelation::ToArrowTable, "Execute and fetch all rows as an Arrow Table",
6767
py::arg("batch_size") = 1000000)
@@ -78,10 +78,18 @@ static void InitializeConsumers(py::class_<DuckDBPyRelation> &m) {
7878
)";
7979
m.def("__arrow_c_stream__", &DuckDBPyRelation::ToArrowCapsule, capsule_docs,
8080
py::arg("requested_schema") = py::none());
81-
m.def("record_batch", &DuckDBPyRelation::ToRecordBatch,
82-
"Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000)
83-
.def("fetch_arrow_reader", &DuckDBPyRelation::ToRecordBatch,
84-
"Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000);
81+
m.def("fetch_record_batch", &DuckDBPyRelation::ToRecordBatch,
82+
"Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("rows_per_batch") = 1000000)
83+
.def("fetch_arrow_reader", &DuckDBPyRelation::ToRecordBatch,
84+
"Execute and return an Arrow Record Batch Reader that yields all rows", py::arg("batch_size") = 1000000)
85+
.def("record_batch",
86+
[](pybind11::object &self, idx_t rows_per_batch)
87+
{
88+
PyErr_WarnEx(PyExc_DeprecationWarning,
89+
"record_batch() is deprecated, use fetch_record_batch() instead.",
90+
0);
91+
return self.attr("fetch_record_batch")(rows_per_batch);
92+
}, py::arg("batch_size") = 1000000);
8593
}
8694

8795
static void InitializeAggregates(py::class_<DuckDBPyRelation> &m) {

tests/pytest.ini

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@
33
filterwarnings =
44
error
55
ignore::UserWarning
6+
ignore::DeprecationWarning
67
# Jupyter is throwing DeprecationWarnings
78
ignore:function ham\(\) is deprecated:DeprecationWarning
89
# Pyspark is throwing these warnings

0 commit comments

Comments
 (0)