chdb-io · wudidapaopao · Sep 28, 2025 · Sep 29, 2025 · Sep 29, 2025 · Oct 28, 2025
diff --git a/.github/workflows/build_linux_arm64_wheels-gh.yml b/.github/workflows/build_linux_arm64_wheels-gh.yml
@@ -124,7 +124,7 @@ jobs:
           which clang++-19
           clang++-19 --version
           sudo apt-get install -y make cmake ccache ninja-build yasm gawk wget
-          # Install WebAssembly linker (wasm-ld) 
+          # Install WebAssembly linker (wasm-ld)
           sudo apt-get install -y lld-19
           # Create symlink for wasm-ld
           if ! command -v wasm-ld &> /dev/null; then

diff --git a/.github/workflows/build_macos_x86_wheels.yml b/.github/workflows/build_macos_x86_wheels.yml
@@ -133,7 +133,6 @@ jobs:
       - uses: actions/checkout@v3
         with:
           fetch-depth: 0
-          token: ${{ secrets.GH_TOKEN }}
       - name: Update submodules
         run: |
           git submodule update --init --recursive --jobs 4

diff --git a/.github/workflows/build_musllinux_arm64_wheels.yml b/.github/workflows/build_musllinux_arm64_wheels.yml
@@ -143,7 +143,7 @@ jobs:
             # Update version for release (if triggered by tag)
             if [ "${GITHUB_REF#refs/tags/v}" != "$GITHUB_REF" ]; then
               pyenv shell 3.9
-              
+
               # Install bump-my-version
               python -m pip install bump-my-version
               TAG_NAME=${GITHUB_REF#refs/tags/v}

diff --git a/chdb/__init__.py b/chdb/__init__.py
@@ -38,9 +38,8 @@ class ChdbError(Exception):
     """
 
 
-_arrow_format = set({"dataframe", "arrowtable"})
+_arrow_format = set({"arrowtable"})
 _process_result_format_funs = {
-    "dataframe": lambda x: to_df(x),
     "arrowtable": lambda x: to_arrowTable(x),
 }
 
@@ -108,33 +107,6 @@ def to_arrowTable(res):
     return pa.RecordBatchFileReader(memview.view()).read_all()
 
 
-# return pandas dataframe
-def to_df(r):
-    """Convert query result to pandas DataFrame.
-
-    Converts a chDB query result to a pandas DataFrame by first converting to
-    PyArrow Table and then to pandas using multi-threading for better performance.
-
-    Args:
-        r: chDB query result object containing binary Arrow data
-
-    Returns:
-        pd.DataFrame: pandas DataFrame containing the query results
-
-    Raises:
-        ImportError: If pyarrow or pandas are not installed
-
-    Example:
-        >>> result = chdb.query("SELECT 1 as id, 'hello' as msg", "Arrow")
-        >>> df = chdb.to_df(result)
-        >>> print(df)
-           id    msg
-        0   1  hello
-    """
-    t = to_arrowTable(r)
-    return t.to_pandas(use_threads=True)
-
-
 # global connection lock, for multi-threading use of legacy chdb.query()
 g_conn_lock = threading.Lock()
 
@@ -222,6 +194,11 @@ def query(sql, output_format="CSV", path="", udf_path=""):
     with g_conn_lock:
         conn = _chdb.connect(conn_str)
         res = conn.query(sql, output_format)
+
+        if lower_output_format == "dataframe":
+            conn.close()
+            return res
+
         if res.has_error():
             conn.close()
             raise ChdbError(res.error_message())

diff --git a/chdb/state/sqlitelike.py b/chdb/state/sqlitelike.py
@@ -10,9 +10,8 @@
     raise ImportError("Failed to import pyarrow") from None
 
 
-_arrow_format = set({"dataframe", "arrowtable"})
+_arrow_format = set({"arrowtable"})
 _process_result_format_funs = {
-    "dataframe": lambda x: to_df(x),
     "arrowtable": lambda x: to_arrowTable(x),
 }
 
@@ -67,47 +66,6 @@ def to_arrowTable(res):
     return pa.RecordBatchFileReader(memview.view()).read_all()
 
 
-# return pandas dataframe
-def to_df(r):
-    """Convert query result to Pandas DataFrame.
-
-    This function converts chdb query results to a Pandas DataFrame format
-    by first converting to PyArrow Table and then to DataFrame. This provides
-    convenient data analysis capabilities with Pandas API.
-
-    Args:
-        r: Query result object from chdb containing Arrow format data
-
-    Returns:
-        pandas.DataFrame: DataFrame containing the query results with
-        appropriate column names and data types
-
-    Raises:
-        ImportError: If pyarrow or pandas packages are not installed
-
-    .. note::
-        This function uses multi-threading for the Arrow to Pandas conversion
-        to improve performance on large datasets.
-
-    .. seealso::
-        :func:`to_arrowTable` - For PyArrow Table format conversion
-
-    Examples:
-        >>> import chdb
-        >>> result = chdb.query("SELECT 1 as num, 'hello' as text", "Arrow")
-        >>> df = to_df(result)
-        >>> print(df)
-           num   text
-        0    1  hello
-        >>> print(df.dtypes)
-        num      int64
-        text    object
-        dtype: object
-    """
-    t = to_arrowTable(r)
-    return t.to_pandas(use_threads=True)
-
-
 class StreamingResult:
     def __init__(self, c_result, conn, result_func, supports_record_batch):
         self._result = c_result

diff --git a/programs/local/CMakeLists.txt b/programs/local/CMakeLists.txt
@@ -25,13 +25,19 @@ endif()
 if (USE_PYTHON)
     set (CHDB_SOURCES
         chdb.cpp
+        ChunkCollectorOutputFormat.cpp
+        FieldToPython.cpp
         FormatHelper.cpp
         ListScan.cpp
         LocalChdb.cpp
         LocalServer.cpp
+        NumpyArray.cpp
+        NumpyNestedTypes.cpp
         NumpyType.cpp
+        ObjectToPython.cpp
         PandasAnalyzer.cpp
         PandasDataFrame.cpp
+        PandasDataFrameBuilder.cpp
         PandasScan.cpp
         PyArrowStreamFactory.cpp
         PyArrowTable.cpp

diff --git a/programs/local/ChunkCollectorOutputFormat.cpp b/programs/local/ChunkCollectorOutputFormat.cpp
@@ -0,0 +1,91 @@
+#include "ChunkCollectorOutputFormat.h"
+#include "PandasDataFrameBuilder.h"
+
+#include <IO/NullWriteBuffer.h>
+#include <Processors/Port.h>
+#include <Client/ClientBase.h>
+#include <base/defines.h>
+
+using namespace DB;
+
+namespace CHDB
+{
+
+NullWriteBuffer ChunkCollectorOutputFormat::out;
+
+ChunkCollectorOutputFormat::ChunkCollectorOutputFormat(
+    SharedHeader shared_header,
+    PandasDataFrameBuilder & builder)
+    : IOutputFormat(shared_header, out)
+    , dataframe_builder(builder)
+{}
+
+void ChunkCollectorOutputFormat::consume(Chunk chunk)
+{
+    chunks.emplace_back(std::move(chunk));
+}
+
+void ChunkCollectorOutputFormat::consumeTotals(Chunk totals)
+{
+    chunks.emplace_back(std::move(totals));
+}
+
+void ChunkCollectorOutputFormat::consumeExtremes(Chunk extremes)
+{
+    chunks.emplace_back(std::move(extremes));
+}
+
+void ChunkCollectorOutputFormat::finalizeImpl()
+{
+    // Add all collected chunks to the builder
+    for (const auto & chunk : chunks)
+    {
+        dataframe_builder.addChunk(chunk);
+    }
+
+    // Finalize the DataFrame generation
+    dataframe_builder.finalize();
+
+    chunks.clear();
+}
+
+/// Global dataframe builder
+static std::shared_ptr<PandasDataFrameBuilder> g_dataframe_builder = nullptr;
+
+PandasDataFrameBuilder & getGlobalDataFrameBuilder()
+{
+    return *g_dataframe_builder;
+}
+
+void setGlobalDataFrameBuilder(std::shared_ptr<PandasDataFrameBuilder> builder)
+{
+    g_dataframe_builder = builder;
+}
+
+void resetGlobalDataFrameBuilder()
+{
+    if (g_dataframe_builder)
+    {
+        py::gil_scoped_acquire acquire;
+        g_dataframe_builder.reset();
+    }
+}
+
+/// create ChunkCollectorOutputFormat for use with function pointer
+std::shared_ptr<IOutputFormat> createDataFrameOutputFormat(SharedHeader header)
+{
+    /// Create a PandasDataFrameBuilder and set it globally
+    auto dataframe_builder = std::make_shared<PandasDataFrameBuilder>(*header);
+    setGlobalDataFrameBuilder(dataframe_builder);
+
+    /// Create and return the format with the builder
+    return std::make_shared<ChunkCollectorOutputFormat>(header, getGlobalDataFrameBuilder());
+}
+
+/// Registration function to be called during initialization
+void registerDataFrameOutputFormat()
+{
+    ClientBase::setDataFrameFormatCreator(&createDataFrameOutputFormat);
+}
+
+}
diff --git a/programs/local/ChunkCollectorOutputFormat.h b/programs/local/ChunkCollectorOutputFormat.h
@@ -0,0 +1,61 @@
+#pragma once
+
+#include <vector>
+#include <Core/NamesAndTypes.h>
+#include <Processors/Formats/IOutputFormat.h>
+#include <Processors/Port.h>
+
+namespace DB
+{
+class NullWriteBuffer;
+}
+
+namespace CHDB
+{
+
+class PandasDataFrameBuilder;
+
+/// OutputFormat that collects all chunks into memory for further processing
+/// Does not write to WriteBuffer, instead accumulates data for conversion to pandas DataFrame objects
+class ChunkCollectorOutputFormat : public DB::IOutputFormat
+{
+public:
+    ChunkCollectorOutputFormat(DB::SharedHeader shared_header, PandasDataFrameBuilder & builder);
+
+    String getName() const override { return "ChunkCollectorOutputFormat"; }
+
+    void onCancel() noexcept override
+    {
+        chunks.clear();
+    }
+
+protected:
+    void consume(DB::Chunk chunk) override;
+
+    void consumeTotals(DB::Chunk totals) override;
+
+    void consumeExtremes(DB::Chunk extremes) override;
+
+    void finalizeImpl() override;
+
+private:
+    std::vector<DB::Chunk> chunks;
+
+    PandasDataFrameBuilder & dataframe_builder;
+
+    static DB::NullWriteBuffer out;
+};
+
+/// Registration function to be called during initialization
+void registerDataFrameOutputFormat();
+
+/// Get the global dataframe builder
+PandasDataFrameBuilder & getGlobalDataFrameBuilder();
+
+/// Set the global dataframe builder
+void setGlobalDataFrameBuilder(std::shared_ptr<PandasDataFrameBuilder> builder);
+
+/// Reset the global dataframe builder
+void resetGlobalDataFrameBuilder();
+
+}