Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
39 commits
Select commit Hold shift + click to select a range
5496746
chore: add ChunkCollectorOutputFormat.cpp
wudidapaopao Sep 28, 2025
bd998e2
chore: adjust Python-C interface to support direct DataFrame return
wudidapaopao Sep 29, 2025
4b0276b
chore: update NumpyType.cpp
wudidapaopao Sep 29, 2025
ef1d123
Merge remote-tracking branch 'origin/main' into support_export_dataframe
wudidapaopao Oct 28, 2025
fbaf1c7
chore: add NumpyArray
wudidapaopao Oct 29, 2025
8391e2f
chore: update NumpyArray
wudidapaopao Oct 29, 2025
8af2274
chore: update NumpyArray
wudidapaopao Oct 29, 2025
0255b90
chore: update NumpyArray
wudidapaopao Oct 30, 2025
1b6dade
chore: support timezone
wudidapaopao Oct 31, 2025
2a09093
chore: add more CH types
wudidapaopao Nov 1, 2025
af9cc16
chore: support time and time64 types
wudidapaopao Nov 2, 2025
373bd5e
chore: support more types
wudidapaopao Nov 2, 2025
dd55a08
chore: support nested types
wudidapaopao Nov 3, 2025
afd902a
chore: support converting filed to python object
wudidapaopao Nov 3, 2025
670d87c
chore: support more types
wudidapaopao Nov 4, 2025
b5b4de6
chore: support map type
wudidapaopao Nov 4, 2025
f9f1970
chore: support more nested type
wudidapaopao Nov 4, 2025
dab8450
chore: support more nested type
wudidapaopao Nov 4, 2025
f585a22
Merge remote-tracking branch 'origin/main' into support_export_dataframe
wudidapaopao Nov 4, 2025
9894ace
chore: support object type
wudidapaopao Nov 4, 2025
95ad2d4
fix: fix build issues
wudidapaopao Nov 4, 2025
7ab8fcb
test: update workflow
wudidapaopao Nov 5, 2025
f0406e6
fix: fix test issues
wudidapaopao Nov 5, 2025
c6d370d
test: add more test cases
wudidapaopao Nov 5, 2025
bffe614
test: add more test cases
wudidapaopao Nov 6, 2025
2395a8e
fix: fix enum types
wudidapaopao Nov 6, 2025
d5c2538
test: add uuid types
wudidapaopao Nov 7, 2025
bf03bfa
test: add ip types
wudidapaopao Nov 9, 2025
5837851
fix: fix test cases
wudidapaopao Nov 10, 2025
5bd84e8
test: add more test cases
wudidapaopao Nov 10, 2025
0e18c2d
fix: fix test cases
wudidapaopao Nov 11, 2025
7345d2e
test: add more test cases
wudidapaopao Nov 11, 2025
69cfbcf
test: add more test cases
wudidapaopao Nov 11, 2025
5c3bc24
test: add more test cases
wudidapaopao Nov 11, 2025
c6aa7bb
fix: fix test cases
wudidapaopao Nov 11, 2025
0585c5d
test: update workflow
wudidapaopao Nov 11, 2025
7332601
test: add more test cases
wudidapaopao Nov 11, 2025
2287862
test: update workflow
wudidapaopao Nov 12, 2025
012622f
Merge branch 'main' into support_export_dataframe
wudidapaopao Nov 13, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/build_linux_arm64_wheels-gh.yml
Original file line number Diff line number Diff line change
Expand Up @@ -124,7 +124,7 @@ jobs:
which clang++-19
clang++-19 --version
sudo apt-get install -y make cmake ccache ninja-build yasm gawk wget
# Install WebAssembly linker (wasm-ld)
# Install WebAssembly linker (wasm-ld)
sudo apt-get install -y lld-19
# Create symlink for wasm-ld
if ! command -v wasm-ld &> /dev/null; then
Expand Down
1 change: 0 additions & 1 deletion .github/workflows/build_macos_x86_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -133,7 +133,6 @@ jobs:
- uses: actions/checkout@v3
with:
fetch-depth: 0
token: ${{ secrets.GH_TOKEN }}
- name: Update submodules
run: |
git submodule update --init --recursive --jobs 4
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/build_musllinux_arm64_wheels.yml
Original file line number Diff line number Diff line change
Expand Up @@ -143,7 +143,7 @@ jobs:
# Update version for release (if triggered by tag)
if [ "${GITHUB_REF#refs/tags/v}" != "$GITHUB_REF" ]; then
pyenv shell 3.9

# Install bump-my-version
python -m pip install bump-my-version
TAG_NAME=${GITHUB_REF#refs/tags/v}
Expand Down
35 changes: 6 additions & 29 deletions chdb/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,9 +38,8 @@ class ChdbError(Exception):
"""


_arrow_format = set({"dataframe", "arrowtable"})
_arrow_format = set({"arrowtable"})
_process_result_format_funs = {
"dataframe": lambda x: to_df(x),
"arrowtable": lambda x: to_arrowTable(x),
}

Expand Down Expand Up @@ -108,33 +107,6 @@ def to_arrowTable(res):
return pa.RecordBatchFileReader(memview.view()).read_all()


# return pandas dataframe
def to_df(r):
"""Convert query result to pandas DataFrame.

Converts a chDB query result to a pandas DataFrame by first converting to
PyArrow Table and then to pandas using multi-threading for better performance.

Args:
r: chDB query result object containing binary Arrow data

Returns:
pd.DataFrame: pandas DataFrame containing the query results

Raises:
ImportError: If pyarrow or pandas are not installed

Example:
>>> result = chdb.query("SELECT 1 as id, 'hello' as msg", "Arrow")
>>> df = chdb.to_df(result)
>>> print(df)
id msg
0 1 hello
"""
t = to_arrowTable(r)
return t.to_pandas(use_threads=True)


# global connection lock, for multi-threading use of legacy chdb.query()
g_conn_lock = threading.Lock()

Expand Down Expand Up @@ -222,6 +194,11 @@ def query(sql, output_format="CSV", path="", udf_path=""):
with g_conn_lock:
conn = _chdb.connect(conn_str)
res = conn.query(sql, output_format)

if lower_output_format == "dataframe":
conn.close()
return res

if res.has_error():
conn.close()
raise ChdbError(res.error_message())
Expand Down
44 changes: 1 addition & 43 deletions chdb/state/sqlitelike.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,8 @@
raise ImportError("Failed to import pyarrow") from None


_arrow_format = set({"dataframe", "arrowtable"})
_arrow_format = set({"arrowtable"})
_process_result_format_funs = {
"dataframe": lambda x: to_df(x),
"arrowtable": lambda x: to_arrowTable(x),
}

Expand Down Expand Up @@ -67,47 +66,6 @@ def to_arrowTable(res):
return pa.RecordBatchFileReader(memview.view()).read_all()


# return pandas dataframe
def to_df(r):
"""Convert query result to Pandas DataFrame.

This function converts chdb query results to a Pandas DataFrame format
by first converting to PyArrow Table and then to DataFrame. This provides
convenient data analysis capabilities with Pandas API.

Args:
r: Query result object from chdb containing Arrow format data

Returns:
pandas.DataFrame: DataFrame containing the query results with
appropriate column names and data types

Raises:
ImportError: If pyarrow or pandas packages are not installed

.. note::
This function uses multi-threading for the Arrow to Pandas conversion
to improve performance on large datasets.

.. seealso::
:func:`to_arrowTable` - For PyArrow Table format conversion

Examples:
>>> import chdb
>>> result = chdb.query("SELECT 1 as num, 'hello' as text", "Arrow")
>>> df = to_df(result)
>>> print(df)
num text
0 1 hello
>>> print(df.dtypes)
num int64
text object
dtype: object
"""
t = to_arrowTable(r)
return t.to_pandas(use_threads=True)


class StreamingResult:
def __init__(self, c_result, conn, result_func, supports_record_batch):
self._result = c_result
Expand Down
6 changes: 6 additions & 0 deletions programs/local/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,19 @@ endif()
if (USE_PYTHON)
set (CHDB_SOURCES
chdb.cpp
ChunkCollectorOutputFormat.cpp
FieldToPython.cpp
FormatHelper.cpp
ListScan.cpp
LocalChdb.cpp
LocalServer.cpp
NumpyArray.cpp
NumpyNestedTypes.cpp
NumpyType.cpp
ObjectToPython.cpp
PandasAnalyzer.cpp
PandasDataFrame.cpp
PandasDataFrameBuilder.cpp
PandasScan.cpp
PyArrowStreamFactory.cpp
PyArrowTable.cpp
Expand Down
91 changes: 91 additions & 0 deletions programs/local/ChunkCollectorOutputFormat.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#include "ChunkCollectorOutputFormat.h"
#include "PandasDataFrameBuilder.h"

#include <IO/NullWriteBuffer.h>
#include <Processors/Port.h>
#include <Client/ClientBase.h>
#include <base/defines.h>

using namespace DB;

namespace CHDB
{

NullWriteBuffer ChunkCollectorOutputFormat::out;

ChunkCollectorOutputFormat::ChunkCollectorOutputFormat(
SharedHeader shared_header,
PandasDataFrameBuilder & builder)
: IOutputFormat(shared_header, out)
, dataframe_builder(builder)
{}

void ChunkCollectorOutputFormat::consume(Chunk chunk)
{
chunks.emplace_back(std::move(chunk));
}

void ChunkCollectorOutputFormat::consumeTotals(Chunk totals)
{
chunks.emplace_back(std::move(totals));
}

void ChunkCollectorOutputFormat::consumeExtremes(Chunk extremes)
{
chunks.emplace_back(std::move(extremes));
}

void ChunkCollectorOutputFormat::finalizeImpl()
{
// Add all collected chunks to the builder
for (const auto & chunk : chunks)
{
dataframe_builder.addChunk(chunk);
}

// Finalize the DataFrame generation
dataframe_builder.finalize();

chunks.clear();
}

/// Global dataframe builder
static std::shared_ptr<PandasDataFrameBuilder> g_dataframe_builder = nullptr;

PandasDataFrameBuilder & getGlobalDataFrameBuilder()
{
return *g_dataframe_builder;
}

void setGlobalDataFrameBuilder(std::shared_ptr<PandasDataFrameBuilder> builder)
{
g_dataframe_builder = builder;
}

void resetGlobalDataFrameBuilder()
{
if (g_dataframe_builder)
{
py::gil_scoped_acquire acquire;
g_dataframe_builder.reset();
}
}

/// create ChunkCollectorOutputFormat for use with function pointer
std::shared_ptr<IOutputFormat> createDataFrameOutputFormat(SharedHeader header)
{
/// Create a PandasDataFrameBuilder and set it globally
auto dataframe_builder = std::make_shared<PandasDataFrameBuilder>(*header);
setGlobalDataFrameBuilder(dataframe_builder);

/// Create and return the format with the builder
return std::make_shared<ChunkCollectorOutputFormat>(header, getGlobalDataFrameBuilder());
}

/// Registration function to be called during initialization
void registerDataFrameOutputFormat()
{
ClientBase::setDataFrameFormatCreator(&createDataFrameOutputFormat);
}

}
61 changes: 61 additions & 0 deletions programs/local/ChunkCollectorOutputFormat.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
#pragma once

#include <vector>
#include <Core/NamesAndTypes.h>
#include <Processors/Formats/IOutputFormat.h>
#include <Processors/Port.h>

namespace DB
{
class NullWriteBuffer;
}

namespace CHDB
{

class PandasDataFrameBuilder;

/// OutputFormat that collects all chunks into memory for further processing
/// Does not write to WriteBuffer, instead accumulates data for conversion to pandas DataFrame objects
class ChunkCollectorOutputFormat : public DB::IOutputFormat
{
public:
ChunkCollectorOutputFormat(DB::SharedHeader shared_header, PandasDataFrameBuilder & builder);

String getName() const override { return "ChunkCollectorOutputFormat"; }

void onCancel() noexcept override
{
chunks.clear();
}

protected:
void consume(DB::Chunk chunk) override;

void consumeTotals(DB::Chunk totals) override;

void consumeExtremes(DB::Chunk extremes) override;

void finalizeImpl() override;

private:
std::vector<DB::Chunk> chunks;

PandasDataFrameBuilder & dataframe_builder;

static DB::NullWriteBuffer out;
};

/// Registration function to be called during initialization
void registerDataFrameOutputFormat();

/// Get the global dataframe builder
PandasDataFrameBuilder & getGlobalDataFrameBuilder();

/// Set the global dataframe builder
void setGlobalDataFrameBuilder(std::shared_ptr<PandasDataFrameBuilder> builder);

/// Reset the global dataframe builder
void resetGlobalDataFrameBuilder();

}
Loading
Loading