diff --git a/.github/workflows/build_linux_arm64_wheels-gh.yml b/.github/workflows/build_linux_arm64_wheels-gh.yml index 3bfdf772c6d..303fcd371c0 100644 --- a/.github/workflows/build_linux_arm64_wheels-gh.yml +++ b/.github/workflows/build_linux_arm64_wheels-gh.yml @@ -124,7 +124,7 @@ jobs: which clang++-19 clang++-19 --version sudo apt-get install -y make cmake ccache ninja-build yasm gawk wget - # Install WebAssembly linker (wasm-ld) + # Install WebAssembly linker (wasm-ld) sudo apt-get install -y lld-19 # Create symlink for wasm-ld if ! command -v wasm-ld &> /dev/null; then diff --git a/.github/workflows/build_macos_x86_wheels.yml b/.github/workflows/build_macos_x86_wheels.yml index 5f9cf520d80..f58e2230c86 100644 --- a/.github/workflows/build_macos_x86_wheels.yml +++ b/.github/workflows/build_macos_x86_wheels.yml @@ -133,7 +133,6 @@ jobs: - uses: actions/checkout@v3 with: fetch-depth: 0 - token: ${{ secrets.GH_TOKEN }} - name: Update submodules run: | git submodule update --init --recursive --jobs 4 diff --git a/.github/workflows/build_musllinux_arm64_wheels.yml b/.github/workflows/build_musllinux_arm64_wheels.yml index b76f3ab6f02..4eb06543427 100644 --- a/.github/workflows/build_musllinux_arm64_wheels.yml +++ b/.github/workflows/build_musllinux_arm64_wheels.yml @@ -143,7 +143,7 @@ jobs: # Update version for release (if triggered by tag) if [ "${GITHUB_REF#refs/tags/v}" != "$GITHUB_REF" ]; then pyenv shell 3.9 - + # Install bump-my-version python -m pip install bump-my-version TAG_NAME=${GITHUB_REF#refs/tags/v} diff --git a/chdb/__init__.py b/chdb/__init__.py index 0094323643e..6d4d516a3e7 100644 --- a/chdb/__init__.py +++ b/chdb/__init__.py @@ -38,9 +38,8 @@ class ChdbError(Exception): """ -_arrow_format = set({"dataframe", "arrowtable"}) +_arrow_format = set({"arrowtable"}) _process_result_format_funs = { - "dataframe": lambda x: to_df(x), "arrowtable": lambda x: to_arrowTable(x), } @@ -108,33 +107,6 @@ def to_arrowTable(res): return pa.RecordBatchFileReader(memview.view()).read_all() -# return pandas dataframe -def to_df(r): - """Convert query result to pandas DataFrame. - - Converts a chDB query result to a pandas DataFrame by first converting to - PyArrow Table and then to pandas using multi-threading for better performance. - - Args: - r: chDB query result object containing binary Arrow data - - Returns: - pd.DataFrame: pandas DataFrame containing the query results - - Raises: - ImportError: If pyarrow or pandas are not installed - - Example: - >>> result = chdb.query("SELECT 1 as id, 'hello' as msg", "Arrow") - >>> df = chdb.to_df(result) - >>> print(df) - id msg - 0 1 hello - """ - t = to_arrowTable(r) - return t.to_pandas(use_threads=True) - - # global connection lock, for multi-threading use of legacy chdb.query() g_conn_lock = threading.Lock() @@ -222,6 +194,11 @@ def query(sql, output_format="CSV", path="", udf_path=""): with g_conn_lock: conn = _chdb.connect(conn_str) res = conn.query(sql, output_format) + + if lower_output_format == "dataframe": + conn.close() + return res + if res.has_error(): conn.close() raise ChdbError(res.error_message()) diff --git a/chdb/state/sqlitelike.py b/chdb/state/sqlitelike.py index 3b138e0adbb..afaf2c5cbdd 100644 --- a/chdb/state/sqlitelike.py +++ b/chdb/state/sqlitelike.py @@ -10,9 +10,8 @@ raise ImportError("Failed to import pyarrow") from None -_arrow_format = set({"dataframe", "arrowtable"}) +_arrow_format = set({"arrowtable"}) _process_result_format_funs = { - "dataframe": lambda x: to_df(x), "arrowtable": lambda x: to_arrowTable(x), } @@ -67,47 +66,6 @@ def to_arrowTable(res): return pa.RecordBatchFileReader(memview.view()).read_all() -# return pandas dataframe -def to_df(r): - """Convert query result to Pandas DataFrame. - - This function converts chdb query results to a Pandas DataFrame format - by first converting to PyArrow Table and then to DataFrame. This provides - convenient data analysis capabilities with Pandas API. - - Args: - r: Query result object from chdb containing Arrow format data - - Returns: - pandas.DataFrame: DataFrame containing the query results with - appropriate column names and data types - - Raises: - ImportError: If pyarrow or pandas packages are not installed - - .. note:: - This function uses multi-threading for the Arrow to Pandas conversion - to improve performance on large datasets. - - .. seealso:: - :func:`to_arrowTable` - For PyArrow Table format conversion - - Examples: - >>> import chdb - >>> result = chdb.query("SELECT 1 as num, 'hello' as text", "Arrow") - >>> df = to_df(result) - >>> print(df) - num text - 0 1 hello - >>> print(df.dtypes) - num int64 - text object - dtype: object - """ - t = to_arrowTable(r) - return t.to_pandas(use_threads=True) - - class StreamingResult: def __init__(self, c_result, conn, result_func, supports_record_batch): self._result = c_result diff --git a/programs/local/CMakeLists.txt b/programs/local/CMakeLists.txt index f84770e6392..2cade0b59be 100644 --- a/programs/local/CMakeLists.txt +++ b/programs/local/CMakeLists.txt @@ -25,13 +25,19 @@ endif() if (USE_PYTHON) set (CHDB_SOURCES chdb.cpp + ChunkCollectorOutputFormat.cpp + FieldToPython.cpp FormatHelper.cpp ListScan.cpp LocalChdb.cpp LocalServer.cpp + NumpyArray.cpp + NumpyNestedTypes.cpp NumpyType.cpp + ObjectToPython.cpp PandasAnalyzer.cpp PandasDataFrame.cpp + PandasDataFrameBuilder.cpp PandasScan.cpp PyArrowStreamFactory.cpp PyArrowTable.cpp diff --git a/programs/local/ChunkCollectorOutputFormat.cpp b/programs/local/ChunkCollectorOutputFormat.cpp new file mode 100644 index 00000000000..8faa54a7ef4 --- /dev/null +++ b/programs/local/ChunkCollectorOutputFormat.cpp @@ -0,0 +1,91 @@ +#include "ChunkCollectorOutputFormat.h" +#include "PandasDataFrameBuilder.h" + +#include +#include +#include +#include + +using namespace DB; + +namespace CHDB +{ + +NullWriteBuffer ChunkCollectorOutputFormat::out; + +ChunkCollectorOutputFormat::ChunkCollectorOutputFormat( + SharedHeader shared_header, + PandasDataFrameBuilder & builder) + : IOutputFormat(shared_header, out) + , dataframe_builder(builder) +{} + +void ChunkCollectorOutputFormat::consume(Chunk chunk) +{ + chunks.emplace_back(std::move(chunk)); +} + +void ChunkCollectorOutputFormat::consumeTotals(Chunk totals) +{ + chunks.emplace_back(std::move(totals)); +} + +void ChunkCollectorOutputFormat::consumeExtremes(Chunk extremes) +{ + chunks.emplace_back(std::move(extremes)); +} + +void ChunkCollectorOutputFormat::finalizeImpl() +{ + // Add all collected chunks to the builder + for (const auto & chunk : chunks) + { + dataframe_builder.addChunk(chunk); + } + + // Finalize the DataFrame generation + dataframe_builder.finalize(); + + chunks.clear(); +} + +/// Global dataframe builder +static std::shared_ptr g_dataframe_builder = nullptr; + +PandasDataFrameBuilder & getGlobalDataFrameBuilder() +{ + return *g_dataframe_builder; +} + +void setGlobalDataFrameBuilder(std::shared_ptr builder) +{ + g_dataframe_builder = builder; +} + +void resetGlobalDataFrameBuilder() +{ + if (g_dataframe_builder) + { + py::gil_scoped_acquire acquire; + g_dataframe_builder.reset(); + } +} + +/// create ChunkCollectorOutputFormat for use with function pointer +std::shared_ptr createDataFrameOutputFormat(SharedHeader header) +{ + /// Create a PandasDataFrameBuilder and set it globally + auto dataframe_builder = std::make_shared(*header); + setGlobalDataFrameBuilder(dataframe_builder); + + /// Create and return the format with the builder + return std::make_shared(header, getGlobalDataFrameBuilder()); +} + +/// Registration function to be called during initialization +void registerDataFrameOutputFormat() +{ + ClientBase::setDataFrameFormatCreator(&createDataFrameOutputFormat); +} + +} diff --git a/programs/local/ChunkCollectorOutputFormat.h b/programs/local/ChunkCollectorOutputFormat.h new file mode 100644 index 00000000000..7dc2fe26127 --- /dev/null +++ b/programs/local/ChunkCollectorOutputFormat.h @@ -0,0 +1,61 @@ +#pragma once + +#include +#include +#include +#include + +namespace DB +{ +class NullWriteBuffer; +} + +namespace CHDB +{ + +class PandasDataFrameBuilder; + +/// OutputFormat that collects all chunks into memory for further processing +/// Does not write to WriteBuffer, instead accumulates data for conversion to pandas DataFrame objects +class ChunkCollectorOutputFormat : public DB::IOutputFormat +{ +public: + ChunkCollectorOutputFormat(DB::SharedHeader shared_header, PandasDataFrameBuilder & builder); + + String getName() const override { return "ChunkCollectorOutputFormat"; } + + void onCancel() noexcept override + { + chunks.clear(); + } + +protected: + void consume(DB::Chunk chunk) override; + + void consumeTotals(DB::Chunk totals) override; + + void consumeExtremes(DB::Chunk extremes) override; + + void finalizeImpl() override; + +private: + std::vector chunks; + + PandasDataFrameBuilder & dataframe_builder; + + static DB::NullWriteBuffer out; +}; + +/// Registration function to be called during initialization +void registerDataFrameOutputFormat(); + +/// Get the global dataframe builder +PandasDataFrameBuilder & getGlobalDataFrameBuilder(); + +/// Set the global dataframe builder +void setGlobalDataFrameBuilder(std::shared_ptr builder); + +/// Reset the global dataframe builder +void resetGlobalDataFrameBuilder(); + +} diff --git a/programs/local/FieldToPython.cpp b/programs/local/FieldToPython.cpp new file mode 100644 index 00000000000..ce859754a71 --- /dev/null +++ b/programs/local/FieldToPython.cpp @@ -0,0 +1,767 @@ +#include "FieldToPython.h" +#include "PythonImporter.h" +#include "ObjectToPython.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int NOT_IMPLEMENTED; +extern const int LOGICAL_ERROR; +} + +} + +namespace CHDB +{ + +using namespace DB; + +py::object convertTimeFieldToPython(const Field & field) +{ + auto & import_cache = PythonImporter::ImportCache(); + auto time_seconds = field.safeGet(); + + if (time_seconds < 0) + { + return py::str(toString(field)); + } + + /// Handle time overflow (should be within 24 hours) + /// ClickHouse Time range is [-999:59:59, 999:59:59] + time_seconds = time_seconds % 86400; + + int hour = static_cast(time_seconds / 3600); + int minute = static_cast((time_seconds % 3600) / 60); + int second = static_cast(time_seconds % 60); + int microsecond = 0; + + try + { + return import_cache.datetime.time()(hour, minute, second, microsecond); + } + catch (py::error_already_set &) + { + return py::str(toString(field)); + } +} + +py::object convertTime64FieldToPython(const Field & field) +{ + auto & import_cache = PythonImporter::ImportCache(); + auto time64_field = field.safeGet>(); + auto time64_value = time64_field.getValue(); + Int64 time64_ticks = time64_value.value; + + if (time64_ticks < 0) + { + return py::str(toString(field)); + } + + UInt32 scale = time64_field.getScale(); + Int64 scale_multiplier = DecimalUtils::scaleMultiplier(scale); + + /// Convert to seconds and fractional part within a day + Int64 total_seconds = time64_ticks / scale_multiplier; + Int64 fractional = time64_ticks % scale_multiplier; + + /// Handle time overflow (should be within 24 hours) + /// ClickHouse Time range is [-999:59:59, 999:59:59] + total_seconds = total_seconds % 86400; + + int hour = static_cast(total_seconds / 3600); + int minute = static_cast((total_seconds % 3600) / 60); + int second = static_cast(total_seconds % 60); + int microsecond = static_cast((fractional * 1000000) / scale_multiplier); + + try + { + return import_cache.datetime.time()(hour, minute, second, microsecond); + } + catch (py::error_already_set &) + { + return py::str(toString(field)); + } +} + +static bool canTypeBeUsedAsDictKey(const DataTypePtr & type) +{ + DataTypePtr actual_type = removeLowCardinalityAndNullable(type); + + switch (actual_type->getTypeId()) + { + case TypeIndex::Nothing: + case TypeIndex::Int8: + case TypeIndex::UInt8: + case TypeIndex::Int16: + case TypeIndex::UInt16: + case TypeIndex::Int32: + case TypeIndex::UInt32: + case TypeIndex::Int64: + case TypeIndex::UInt64: + case TypeIndex::Float32: + case TypeIndex::Float64: + case TypeIndex::Int128: + case TypeIndex::Int256: + case TypeIndex::UInt128: + case TypeIndex::UInt256: + case TypeIndex::BFloat16: + case TypeIndex::Date: + case TypeIndex::Date32: + case TypeIndex::DateTime: + case TypeIndex::DateTime64: + case TypeIndex::Time: + case TypeIndex::Time64: + case TypeIndex::String: + case TypeIndex::FixedString: + case TypeIndex::Enum8: + case TypeIndex::Enum16: + case TypeIndex::Decimal32: + case TypeIndex::Decimal64: + case TypeIndex::Decimal128: + case TypeIndex::Decimal256: + case TypeIndex::UUID: + case TypeIndex::Interval: + case TypeIndex::IPv4: + case TypeIndex::IPv6: + return true; + + case TypeIndex::Array: + case TypeIndex::Tuple: + case TypeIndex::Map: + case TypeIndex::Object: + case TypeIndex::Dynamic: + return false; + + case TypeIndex::Variant: + { + const auto * variant_type = typeid_cast(type.get()); + chassert(variant_type); + + const auto & variants = variant_type->getVariants(); + for (const auto & variant : variants) + { + if (!canTypeBeUsedAsDictKey(variant)) + return false; + } + return true; + } + + case TypeIndex::Set: + case TypeIndex::JSONPaths: + case TypeIndex::ObjectDeprecated: + case TypeIndex::Function: + case TypeIndex::AggregateFunction: + case TypeIndex::LowCardinality: + case TypeIndex::Nullable: + default: + return false; + } +} + +static py::object convertLocalDateToPython(const LocalDate & local_date, auto & import_cache, const Field & field) +{ + auto year = local_date.year(); + auto month = local_date.month(); + auto day = local_date.day(); + + try + { + return import_cache.datetime.date()(year, month, day); + } + catch (py::error_already_set &) + { + return py::str(toString(field)); + } +} + +py::object convertFieldToPython( + const IColumn & column, + const DataTypePtr & type, + size_t index) +{ + if (column.isNullAt(index)) + { + return py::none(); + } + + DataTypePtr actual_type = removeLowCardinalityAndNullable(type); + + auto & import_cache = PythonImporter::ImportCache(); + + switch (actual_type->getTypeId()) + { + case TypeIndex::Nothing: + return py::none(); + + case TypeIndex::Int8: + { + auto field = column[index]; + return py::cast(field.safeGet()); + } + + case TypeIndex::UInt8: + { + auto field = column[index]; + auto is_bool = isBool(actual_type); + if (is_bool) + { + bool val = field.safeGet(); + return py::cast(val); + } + + return py::cast(field.safeGet()); + } + + case TypeIndex::Int16: + { + auto field = column[index]; + return py::cast(field.safeGet()); + } + + case TypeIndex::UInt16: + { + auto field = column[index]; + return py::cast(field.safeGet()); + } + + case TypeIndex::Int32: + { + auto field = column[index]; + return py::cast(field.safeGet()); + } + + case TypeIndex::UInt32: + { + auto field = column[index]; + return py::cast(field.safeGet()); + } + + case TypeIndex::Int64: + { + auto field = column[index]; + return py::cast(field.safeGet()); + } + + case TypeIndex::UInt64: + { + auto field = column[index]; + return py::cast(field.safeGet()); + } + + case TypeIndex::Float32: + { + auto field = column[index]; + return py::cast(field.safeGet()); + } + + case TypeIndex::Float64: + { + auto field = column[index]; + return py::cast(field.safeGet()); + } + + case TypeIndex::Int128: + { + auto field = column[index]; + return py::cast((double)field.safeGet()); + } + + case TypeIndex::Int256: + { + auto field = column[index]; + return py::cast((double)field.safeGet()); + } + + case TypeIndex::UInt128: + { + auto field = column[index]; + return py::cast((double)field.safeGet()); + } + + case TypeIndex::UInt256: + { + auto field = column[index]; + return py::cast((double)field.safeGet()); + } + + case TypeIndex::BFloat16: + { + auto field = column[index]; + return py::cast((double)field.safeGet()); + } + + case TypeIndex::Date: + { + auto field = column[index]; + auto days = field.safeGet(); + LocalDate local_date(DayNum(static_cast(days))); + return convertLocalDateToPython(local_date, import_cache, field); + } + + case TypeIndex::Date32: + { + auto field = column[index]; + auto days = field.safeGet(); + LocalDate local_date(ExtendedDayNum(static_cast(days))); + return convertLocalDateToPython(local_date, import_cache, field); + } + + case TypeIndex::DateTime: + { + auto field = column[index]; + auto seconds = field.safeGet(); + + const auto * datetime_type = typeid_cast(actual_type.get()); + const auto & utc_time_zone = DateLUT::instance("UTC"); + const auto & time_zone = datetime_type ? datetime_type->getTimeZone() : utc_time_zone; + + time_t timestamp = static_cast(seconds); + LocalDateTime local_dt(timestamp, time_zone); + + int year = local_dt.year(); + int month = local_dt.month(); + int day = local_dt.day(); + int hour = local_dt.hour(); + int minute = local_dt.minute(); + int second = local_dt.second(); + int microsecond = 0; + + try + { + py::object timestamp_object = import_cache.datetime.datetime()( + year, month, day, hour, minute, second, microsecond + ); + + const String & tz_name = time_zone.getTimeZone(); + auto tz_obj = import_cache.pytz.timezone()(tz_name); + return tz_obj.attr("localize")(timestamp_object); + } + catch (py::error_already_set &) + { + return py::str(toString(field)); + } + } + + case TypeIndex::DateTime64: + { + auto field = column[index]; + auto datetime64_field = field.safeGet>(); + auto datetime64_value = datetime64_field.getValue(); + Int64 datetime64_ticks = datetime64_value.value; + + const auto * datetime64_type = typeid_cast(actual_type.get()); + const auto & utc_time_zone = DateLUT::instance("UTC"); + const auto & time_zone = datetime64_type ? datetime64_type->getTimeZone() : utc_time_zone; + + UInt32 scale = datetime64_field.getScale(); + Int64 scale_multiplier = DecimalUtils::scaleMultiplier(scale); + + auto seconds = static_cast(datetime64_ticks / scale_multiplier); + auto fractional = datetime64_ticks % scale_multiplier; + + LocalDateTime local_dt(seconds, time_zone); + + int year = local_dt.year(); + int month = local_dt.month(); + int day = local_dt.day(); + int hour = local_dt.hour(); + int minute = local_dt.minute(); + int second = local_dt.second(); + int microsecond = static_cast((fractional * 1000000) / scale_multiplier); + + try + { + py::object timestamp_object = import_cache.datetime.datetime()( + year, month, day, hour, minute, second, microsecond + ); + + const String & tz_name = time_zone.getTimeZone(); + auto tz_obj = import_cache.pytz.timezone()(tz_name); + return tz_obj.attr("localize")(timestamp_object); + } + catch (py::error_already_set &) + { + return py::str(toString(field)); + } + } + + case TypeIndex::Time: + { + auto field = column[index]; + return convertTimeFieldToPython(field); + } + + case TypeIndex::Time64: + { + auto field = column[index]; + return convertTime64FieldToPython(field); + } + + case TypeIndex::String: + case TypeIndex::FixedString: + { + auto field = column[index]; + return py::cast(field.safeGet()); + } + + case TypeIndex::Enum8: + { + auto field = column[index]; + try + { + const auto & enum_type = typeid_cast(*type); + auto it = enum_type.findByValue(static_cast(field.safeGet())); + String enum_name(it->second.data, it->second.size); + return py::cast(enum_name); + } + catch (...) + { + return py::cast(toString(field)); + } + } + + case TypeIndex::Enum16: + { + auto field = column[index]; + try + { + const auto & enum_type = typeid_cast(*type); + auto it = enum_type.findByValue(static_cast(field.safeGet())); + String enum_name(it->second.data, it->second.size); + return py::cast(enum_name); + } + catch (...) + { + return py::cast(toString(field)); + } + } + + case TypeIndex::Decimal32: + { + auto field = column[index]; + auto decimal_field = field.safeGet>(); + auto decimal_value = decimal_field.getValue(); + UInt32 scale = decimal_field.getScale(); + double result = DecimalUtils::convertTo(decimal_value, scale); + return py::cast(result); + } + + case TypeIndex::Decimal64: + { + auto field = column[index]; + auto decimal_field = field.safeGet>(); + auto decimal_value = decimal_field.getValue(); + UInt32 scale = decimal_field.getScale(); + double result = DecimalUtils::convertTo(decimal_value, scale); + return py::cast(result); + } + + case TypeIndex::Decimal128: + { + auto field = column[index]; + auto decimal_field = field.safeGet>(); + auto decimal_value = decimal_field.getValue(); + UInt32 scale = decimal_field.getScale(); + double result = DecimalUtils::convertTo(decimal_value, scale); + return py::cast(result); + } + + case TypeIndex::Decimal256: + { + auto field = column[index]; + auto decimal_field = field.safeGet>(); + auto decimal_value = decimal_field.getValue(); + UInt32 scale = decimal_field.getScale(); + double result = DecimalUtils::convertTo(decimal_value, scale); + return py::cast(result); + } + + case TypeIndex::UUID: + { + auto field = column[index]; + auto uuid_value = field.safeGet(); + const auto formatted_uuid = formatUUID(uuid_value); + return import_cache.uuid.UUID()(String(formatted_uuid.data(), formatted_uuid.size())); + } + + case TypeIndex::Array: + { + const auto & array_column = typeid_cast(column); + + const auto * array_type = typeid_cast(actual_type.get()); + chassert(array_type); + + const auto & element_type = array_type->getNestedType(); + const auto & offsets = array_column.getOffsets(); + const auto & nested_column = array_column.getDataPtr(); + + size_t start_offset = (index == 0) ? 0 : offsets[index - 1]; + size_t end_offset = offsets[index]; + + py::list python_list; + for (size_t i = start_offset; i < end_offset; ++i) + { + auto python_element = convertFieldToPython(*nested_column, element_type, i); + python_list.append(python_element); + } + + return python_list; + } + + case TypeIndex::Tuple: + { + const auto & tuple_column = typeid_cast(column); + + const auto * tuple_type = typeid_cast(actual_type.get()); + chassert(tuple_type); + + const auto & element_types = tuple_type->getElements(); + const auto & tuple_columns = tuple_column.getColumns(); + + py::tuple python_tuple(tuple_columns.size()); + for (size_t i = 0; i < tuple_columns.size(); ++i) + { + auto python_element = convertFieldToPython(*(tuple_columns[i]), element_types[i], index); + python_tuple[i] = python_element; + } + + return python_tuple; + } + + case TypeIndex::Interval: + { + auto field = column[index]; + auto interval_value = field.safeGet(); + const auto * interval_type = typeid_cast(actual_type.get()); + chassert(interval_type); + IntervalKind::Kind interval_kind = interval_type->getKind(); + + switch (interval_kind) + { + case IntervalKind::Kind::Nanosecond: + return import_cache.datetime.timedelta()(py::arg("microseconds") = interval_value / 1000); + case IntervalKind::Kind::Microsecond: + return import_cache.datetime.timedelta()(py::arg("microseconds") = interval_value); + case IntervalKind::Kind::Millisecond: + return import_cache.datetime.timedelta()(py::arg("milliseconds") = interval_value); + case IntervalKind::Kind::Second: + return import_cache.datetime.timedelta()(py::arg("seconds") = interval_value); + case IntervalKind::Kind::Minute: + return import_cache.datetime.timedelta()(py::arg("minutes") = interval_value); + case IntervalKind::Kind::Hour: + return import_cache.datetime.timedelta()(py::arg("hours") = interval_value); + case IntervalKind::Kind::Day: + return import_cache.datetime.timedelta()(py::arg("days") = interval_value); + case IntervalKind::Kind::Week: + return import_cache.datetime.timedelta()(py::arg("weeks") = interval_value); + case IntervalKind::Kind::Month: + /// Approximate: 1 month = 30 days + return import_cache.datetime.timedelta()(py::arg("days") = interval_value * 30); + case IntervalKind::Kind::Quarter: + /// 1 quarter = 3 months = 90 days + return import_cache.datetime.timedelta()(py::arg("days") = interval_value * 90); + case IntervalKind::Kind::Year: + /// 1 year = 365 days + return import_cache.datetime.timedelta()(py::arg("days") = interval_value * 365); + default: + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unsupported interval kind"); + } + } + + case TypeIndex::Map: + { + const auto & map_column = typeid_cast(column); + + const auto * map_type = typeid_cast(actual_type.get()); + chassert(map_type); + + const auto & key_type = map_type->getKeyType(); + const auto & value_type = map_type->getValueType(); + + /// Get the nested array column containing tuples + const auto & nested_array = map_column.getNestedColumn(); + const auto & array_column = typeid_cast(nested_array); + + const auto & offsets = array_column.getOffsets(); + const auto & tuple_column_ptr = array_column.getDataPtr(); + const auto & tuple_column = typeid_cast(*tuple_column_ptr); + + size_t start_offset = (index == 0) ? 0 : offsets[index - 1]; + size_t end_offset = offsets[index]; + + const auto & key_column = tuple_column.getColumn(0); + const auto & value_column = tuple_column.getColumn(1); + + bool use_dict = canTypeBeUsedAsDictKey(key_type); + + if (use_dict) + { + py::dict python_dict; + for (size_t i = start_offset; i < end_offset; ++i) + { + auto python_key = convertFieldToPython(key_column, key_type, i); + auto python_value = convertFieldToPython(value_column, value_type, i); + + python_dict[std::move(python_key)] = std::move(python_value); + } + + return python_dict; + } + else + { + py::list keys_list; + py::list values_list; + for (size_t i = start_offset; i < end_offset; ++i) + { + auto python_key = convertFieldToPython(key_column, key_type, i); + auto python_value = convertFieldToPython(value_column, value_type, i); + + keys_list.append(std::move(python_key)); + values_list.append(std::move(python_value)); + } + + py::dict python_dict; + python_dict["keys"] = std::move(keys_list); + python_dict["values"] = std::move(values_list); + + return python_dict; + } + } + + case TypeIndex::Variant: + { + const auto & variant_column = typeid_cast(column); + auto discriminator = variant_column.globalDiscriminatorAt(index); + if (discriminator == ColumnVariant::NULL_DISCRIMINATOR) + { + return py::none(); + } + + const auto & variant_type = typeid_cast(*actual_type); + const auto & variants = variant_type.getVariants(); + const auto & variant_data_type = variants[discriminator]; + + auto offset = variant_column.offsetAt(index); + const auto & variant_inner_column = variant_column.getVariantByGlobalDiscriminator(discriminator); + + return convertFieldToPython(variant_inner_column, variant_data_type, offset); + } + + + case TypeIndex::Dynamic: + { + const auto & dynamic_column = typeid_cast(column); + const auto & variant_column = dynamic_column.getVariantColumn(); + + /// Check if this row has value in shared variant + if (variant_column.globalDiscriminatorAt(index) == dynamic_column.getSharedVariantDiscriminator()) + { + /// Get data from shared variant and deserialize it + auto value = dynamic_column.getSharedVariant().getDataAt(variant_column.offsetAt(index)); + ReadBufferFromMemory buf(value.data, value.size); + auto variant_type = decodeDataType(buf); + auto tmp_variant_column = variant_type->createColumn(); + auto variant_serialization = variant_type->getDefaultSerialization(); + variant_serialization->deserializeBinary(*tmp_variant_column, buf, FormatSettings{}); + + /// Convert the deserialized value + return convertFieldToPython(*tmp_variant_column, variant_type, 0); + } + else + { + /// Use variant conversion logic directly + return convertFieldToPython(variant_column, dynamic_column.getVariantInfo().variant_type, index); + } + } + + case TypeIndex::Object: + { + return convertObjectToPython(column, actual_type, index); + } + + case TypeIndex::IPv4: + { + auto field = column[index]; + auto ipv4_value = field.safeGet(); + + char ipv4_str[IPV4_MAX_TEXT_LENGTH]; + char * ptr = ipv4_str; + formatIPv4(reinterpret_cast(&ipv4_value), ptr); + const size_t ipv4_str_len = ptr - ipv4_str; + + return import_cache.ipaddress.ipv4_address()(String(ipv4_str, ipv4_str_len)); + } + + case TypeIndex::IPv6: + { + auto field = column[index]; + auto ipv6_value = field.safeGet(); + + char ipv6_str[IPV6_MAX_TEXT_LENGTH]; + char * ptr = ipv6_str; + formatIPv6(reinterpret_cast(&ipv6_value), ptr); + const size_t ipv6_str_len = ptr - ipv6_str; + + return import_cache.ipaddress.ipv6_address()(String(ipv6_str, ipv6_str_len)); + } + + /// Set types are used only in WHERE clauses for IN operations, not in actual data storage + case TypeIndex::Set: + /// JSONPaths is an internal type used only for JSON schema inference, + case TypeIndex::JSONPaths: + /// Deprecated type, should not appear in normal data processing + case TypeIndex::ObjectDeprecated: + /// Function types are not actual data types, should not appear here + case TypeIndex::Function: + /// Aggregate function types are not actual data types, should not appear here + case TypeIndex::AggregateFunction: + /// LowCardinality should be unwrapped before reaching this point + case TypeIndex::LowCardinality: + /// Nullable cannot contain another Nullable type, so this should not appear in nested conversion + case TypeIndex::Nullable: + /// QBit type is supported in newer versions of ClickHouse + /// case TypeIndex::QBit: + default: + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}", type->getName()); + } +} + +} // namespace CHDB diff --git a/programs/local/FieldToPython.h b/programs/local/FieldToPython.h new file mode 100644 index 00000000000..f175ceb0866 --- /dev/null +++ b/programs/local/FieldToPython.h @@ -0,0 +1,20 @@ +#pragma once + +#include +#include +#include +#include + +namespace CHDB +{ + +pybind11::object convertTimeFieldToPython(const DB::Field & field); + +pybind11::object convertTime64FieldToPython(const DB::Field & field); + +pybind11::object convertFieldToPython( + const DB::IColumn & column, + const DB::DataTypePtr & type, + size_t index); + +} // namespace CHDB diff --git a/programs/local/IPAddressCacheItem.h b/programs/local/IPAddressCacheItem.h new file mode 100644 index 00000000000..2d51a1a3e43 --- /dev/null +++ b/programs/local/IPAddressCacheItem.h @@ -0,0 +1,25 @@ +#pragma once + +#include "PythonImportCacheItem.h" + +namespace CHDB { + +struct IPAddressCacheItem : public PythonImportCacheItem +{ +public: + static constexpr const char * Name = "ipaddress"; + + IPAddressCacheItem() + : PythonImportCacheItem("ipaddress") + , ipv4_address("IPv4Address", this) + , ipv6_address("IPv6Address", this) + { + } + + ~IPAddressCacheItem() override = default; + + PythonImportCacheItem ipv4_address; + PythonImportCacheItem ipv6_address; +}; + +} // namespace CHDB diff --git a/programs/local/LocalChdb.cpp b/programs/local/LocalChdb.cpp index a35f6669c56..61c931270e3 100644 --- a/programs/local/LocalChdb.cpp +++ b/programs/local/LocalChdb.cpp @@ -1,14 +1,14 @@ #include "LocalChdb.h" -#include +#include "chdb-internal.h" +#include "PandasDataFrameBuilder.h" +#include "ChunkCollectorOutputFormat.h" #include "PythonImporter.h" #include "PythonTableCache.h" #include "StoragePython.h" -#include "chdb-internal.h" -#include "chdb.h" #include #include - +#include #include #if USE_JEMALLOC # include @@ -79,13 +79,26 @@ chdb_result * queryToBuffer( // Pybind11 will take over the ownership of the `query_result` object // using smart ptr will cause early free of the object -query_result * query( +py::object query( const std::string & queryStr, const std::string & output_format = "CSV", const std::string & path = {}, const std::string & udfPath = {}) { - return new query_result(queryToBuffer(queryStr, output_format, path, udfPath)); + auto * result = queryToBuffer(queryStr, output_format, path, udfPath); + + if (Poco::toLower(output_format) == "dataframe") + { + chdb_destroy_query_result(result); + + auto & builder = CHDB::getGlobalDataFrameBuilder(); + auto ret = builder.getDataFrame(); + CHDB::resetGlobalDataFrameBuilder(); + return ret; + } + + // Default behavior - return query_result + return py::cast(new query_result(result)); } // The `query_result` and `memoryview_wrapper` will hold `local_result_wrapper` with shared_ptr @@ -263,25 +276,39 @@ void connection_wrapper::commit() // do nothing } -query_result * connection_wrapper::query(const std::string & query_str, const std::string & format) +py::object connection_wrapper::query(const std::string & query_str, const std::string & format) { CHDB::PythonTableCache::findQueryableObjFromQuery(query_str); - py::gil_scoped_release release; - auto * result = chdb_query_n(*conn, query_str.data(), query_str.size(), format.data(), format.size()); - if (chdb_result_length(result)) + chdb_result * result = nullptr; { - LOG_DEBUG(getLogger("CHDB"), "Empty result returned for query: {}", query_str); - } + py::gil_scoped_release release; + result = chdb_query_n(*conn, query_str.data(), query_str.size(), format.data(), format.size()); + auto error_msg = CHDB::chdb_result_error_string(result); + if (!error_msg.empty()) + { + std::string msg_copy(error_msg); + chdb_destroy_query_result(result); + CHDB::resetGlobalDataFrameBuilder(); + throw std::runtime_error(msg_copy); + } - auto error_msg = CHDB::chdb_result_error_string(result); - if (!error_msg.empty()) - { - std::string msg_copy(error_msg); - chdb_destroy_query_result(result); - throw std::runtime_error(msg_copy); + if (Poco::toLower(format) == "dataframe") + { + chdb_destroy_query_result(result); + auto & builder = CHDB::getGlobalDataFrameBuilder(); + auto ret = builder.getDataFrame(); + CHDB::resetGlobalDataFrameBuilder(); + return ret; + } + + if (chdb_result_length(result)) + { + LOG_DEBUG(getLogger("CHDB"), "Empty result returned for query: {}", query_str); + } } - return new query_result(result, false); + + return py::cast(new query_result(result, false)); } streaming_query_result * connection_wrapper::send_query(const std::string & query_str, const std::string & format) @@ -483,7 +510,7 @@ PYBIND11_MODULE(_chdb, m) &connection_wrapper::query, py::arg("query_str"), py::arg("format") = "CSV", - "Execute a query and return a query_result object") + "Execute a query and return a query_result object or DataFrame") .def( "send_query", &connection_wrapper::send_query, @@ -509,7 +536,7 @@ PYBIND11_MODULE(_chdb, m) py::kw_only(), py::arg("path") = "", py::arg("udf_path") = "", - "Query chDB and return a query_result object"); + "Query chDB and return a query_result object or DataFrame"); auto destroy_import_cache = []() { diff --git a/programs/local/LocalChdb.h b/programs/local/LocalChdb.h index 5cf30058135..076103cca19 100644 --- a/programs/local/LocalChdb.h +++ b/programs/local/LocalChdb.h @@ -30,7 +30,7 @@ class connection_wrapper cursor_wrapper * cursor(); void commit(); void close(); - query_result * query(const std::string & query_str, const std::string & format = "CSV"); + py::object query(const std::string & query_str, const std::string & format = "CSV"); streaming_query_result * send_query(const std::string & query_str, const std::string & format = "CSV"); query_result * streaming_fetch_result(streaming_query_result * streaming_result); void streaming_cancel_query(streaming_query_result * streaming_result); diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp index af5590c8186..886fe374be0 100644 --- a/programs/local/LocalServer.cpp +++ b/programs/local/LocalServer.cpp @@ -1,7 +1,7 @@ #include "LocalServer.h" - #include "chdb-internal.h" #if USE_PYTHON +#include "ChunkCollectorOutputFormat.h" #include "StoragePython.h" #include "TableFunctionPython.h" #else @@ -10,7 +10,6 @@ #endif #include #include - #include #include #include @@ -671,6 +670,7 @@ try auto & storage_factory = StorageFactory::instance(); #if USE_PYTHON registerStoragePython(storage_factory); + CHDB::registerDataFrameOutputFormat(); #else registerStorageArrowStream(storage_factory); #endif diff --git a/programs/local/NumpyArray.cpp b/programs/local/NumpyArray.cpp new file mode 100644 index 00000000000..8d672c11de1 --- /dev/null +++ b/programs/local/NumpyArray.cpp @@ -0,0 +1,937 @@ +#include "NumpyArray.h" +#include "NumpyType.h" +#include "NumpyNestedTypes.h" +#include "PythonImporter.h" +#include "FieldToPython.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int NOT_IMPLEMENTED; + extern const int LOGICAL_ERROR; +} + +} + +using namespace DB; + +namespace CHDB +{ + +struct RegularConvert +{ + template + static NUMPYTYPE convertValue(CHTYPE val, NumpyAppendData & append_data) + { + (void)append_data; + return (NUMPYTYPE)val; + } + + template + static NUMPYTYPE nullValue(bool & set_mask) + { + set_mask = true; + return 0; + } +}; + +struct DateConvert +{ + template + static NUMPYTYPE convertValue(CHTYPE val, NumpyAppendData & append_data) + { + (void)append_data; + return (NUMPYTYPE)val * 3600 * 24; + } + + template + static NUMPYTYPE nullValue(bool & set_mask) + { + set_mask = true; + return 0; + } +}; + +struct TimeConvert +{ + template + static NUMPYTYPE convertValue(CHTYPE val, NumpyAppendData & append_data) + { + chassert(append_data.type); + + Field field(static_cast(val)); + auto time_object = convertTimeFieldToPython(field); + return time_object.release().ptr(); + } + + template + static NUMPYTYPE nullValue(bool & set_mask) + { + set_mask = true; + return nullptr; + } +}; + +struct Time64Convert +{ + template + static NUMPYTYPE convertValue(CHTYPE val, NumpyAppendData & append_data) + { + chassert(append_data.type); + + const auto & time64_type = typeid_cast(*append_data.type); + UInt32 scale = time64_type.getScale(); + DecimalField decimal_field(static_cast(val), scale); + Field field(decimal_field); + + auto time64_object = convertTime64FieldToPython(field); + return time64_object.release().ptr(); + } + + template + static NUMPYTYPE nullValue(bool & set_mask) + { + set_mask = true; + return nullptr; + } +}; + +struct Enum8Convert +{ + template + static NUMPYTYPE convertValue(CHTYPE val, NumpyAppendData & append_data) + { + const auto & enum_type = typeid_cast(*append_data.type); + + try + { + auto it = enum_type.findByValue(static_cast(val)); + String enum_name(it->second.data, it->second.size); + return py::str(enum_name).release().ptr(); + } + catch (...) + { + return py::str(toString(static_cast(val))).release().ptr(); + } + } + + template + static NUMPYTYPE nullValue(bool & set_mask) + { + set_mask = true; + return nullptr; + } +}; + +struct Enum16Convert +{ + template + static NUMPYTYPE convertValue(CHTYPE val, NumpyAppendData & append_data) + { + const auto & enum_type = typeid_cast(*append_data.type); + try + { + auto it = enum_type.findByValue(static_cast(val)); + String enum_name(it->second.data, it->second.size); + return py::str(enum_name).release().ptr(); + } + catch (...) + { + return py::str(toString(static_cast(val))).release().ptr(); + } + } + + template + static NUMPYTYPE nullValue(bool & set_mask) + { + set_mask = true; + return nullptr; + } +}; + +template +static bool TransformColumn(NumpyAppendData & append_data) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + /// Check if column is nullable + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * tmp_ptr = static_cast(data_column)->getRawDataBegin(); + const auto * src_ptr = reinterpret_cast(tmp_ptr); + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = 0; i < append_data.src_count; i++) + { + size_t src_index = append_data.src_offset + i; + size_t dest_index = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(src_index)) + { + dest_ptr[dest_index] = CONVERT::template nullValue(mask_ptr[dest_index]); + has_null = has_null || mask_ptr[dest_index]; + } + else + { + dest_ptr[dest_index] = CONVERT::template convertValue(src_ptr[src_index], append_data); + mask_ptr[dest_index] = false; + } + } + + return has_null; +} + +template +static bool CHColumnToNumpyArray(NumpyAppendData & append_data) +{ + return TransformColumn(append_data); +} + +template +static bool CHColumnDecimalToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + /// Check if column is nullable + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * decimal_column = typeid_cast *>(data_column); + if (!decimal_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnDecimal"); + + /// Get scale from data type to convert integer to actual decimal value + const auto * decimal_type = typeid_cast *>(data_type.get()); + if (!decimal_type) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected DataTypeDecimal"); + + UInt32 scale = decimal_type->getScale(); + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = 0; i < append_data.src_count; i++) + { + size_t src_index = append_data.src_offset + i; + size_t dest_index = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(src_index)) + { + /// Set to 0.0 for null values + dest_ptr[dest_index] = 0.0; + mask_ptr[dest_index] = true; + has_null = true; + } + else + { + auto decimal_value = decimal_column->getElement(src_index); + dest_ptr[dest_index] = DecimalUtils::convertTo(decimal_value, scale); + mask_ptr[dest_index] = false; + } + } + + return has_null; +} + +static bool CHColumnDateTime64ToNumpyArray(NumpyAppendData & append_data) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * decimal_column = typeid_cast *>(data_column); + if (!decimal_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnDecimal"); + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = 0; i < append_data.src_count; i++) + { + size_t src_index = append_data.src_offset + i; + size_t dest_index = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(src_index)) + { + dest_ptr[dest_index] = 0; + mask_ptr[dest_index] = true; + has_null = true; + } + else + { + /// Get the DateTime64 value and convert to nanoseconds + Int64 raw_value = decimal_column->getInt(src_index); + auto scale = decimal_column->getScale(); + + Int64 ns_value; + chassert(scale <= 9); + Int64 multiplier = common::exp10_i32(9 - scale); + ns_value = raw_value * multiplier; + + dest_ptr[dest_index] = ns_value; + mask_ptr[dest_index] = false; + } + } + + return has_null; +} + +static bool CHColumnIntervalToNumpyArray(NumpyAppendData & append_data) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + /// Check if column is nullable + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto & int64_column = typeid_cast &>(*data_column); + const auto & inteval_type = typeid_cast(*append_data.type); + IntervalKind kind = inteval_type.getKind(); + size_t multiplier = 1; + + switch (kind) + { + case IntervalKind::Kind::Year: + multiplier = 3600 * 24 * 365; + break; + case IntervalKind::Kind::Quarter: + multiplier = 3600 * 24 * 30 * 3; + break; + case IntervalKind::Kind::Month: + multiplier = 3600 * 24 * 30; + break; + case IntervalKind::Kind::Week: + multiplier = 3600 * 24 * 7; + break; + case IntervalKind::Kind::Day: + multiplier = 3600 * 24; + break; + case IntervalKind::Kind::Hour: + multiplier = 3600; + break; + case IntervalKind::Kind::Minute: + multiplier = 60; + break; + default: + break; + } + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = 0; i < append_data.src_count; i++) + { + size_t src_index = append_data.src_offset + i; + size_t dest_index = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(src_index)) + { + dest_ptr[dest_index] = 0; + mask_ptr[dest_index] = true; + has_null = true; + } + else + { + Int64 interval_value = int64_column.getElement(src_index); + interval_value *= multiplier; + + dest_ptr[dest_index] = interval_value; + mask_ptr[dest_index] = false; + } + } + + return has_null; +} + +static bool CHColumnUUIDToNumpyArray(NumpyAppendData & append_data) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + /// Check if column is nullable + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * uuid_column = typeid_cast *>(data_column); + if (!uuid_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnVector"); + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = 0; i < append_data.src_count; i++) + { + size_t src_index = append_data.src_offset + i; + size_t dest_index = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(src_index)) + { + dest_ptr[dest_index] = nullptr; + has_null = true; + mask_ptr[dest_index] = true; + } + else + { + /// Convert UUID to Python uuid.UUID object + UUID uuid_value = uuid_column->getElement(src_index); + const auto formatted_uuid = formatUUID(uuid_value); + const char * uuid_str = formatted_uuid.data(); + const size_t uuid_str_len = formatted_uuid.size(); + + /// Create Python uuid.UUID object + auto & import_cache = PythonImporter::ImportCache(); + py::handle uuid_handle = import_cache.uuid.UUID()(String(uuid_str, uuid_str_len)).release(); + dest_ptr[dest_index] = uuid_handle.ptr(); + mask_ptr[dest_index] = false; + } + } + + return has_null; +} + +static bool CHColumnIPv4ToNumpyArray(NumpyAppendData & append_data) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + /// Check if column is nullable + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * ipv4_column = typeid_cast *>(data_column); + if (!ipv4_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnVector"); + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = 0; i < append_data.src_count; i++) + { + size_t src_index = append_data.src_offset + i; + size_t dest_index = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(src_index)) + { + dest_ptr[dest_index] = nullptr; + has_null = true; + mask_ptr[dest_index] = true; + } + else + { + /// Convert IPv4 to Python ipaddress.IPv4Address object + IPv4 ipv4_value = ipv4_column->getElement(src_index); + + char ipv4_str[IPV4_MAX_TEXT_LENGTH]; + char * ptr = ipv4_str; + formatIPv4(reinterpret_cast(&ipv4_value), ptr); + const size_t ipv4_str_len = ptr - ipv4_str; + + /// Create Python ipaddress.IPv4Address object + auto & import_cache = PythonImporter::ImportCache(); + py::handle ipv4_handle = import_cache.ipaddress.ipv4_address()(String(ipv4_str, ipv4_str_len)).release(); + dest_ptr[dest_index] = ipv4_handle.ptr(); + mask_ptr[dest_index] = false; + } + } + + return has_null; +} + +static bool CHColumnIPv6ToNumpyArray(NumpyAppendData & append_data) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + /// Check if column is nullable + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * ipv6_column = typeid_cast *>(data_column); + if (!ipv6_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected ColumnVector"); + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = 0; i < append_data.src_count; i++) + { + size_t src_index = append_data.src_offset + i; + size_t dest_index = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(src_index)) + { + dest_ptr[dest_index] = nullptr; + has_null = true; + mask_ptr[dest_index] = true; + } + else + { + /// Convert IPv6 to Python ipaddress.IPv6Address object + IPv6 ipv6_value = ipv6_column->getElement(src_index); + + /// Use ClickHouse's built-in IPv6 formatting function + char ipv6_str[IPV6_MAX_TEXT_LENGTH]; + char * ptr = ipv6_str; + formatIPv6(reinterpret_cast(&ipv6_value), ptr); + const size_t ipv6_str_len = ptr - ipv6_str; + + /// Create Python ipaddress.IPv6Address object + auto & import_cache = PythonImporter::ImportCache(); + py::handle ipv6_handle = import_cache.ipaddress.ipv6_address()(String(ipv6_str, ipv6_str_len)).release(); + dest_ptr[dest_index] = ipv6_handle.ptr(); + mask_ptr[dest_index] = false; + } + } + + return has_null; +} + +template +static bool CHColumnStringToNumpyArray(NumpyAppendData & append_data) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + /// Check if column is nullable + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * string_column = typeid_cast(data_column); + if (!string_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected String ColumnType"); + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + + for (size_t i = 0; i < append_data.src_count; i++) + { + size_t src_index = append_data.src_offset + i; + size_t dest_index = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(src_index)) + { + Py_INCREF(Py_None); + dest_ptr[dest_index] = Py_None; + } + else + { + StringRef str_ref = string_column->getDataAt(src_index); + auto * str_ptr = const_cast(str_ref.data); + auto str_size = str_ref.size; + dest_ptr[dest_index] = PyUnicode_FromStringAndSize(str_ptr, str_size); + } + } + + return has_null; +} + +NumpyAppendData::NumpyAppendData( + const DB::IColumn & column_, + const DB::DataTypePtr & type_) + : column(column_) + , type(type_) + , src_offset(0) + , src_count(0) + , dest_offset(0) + , target_data(nullptr) + , target_mask(nullptr) +{ +} + +InternalNumpyArray::InternalNumpyArray(const DataTypePtr & type_) + : data(nullptr) + , type(type_) + , count(0) +{ +} + +void InternalNumpyArray::init(size_t capacity) +{ + String type_str = DataTypeToNumpyTypeStr(type); + + array = py::array(py::dtype(type_str), capacity); + data = reinterpret_cast(array.mutable_data()); +} + +void InternalNumpyArray::resize(size_t capacity) +{ + std::vector new_shape {py::ssize_t(capacity)}; + + array.resize(new_shape, false); + data = reinterpret_cast(array.mutable_data()); +} + +NumpyArray::NumpyArray(const DataTypePtr & type_) + : hava_null(false) +{ + data_array = std::make_unique(type_); +} + +void NumpyArray::init(size_t capacity, bool may_have_null) +{ + data_array->init(capacity); + + if (may_have_null) + { + if (!mask_array) + mask_array = std::make_unique(DataTypeFactory::instance().get("Bool")); + + mask_array->init(capacity); + } +} + +void NumpyArray::resize(size_t capacity, bool may_have_null) +{ + data_array->resize(capacity); + + if (may_have_null) + { + if (!mask_array) + mask_array = std::make_unique(DataTypeFactory::instance().get("Bool")); + + mask_array->resize(capacity); + } +} + +static bool CHColumnNothingToNumpyArray(NumpyAppendData & append_data) +{ + /// Nothing type represents columns with no actual values, so we fill all positions with None + bool has_null = true; + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = 0; i < append_data.src_count; i++) + { + size_t dest_index = append_data.dest_offset + i; + + dest_ptr[dest_index] = nullptr; + mask_ptr[dest_index] = true; + } + + return has_null; +} + +void NumpyArray::append(const ColumnPtr & column) +{ + append(column, 0, column->size()); +} + +void NumpyArray::append( + const ColumnPtr & column, + size_t offset, + size_t count) +{ + auto actual_column = column->convertToFullColumnIfLowCardinality(); + DataTypePtr actual_type = removeLowCardinalityAndNullable(data_array->type); + + chassert(data_array); + chassert(mask_array); + + auto * data_ptr = data_array->data; + auto * mask_ptr = reinterpret_cast(mask_array->data); + chassert(data_ptr); + chassert(mask_ptr); + chassert(actual_column->isNullable() || actual_column->getDataType() == actual_type->getColumnType()); + + data_array->count += count; + mask_array->count += count; + bool may_have_null = false; + + NumpyAppendData append_data(*actual_column, actual_type); + append_data.src_offset = offset; + append_data.src_count = count; + append_data.target_data = data_ptr; + append_data.target_mask = mask_ptr; + append_data.dest_offset = data_array->count - count; + + switch (actual_type->getTypeId()) + { + case TypeIndex::Nothing: + may_have_null = CHColumnNothingToNumpyArray(append_data); + break; + + case TypeIndex::Int8: + may_have_null = CHColumnToNumpyArray(append_data); + break; + + case TypeIndex::UInt8: + { + auto is_bool = isBool(actual_type); + if (is_bool) + may_have_null = CHColumnToNumpyArray(append_data); + else + may_have_null = CHColumnToNumpyArray(append_data); + } + break; + + case TypeIndex::Int16: + may_have_null = CHColumnToNumpyArray(append_data); + break; + + case TypeIndex::UInt16: + may_have_null = CHColumnToNumpyArray(append_data); + break; + + case TypeIndex::Int32: + may_have_null = CHColumnToNumpyArray(append_data); + break; + + case TypeIndex::UInt32: + may_have_null = CHColumnToNumpyArray(append_data); + break; + + case TypeIndex::Int64: + may_have_null = CHColumnToNumpyArray(append_data); + break; + + case TypeIndex::UInt64: + may_have_null = CHColumnToNumpyArray(append_data); + break; + + case TypeIndex::Float32: + may_have_null = CHColumnToNumpyArray(append_data); + break; + + case TypeIndex::Float64: + may_have_null = CHColumnToNumpyArray(append_data); + break; + + case TypeIndex::Int128: + may_have_null = TransformColumn(append_data); + break; + + case TypeIndex::Int256: + may_have_null = TransformColumn(append_data); + break; + + case TypeIndex::UInt128: + may_have_null = TransformColumn(append_data); + break; + + case TypeIndex::UInt256: + may_have_null = TransformColumn(append_data); + break; + + case TypeIndex::BFloat16: + may_have_null = TransformColumn(append_data); + break; + + case TypeIndex::Date: + may_have_null = TransformColumn(append_data); + break; + + case TypeIndex::Date32: + may_have_null = TransformColumn(append_data); + break; + + case TypeIndex::DateTime: + may_have_null = TransformColumn(append_data); + break; + + case TypeIndex::DateTime64: + may_have_null = CHColumnDateTime64ToNumpyArray(append_data); + break; + + case TypeIndex::Time: + may_have_null = TransformColumn(append_data); + break; + + case TypeIndex::Time64: + may_have_null = TransformColumn(append_data); + break; + + case TypeIndex::String: + may_have_null = CHColumnStringToNumpyArray(append_data); + break; + + case TypeIndex::FixedString: + may_have_null = CHColumnStringToNumpyArray(append_data); + break; + + case TypeIndex::Enum8: + may_have_null = TransformColumn(append_data); + break; + + case TypeIndex::Enum16: + may_have_null = TransformColumn(append_data); + break; + + case TypeIndex::Decimal32: + may_have_null = CHColumnDecimalToNumpyArray(append_data, actual_type); + break; + + case TypeIndex::Decimal64: + may_have_null = CHColumnDecimalToNumpyArray(append_data, actual_type); + break; + + case TypeIndex::Decimal128: + may_have_null = CHColumnDecimalToNumpyArray(append_data, actual_type); + break; + + case TypeIndex::Decimal256: + may_have_null = CHColumnDecimalToNumpyArray(append_data, actual_type); + break; + + case TypeIndex::UUID: + may_have_null = CHColumnUUIDToNumpyArray(append_data); + break; + + case TypeIndex::Array: + may_have_null = CHColumnArrayToNumpyArray(append_data, actual_type); + break; + + case TypeIndex::Tuple: + may_have_null = CHColumnTupleToNumpyArray(append_data, actual_type); + break; + + case TypeIndex::Interval: + may_have_null = CHColumnIntervalToNumpyArray(append_data); + break; + + case TypeIndex::Map: + may_have_null = CHColumnMapToNumpyArray(append_data, actual_type); + break; + + case TypeIndex::Object: + may_have_null = CHColumnObjectToNumpyArray(append_data, actual_type); + break; + + case TypeIndex::IPv4: + may_have_null = CHColumnIPv4ToNumpyArray(append_data); + break; + + case TypeIndex::IPv6: + may_have_null = CHColumnIPv6ToNumpyArray(append_data); + break; + + case TypeIndex::Variant: + may_have_null = CHColumnVariantToNumpyArray(append_data, actual_type); + break; + + case TypeIndex::Dynamic: + may_have_null = CHColumnDynamicToNumpyArray(append_data, actual_type); + break; + + /// Set types are used only in WHERE clauses for IN operations, not in actual data storage + case TypeIndex::Set: + /// JSONPaths is an internal type used only for JSON schema inference, + case TypeIndex::JSONPaths: + /// Deprecated type, should not appear in normal data processing + case TypeIndex::ObjectDeprecated: + /// Function types are not actual data types, should not appear here + case TypeIndex::Function: + /// Aggregate function types are not actual data types, should not appear here + case TypeIndex::AggregateFunction: + /// LowCardinality should be unwrapped before reaching this point + case TypeIndex::LowCardinality: + /// Nullable cannot contain another Nullable type, so this should not appear in nested conversion + case TypeIndex::Nullable: + /// QBit type is supported in newer versions of ClickHouse + /// case TypeIndex::QBit: + default: + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}", data_array->type->getName()); + } + + if (may_have_null) + { + hava_null = true; + } +} + +void NumpyArray::append( + const DB::IColumn & column, + const DB::DataTypePtr & type, + size_t index) +{ + chassert(data_array); + chassert(!mask_array); + + auto * data_ptr = data_array->data; + chassert(data_ptr); + + auto * dest_ptr = reinterpret_cast(data_ptr) + data_array->count; + + *dest_ptr = convertFieldToPython(column, type, index); + + data_array->count += 1; +} + +py::object NumpyArray::toArray() const +{ + chassert(data_array); + + data_array->resize(data_array->count); + if (!hava_null) + { + return std::move(data_array->array); + } + + chassert(mask_array); + + mask_array->resize(mask_array->count); + auto data_values = std::move(data_array->array); + auto null_values = std::move(mask_array->array); + + auto masked_array = py::module::import("numpy.ma").attr("masked_array")(data_values, null_values); + return masked_array; +} + +} // namespace CHDB diff --git a/programs/local/NumpyArray.h b/programs/local/NumpyArray.h new file mode 100644 index 00000000000..ca2af0ae6bd --- /dev/null +++ b/programs/local/NumpyArray.h @@ -0,0 +1,74 @@ +#pragma once + +#include "PybindWrapper.h" + +#include +#include +#include +#include + +namespace CHDB +{ + +/// Data structure for appending column data to numpy arrays +class NumpyAppendData +{ +public: + explicit NumpyAppendData( + const DB::IColumn & column_, + const DB::DataTypePtr & type_); + + const DB::IColumn & column; + const DB::DataTypePtr & type; + + size_t src_offset; + size_t src_count; + size_t dest_offset; + UInt8 * target_data; + bool * target_mask; +}; + +class InternalNumpyArray +{ +public: + explicit InternalNumpyArray(const DB::DataTypePtr & type); + + void init(size_t capacity); + + void resize(size_t capacity); + + py::array array; + UInt8 * data; + DB::DataTypePtr type; + size_t count; +}; + +class NumpyArray { +public: + explicit NumpyArray(const DB::DataTypePtr & type_); + + void init(size_t capacity, bool may_have_null = true); + + void resize(size_t capacity, bool may_have_null = true); + + void append(const DB::ColumnPtr & column); + + void append( + const DB::ColumnPtr & column, + size_t offset, + size_t count); + + void append( + const DB::IColumn & column, + const DB::DataTypePtr & type, + size_t index); + + py::object toArray() const; + +private: + bool hava_null; + std::unique_ptr data_array; + std::unique_ptr mask_array; +}; + +} // namespace CHDB diff --git a/programs/local/NumpyCacheItem.h b/programs/local/NumpyCacheItem.h new file mode 100644 index 00000000000..5d75cc5ed0a --- /dev/null +++ b/programs/local/NumpyCacheItem.h @@ -0,0 +1,66 @@ +#pragma once + +#include "PythonImportCacheItem.h" + +namespace CHDB { + +struct NumpyMaCacheItem : public PythonImportCacheItem +{ +public: + NumpyMaCacheItem(PythonImportCacheItem * parent) + : PythonImportCacheItem("ma", parent), masked("masked", this), masked_array("masked_array", this) { + } + ~NumpyMaCacheItem() override = default; + + PythonImportCacheItem masked; + PythonImportCacheItem masked_array; +}; + +struct NumpyCacheItem : public PythonImportCacheItem +{ +public: + static constexpr const char * Name = "numpy"; + + NumpyCacheItem() + : PythonImportCacheItem("numpy"), ma(this), ndarray("ndarray", this), datetime64("datetime64", this), + generic("generic", this), int64("int64", this), bool_("bool_", this), byte("byte", this), + ubyte("ubyte", this), short_("short", this), ushort_("ushort", this), intc("intc", this), + uintc("uintc", this), int_("int_", this), uint("uint", this), longlong("longlong", this), + ulonglong("ulonglong", this), half("half", this), float16("float16", this), single("single", this), + longdouble("longdouble", this), csingle("csingle", this), cdouble("cdouble", this), + clongdouble("clongdouble", this) { + } + ~NumpyCacheItem() override = default; + + NumpyMaCacheItem ma; + PythonImportCacheItem ndarray; + PythonImportCacheItem datetime64; + PythonImportCacheItem generic; + PythonImportCacheItem int64; + PythonImportCacheItem bool_; + PythonImportCacheItem byte; + PythonImportCacheItem ubyte; + PythonImportCacheItem short_; + PythonImportCacheItem ushort_; + PythonImportCacheItem intc; + PythonImportCacheItem uintc; + PythonImportCacheItem int_; + PythonImportCacheItem uint; + PythonImportCacheItem longlong; + PythonImportCacheItem ulonglong; + PythonImportCacheItem half; + PythonImportCacheItem float16; + PythonImportCacheItem single; + PythonImportCacheItem longdouble; + PythonImportCacheItem csingle; + PythonImportCacheItem cdouble; + PythonImportCacheItem clongdouble; + +protected: + bool IsRequired() const override final + { + return false; + } +}; + +} // namespace CHDB diff --git a/programs/local/NumpyNestedTypes.cpp b/programs/local/NumpyNestedTypes.cpp new file mode 100644 index 00000000000..633f0d89c8d --- /dev/null +++ b/programs/local/NumpyNestedTypes.cpp @@ -0,0 +1,205 @@ +#include "NumpyNestedTypes.h" +#include "NumpyArray.h" +#include "FieldToPython.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int LOGICAL_ERROR; +extern const int NOT_IMPLEMENTED; +} + +} + +namespace CHDB +{ + +using namespace DB; + +template +struct ColumnTraits; + +template <> +struct ColumnTraits +{ + using DataType = DataTypeArray; + + static py::object convertElement(const ColumnArray * column, const DataTypePtr & data_type, size_t index) + { + const auto & offsets = column->getOffsets(); + const auto & nested_column = column->getDataPtr(); + + size_t start_offset = (index == 0) ? 0 : offsets[index - 1]; + size_t end_offset = offsets[index]; + size_t array_size = end_offset - start_offset; + + /// Extract the nested element type from DataTypeArray + const auto & array_data_type = typeid_cast(*data_type); + const DataTypePtr & nested_data_type = array_data_type.getNestedType(); + + NumpyArray numpy_array(nested_data_type); + numpy_array.init(array_size); + numpy_array.append(nested_column, start_offset, array_size); + + return numpy_array.toArray(); + } +}; + +template <> +struct ColumnTraits +{ + using DataType = DataTypeTuple; + + static py::object convertElement(const ColumnTuple * column, const DataTypePtr & data_type, size_t index) + { + const auto & tuple_data_type = typeid_cast(*data_type); + + const auto & element_types = tuple_data_type.getElements(); + size_t tuple_size = column->tupleSize(); + + NumpyArray numpy_array({}); + numpy_array.init(tuple_size, false); + + for (size_t i = 0; i < tuple_size; ++i) + { + numpy_array.append(column->getColumn(i), element_types[i], index); + } + + return numpy_array.toArray(); + } +}; + +template <> +struct ColumnTraits +{ + using DataType = DataTypeMap; + + static py::object convertElement(const ColumnMap * column, const DataTypePtr & data_type, size_t index) + { + return convertFieldToPython(*column, data_type, index); + } +}; + +template <> +struct ColumnTraits +{ + using DataType = DataTypeObject; + + static py::object convertElement(const ColumnObject * column, const DataTypePtr & data_type, size_t index) + { + return convertFieldToPython(*column, data_type, index); + } +}; + +template <> +struct ColumnTraits +{ + using DataType = DataTypeVariant; + + static py::object convertElement(const ColumnVariant * column, const DataTypePtr & data_type, size_t index) + { + return convertFieldToPython(*column, data_type, index); + } +}; + +template <> +struct ColumnTraits +{ + using DataType = DataTypeDynamic; + + static py::object convertElement(const ColumnDynamic * column, const DataTypePtr & data_type, size_t index) + { + return convertFieldToPython(*column, data_type, index); + } +}; + +template +bool CHNestedColumnToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + bool has_null = false; + const IColumn * data_column = &append_data.column; + const ColumnNullable * nullable_column = nullptr; + + if (const auto * nullable = typeid_cast(&append_data.column)) + { + nullable_column = nullable; + data_column = &nullable->getNestedColumn(); + } + + const auto * typed_column = typeid_cast(data_column); + if (!typed_column) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Expected specific column type"); + + auto * dest_ptr = reinterpret_cast(append_data.target_data); + auto * mask_ptr = append_data.target_mask; + + for (size_t i = append_data.src_offset; i < append_data.src_offset + append_data.src_count; i++) + { + size_t offset = append_data.dest_offset + i; + if (nullable_column && nullable_column->isNullAt(i)) + { + dest_ptr[offset] = py::none(); + mask_ptr[offset] = true; + has_null = true; + } + else + { + dest_ptr[offset] = ColumnTraits::convertElement(typed_column, data_type, i); + mask_ptr[offset] = false; + } + } + + return has_null; +} + +bool CHColumnArrayToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + return CHNestedColumnToNumpyArray(append_data, data_type); +} + +bool CHColumnTupleToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + return CHNestedColumnToNumpyArray(append_data, data_type); +} + +bool CHColumnMapToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + return CHNestedColumnToNumpyArray(append_data, data_type); +} + +bool CHColumnObjectToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + return CHNestedColumnToNumpyArray(append_data, data_type); +} + +bool CHColumnVariantToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + return CHNestedColumnToNumpyArray(append_data, data_type); +} + +bool CHColumnDynamicToNumpyArray(NumpyAppendData & append_data, const DataTypePtr & data_type) +{ + return CHNestedColumnToNumpyArray(append_data, data_type); +} + +} // namespace CHDB diff --git a/programs/local/NumpyNestedTypes.h b/programs/local/NumpyNestedTypes.h new file mode 100644 index 00000000000..b3e0a68520e --- /dev/null +++ b/programs/local/NumpyNestedTypes.h @@ -0,0 +1,20 @@ +#pragma once + +#include "NumpyArray.h" + +namespace CHDB +{ + +bool CHColumnArrayToNumpyArray(NumpyAppendData & append_data, const DB::DataTypePtr & data_type); + +bool CHColumnTupleToNumpyArray(NumpyAppendData & append_data, const DB::DataTypePtr & data_type); + +bool CHColumnMapToNumpyArray(NumpyAppendData & append_data, const DB::DataTypePtr & data_type); + +bool CHColumnObjectToNumpyArray(NumpyAppendData & append_data, const DB::DataTypePtr & data_type); + +bool CHColumnVariantToNumpyArray(NumpyAppendData & append_data, const DB::DataTypePtr & data_type); + +bool CHColumnDynamicToNumpyArray(NumpyAppendData & append_data, const DB::DataTypePtr & data_type); + +} // namespace CHDB diff --git a/programs/local/NumpyType.cpp b/programs/local/NumpyType.cpp index cf92c8dece9..4c113b905c9 100644 --- a/programs/local/NumpyType.cpp +++ b/programs/local/NumpyType.cpp @@ -1,11 +1,14 @@ #include "NumpyType.h" +#include "PythonImporter.h" -#include #include #include +#include +#include #include #include #include +#include using namespace DB; @@ -138,7 +141,6 @@ static NumpyNullableType ConvertNumpyTypeInternal(const String & col_type_str) {"Float64", NumpyNullableType::FLOAT_64}, {"string", NumpyNullableType::STRING}, {"object", NumpyNullableType::OBJECT}, - {"timedelta64[ns]", NumpyNullableType::TIMEDELTA}, {"category", NumpyNullableType::CATEGORY}, }; @@ -154,6 +156,8 @@ static NumpyNullableType ConvertNumpyTypeInternal(const String & col_type_str) return NumpyNullableType::DATETIME_MS; if (startsWith(col_type_str, "datetime64[s")) return NumpyNullableType::DATETIME_S; + if (startsWith(col_type_str, "timedelta64[")) + return NumpyNullableType::TIMEDELTA; /// Legacy datetime type indicators if (startsWith(col_type_str, " NumpyToDataType(const NumpyType & col_type) } } +String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type) +{ + if (!data_type) + return "object"; + + auto actual_data_type = removeLowCardinalityAndNullable(data_type); + + TypeIndex type_id = actual_data_type->getTypeId(); + switch (type_id) + { + case TypeIndex::Nothing: + return "object"; + + case TypeIndex::Int8: + return "int8"; + + case TypeIndex::UInt8: + /// Special case: UInt8 could be Bool type, need to check getName() + { + auto is_bool = isBool(actual_data_type); + return is_bool ? "bool" : "uint8"; + } + + case TypeIndex::Int16: + return "int16"; + + case TypeIndex::UInt16: + return "uint16"; + + case TypeIndex::Int32: + return "int32"; + + case TypeIndex::UInt32: + return "uint32"; + + case TypeIndex::Int64: + return "int64"; + + case TypeIndex::UInt64: + return "uint64"; + + case TypeIndex::BFloat16: + case TypeIndex::Float32: + return "float32"; + + case TypeIndex::Int256: + case TypeIndex::UInt256: + case TypeIndex::Int128: + case TypeIndex::UInt128: + case TypeIndex::Float64: + return "float64"; + + case TypeIndex::String: + case TypeIndex::FixedString: + return "object"; + + case TypeIndex::DateTime: + return "datetime64[s]"; + + case TypeIndex::DateTime64: + { + if (const auto * dt64 = typeid_cast(actual_data_type.get())) + { + UInt32 scale = dt64->getScale(); + if (scale <= 9) + return "datetime64[ns]"; + + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}, scale {}", actual_data_type->getName(), scale); + } + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected datetime64 type {}", actual_data_type->getName()); + } + + case TypeIndex::Date: + case TypeIndex::Date32: + return "datetime64[s]"; // pandas converts datetime64[D] to datetime64[s] internally + + case TypeIndex::Time: + case TypeIndex::Time64: + return "object"; + + case TypeIndex::Interval: + { + if (const auto * interval = typeid_cast(actual_data_type.get())) + { + IntervalKind kind = interval->getKind(); + switch (kind.kind) + { + case IntervalKind::Kind::Nanosecond: + return "timedelta64[ns]"; + case IntervalKind::Kind::Microsecond: + return "timedelta64[us]"; + case IntervalKind::Kind::Millisecond: + return "timedelta64[ms]"; + case IntervalKind::Kind::Second: + return "timedelta64[s]"; + case IntervalKind::Kind::Minute: + case IntervalKind::Kind::Hour: + case IntervalKind::Kind::Day: + case IntervalKind::Kind::Week: + case IntervalKind::Kind::Month: + case IntervalKind::Kind::Quarter: + case IntervalKind::Kind::Year: + return "timedelta64[s]"; + default: + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected interval kind {}", kind.kind); + } + } + throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected interval type {}", actual_data_type->getName()); + } + + case TypeIndex::UUID: + case TypeIndex::IPv4: + case TypeIndex::IPv6: + return "object"; + + case TypeIndex::Decimal32: + case TypeIndex::Decimal64: + case TypeIndex::Decimal128: + case TypeIndex::Decimal256: + return "float64"; + + case TypeIndex::Array: + case TypeIndex::Tuple: + case TypeIndex::Map: + case TypeIndex::Set: + case TypeIndex::Dynamic: + case TypeIndex::Variant: + case TypeIndex::Object: + return "object"; + + case TypeIndex::Enum8: + case TypeIndex::Enum16: + return "object"; + + case TypeIndex::Nullable: + default: + throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Unsupported type {}", actual_data_type->getName()); + } +} + +py::object ConvertNumpyDtype(const py::handle & numpy_array) +{ + chassert(py::gil_check()); + + auto & import_cache = PythonImporter::ImportCache(); + + auto dtype = numpy_array.attr("dtype"); + if (!py::isinstance(numpy_array, import_cache.numpy.ma.masked_array())) + { + return dtype; + } + + auto numpy_type = ConvertNumpyType(dtype); + switch (numpy_type.type) + { + case NumpyNullableType::BOOL: + return import_cache.pandas.BooleanDtype()(); + case NumpyNullableType::UINT_8: + return import_cache.pandas.UInt8Dtype()(); + case NumpyNullableType::UINT_16: + return import_cache.pandas.UInt16Dtype()(); + case NumpyNullableType::UINT_32: + return import_cache.pandas.UInt32Dtype()(); + case NumpyNullableType::UINT_64: + return import_cache.pandas.UInt64Dtype()(); + case NumpyNullableType::INT_8: + return import_cache.pandas.Int8Dtype()(); + case NumpyNullableType::INT_16: + return import_cache.pandas.Int16Dtype()(); + case NumpyNullableType::INT_32: + return import_cache.pandas.Int32Dtype()(); + case NumpyNullableType::INT_64: + return import_cache.pandas.Int64Dtype()(); + case NumpyNullableType::FLOAT_32: + return import_cache.pandas.Float32Dtype()(); + case NumpyNullableType::FLOAT_64: + return import_cache.pandas.Float64Dtype()(); + case NumpyNullableType::FLOAT_16: + default: + return dtype; + } +} + } // namespace CHDB diff --git a/programs/local/NumpyType.h b/programs/local/NumpyType.h index c58fee13768..da8ccd5eafe 100644 --- a/programs/local/NumpyType.h +++ b/programs/local/NumpyType.h @@ -48,6 +48,11 @@ enum class NumpyObjectType : uint8_t { }; NumpyType ConvertNumpyType(const py::handle & col_type); + std::shared_ptr NumpyToDataType(const NumpyType & col_type); +String DataTypeToNumpyTypeStr(const std::shared_ptr & data_type); + +py::object ConvertNumpyDtype(const py::handle & numpy_array); + } // namespace CHDB diff --git a/programs/local/ObjectToPython.cpp b/programs/local/ObjectToPython.cpp new file mode 100644 index 00000000000..7bf2626e9ce --- /dev/null +++ b/programs/local/ObjectToPython.cpp @@ -0,0 +1,156 @@ +#include "ObjectToPython.h" +#include "FieldToPython.h" + +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ + extern const int LOGICAL_ERROR; +} + +} + +namespace CHDB +{ + +using namespace DB; +namespace py = pybind11; + +struct PathElements +{ + explicit PathElements(const String & path) + { + const char * start = path.data(); + const char * end = start + path.size(); + const char * pos = start; + const char * last_dot_pos = pos - 1; + for (pos = start; pos != end; ++pos) + { + if (*pos == '.') + { + elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1)); + last_dot_pos = pos; + } + } + + elements.emplace_back(last_dot_pos + 1, size_t(pos - last_dot_pos - 1)); + } + + size_t size() const { return elements.size(); } + + std::vector elements; +}; + +py::object convertObjectToPython( + const IColumn & column, + const DataTypePtr & type, + size_t index) +{ + const IColumn * data_column = &column; + if (const auto * nullable = typeid_cast(&column)) + { + data_column = &nullable->getNestedColumn(); + } + + const auto & column_object = typeid_cast(*data_column); + const auto & typed_paths = column_object.getTypedPaths(); + const auto & dynamic_paths = column_object.getDynamicPaths(); + const auto & shared_data_offsets = column_object.getSharedDataOffsets(); + const auto [shared_data_paths, shared_data_values] = column_object.getSharedDataPathsAndValues(); + + size_t shared_data_offset = shared_data_offsets[static_cast(index) - 1]; + size_t shared_data_end = shared_data_offsets[static_cast(index)]; + + const auto & object_type = typeid_cast(*type); + const auto & specific_typed_paths = object_type.getTypedPaths(); + const auto & dynamic_data_type = object_type.getDynamicType(); + + std::vector> path_values; + path_values.reserve(typed_paths.size() + dynamic_paths.size() + (shared_data_end - shared_data_offset)); + + for (const auto & [path, column_ptr] : typed_paths) + { + auto iter = specific_typed_paths.find(path); + if (iter == specific_typed_paths.end()) + throw Exception(ErrorCodes::LOGICAL_ERROR, "Path {} not found in typed paths", path); + + const auto & specific_data_type = iter->second; + auto python_value = convertFieldToPython(*column_ptr, specific_data_type, index); + path_values.emplace_back(path, python_value); + } + + for (const auto & [path, dynamic_column] : dynamic_paths) + { + if (!dynamic_column->isNullAt(index)) + { + auto python_value = convertFieldToPython(*dynamic_column, dynamic_data_type, index); + path_values.emplace_back(path, python_value); + } + } + + size_t index_in_shared_data_values = shared_data_offset; + for (size_t i = shared_data_offset; i != shared_data_end; ++i) + { + auto path = shared_data_paths->getDataAt(i).toString(); + + auto tmp_dynamic_column = ColumnDynamic::create(); + tmp_dynamic_column->reserve(1); + ColumnObject::deserializeValueFromSharedData(shared_data_values, index_in_shared_data_values++, *tmp_dynamic_column); + + auto python_value = convertFieldToPython(*tmp_dynamic_column, dynamic_data_type, 0); + path_values.emplace_back(path, python_value); + } + + py::dict result; + + for (const auto & [path, value] : path_values) + { + PathElements path_elements(path); + + if (path_elements.size() == 1) + { + String key(path_elements.elements[0]); + result[key.c_str()] = value; + } + else + { + py::dict current_dict = result; + + for (size_t i = 0; i < path_elements.size() - 1; ++i) + { + String key(path_elements.elements[i]); + + if (current_dict.contains(key.c_str())) + { + py::object nested = (*current_dict)[key.c_str()]; + current_dict = nested.cast(); + } + else + { + py::dict new_dict; + current_dict[key.c_str()] = new_dict; + current_dict = new_dict; + } + } + + String final_key(path_elements.elements[path_elements.size() - 1]); + current_dict[final_key.c_str()] = value; + } + } + + /// Return None if the result dictionary is empty + if (result.empty()) + return py::none(); + + return result; +} + +} // namespace CHDB diff --git a/programs/local/ObjectToPython.h b/programs/local/ObjectToPython.h new file mode 100644 index 00000000000..64d79e218fd --- /dev/null +++ b/programs/local/ObjectToPython.h @@ -0,0 +1,15 @@ +#pragma once + +#include +#include +#include + +namespace CHDB +{ + +pybind11::object convertObjectToPython( + const DB::IColumn & column, + const DB::DataTypePtr & type, + size_t index); + +} // namespace CHDB diff --git a/programs/local/PandasAnalyzer.cpp b/programs/local/PandasAnalyzer.cpp index f1c97c96772..57d6140c692 100644 --- a/programs/local/PandasAnalyzer.cpp +++ b/programs/local/PandasAnalyzer.cpp @@ -38,7 +38,7 @@ PandasAnalyzer::PandasAnalyzer(const DB::Settings & settings) bool PandasAnalyzer::Analyze(py::object column) { #if USE_JEMALLOC - ::Memory::MemoryCheckScope memory_check_scope; + ::Memory::MemoryCheckScope memory_check_scope; #endif if (sample_size == 0) return false; diff --git a/programs/local/PandasDataFrameBuilder.cpp b/programs/local/PandasDataFrameBuilder.cpp new file mode 100644 index 00000000000..7b570cebfcb --- /dev/null +++ b/programs/local/PandasDataFrameBuilder.cpp @@ -0,0 +1,176 @@ +#include "PandasDataFrameBuilder.h" +#include "PythonImporter.h" +#include "NumpyType.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace DB +{ + +namespace ErrorCodes +{ +extern const int LOGICAL_ERROR; +} + +} + +using namespace DB; + +namespace CHDB +{ + +PandasDataFrameBuilder::PandasDataFrameBuilder(const Block & sample) +{ + column_names.reserve(sample.columns()); + column_types.reserve(sample.columns()); + + for (const auto & column : sample) + { + column_names.push_back(column.name); + column_types.push_back(column.type); + + /// Record timezone for timezone-aware types + if (const auto * dt = typeid_cast(column.type.get())) + column_timezones[column.name] = dt->getTimeZone().getTimeZone(); + else if (const auto * dt64 = typeid_cast(column.type.get())) + column_timezones[column.name] = dt64->getTimeZone().getTimeZone(); + } +} + +void PandasDataFrameBuilder::addChunk(const Chunk & chunk) +{ + if (chunk.hasRows()) + { + chunks.push_back(chunk.clone()); + total_rows += chunk.getNumRows(); + } +} + +py::object PandasDataFrameBuilder::genDataFrame(const py::handle & dict) +{ + auto & import_cache = PythonImporter::ImportCache(); + auto pandas = import_cache.pandas(); + if (!pandas) + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "Pandas is not installed"); + } + + py::object items = dict.attr("items")(); + for (const py::handle & item : items) { + auto key_value = py::cast(item); + py::handle key = key_value[0]; + py::handle value = key_value[1]; + + if (py::isinstance(value, import_cache.numpy.ma.masked_array())) + { + auto dtype = ConvertNumpyDtype(value); + auto series = pandas.attr("Series")(value.attr("data"), py::arg("dtype") = dtype); + series.attr("__setitem__")(value.attr("mask"), import_cache.pandas.NA()); + dict.attr("__setitem__")(key, series); + } + } + + auto df = pandas.attr("DataFrame").attr("from_dict")(dict); + + /// Apply timezone conversion for timezone-aware columns + changeToTZType(df); + + return df; +} + +void PandasDataFrameBuilder::changeToTZType(py::object & df) +{ + if (column_timezones.empty()) + return; + + for (const auto & [column_name, timezone_str] : column_timezones) + { + /// Check if column exists in DataFrame + if (!df.attr("__contains__")(column_name).cast()) + continue; + + /// Get the column + auto column = df[column_name.c_str()]; + + /// First localize to UTC (assuming the timestamps are in UTC) + auto utc_localized = column.attr("dt").attr("tz_localize")("UTC"); + + /// Then convert to the target timezone + auto tz_converted = utc_localized.attr("dt").attr("tz_convert")(timezone_str); + + /// Update the column in DataFrame + df.attr("__setitem__")(column_name.c_str(), tz_converted); + } +} + +void PandasDataFrameBuilder::finalize() +{ + if (is_finalized) + return; + + columns_data.reserve(column_types.size()); + + py::gil_scoped_acquire acquire; + + for (const auto & type : column_types) + { + columns_data.emplace_back(type); + } + + for (auto & column_data : columns_data) + { + column_data.init(total_rows); + } + + /// Process all chunks and append column data + for (const auto & chunk : chunks) + { + const auto & columns = chunk.getColumns(); + for (size_t col_idx = 0; col_idx < columns.size(); ++col_idx) + { + auto column = columns[col_idx]; + + columns_data[col_idx].append(column); + } + } + + chunks.clear(); + + /// Create pandas DataFrame + py::dict res; + for (size_t col_idx = 0; col_idx < column_names.size(); ++col_idx) { + auto & name = column_names[col_idx]; + auto & column_data = columns_data[col_idx]; + res[name.c_str()] = column_data.toArray(); + } + final_dataframe = genDataFrame(res); + + is_finalized = true; +} + +py::object PandasDataFrameBuilder::getDataFrame() +{ + chassert(is_finalized); + + py::gil_scoped_acquire acquire; + + columns_data.clear(); + return std::move(final_dataframe); +} +} diff --git a/programs/local/PandasDataFrameBuilder.h b/programs/local/PandasDataFrameBuilder.h new file mode 100644 index 00000000000..4c6d395e0a5 --- /dev/null +++ b/programs/local/PandasDataFrameBuilder.h @@ -0,0 +1,53 @@ +#pragma once + +#include "PybindWrapper.h" +#include "NumpyArray.h" + +#include +#include +#include +#include +#include + +namespace CHDB +{ + +/// Builder class to convert ClickHouse Chunks to Pandas DataFrame +/// Accumulates chunks and provides conversion to Python pandas DataFrame object +class PandasDataFrameBuilder +{ +public: + explicit PandasDataFrameBuilder(const DB::Block & sample); + + ~PandasDataFrameBuilder() = default; + + /// Add data chunk + void addChunk(const DB::Chunk & chunk); + + /// Finalize and build pandas DataFrame from all collected chunks + void finalize(); + + /// Get the finalized pandas DataFrame + pybind11::object getDataFrame(); + +private: + pybind11::object genDataFrame(const pybind11::handle & dict); + void changeToTZType(pybind11::object & df); + + std::vector column_names; + std::vector column_types; + + /// Map column name to timezone string for timezone-aware types + std::unordered_map column_timezones; + + std::vector chunks; + std::vector columns_data; + + size_t total_rows = 0; + bool is_finalized = false; + pybind11::object final_dataframe; + + Poco::Logger * log = &Poco::Logger::get("PandasDataFrameBuilder"); +}; + +} diff --git a/programs/local/PythonImportCache.cpp b/programs/local/PythonImportCache.cpp index 6e24b35e934..85f30a9a732 100644 --- a/programs/local/PythonImportCache.cpp +++ b/programs/local/PythonImportCache.cpp @@ -53,7 +53,7 @@ py::handle PythonImportCacheItem::AddCache(PythonImportCache & cache, py::object void PythonImportCacheItem::LoadModule(PythonImportCache & cache) { #if USE_JEMALLOC - ::Memory::MemoryCheckScope memory_check_scope; + ::Memory::MemoryCheckScope memory_check_scope; #endif try { diff --git a/programs/local/PythonImportCache.h b/programs/local/PythonImportCache.h index 6bdf5cf7c8f..598069a60e2 100644 --- a/programs/local/PythonImportCache.h +++ b/programs/local/PythonImportCache.h @@ -2,9 +2,13 @@ #include "DatetimeCacheItem.h" #include "DecimalCacheItem.h" +#include "NumpyCacheItem.h" #include "PandasCacheItem.h" #include "PyArrowCacheItem.h" #include "PythonImportCacheItem.h" +#include "UUIDCacheItem.h" +#include "IPAddressCacheItem.h" +#include "PytzCacheItem.h" #include @@ -13,7 +17,8 @@ namespace CHDB { struct PythonImportCache; using PythonImportCachePtr = std::shared_ptr; -struct PythonImportCache { +struct PythonImportCache +{ public: explicit PythonImportCache() = default; @@ -23,6 +28,10 @@ struct PythonImportCache { PyarrowCacheItem pyarrow; DatetimeCacheItem datetime; DecimalCacheItem decimal; + NumpyCacheItem numpy; + UUIDCacheItem uuid; + IPAddressCacheItem ipaddress; + PytzCacheItem pytz; py::handle AddCache(py::object item); diff --git a/programs/local/PytzCacheItem.h b/programs/local/PytzCacheItem.h new file mode 100644 index 00000000000..3c6fccbe858 --- /dev/null +++ b/programs/local/PytzCacheItem.h @@ -0,0 +1,19 @@ +#pragma once + +#include "PythonImportCacheItem.h" + +namespace CHDB { + +struct PytzCacheItem : public PythonImportCacheItem +{ +public: + static constexpr const char *Name = "pytz"; + + PytzCacheItem() : PythonImportCacheItem("pytz"), timezone("timezone", this) {} + + ~PytzCacheItem() override = default; + + PythonImportCacheItem timezone; +}; + +} // namespace CHDB diff --git a/programs/local/QueryResult.h b/programs/local/QueryResult.h index ebd79ec042e..bbd924e3931 100644 --- a/programs/local/QueryResult.h +++ b/programs/local/QueryResult.h @@ -64,6 +64,9 @@ class MaterializedQueryResult : public QueryResult { String string() { + if (!result_buffer) + return {}; + return String(result_buffer->begin(), result_buffer->end()); } diff --git a/programs/local/UUIDCacheItem.h b/programs/local/UUIDCacheItem.h new file mode 100644 index 00000000000..ee21b48ca22 --- /dev/null +++ b/programs/local/UUIDCacheItem.h @@ -0,0 +1,21 @@ +#pragma once + +#include "PythonImportCacheItem.h" + +namespace CHDB { + +struct UUIDCacheItem : public PythonImportCacheItem +{ +public: + static constexpr const char * Name = "uuid"; + + UUIDCacheItem() : PythonImportCacheItem("uuid"), UUID("UUID", this) + { + } + + ~UUIDCacheItem() override = default; + + PythonImportCacheItem UUID; +}; + +} // namespace CHDB diff --git a/src/Client/ClientBase.cpp b/src/Client/ClientBase.cpp index 4980f8d610f..aad4ad78e78 100644 --- a/src/Client/ClientBase.cpp +++ b/src/Client/ClientBase.cpp @@ -151,6 +151,11 @@ namespace ErrorCodes extern const int CANNOT_WRITE_TO_FILE; } +#if USE_PYTHON +/// Custom DataFrame format creator function pointer +static CustomOutputFormatCreator g_dataframe_format_creator = nullptr; +#endif + } namespace ProfileEvents @@ -644,6 +649,22 @@ try { if (!output_format) { +#if USE_PYTHON + if (Poco::toLower(default_output_format) == "dataframe") + { + auto creator = getDataFrameFormatCreator(); + if (creator) + { + output_format = creator(std::make_shared(block)); + return; + } + else + { + throw Exception(ErrorCodes::LOGICAL_ERROR, "DataFrame output format creator not set"); + } + } +#endif + /// Ignore all results when fuzzing as they can be huge. if (query_fuzzer_runs) { @@ -4035,4 +4056,16 @@ void ClientBase::showClientVersion() output_stream << VERSION_NAME << " " + getName() + " version " << VERSION_STRING << VERSION_OFFICIAL << "." << std::endl; } +#if USE_PYTHON +void ClientBase::setDataFrameFormatCreator(CustomOutputFormatCreator creator) +{ + g_dataframe_format_creator = std::move(creator); +} + +CustomOutputFormatCreator ClientBase::getDataFrameFormatCreator() +{ + return g_dataframe_format_creator; +} +#endif + } diff --git a/src/Client/ClientBase.h b/src/Client/ClientBase.h index 8e08e60e541..86fdb78d798 100644 --- a/src/Client/ClientBase.h +++ b/src/Client/ClientBase.h @@ -94,6 +94,11 @@ struct StreamingQueryContext StreamingQueryContext() = default; }; +#if USE_PYTHON +/// Function pointer type for creating custom output formats (e.g. DataFrame) +using CustomOutputFormatCreator = std::function(SharedHeader)>; +#endif + /** * The base class which encapsulates the core functionality of a client. * Can be used in a standalone application (clickhouse-client or clickhouse-local), @@ -353,6 +358,14 @@ class ClientBase String appendSmileyIfNeeded(const String & prompt); +#if USE_PYTHON + /// Set custom DataFrame format creator + static void setDataFrameFormatCreator(CustomOutputFormatCreator creator); + + /// Get custom DataFrame format creator + static CustomOutputFormatCreator getDataFrameFormatCreator(); +#endif + /// Should be one of the first, to be destroyed the last, /// since other members can use them. /// This holder may not be initialized in case if we run the client in the embedded mode (SSH). diff --git a/tests/test_complex_pyobj.py b/tests/test_complex_pyobj.py index 241fffc0105..e393f042e70 100644 --- a/tests/test_complex_pyobj.py +++ b/tests/test_complex_pyobj.py @@ -42,14 +42,43 @@ def test_df_with_na(self): self.assertEqual(ret.dtypes["E"], "object") self.assertEqual(ret.dtypes["F"], "object") self.assertEqual(ret.dtypes["G"], "object") - self.assertEqual( - str(ret), - """ A B C D E F G -0 1 4.0 True a [1, 2] {"a": 1, "b": 2} -1 2 5.0 False b [3, 4] {"c": 3, "d": 4} -2 3 6.0 True c [5, 6] {"e": 5, "f": 6} -3 """, - ) + self.assertEqual(ret.shape, (4, 7)) + + # Row 0 + self.assertEqual(ret.iloc[0]["A"], '1') + self.assertEqual(ret.iloc[0]["B"], '4.0') + self.assertEqual(ret.iloc[0]["C"], 'True') + self.assertEqual(ret.iloc[0]["D"], 'a') + self.assertEqual(ret.iloc[0]["E"], '') + self.assertEqual(ret.iloc[0]["F"], '[1, 2]') + self.assertEqual(ret.iloc[0]["G"], '{"a": 1, "b": 2}') + + # Row 1 + self.assertEqual(ret.iloc[1]["A"], '2') + self.assertEqual(ret.iloc[1]["B"], '5.0') + self.assertEqual(ret.iloc[1]["C"], 'False') + self.assertEqual(ret.iloc[1]["D"], 'b') + self.assertEqual(ret.iloc[1]["E"], '') + self.assertEqual(ret.iloc[1]["F"], '[3, 4]') + self.assertEqual(ret.iloc[1]["G"], '{"c": 3, "d": 4}') + + # Row 2 + self.assertEqual(ret.iloc[2]["A"], '3') + self.assertEqual(ret.iloc[2]["B"], '6.0') + self.assertEqual(ret.iloc[2]["C"], 'True') + self.assertEqual(ret.iloc[2]["D"], 'c') + self.assertEqual(ret.iloc[2]["E"], '') + self.assertEqual(ret.iloc[2]["F"], '[5, 6]') + self.assertEqual(ret.iloc[2]["G"], '{"e": 5, "f": 6}') + + # Row 3 + self.assertEqual(ret.iloc[3]["A"], '') + self.assertEqual(ret.iloc[3]["B"], '') + self.assertEqual(ret.iloc[3]["C"], '') + self.assertEqual(ret.iloc[3]["D"], '') + self.assertEqual(ret.iloc[3]["E"], '') + self.assertEqual(ret.iloc[3]["F"], '') + self.assertEqual(ret.iloc[3]["G"], '') def test_df_without_na(self): ret = chdb.query( @@ -65,14 +94,44 @@ def test_df_without_na(self): self.assertEqual(ret.dtypes["E"], "object") self.assertEqual(ret.dtypes["F"], "object") self.assertEqual(ret.dtypes["G"], "object") - self.assertEqual( - str(ret), - """ A B C D E F G -0 1 4.0 1 a a [1, 2] {"a": 1, "b": 2} -1 2 5.0 0 b b [3, 4] {"c": 3, "d": 4} -2 3 6.0 1 c c [5, 6] {"e": 5, "f": 6} -3 4 7.0 0 d d [7, 8] {"g": 7, "h": 8}""", - ) + + self.assertEqual(ret.shape, (4, 7)) + + # Row 0 + self.assertEqual(ret.iloc[0]["A"], 1) + self.assertEqual(ret.iloc[0]["B"], 4.0) + self.assertEqual(ret.iloc[0]["C"], 1) + self.assertEqual(ret.iloc[0]["D"], "a") + self.assertEqual(ret.iloc[0]["E"], "a") + self.assertEqual(ret.iloc[0]["F"], '[1, 2]') + self.assertEqual(ret.iloc[0]["G"], {"a": 1, "b": 2}) + + # Row 1 + self.assertEqual(ret.iloc[1]["A"], 2) + self.assertEqual(ret.iloc[1]["B"], 5.0) + self.assertEqual(ret.iloc[1]["C"], 0) + self.assertEqual(ret.iloc[1]["D"], "b") + self.assertEqual(ret.iloc[1]["E"], "b") + self.assertEqual(ret.iloc[1]["F"], '[3, 4]') + self.assertEqual(ret.iloc[1]["G"], {"c": 3, "d": 4}) + + # Row 2 + self.assertEqual(ret.iloc[2]["A"], 3) + self.assertEqual(ret.iloc[2]["B"], 6.0) + self.assertEqual(ret.iloc[2]["C"], 1) + self.assertEqual(ret.iloc[2]["D"], "c") + self.assertEqual(ret.iloc[2]["E"], "c") + self.assertEqual(ret.iloc[2]["F"], '[5, 6]') + self.assertEqual(ret.iloc[2]["G"], {"e": 5, "f": 6}) + + # Row 3 + self.assertEqual(ret.iloc[3]["A"], 4) + self.assertEqual(ret.iloc[3]["B"], 7.0) + self.assertEqual(ret.iloc[3]["C"], 0) + self.assertEqual(ret.iloc[3]["D"], "d") + self.assertEqual(ret.iloc[3]["E"], "d") + self.assertEqual(ret.iloc[3]["F"], '[7, 8]') + self.assertEqual(ret.iloc[3]["G"], {"g": 7, "h": 8}) if __name__ == "__main__": diff --git a/tests/test_dataframe_column_types_1.py b/tests/test_dataframe_column_types_1.py new file mode 100644 index 00000000000..17c96140934 --- /dev/null +++ b/tests/test_dataframe_column_types_1.py @@ -0,0 +1,2065 @@ +#!/usr/bin/env python3 + +import unittest +import pandas as pd +import chdb +from datetime import datetime, date, timezone, timedelta +import numpy as np +import math +import uuid +import ipaddress + + +class TestDataFrameColumnTypesOne(unittest.TestCase): + + def setUp(self): + self.session = chdb.session.Session() + self.shanghai_tz = timezone(timedelta(hours=8)) + + def tearDown(self): + self.session.close() + + def test_integer_types(self): + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toInt8(-128) as int8_val, + toInt16(-32768) as int16_val, + toInt32(-2147483648) as int32_val, + toInt64(-9223372036854775808) as int64_val, + toInt128('-170141183460469231731687303715884105728') as int128_val, + toInt256('-57896044618658097711785492504343953926634992332820282019728792003956564819968') as int256_val, + toUInt8(255) as UInt8_val, + toUInt16(65535) as uint16_val, + toUInt32(4294967295) as uint32_val, + toUInt64(18446744073709551615) as uint64_val, + toUInt128('340282366920938463463374607431768211455') as uint128_val, + toUInt256('115792089237316195423570985008687907853269984665640564039457584007913129639935') as uint256_val + UNION ALL + SELECT + 2 as row_id, + toInt8(127) as int8_val, + toInt16(32767) as int16_val, + toInt32(2147483647) as int32_val, + toInt64(9223372036854775807) as int64_val, + toInt128('170141183460469231731687303715884105727') as int128_val, + toInt256('57896044618658097711785492504343953926634992332820282019728792003956564819967') as int256_val, + toUInt8(254) as UInt8_val, + toUInt16(65534) as uint16_val, + toUInt32(4294967294) as uint32_val, + toUInt64(18446744073709551614) as uint64_val, + toUInt128('340282366920938463463374607431768211454') as uint128_val, + toUInt256('115792089237316195423570985008687907853269984665640564039457584007913129639934') as uint256_val + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row (minimum/maximum values) + self.assertEqual(ret.iloc[0]["int8_val"], -128) + self.assertEqual(ret.iloc[0]["int16_val"], -32768) + self.assertEqual(ret.iloc[0]["int32_val"], -2147483648) + self.assertEqual(ret.iloc[0]["int64_val"], -9223372036854775808) + self.assertEqual(ret.iloc[0]["int128_val"], float(-170141183460469231731687303715884105728)) + self.assertEqual(ret.iloc[0]["int256_val"], float(-57896044618658097711785492504343953926634992332820282019728792003956564819968)) + self.assertEqual(ret.iloc[0]["UInt8_val"], 255) + self.assertEqual(ret.iloc[0]["uint16_val"], 65535) + self.assertEqual(ret.iloc[0]["uint32_val"], 4294967295) + self.assertEqual(ret.iloc[0]["uint64_val"], 18446744073709551615) + self.assertEqual(ret.iloc[0]["uint128_val"], float(340282366920938463463374607431768211455)) + self.assertEqual(ret.iloc[0]["uint256_val"], float(115792089237316195423570985008687907853269984665640564039457584007913129639935)) + + # Test second row (maximum/near-maximum values) + self.assertEqual(ret.iloc[1]["int8_val"], 127) + self.assertEqual(ret.iloc[1]["int16_val"], 32767) + self.assertEqual(ret.iloc[1]["int32_val"], 2147483647) + self.assertEqual(ret.iloc[1]["int64_val"], 9223372036854775807) + self.assertEqual(ret.iloc[1]["int128_val"], float(170141183460469231731687303715884105727)) + self.assertEqual(ret.iloc[1]["int256_val"], float(57896044618658097711785492504343953926634992332820282019728792003956564819967)) + self.assertEqual(ret.iloc[1]["UInt8_val"], 254) + self.assertEqual(ret.iloc[1]["uint16_val"], 65534) + self.assertEqual(ret.iloc[1]["uint32_val"], 4294967294) + self.assertEqual(ret.iloc[1]["uint64_val"], 18446744073709551614) + self.assertEqual(ret.iloc[1]["uint128_val"], float(340282366920938463463374607431768211454)) + self.assertEqual(ret.iloc[1]["uint256_val"], float(115792089237316195423570985008687907853269984665640564039457584007913129639934)) + + # Precise data type validation + expected_types = { + "int8_val": "int8", + "int16_val": "int16", + "int32_val": "int32", + "int64_val": "int64", + "int128_val": "float64", # Int128 mapped to float64 in ClickHouse->pandas conversion + "int256_val": "float64", # Int256 mapped to float64 in ClickHouse->pandas conversion + "UInt8_val": "uint8", + "uint16_val": "uint16", + "uint32_val": "uint32", + "uint64_val": "uint64", + "uint128_val": "float64", # UInt128 mapped to float64 in ClickHouse->pandas conversion + "uint256_val": "float64" # UInt256 mapped to float64 in ClickHouse->pandas conversion + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_float_types(self): + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toFloat32(3.14159265) as float32_val, + toFloat32(-3.40282347e+38) as float32_min, + toFloat32(3.40282347e+38) as float32_max, + toFloat64(2.718281828459045) as float64_val, + toFloat64(-1.7976931348623157e+308) as float64_min, + toFloat64(1.7976931348623157e+308) as float64_max, + toBFloat16(1.5) as bfloat16_val, + toBFloat16(-3.389531389e+38) as bfloat16_min, + toBFloat16(3.389531389e+38) as bfloat16_max + UNION ALL + SELECT + 2 as row_id, + toFloat32(0.0) as float32_val, + toFloat32(1.175494351e-38) as float32_min, + toFloat32(-1.175494351e-38) as float32_max, + toFloat64(0.0) as float64_val, + toFloat64(2.2250738585072014e-308) as float64_min, + toFloat64(-2.2250738585072014e-308) as float64_max, + toBFloat16(0.0) as bfloat16_val, + toBFloat16(1.175494351e-38) as bfloat16_min, + toBFloat16(-1.175494351e-38) as bfloat16_max + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[1][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - regular and extreme values + self.assertAlmostEqual(ret.iloc[0]["float32_val"], 3.14159265, places=6) + self.assertAlmostEqual(ret.iloc[0]["float32_min"], -3.40282347e+38, delta=1e30) + self.assertAlmostEqual(ret.iloc[0]["float32_max"], 3.40282347e+38, delta=1e30) + self.assertAlmostEqual(ret.iloc[0]["float64_val"], 2.718281828459045, places=15) + self.assertAlmostEqual(ret.iloc[0]["float64_min"], -1.7976931348623157e+308, delta=1e300) + self.assertAlmostEqual(ret.iloc[0]["float64_max"], 1.7976931348623157e+308, delta=1e300) + self.assertAlmostEqual(ret.iloc[0]["bfloat16_val"], 1.5, places=2) + self.assertAlmostEqual(ret.iloc[0]["bfloat16_min"], -3.389531389e+38, delta=1e30) + self.assertAlmostEqual(ret.iloc[0]["bfloat16_max"], 3.389531389e+38, delta=1e30) + + # Test second row - zero and small values + self.assertEqual(ret.iloc[1]["float32_val"], 0.0) + self.assertAlmostEqual(ret.iloc[1]["float32_min"], 1.175494351e-38, delta=1e-40) + self.assertAlmostEqual(ret.iloc[1]["float32_max"], -1.175494351e-38, delta=1e-40) + self.assertEqual(ret.iloc[1]["float64_val"], 0.0) + self.assertAlmostEqual(ret.iloc[1]["float64_min"], 2.2250738585072014e-308, delta=1e-310) + self.assertAlmostEqual(ret.iloc[1]["float64_max"], -2.2250738585072014e-308, delta=1e-310) + self.assertEqual(ret.iloc[1]["bfloat16_val"], 0.0) + self.assertAlmostEqual(ret.iloc[1]["bfloat16_min"], 1.175494351e-38, delta=1e-40) + self.assertAlmostEqual(ret.iloc[1]["bfloat16_max"], -1.175494351e-38, delta=1e-40) + + # Precise data type validation + expected_types = { + "float32_val": "float32", + "float32_min": "float32", + "float32_max": "float32", + "float64_val": "float64", + "float64_min": "float64", + "float64_max": "float64", + "bfloat16_val": "float32", # BFloat16 typically mapped to float32 in pandas + "bfloat16_min": "float32", + "bfloat16_max": "float32" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_float_special_values(self): + """Test Infinity and NaN values for all float types""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toFloat32(1.0/0.0) as float32_pos_inf, + toFloat32(-1.0/0.0) as float32_neg_inf, + toFloat32(0.0/0.0) as float32_nan, + toFloat64(1.0/0.0) as float64_pos_inf, + toFloat64(-1.0/0.0) as float64_neg_inf, + toFloat64(0.0/0.0) as float64_nan, + toBFloat16(1.0/0.0) as bfloat16_pos_inf, + toBFloat16(-1.0/0.0) as bfloat16_neg_inf, + toBFloat16(0.0/0.0) as bfloat16_nan + UNION ALL + SELECT + 2 as row_id, + toFloat32(1.0/0.0) as float32_pos_inf, + toFloat32(-1.0/0.0) as float32_neg_inf, + toFloat32(0.0/0.0) as float32_nan, + toFloat64(1.0/0.0) as float64_pos_inf, + toFloat64(-1.0/0.0) as float64_neg_inf, + toFloat64(0.0/0.0) as float64_nan, + toBFloat16(1.0/0.0) as bfloat16_pos_inf, + toBFloat16(-1.0/0.0) as bfloat16_neg_inf, + toBFloat16(0.0/0.0) as bfloat16_nan + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test Float32 special values + self.assertTrue(math.isinf(ret.iloc[0]["float32_pos_inf"])) + self.assertTrue(ret.iloc[0]["float32_pos_inf"] > 0) # positive infinity + self.assertTrue(math.isinf(ret.iloc[0]["float32_neg_inf"])) + self.assertTrue(ret.iloc[0]["float32_neg_inf"] < 0) # negative infinity + self.assertTrue(math.isnan(ret.iloc[0]["float32_nan"])) + + # Test Float64 special values + self.assertTrue(math.isinf(ret.iloc[0]["float64_pos_inf"])) + self.assertTrue(ret.iloc[0]["float64_pos_inf"] > 0) # positive infinity + self.assertTrue(math.isinf(ret.iloc[0]["float64_neg_inf"])) + self.assertTrue(ret.iloc[0]["float64_neg_inf"] < 0) # negative infinity + self.assertTrue(math.isnan(ret.iloc[0]["float64_nan"])) + + # Test BFloat16 special values + self.assertTrue(math.isinf(ret.iloc[0]["bfloat16_pos_inf"])) + self.assertTrue(ret.iloc[0]["bfloat16_pos_inf"] > 0) # positive infinity + self.assertTrue(math.isinf(ret.iloc[0]["bfloat16_neg_inf"])) + self.assertTrue(ret.iloc[0]["bfloat16_neg_inf"] < 0) # negative infinity + self.assertTrue(math.isnan(ret.iloc[0]["bfloat16_nan"])) + + # Test second row (same values, consistency check) + self.assertTrue(math.isinf(ret.iloc[1]["float32_pos_inf"])) + self.assertTrue(ret.iloc[1]["float32_pos_inf"] > 0) + self.assertTrue(math.isinf(ret.iloc[1]["float32_neg_inf"])) + self.assertTrue(ret.iloc[1]["float32_neg_inf"] < 0) + self.assertTrue(math.isnan(ret.iloc[1]["float32_nan"])) + + self.assertTrue(math.isinf(ret.iloc[1]["float64_pos_inf"])) + self.assertTrue(ret.iloc[1]["float64_pos_inf"] > 0) + self.assertTrue(math.isinf(ret.iloc[1]["float64_neg_inf"])) + self.assertTrue(ret.iloc[1]["float64_neg_inf"] < 0) + self.assertTrue(math.isnan(ret.iloc[1]["float64_nan"])) + + self.assertTrue(math.isinf(ret.iloc[1]["bfloat16_pos_inf"])) + self.assertTrue(ret.iloc[1]["bfloat16_pos_inf"] > 0) + self.assertTrue(math.isinf(ret.iloc[1]["bfloat16_neg_inf"])) + self.assertTrue(ret.iloc[1]["bfloat16_neg_inf"] < 0) + self.assertTrue(math.isnan(ret.iloc[1]["bfloat16_nan"])) + + # Precise data type validation + expected_types = { + "float32_pos_inf": "float32", + "float32_neg_inf": "float32", + "float32_nan": "float32", + "float64_pos_inf": "float64", + "float64_neg_inf": "float64", + "float64_nan": "float64", + "bfloat16_pos_inf": "float32", # BFloat16 typically mapped to float32 in pandas + "bfloat16_neg_inf": "float32", + "bfloat16_nan": "float32" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_decimal_types(self): + """Test Decimal32, Decimal64, Decimal128, Decimal256 types""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toDecimal32('123.456', 3) as decimal32_val, + toDecimal32('-999999.999', 3) as decimal32_min, + toDecimal32('999999.999', 3) as decimal32_max, + toDecimal64('123456.789012', 6) as decimal64_val, + toDecimal64('-999999999999.999999', 6) as decimal64_min, + toDecimal64('999999999999.999999', 6) as decimal64_max, + toDecimal128('12345678901234567890123456789.123456789', 9) as decimal128_val, + toDecimal128('-12345678901234567890123456789.123456789', 9) as decimal128_min, + toDecimal128('12345678901234567890123456789.123456789', 9) as decimal128_max, + toDecimal256('1234567890123456789012345678901234567890123456789012345678.123456789012345678', 18) as decimal256_val, + toDecimal256('-1234567890123456789012345678901234567890123456789012345678.123456789012345678', 18) as decimal256_min, + toDecimal256('1234567890123456789012345678901234567890123456789012345678.123456789012345678', 18) as decimal256_max + UNION ALL + SELECT + 2 as row_id, + toDecimal32('0.001', 3) as decimal32_val, + toDecimal32('0.000', 3) as decimal32_min, + toDecimal32('1.000', 3) as decimal32_max, + toDecimal64('0.000001', 6) as decimal64_val, + toDecimal64('0.000000', 6) as decimal64_min, + toDecimal64('1.000000', 6) as decimal64_max, + toDecimal128('0.000000001', 9) as decimal128_val, + toDecimal128('0.000000000', 9) as decimal128_min, + toDecimal128('1.000000000', 9) as decimal128_max, + toDecimal256('0.000000000000000001', 18) as decimal256_val, + toDecimal256('0.000000000000000000', 18) as decimal256_min, + toDecimal256('1.000000000000000000', 18) as decimal256_max + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - regular and extreme decimal values (converted to float64) + self.assertAlmostEqual(ret.iloc[0]["decimal32_val"], 123.456, places=3) + self.assertAlmostEqual(ret.iloc[0]["decimal32_min"], -999999.999, places=3) + self.assertAlmostEqual(ret.iloc[0]["decimal32_max"], 999999.999, places=3) + + self.assertAlmostEqual(ret.iloc[0]["decimal64_val"], 123456.789012, places=6) + self.assertAlmostEqual(ret.iloc[0]["decimal64_min"], -999999999999.999999, places=6) + self.assertAlmostEqual(ret.iloc[0]["decimal64_max"], 999999999999.999999, places=6) + + self.assertAlmostEqual(ret.iloc[0]["decimal128_val"], 12345678901234567890123456789.123456789, delta=1e20) + self.assertAlmostEqual(ret.iloc[0]["decimal128_min"], -12345678901234567890123456789.123456789, delta=1e20) + self.assertAlmostEqual(ret.iloc[0]["decimal128_max"], 12345678901234567890123456789.123456789, delta=1e20) + + self.assertAlmostEqual(ret.iloc[0]["decimal256_val"], 1234567890123456789012345678901234567890123456789012345678.123456789012345678, delta=1e50) + self.assertAlmostEqual(ret.iloc[0]["decimal256_min"], -1234567890123456789012345678901234567890123456789012345678.123456789012345678, delta=1e50) + self.assertAlmostEqual(ret.iloc[0]["decimal256_max"], 1234567890123456789012345678901234567890123456789012345678.123456789012345678, delta=1e50) + + # Test second row - small decimal values (converted to float64) + self.assertAlmostEqual(ret.iloc[1]["decimal32_val"], 0.001, places=3) + self.assertEqual(ret.iloc[1]["decimal32_min"], 0.000) + self.assertAlmostEqual(ret.iloc[1]["decimal32_max"], 1.000, places=3) + + self.assertAlmostEqual(ret.iloc[1]["decimal64_val"], 0.000001, places=6) + self.assertEqual(ret.iloc[1]["decimal64_min"], 0.000000) + self.assertAlmostEqual(ret.iloc[1]["decimal64_max"], 1.000000, places=6) + + self.assertAlmostEqual(ret.iloc[1]["decimal128_val"], 0.000000001, places=9) + self.assertEqual(ret.iloc[1]["decimal128_min"], 0.000000000) + self.assertAlmostEqual(ret.iloc[1]["decimal128_max"], 1.000000000, places=9) + + self.assertAlmostEqual(ret.iloc[1]["decimal256_val"], 0.000000000000000001, places=18) + self.assertEqual(ret.iloc[1]["decimal256_min"], 0.000000000000000000) + self.assertAlmostEqual(ret.iloc[1]["decimal256_max"], 1.000000000000000000, places=18) + + # Precise data type validation + expected_types = { + "decimal32_val": "float64", # Decimal types mapped to float64 in ClickHouse->pandas conversion + "decimal32_min": "float64", + "decimal32_max": "float64", + "decimal64_val": "float64", + "decimal64_min": "float64", + "decimal64_max": "float64", + "decimal128_val": "float64", + "decimal128_min": "float64", + "decimal128_max": "float64", + "decimal256_val": "float64", + "decimal256_min": "float64", + "decimal256_max": "float64" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_string_types(self): + """Test String, FixedString, and LowCardinality string types""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toString('Hello World') as string_val, + toFixedString('Fixed', 10) as fixed_string_val, + toLowCardinality('Category A') as low_cardinality_val, + toString('') as empty_string, + toString('Unicode: 🌍 éñáíóú') as unicode_string, + toString('Special chars: \\t\\n\\r\\"\\\'') as special_chars, + toString('Very long string with many characters to test maximum length handling and memory allocation behavior') as long_string, + toFixedString('ABC', 5) as fixed_string_short, + toLowCardinality('') as low_cardinality_empty + UNION ALL + SELECT + 2 as row_id, + toString('Another string') as string_val, + toFixedString('Test123', 10) as fixed_string_val, + toLowCardinality('Category B') as low_cardinality_val, + toString('Non-empty') as empty_string, + toString('More Unicode: 🚀 ñáéíóú àèìòù') as unicode_string, + toString('Line breaks:\\nTab:\\tQuote:\\"') as special_chars, + toString('Short') as long_string, + toFixedString('XYZZZ', 5) as fixed_string_short, + toLowCardinality('Option 2') as low_cardinality_empty + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - various string types + self.assertEqual(ret.iloc[0]["string_val"], "Hello World") + self.assertEqual(ret.iloc[0]["fixed_string_val"], "Fixed\x00\x00\x00\x00\x00") # FixedString pads with null bytes + self.assertEqual(ret.iloc[0]["low_cardinality_val"], "Category A") + self.assertEqual(ret.iloc[0]["empty_string"], "") + self.assertEqual(ret.iloc[0]["unicode_string"], "Unicode: 🌍 éñáíóú") + self.assertEqual(ret.iloc[0]["special_chars"], "Special chars: \t\n\r\"'") # ClickHouse interprets escape sequences + self.assertEqual(ret.iloc[0]["long_string"], "Very long string with many characters to test maximum length handling and memory allocation behavior") + self.assertEqual(ret.iloc[0]["fixed_string_short"], "ABC\x00\x00") # Padded to 5 chars + self.assertEqual(ret.iloc[0]["low_cardinality_empty"], "") + + # Test second row - different string values + self.assertEqual(ret.iloc[1]["string_val"], "Another string") + self.assertEqual(ret.iloc[1]["fixed_string_val"], "Test123\x00\x00\x00") # Padded to 10 chars + self.assertEqual(ret.iloc[1]["low_cardinality_val"], "Category B") + self.assertEqual(ret.iloc[1]["empty_string"], "Non-empty") + self.assertEqual(ret.iloc[1]["unicode_string"], "More Unicode: 🚀 ñáéíóú àèìòù") + self.assertEqual(ret.iloc[1]["special_chars"], "Line breaks:\nTab:\tQuote:\"") # ClickHouse interprets escape sequences + self.assertEqual(ret.iloc[1]["long_string"], "Short") + self.assertEqual(ret.iloc[1]["fixed_string_short"], "XYZZZ") # Exactly 5 chars, no padding + self.assertEqual(ret.iloc[1]["low_cardinality_empty"], "Option 2") + + # Precise data type validation + expected_types = { + "string_val": "object", # String types mapped to object in pandas + "fixed_string_val": "object", + "low_cardinality_val": "object", + "empty_string": "object", + "unicode_string": "object", + "special_chars": "object", + "long_string": "object", + "fixed_string_short": "object", + "low_cardinality_empty": "object" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_date_types(self): + """Test Date and Date32 types""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toDate('2023-12-25') as date_val, + toDate('1970-01-01') as date_min, + toDate('2149-06-06') as date_max, + toDate32('2023-12-25') as date32_val, + toDate32('1900-01-01') as date32_min, + toDate32('2299-12-31') as date32_max, + toDate('2000-02-29') as date_leap_year, + toDate32('2000-02-29') as date32_leap_year, + toDate32('1950-06-15') as date32_negative_1, + toDate32('1960-12-31') as date32_negative_2, + toDate32('1969-12-31') as date32_before_epoch + UNION ALL + SELECT + 2 as row_id, + toDate('1970-01-01') as date_val, + toDate('2023-01-01') as date_min, + toDate('2023-12-31') as date_max, + toDate32('1970-01-01') as date32_val, + toDate32('2023-01-01') as date32_min, + toDate32('2023-12-31') as date32_max, + toDate('2024-02-29') as date_leap_year, + toDate32('2024-02-29') as date32_leap_year, + toDate32('1945-05-08') as date32_negative_1, + toDate32('1955-03-20') as date32_negative_2, + toDate32('1968-07-20') as date32_before_epoch + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - specific dates (Date types include time component 00:00:00) + self.assertIn("2023-12-25", str(ret.iloc[0]["date_val"])) + self.assertIn("1970-01-01", str(ret.iloc[0]["date_min"])) + self.assertIn("2149-06-06", str(ret.iloc[0]["date_max"])) + self.assertIn("2023-12-25", str(ret.iloc[0]["date32_val"])) + self.assertIn("1900-01-01", str(ret.iloc[0]["date32_min"])) + self.assertIn("2299-12-31", str(ret.iloc[0]["date32_max"])) + self.assertIn("2000-02-29", str(ret.iloc[0]["date_leap_year"])) + self.assertIn("2000-02-29", str(ret.iloc[0]["date32_leap_year"])) + # Test Date32 negative values (before 1970 epoch) + self.assertIn("1950-06-15", str(ret.iloc[0]["date32_negative_1"])) + self.assertIn("1960-12-31", str(ret.iloc[0]["date32_negative_2"])) + self.assertIn("1969-12-31", str(ret.iloc[0]["date32_before_epoch"])) + + # Test second row - different dates + self.assertIn("1970-01-01", str(ret.iloc[1]["date_val"])) + self.assertIn("2023-01-01", str(ret.iloc[1]["date_min"])) + self.assertIn("2023-12-31", str(ret.iloc[1]["date_max"])) + self.assertIn("1970-01-01", str(ret.iloc[1]["date32_val"])) + self.assertIn("2023-01-01", str(ret.iloc[1]["date32_min"])) + self.assertIn("2023-12-31", str(ret.iloc[1]["date32_max"])) + self.assertIn("2024-02-29", str(ret.iloc[1]["date_leap_year"])) + self.assertIn("2024-02-29", str(ret.iloc[1]["date32_leap_year"])) + # Test Date32 negative values (before 1970 epoch) - second row + self.assertIn("1945-05-08", str(ret.iloc[1]["date32_negative_1"])) + self.assertIn("1955-03-20", str(ret.iloc[1]["date32_negative_2"])) + self.assertIn("1968-07-20", str(ret.iloc[1]["date32_before_epoch"])) + + # Precise data type validation + expected_types = { + "date_val": "datetime64[s]", # Date types mapped to datetime64[s] in pandas + "date_min": "datetime64[s]", + "date_max": "datetime64[s]", + "date32_val": "datetime64[s]", + "date32_min": "datetime64[s]", + "date32_max": "datetime64[s]", + "date_leap_year": "datetime64[s]", + "date32_leap_year": "datetime64[s]", + "date32_negative_1": "datetime64[s]", # Date32 negative values (before 1970) + "date32_negative_2": "datetime64[s]", + "date32_before_epoch": "datetime64[s]" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_time_types(self): + """Test Time and Time64 types""" + # Enable Time and Time64 types + self.session.query("SET enable_time_time64_type = 1") + + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + CAST('14:30:45' AS Time) as time_val, + CAST('00:00:00' AS Time) as time_min, + CAST('23:59:59' AS Time) as time_max, + CAST('14:30:45.123456' AS Time64(6)) as time64_val, + CAST('00:00:00.000000' AS Time64(6)) as time64_min, + CAST('23:59:59.999999' AS Time64(6)) as time64_max, + CAST('12:00:00.123' AS Time64(3)) as time64_ms, + CAST('18:45:30.987654321' AS Time64(9)) as time64_ns + UNION ALL + SELECT + 2 as row_id, + CAST('09:15:30' AS Time) as time_val, + CAST('12:00:00' AS Time) as time_min, + CAST('18:45:15' AS Time) as time_max, + CAST('09:15:30.654321' AS Time64(6)) as time64_val, + CAST('12:30:45.500000' AS Time64(6)) as time64_min, + CAST('20:15:30.111111' AS Time64(6)) as time64_max, + CAST('08:30:15.500' AS Time64(3)) as time64_ms, + CAST('16:20:10.123456789' AS Time64(9)) as time64_ns + UNION ALL + SELECT + 3 as row_id, + CAST(-3600 AS Time) as time_val, -- -1 hour as negative seconds + CAST(-7200 AS Time) as time_min, -- -2 hours as negative seconds + CAST(-1800 AS Time) as time_max, -- -30 minutes as negative seconds + CAST(-3661.123456 AS Time64(6)) as time64_val, -- -1h 1m 1.123456s + CAST(-7322.500000 AS Time64(6)) as time64_min, -- -2h 2m 2.5s + CAST(-1801.999999 AS Time64(6)) as time64_max, -- -30m 1.999999s + CAST(-3723.500 AS Time64(3)) as time64_ms, -- -1h 2m 3.5s + CAST(-5434.123456789 AS Time64(9)) as time64_ns -- -1h 30m 34.123456789s + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - time values + self.assertIn("14:30:45", str(ret.iloc[0]["time_val"])) + self.assertIn("00:00:00", str(ret.iloc[0]["time_min"])) + self.assertIn("23:59:59", str(ret.iloc[0]["time_max"])) + self.assertIn("14:30:45", str(ret.iloc[0]["time64_val"])) + self.assertIn("00:00:00", str(ret.iloc[0]["time64_min"])) + self.assertIn("23:59:59", str(ret.iloc[0]["time64_max"])) + self.assertIn("12:00:00", str(ret.iloc[0]["time64_ms"])) + self.assertIn("18:45:30", str(ret.iloc[0]["time64_ns"])) + + # Test second row - different time values + self.assertIn("09:15:30", str(ret.iloc[1]["time_val"])) + self.assertIn("12:00:00", str(ret.iloc[1]["time_min"])) + self.assertIn("18:45:15", str(ret.iloc[1]["time_max"])) + self.assertIn("09:15:30", str(ret.iloc[1]["time64_val"])) + self.assertIn("12:30:45", str(ret.iloc[1]["time64_min"])) + self.assertIn("20:15:30", str(ret.iloc[1]["time64_max"])) + self.assertIn("08:30:15", str(ret.iloc[1]["time64_ms"])) + self.assertIn("16:20:10", str(ret.iloc[1]["time64_ns"])) + + # Test third row - negative time values (should be returned as string numbers) + # Since Python time types don't support negative values, they are returned as numeric strings + self.assertEqual(ret.iloc[2]["time_val"], "-3600") # -1 hour + self.assertEqual(ret.iloc[2]["time_min"], "-7200") # -2 hours + self.assertEqual(ret.iloc[2]["time_max"], "-1800") # -30 minutes + self.assertEqual(ret.iloc[2]["time64_val"], "-3661.123456") # -1h 1m 1.123456s + self.assertEqual(ret.iloc[2]["time64_min"], "-7322.5") # -2h 2m 2.5s + self.assertEqual(ret.iloc[2]["time64_max"], "-1801.999999") # -30m 1.999999s + self.assertEqual(ret.iloc[2]["time64_ms"], "-3723.5") # -1h 2m 3.5s + self.assertEqual(ret.iloc[2]["time64_ns"], "-5434.123456789") # -1h 30m 34.123456789s + + # Verify negative values are returned as strings (object dtype) + for col in ["time_val", "time_min", "time_max", "time64_val", "time64_min", "time64_max", "time64_ms", "time64_ns"]: + self.assertIsInstance(ret.iloc[2][col], str, f"{col} should be string for negative values") + + # Precise data type validation + expected_types = { + "time_val": "object", # Time types mapped to object in pandas + "time_min": "object", + "time_max": "object", + "time64_val": "object", + "time64_min": "object", + "time64_max": "object", + "time64_ms": "object", + "time64_ns": "object" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_datetime_types(self): + """Test DateTime and DateTime64 types""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toDateTime('2023-12-25 14:30:45', 'Asia/Shanghai') as datetime_val, + toDateTime('1970-01-02 00:00:00', 'Asia/Shanghai') as datetime_min, + toDateTime('2106-02-07 06:28:15', 'Asia/Shanghai') as datetime_max, + toDateTime64('2023-12-25 14:30:45.123456', 6, 'Asia/Shanghai') as datetime64_val, + toDateTime64('1902-01-01 00:00:00.000000', 6, 'Asia/Shanghai') as datetime64_min, + toDateTime64('2099-12-31 10:59:59.999999', 6, 'Asia/Shanghai') as datetime64_max, + toDateTime64('2023-12-25 14:30:45.123456789', 9, 'Asia/Shanghai') as datetime64_ns, + toDateTime('2023-06-15 12:00:00', 'UTC') as datetime_utc, + toDateTime('2023-06-15 15:30:00', 'Europe/London') as datetime_london, + toDateTime64('2023-06-15 12:00:00.123', 3, 'Asia/Shanghai') as datetime64_tz_sh, + toDateTime64('2023-06-15 12:00:00.456', 3, 'America/New_York') as datetime64_tz_ny + UNION ALL + SELECT + 2 as row_id, + toDateTime('2000-02-29 09:15:30', 'Asia/Shanghai') as datetime_val, + toDateTime('2023-01-01 12:30:45', 'Asia/Shanghai') as datetime_min, + toDateTime('2023-12-31 18:45:15', 'Asia/Shanghai') as datetime_max, + toDateTime64('2000-02-29 09:15:30.654321', 6, 'Asia/Shanghai') as datetime64_val, + toDateTime64('2023-01-01 08:00:00.111111', 6, 'Asia/Shanghai') as datetime64_min, + toDateTime64('2023-12-31 20:30:45.888888', 6, 'Asia/Shanghai') as datetime64_max, + toDateTime64('2000-02-29 09:15:30.987654321', 9, 'Asia/Shanghai') as datetime64_ns, + toDateTime('2024-01-15 08:30:00', 'UTC') as datetime_utc, + toDateTime('2024-01-15 20:00:00', 'Europe/London') as datetime_london, + toDateTime64('2024-01-15 16:45:30.789', 3, 'Asia/Shanghai') as datetime64_tz_sh, + toDateTime64('2024-01-15 09:15:45.987', 3, 'America/New_York') as datetime64_tz_ny + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Note: Historical timezone offsets vary for the same location across different periods. + # For example, in 1900, Shanghai had a UTC offset of +8:05:43 (8 hours 5 minutes 43 seconds). + # So executing session.query("select toDateTime64('1900-01-01 00:00:00.000000', 6, 'Asia/Shanghai')", "DataFrame") + # would output 1900-01-01 00:00:17+08:06 in pandas instead of the standard +08:00 + + + # Test first row - exact datetime values + # DateTime (second precision) - ClickHouse uses server timezone + # Get system timezone dynamically + actual_tz = "Asia/Shanghai" + + self.assertEqual(ret.iloc[0]["datetime_val"], pd.Timestamp('2023-12-25 14:30:45', tz=actual_tz)) + self.assertEqual(ret.iloc[0]["datetime_min"], pd.Timestamp('1970-01-02 00:00:00', tz=actual_tz)) + self.assertEqual(ret.iloc[0]["datetime_max"], pd.Timestamp('2106-02-07 06:28:15', tz=actual_tz)) + + # DateTime64 (microsecond precision) - should use same timezone as ClickHouse server + self.assertEqual(ret.iloc[0]["datetime64_val"], pd.Timestamp('2023-12-25 14:30:45.123456', tz=actual_tz)) + self.assertEqual(ret.iloc[0]["datetime64_min"], pd.Timestamp('1902-01-01 00:00:00.000000', tz=actual_tz)) + self.assertEqual(ret.iloc[0]["datetime64_max"], pd.Timestamp('2099-12-31 10:59:59.999999', tz=actual_tz)) + + # DateTime64 (nanosecond precision) - should use same timezone as ClickHouse server + self.assertEqual(ret.iloc[0]["datetime64_ns"], pd.Timestamp('2023-12-25 14:30:45.123456789', tz=actual_tz)) + + # UTC timezone datetime + expected_utc = pd.Timestamp('2023-06-15 12:00:00', tz='UTC') + actual_utc = ret.iloc[0]["datetime_utc"] + self.assertEqual(actual_utc, expected_utc) + + # Europe/London timezone datetime + expected_london = pd.Timestamp('2023-06-15 15:30:00', tz='Europe/London') + actual_london = ret.iloc[0]["datetime_london"] + self.assertEqual(actual_london, expected_london) + + # Timezone-aware datetime64 - Asia/Shanghai + expected_sh = pd.Timestamp('2023-06-15 12:00:00.123', tz='Asia/Shanghai') + actual_sh = ret.iloc[0]["datetime64_tz_sh"] + self.assertEqual(actual_sh, expected_sh) + + # Timezone-aware datetime64 - America/New_York + expected_ny = pd.Timestamp('2023-06-15 12:00:00.456', tz='America/New_York') + actual_ny = ret.iloc[0]["datetime64_tz_ny"] + self.assertEqual(actual_ny, expected_ny) + + # Test second row - exact datetime values with ClickHouse server timezone + self.assertEqual(ret.iloc[1]["datetime_val"], pd.Timestamp('2000-02-29 09:15:30', tz=actual_tz)) + self.assertEqual(ret.iloc[1]["datetime_min"], pd.Timestamp('2023-01-01 12:30:45', tz=actual_tz)) + self.assertEqual(ret.iloc[1]["datetime_max"], pd.Timestamp('2023-12-31 18:45:15', tz=actual_tz)) + self.assertEqual(ret.iloc[1]["datetime64_val"], pd.Timestamp('2000-02-29 09:15:30.654321', tz=actual_tz)) + self.assertEqual(ret.iloc[1]["datetime64_min"], pd.Timestamp('2023-01-01 08:00:00.111111', tz=actual_tz)) + self.assertEqual(ret.iloc[1]["datetime64_max"], pd.Timestamp('2023-12-31 20:30:45.888888', tz=actual_tz)) + self.assertEqual(ret.iloc[1]["datetime64_ns"], pd.Timestamp('2000-02-29 09:15:30.987654321', tz=actual_tz)) + + # Second row timezone datetime tests + expected_utc_2 = pd.Timestamp('2024-01-15 08:30:00', tz='UTC') + actual_utc_2 = ret.iloc[1]["datetime_utc"] + self.assertEqual(actual_utc_2, expected_utc_2) + + expected_london_2 = pd.Timestamp('2024-01-15 20:00:00', tz='Europe/London') + actual_london_2 = ret.iloc[1]["datetime_london"] + self.assertEqual(actual_london_2, expected_london_2) + + # Second row timezone tests (already converted by C++ code) + expected_sh_2 = pd.Timestamp('2024-01-15 16:45:30.789', tz='Asia/Shanghai') + actual_sh_2 = ret.iloc[1]["datetime64_tz_sh"] + self.assertEqual(actual_sh_2, expected_sh_2) + + expected_ny_2 = pd.Timestamp('2024-01-15 09:15:45.987', tz='America/New_York') + actual_ny_2 = ret.iloc[1]["datetime64_tz_ny"] + self.assertEqual(actual_ny_2, expected_ny_2) + + # Precise data type validation + expected_types = { + "row_id": "uint8", + "datetime_val": "datetime64[s, Asia/Shanghai]", # DateTime types mapped to datetime64[s] (second precision) + "datetime_min": "datetime64[s, Asia/Shanghai]", + "datetime_max": "datetime64[s, Asia/Shanghai]", + "datetime64_val": "datetime64[ns, Asia/Shanghai]", # DateTime64 types mapped to datetime64[ns] (nanosecond precision) + "datetime64_min": "datetime64[ns, Asia/Shanghai]", + "datetime64_max": "datetime64[ns, Asia/Shanghai]", + "datetime64_ns": "datetime64[ns, Asia/Shanghai]", # DateTime64 with 9-digit precision (nanoseconds) + "datetime_utc": "datetime64[s, UTC]", # DateTime with timezone -> datetime64[s] + "datetime64_tz_sh": "datetime64[ns, Asia/Shanghai]", # DateTime64 with Asia/Shanghai timezone + "datetime64_tz_ny": "datetime64[ns, America/New_York]" # DateTime64 with America/New_York timezone + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_enum_types(self): + """Test Enum8 and Enum16 types""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + CAST('hello' AS Enum8('hello' = 1, 'world' = 2)) as enum8_val, + CAST('small' AS Enum8('small' = -128, 'medium' = 0, 'large' = 127)) as enum8_range, + CAST('active' AS Enum16('active' = 1, 'inactive' = 2, 'pending' = 3, 'deleted' = -1)) as enum16_val, + CAST('north' AS Enum16('north' = 1, 'south' = 2, 'east' = 3, 'west' = 4, 'center' = 0)) as enum16_direction + UNION ALL + SELECT + 2 as row_id, + CAST('world' AS Enum8('hello' = 1, 'world' = 2)) as enum8_val, + CAST('large' AS Enum8('small' = -128, 'medium' = 0, 'large' = 127)) as enum8_range, + CAST('deleted' AS Enum16('active' = 1, 'inactive' = 2, 'pending' = 3, 'deleted' = -1)) as enum16_val, + CAST('south' AS Enum16('north' = 1, 'south' = 2, 'east' = 3, 'west' = 4, 'center' = 0)) as enum16_direction + UNION ALL + SELECT + 3 as row_id, + CAST('hello' AS Enum8('hello' = 1, 'world' = 2)) as enum8_val, + CAST('medium' AS Enum8('small' = -128, 'medium' = 0, 'large' = 127)) as enum8_range, + CAST('pending' AS Enum16('active' = 1, 'inactive' = 2, 'pending' = 3, 'deleted' = -1)) as enum16_val, + CAST('center' AS Enum16('north' = 1, 'south' = 2, 'east' = 3, 'west' = 4, 'center' = 0)) as enum16_direction + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row values + self.assertEqual(ret.iloc[0]["enum8_val"], "hello") + self.assertEqual(ret.iloc[0]["enum8_range"], "small") + self.assertEqual(ret.iloc[0]["enum16_val"], "active") + self.assertEqual(ret.iloc[0]["enum16_direction"], "north") + + # Test second row values + self.assertEqual(ret.iloc[1]["enum8_val"], "world") + self.assertEqual(ret.iloc[1]["enum8_range"], "large") + self.assertEqual(ret.iloc[1]["enum16_val"], "deleted") + self.assertEqual(ret.iloc[1]["enum16_direction"], "south") + + # Test third row values + self.assertEqual(ret.iloc[2]["enum8_val"], "hello") + self.assertEqual(ret.iloc[2]["enum8_range"], "medium") + self.assertEqual(ret.iloc[2]["enum16_val"], "pending") + self.assertEqual(ret.iloc[2]["enum16_direction"], "center") + + # Verify data types - Enum types should be mapped to object (string) dtype in pandas + expected_types = { + "row_id": "uint8", + "enum8_val": "object", # Enum8 mapped to object (string) dtype + "enum8_range": "object", # Enum8 with negative/positive range + "enum16_val": "object", # Enum16 mapped to object (string) dtype + "enum16_direction": "object" # Enum16 with multiple values + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + # Verify all enum values are strings + for col in ["enum8_val", "enum8_range", "enum16_val", "enum16_direction"]: + for i in range(len(ret)): + self.assertIsInstance(ret.iloc[i][col], str, f"Row {i}, column {col} should be string") + + def test_uuid_types(self): + """Test UUID data type""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toUUID('550e8400-e29b-41d4-a716-446655440000') as uuid_fixed1, + toUUID('6ba7b810-9dad-11d1-80b4-00c04fd430c8') as uuid_fixed2, + generateUUIDv4() as uuid_random1, + generateUUIDv4() as uuid_random2 + UNION ALL + SELECT + 2 as row_id, + toUUID('123e4567-e89b-12d3-a456-426614174000') as uuid_fixed1, + toUUID('6ba7b811-9dad-11d1-80b4-00c04fd430c8') as uuid_fixed2, + generateUUIDv4() as uuid_random1, + generateUUIDv4() as uuid_random2 + UNION ALL + SELECT + 3 as row_id, + toUUID('00000000-0000-0000-0000-000000000000') as uuid_fixed1, + toUUID('ffffffff-ffff-ffff-ffff-ffffffffffff') as uuid_fixed2, + generateUUIDv4() as uuid_random1, + generateUUIDv4() as uuid_random2 + ) + ORDER BY row_id + """, "DataFrame") + + # Verify we have 3 rows and 5 columns + self.assertEqual(len(ret), 3) + self.assertEqual(len(ret.columns), 5) + + # Test first row fixed UUID values + self.assertEqual(ret.iloc[0]["uuid_fixed1"], uuid.UUID("550e8400-e29b-41d4-a716-446655440000")) + self.assertEqual(ret.iloc[0]["uuid_fixed2"], uuid.UUID("6ba7b810-9dad-11d1-80b4-00c04fd430c8")) + + # Test second row fixed UUID values + self.assertEqual(ret.iloc[1]["uuid_fixed1"], uuid.UUID("123e4567-e89b-12d3-a456-426614174000")) + self.assertEqual(ret.iloc[1]["uuid_fixed2"], uuid.UUID("6ba7b811-9dad-11d1-80b4-00c04fd430c8")) + + # Test third row special UUID values (all zeros and all F's) + self.assertEqual(ret.iloc[2]["uuid_fixed1"], uuid.UUID("00000000-0000-0000-0000-000000000000")) + self.assertEqual(ret.iloc[2]["uuid_fixed2"], uuid.UUID("ffffffff-ffff-ffff-ffff-ffffffffffff")) + + # Verify data types - UUID types should be mapped to object dtype in pandas + expected_types = { + "row_id": "uint8", + "uuid_fixed1": "object", # UUID mapped to object dtype (contains UUID objects) + "uuid_fixed2": "object", # UUID mapped to object dtype (contains UUID objects) + "uuid_random1": "object", # Generated UUID mapped to object dtype (contains UUID objects) + "uuid_random2": "object" # Generated UUID mapped to object dtype (contains UUID objects) + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + # Verify all UUID values are UUID objects and have valid format + for col in ["uuid_fixed1", "uuid_fixed2", "uuid_random1", "uuid_random2"]: + for i in range(len(ret)): + uuid_value = ret.iloc[i][col] + self.assertIsInstance(uuid_value, uuid.UUID, f"Row {i}, column {col} should be UUID object") + # Verify UUID string representation has correct format + uuid_str = str(uuid_value) + self.assertEqual(len(uuid_str), 36, f"Row {i}, column {col} UUID string should be 36 characters") + self.assertEqual(uuid_str.count('-'), 4, f"Row {i}, column {col} UUID should have 4 hyphens") + + def test_ipv4_types(self): + """Test IPv4 data type""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toIPv4('192.168.1.1') as ipv4_private, + toIPv4('8.8.8.8') as ipv4_public, + toIPv4('127.0.0.1') as ipv4_localhost, + toIPv4('0.0.0.0') as ipv4_zero, + toIPv4('255.255.255.255') as ipv4_broadcast + UNION ALL + SELECT + 2 as row_id, + toIPv4('10.0.0.1') as ipv4_private, + toIPv4('1.1.1.1') as ipv4_public, + toIPv4('127.0.0.2') as ipv4_localhost, + toIPv4('172.16.0.1') as ipv4_zero, + toIPv4('203.0.113.1') as ipv4_broadcast + UNION ALL + SELECT + 3 as row_id, + toIPv4('192.0.2.1') as ipv4_private, + toIPv4('208.67.222.222') as ipv4_public, + toIPv4('169.254.1.1') as ipv4_localhost, + toIPv4('224.0.0.1') as ipv4_zero, + toIPv4('239.255.255.255') as ipv4_broadcast + ) + ORDER BY row_id + """, "DataFrame") + + # Verify we have 3 rows and 6 columns + self.assertEqual(len(ret), 3) + self.assertEqual(len(ret.columns), 6) + + # Test first row IPv4 values + self.assertEqual(ret.iloc[0]["ipv4_private"], ipaddress.IPv4Address("192.168.1.1")) + self.assertEqual(ret.iloc[0]["ipv4_public"], ipaddress.IPv4Address("8.8.8.8")) + self.assertEqual(ret.iloc[0]["ipv4_localhost"], ipaddress.IPv4Address("127.0.0.1")) + self.assertEqual(ret.iloc[0]["ipv4_zero"], ipaddress.IPv4Address("0.0.0.0")) + self.assertEqual(ret.iloc[0]["ipv4_broadcast"], ipaddress.IPv4Address("255.255.255.255")) + + # Test second row IPv4 values + self.assertEqual(ret.iloc[1]["ipv4_private"], ipaddress.IPv4Address("10.0.0.1")) + self.assertEqual(ret.iloc[1]["ipv4_public"], ipaddress.IPv4Address("1.1.1.1")) + self.assertEqual(ret.iloc[1]["ipv4_localhost"], ipaddress.IPv4Address("127.0.0.2")) + self.assertEqual(ret.iloc[1]["ipv4_zero"], ipaddress.IPv4Address("172.16.0.1")) + self.assertEqual(ret.iloc[1]["ipv4_broadcast"], ipaddress.IPv4Address("203.0.113.1")) + + # Test third row IPv4 values + self.assertEqual(ret.iloc[2]["ipv4_private"], ipaddress.IPv4Address("192.0.2.1")) + self.assertEqual(ret.iloc[2]["ipv4_public"], ipaddress.IPv4Address("208.67.222.222")) + self.assertEqual(ret.iloc[2]["ipv4_localhost"], ipaddress.IPv4Address("169.254.1.1")) + self.assertEqual(ret.iloc[2]["ipv4_zero"], ipaddress.IPv4Address("224.0.0.1")) + self.assertEqual(ret.iloc[2]["ipv4_broadcast"], ipaddress.IPv4Address("239.255.255.255")) + + # Verify data types - IPv4 types should be mapped to object dtype in pandas + expected_types = { + "row_id": "uint8", + "ipv4_private": "object", # IPv4Address mapped to object dtype + "ipv4_public": "object", + "ipv4_localhost": "object", + "ipv4_zero": "object", + "ipv4_broadcast": "object" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + # Verify all IPv4 values are IPv4Address objects + for col in ["ipv4_private", "ipv4_public", "ipv4_localhost", "ipv4_zero", "ipv4_broadcast"]: + for i in range(len(ret)): + ipv4_value = ret.iloc[i][col] + self.assertIsInstance(ipv4_value, ipaddress.IPv4Address, f"Row {i}, column {col} should be IPv4Address object") + # Verify IPv4 string representation is valid + ipv4_str = str(ipv4_value) + self.assertEqual(len(ipv4_str.split('.')), 4, f"Row {i}, column {col} IPv4 should have 4 octets") + + def test_ipv6_types(self): + """Test IPv6 data type""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toIPv6('2001:db8::1') as ipv6_standard, + toIPv6('::1') as ipv6_localhost, + toIPv6('::') as ipv6_zero, + toIPv6('2001:db8:85a3::8a2e:370:7334') as ipv6_full, + toIPv6('fe80::1') as ipv6_link_local + UNION ALL + SELECT + 2 as row_id, + toIPv6('2001:db8::2') as ipv6_standard, + toIPv6('::2') as ipv6_localhost, + toIPv6('2001:db8::') as ipv6_zero, + toIPv6('2001:db8:85a3:0:0:8a2e:370:7335') as ipv6_full, + toIPv6('fe80::2') as ipv6_link_local + UNION ALL + SELECT + 3 as row_id, + toIPv6('2001:0db8:0000:0000:0000:ff00:0042:8329') as ipv6_standard, + toIPv6('::ffff:192.0.2.1') as ipv6_localhost, + toIPv6('2001:db8:85a3::8a2e:370:7336') as ipv6_zero, + toIPv6('ff02::1') as ipv6_full, + toIPv6('2001:db8:85a3:8d3:1319:8a2e:370:7348') as ipv6_link_local + ) + ORDER BY row_id + """, "DataFrame") + + # Verify we have 3 rows and 6 columns + self.assertEqual(len(ret), 3) + self.assertEqual(len(ret.columns), 6) + + # Test first row IPv6 values + self.assertEqual(ret.iloc[0]["ipv6_standard"], ipaddress.IPv6Address("2001:db8::1")) + self.assertEqual(ret.iloc[0]["ipv6_localhost"], ipaddress.IPv6Address("::1")) + self.assertEqual(ret.iloc[0]["ipv6_zero"], ipaddress.IPv6Address("::")) + self.assertEqual(ret.iloc[0]["ipv6_full"], ipaddress.IPv6Address("2001:db8:85a3::8a2e:370:7334")) + self.assertEqual(ret.iloc[0]["ipv6_link_local"], ipaddress.IPv6Address("fe80::1")) + + # Test second row IPv6 values + self.assertEqual(ret.iloc[1]["ipv6_standard"], ipaddress.IPv6Address("2001:db8::2")) + self.assertEqual(ret.iloc[1]["ipv6_localhost"], ipaddress.IPv6Address("::2")) + self.assertEqual(ret.iloc[1]["ipv6_zero"], ipaddress.IPv6Address("2001:db8::")) + self.assertEqual(ret.iloc[1]["ipv6_full"], ipaddress.IPv6Address("2001:db8:85a3::8a2e:370:7335")) + self.assertEqual(ret.iloc[1]["ipv6_link_local"], ipaddress.IPv6Address("fe80::2")) + + # Test third row IPv6 values + self.assertEqual(ret.iloc[2]["ipv6_standard"], ipaddress.IPv6Address("2001:db8::ff00:42:8329")) + self.assertEqual(ret.iloc[2]["ipv6_localhost"], ipaddress.IPv6Address("::ffff:192.0.2.1")) + self.assertEqual(ret.iloc[2]["ipv6_zero"], ipaddress.IPv6Address("2001:db8:85a3::8a2e:370:7336")) + self.assertEqual(ret.iloc[2]["ipv6_full"], ipaddress.IPv6Address("ff02::1")) + self.assertEqual(ret.iloc[2]["ipv6_link_local"], ipaddress.IPv6Address("2001:db8:85a3:8d3:1319:8a2e:370:7348")) + + # Verify data types - IPv6 types should be mapped to object dtype in pandas + expected_types = { + "row_id": "uint8", + "ipv6_standard": "object", # IPv6Address mapped to object dtype + "ipv6_localhost": "object", + "ipv6_zero": "object", + "ipv6_full": "object", + "ipv6_link_local": "object" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + # Verify all IPv6 values are IPv6Address objects + for col in ["ipv6_standard", "ipv6_localhost", "ipv6_zero", "ipv6_full", "ipv6_link_local"]: + for i in range(len(ret)): + ipv6_value = ret.iloc[i][col] + self.assertIsInstance(ipv6_value, ipaddress.IPv6Address, f"Row {i}, column {col} should be IPv6Address object") + # Verify IPv6 address is valid by checking it can be converted back to string + ipv6_str = str(ipv6_value) + self.assertIn(":", ipv6_str, f"Row {i}, column {col} IPv6 should contain colons") + + def test_bool_types(self): + """Test Bool and Nullable(Bool) types with various values""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + true as bool_true, + false as bool_false, + true::Bool as explicit_bool_true, + false::Bool as explicit_bool_false, + NULL::Nullable(Bool) as nullable_bool_null, + true::Nullable(Bool) as nullable_bool_true, + false::Nullable(Bool) as nullable_bool_false + UNION ALL + SELECT + 2 as row_id, + false as bool_true, + true as bool_false, + false::Bool as explicit_bool_true, + true::Bool as explicit_bool_false, + true::Nullable(Bool) as nullable_bool_null, + NULL::Nullable(Bool) as nullable_bool_true, + true::Nullable(Bool) as nullable_bool_false + UNION ALL + SELECT + 3 as row_id, + 1 = 1 as bool_true, -- expression result + 1 = 0 as bool_false, -- expression result + (1 > 0)::Bool as explicit_bool_true, + (1 < 0)::Bool as explicit_bool_false, + false::Nullable(Bool) as nullable_bool_null, + false::Nullable(Bool) as nullable_bool_true, + NULL::Nullable(Bool) as nullable_bool_false + ) + ORDER BY row_id + """, "DataFrame") + + # Verify we have 3 rows and 8 columns + self.assertEqual(len(ret), 3) + self.assertEqual(len(ret.columns), 8) + + # Test first row - basic Boolean values + self.assertTrue(ret.iloc[0]["bool_true"]) + self.assertFalse(ret.iloc[0]["bool_false"]) + self.assertTrue(ret.iloc[0]["explicit_bool_true"]) + self.assertFalse(ret.iloc[0]["explicit_bool_false"]) + self.assertTrue(pd.isna(ret.iloc[0]["nullable_bool_null"])) + self.assertTrue(ret.iloc[0]["nullable_bool_true"]) + self.assertFalse(ret.iloc[0]["nullable_bool_false"]) + + # Test second row - inverted Boolean values + self.assertFalse(ret.iloc[1]["bool_true"]) + self.assertTrue(ret.iloc[1]["bool_false"]) + self.assertFalse(ret.iloc[1]["explicit_bool_true"]) + self.assertTrue(ret.iloc[1]["explicit_bool_false"]) + self.assertTrue(ret.iloc[1]["nullable_bool_null"]) + self.assertTrue(pd.isna(ret.iloc[1]["nullable_bool_true"])) + self.assertTrue(ret.iloc[1]["nullable_bool_false"]) + + # Test third row - expression results + self.assertTrue(ret.iloc[2]["bool_true"]) # 1 = 1 is true + self.assertFalse(ret.iloc[2]["bool_false"]) # 1 = 0 is false + self.assertTrue(ret.iloc[2]["explicit_bool_true"]) # 1 > 0 is true + self.assertFalse(ret.iloc[2]["explicit_bool_false"]) # 1 < 0 is false + self.assertFalse(ret.iloc[2]["nullable_bool_null"]) + self.assertFalse(ret.iloc[2]["nullable_bool_true"]) + self.assertTrue(pd.isna(ret.iloc[2]["nullable_bool_false"])) + + # Test Python types - Bool values should be boolean types (Python bool or numpy bool_) + for i in range(len(ret)): + for col in ["bool_true", "bool_false", "explicit_bool_true", "explicit_bool_false"]: + value = ret.iloc[i][col] + # Accept both Python bool and numpy bool_ types + self.assertTrue(isinstance(value, (bool, np.bool_)), f"Row {i}, column {col} should be boolean type, got {type(value)}") + + # Test nullable Bool columns - should be bool/numpy.bool_ or null + for col in ["nullable_bool_null", "nullable_bool_true", "nullable_bool_false"]: + if (pd.isna(ret.iloc[i][col])): + continue + + value = ret.iloc[i][col] + self.assertTrue(isinstance(value, (bool, np.bool_)), + f"Row {i}, column {col} should be boolean type, got {type(value)}") + + # Verify data types - Bool types should be mapped to bool dtype in pandas + expected_types = { + "row_id": "uint8", + "bool_true": "bool", + "bool_false": "bool", + "explicit_bool_true": "bool", + "explicit_bool_false": "bool", + "nullable_bool_null": "boolean", + "nullable_bool_true": "boolean", + "nullable_bool_false": "boolean" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type, f"Column {col} type mismatch") + + def test_tuple_types(self): + """Test Tuple types with various element combinations""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + (1, 'hello') as tuple_int_str, + (true, false, true) as tuple_bool, + (1, 2.5, 'test') as tuple_mixed, + tuple(42, 'world', false) as tuple_explicit, + (1, (2, 3)) as tuple_nested, + ('a', 'b', 'c') as tuple_string, + (NULL, 1)::Tuple(Nullable(Int32), Int32) as tuple_nullable, + tuple() as tuple_empty + UNION ALL + SELECT + 2 as row_id, + (100, 'goodbye') as tuple_int_str, + (false, true, false) as tuple_bool, + (10, -3.14, 'data') as tuple_mixed, + tuple(-5, 'universe', true) as tuple_explicit, + (5, (6, 7)) as tuple_nested, + ('x', 'y', 'z') as tuple_string, + (42, NULL)::Tuple(Int32, Nullable(Int32)) as tuple_nullable, + tuple() as tuple_empty + UNION ALL + SELECT + 3 as row_id, + (-1, '') as tuple_int_str, + (true, false, false) as tuple_bool, + (0, 0.0, '') as tuple_mixed, + tuple(2147483647, 'edge_case', false) as tuple_explicit, + (99, (100, 101)) as tuple_nested, + ('🌍', 'Unicode', 'Test') as tuple_string, + (NULL, NULL)::Tuple(Nullable(Int32), Nullable(Int32)) as tuple_nullable, + tuple() as tuple_empty + ) + ORDER BY row_id + """, "DataFrame") + + # Verify we have 3 rows and 9 columns + self.assertEqual(len(ret), 3) + self.assertEqual(len(ret.columns), 9) + + # Test first row - basic tuple values + tuple_int_str = ret.iloc[0]["tuple_int_str"] + self.assertIsInstance(tuple_int_str, np.ndarray) + self.assertEqual(len(tuple_int_str), 2) + self.assertEqual(tuple_int_str[0], 1) + self.assertEqual(tuple_int_str[1], 'hello') + + tuple_bool = ret.iloc[0]["tuple_bool"] + self.assertIsInstance(tuple_bool, np.ndarray) + self.assertEqual(len(tuple_bool), 3) + self.assertTrue(tuple_bool[0]) + self.assertFalse(tuple_bool[1]) + self.assertTrue(tuple_bool[2]) + + tuple_mixed = ret.iloc[0]["tuple_mixed"] + self.assertIsInstance(tuple_mixed, np.ndarray) + self.assertEqual(len(tuple_mixed), 3) + self.assertEqual(tuple_mixed[0], 1) + self.assertEqual(tuple_mixed[1], 2.5) + self.assertEqual(tuple_mixed[2], 'test') + + # Test nested tuples + tuple_nested = ret.iloc[0]["tuple_nested"] + self.assertIsInstance(tuple_nested, np.ndarray) + self.assertEqual(len(tuple_nested), 2) + self.assertEqual(tuple_nested[0], 1) + self.assertIsInstance(tuple_nested[1], tuple) + self.assertEqual(tuple_nested[1][0], 2) + self.assertEqual(tuple_nested[1][1], 3) + + # Test nullable tuples + tuple_nullable = ret.iloc[0]["tuple_nullable"] + self.assertIsInstance(tuple_nullable, np.ndarray) + self.assertEqual(len(tuple_nullable), 2) + self.assertTrue(pd.isna(tuple_nullable[0])) # NULL value + self.assertEqual(tuple_nullable[1], 1) + + # Test empty tuple + tuple_empty = ret.iloc[0]["tuple_empty"] + self.assertIsInstance(tuple_empty, np.ndarray) + self.assertEqual(len(tuple_empty), 0) + + # Test second row - different values + tuple_int_str_2 = ret.iloc[1]["tuple_int_str"] + self.assertEqual(tuple_int_str_2[0], 100) + self.assertEqual(tuple_int_str_2[1], 'goodbye') + + tuple_nullable_2 = ret.iloc[1]["tuple_nullable"] + self.assertEqual(tuple_nullable_2[0], 42) + self.assertTrue(pd.isna(tuple_nullable_2[1])) # NULL value + + # Test third row - edge cases + tuple_bool_3 = ret.iloc[2]["tuple_bool"] + self.assertIsInstance(tuple_bool_3, np.ndarray) + self.assertEqual(len(tuple_bool_3), 3) + self.assertTrue(tuple_bool_3[0]) # true + self.assertFalse(tuple_bool_3[1]) # false + self.assertFalse(tuple_bool_3[2]) # false + + tuple_nullable_3 = ret.iloc[2]["tuple_nullable"] + self.assertTrue(pd.isna(tuple_nullable_3[0])) # Both NULL + self.assertTrue(pd.isna(tuple_nullable_3[1])) + + # Test string tuple with Unicode + tuple_string_3 = ret.iloc[2]["tuple_string"] + self.assertEqual(tuple_string_3[0], '🌍') + self.assertEqual(tuple_string_3[1], 'Unicode') + self.assertEqual(tuple_string_3[2], 'Test') + + # Test tuple element types + for i in range(len(ret)): + tuple_val = ret.iloc[i]["tuple_int_str"] + self.assertIsInstance(tuple_val, np.ndarray, f"Row {i} tuple_int_str should be tuple") + if len(tuple_val) >= 2: + self.assertIsInstance(tuple_val[0], (int, np.integer), f"Row {i} first element should be integer") + self.assertIsInstance(tuple_val[1], str, f"Row {i} second element should be string") + + # Verify data types - Tuple types should be mapped to object dtype in pandas + expected_types = { + "row_id": "uint8", + "tuple_int_str": "object", # Tuple mapped to object dtype + "tuple_bool": "object", # Tuple mapped to object dtype + "tuple_mixed": "object", # Tuple mapped to object dtype + "tuple_explicit": "object", # Tuple mapped to object dtype + "tuple_nested": "object", # Nested Tuple mapped to object dtype + "tuple_string": "object", # Tuple mapped to object dtype + "tuple_nullable": "object", # Tuple with nullable elements mapped to object dtype + "tuple_empty": "object" # Empty Tuple mapped to object dtype + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type, f"Column {col} type mismatch") + + # Test named tuples + named_tuple_ret = self.session.query(""" + SELECT + tuple(1, 'John', 25) as person_tuple, + (42, 3.14159, 'pi') as unnamed_tuple + """, "DataFrame") + + person_tuple = named_tuple_ret.iloc[0]["person_tuple"] + self.assertIsInstance(person_tuple, np.ndarray) + self.assertEqual(len(person_tuple), 3) + self.assertEqual(person_tuple[0], 1) + self.assertEqual(person_tuple[1], 'John') + self.assertEqual(person_tuple[2], 25) + + unnamed_tuple = named_tuple_ret.iloc[0]["unnamed_tuple"] + self.assertIsInstance(unnamed_tuple, np.ndarray) + self.assertEqual(len(unnamed_tuple), 3) + self.assertEqual(unnamed_tuple[0], 42) + self.assertAlmostEqual(unnamed_tuple[1], 3.14159, places=5) + self.assertEqual(unnamed_tuple[2], 'pi') + + def test_array_types(self): + """Test Array types with various element types""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + [1, 2, 3, 4, 5] as array_int32, + [1, 2, 3, 4, 5]::Array(UInt64) as array_uint64, + [1.1, 2.2, 3.3, 4.4, 5.5] as array_float64, + ['hello', 'world', 'clickhouse', 'array'] as array_string, + [true, false, true, false] as array_bool, + [toDate('2023-01-01'), toDate('2023-02-01'), toDate('2023-03-01')] as array_date, + [toDateTime('2023-01-01 10:00:00', 'Asia/Shanghai'), toDateTime('2023-01-01 11:00:00', 'Asia/Shanghai')] as array_datetime, + [[1, 2], [3, 4], [5, 6]] as array_nested_int, + [[100, 200], [300, 400], [500, 600]]::Array(Array(UInt32)) as array_nested_uint32, + [['a', 'b'], ['c', 'd']] as array_nested_string, + [] as array_empty_int, + ['']::Array(String) as array_empty_string_element, + [NULL, 1, NULL, 3]::Array(Nullable(Int32)) as array_nullable_int, + [NULL, 'test', NULL]::Array(Nullable(String)) as array_nullable_string + UNION ALL + SELECT + 2 as row_id, + [10, 20, 30] as array_int32, + [100, 200, 300]::Array(UInt64) as array_uint64, + [10.5, 20.5] as array_float64, + ['test', 'array', 'data'] as array_string, + [false, false, true] as array_bool, + [toDate('2024-01-01'), toDate('2024-12-31')] as array_date, + [toDateTime('2024-06-15 14:30:00', 'Asia/Shanghai')] as array_datetime, + [[7, 8, 9], [10]] as array_nested_int, + [[700, 800], [900]]::Array(Array(UInt32)) as array_nested_uint32, + [['x'], ['y', 'z', 'w']] as array_nested_string, + [42] as array_empty_int, + ['single'] as array_empty_string_element, + [1, 2, 3]::Array(Nullable(Int32)) as array_nullable_int, + ['a', 'b']::Array(Nullable(String)) as array_nullable_string + UNION ALL + SELECT + 3 as row_id, + [-1, 0, 1, 2147483647, -2147483648] as array_int32, + [0, 18446744073709551615]::Array(UInt64) as array_uint64, + [0.0, -1.5, 1.0/0.0, -1.0/0.0, 0.0/0.0] as array_float64, + ['Unicode: 🌍', 'Special: \t\n"''', ''] as array_string, + [true] as array_bool, + [toDate('1970-01-01'), toDate('2149-06-06')] as array_date, + [toDateTime('1970-01-02 00:00:00', 'Asia/Shanghai'), toDateTime('2106-02-07 06:28:15', 'Asia/Shanghai')] as array_datetime, + [[], [1], [2, 3, 4, 5]] as array_nested_int, + [[], [1000], [2000, 3000, 4000]]::Array(Array(UInt32)) as array_nested_uint32, + [[], ['single'], ['a', 'b', 'c']] as array_nested_string, + []::Array(Int32) as array_empty_int, + []::Array(String) as array_empty_string_element, + [NULL]::Array(Nullable(Int32)) as array_nullable_int, + [NULL, NULL]::Array(Nullable(String)) as array_nullable_string + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - basic arrays (converted to numpy arrays) + np.testing.assert_array_equal(ret.iloc[0]["array_int32"], [1, 2, 3, 4, 5]) + np.testing.assert_array_equal(ret.iloc[0]["array_uint64"], [1, 2, 3, 4, 5]) + np.testing.assert_array_equal(ret.iloc[0]["array_float64"], [1.1, 2.2, 3.3, 4.4, 5.5]) + np.testing.assert_array_equal(ret.iloc[0]["array_string"], ['hello', 'world', 'clickhouse', 'array']) + np.testing.assert_array_equal(ret.iloc[0]["array_bool"], [True, False, True, False]) + + # Test date arrays (converted to numpy array of pandas timestamps) + date_array = ret.iloc[0]["array_date"] + self.assertIsInstance(date_array, np.ndarray) + self.assertEqual(len(date_array), 3) + self.assertEqual(date_array[0], pd.Timestamp('2023-01-01')) + self.assertEqual(date_array[1], pd.Timestamp('2023-02-01')) + self.assertEqual(date_array[2], pd.Timestamp('2023-03-01')) + + # Test datetime arrays (converted to numpy array of numpy.datetime64 in UTC) + datetime_array = ret.iloc[0]["array_datetime"] + self.assertIsInstance(datetime_array, np.ndarray) + self.assertEqual(len(datetime_array), 2) + # ClickHouse converts Asia/Shanghai time to UTC: 10:00:00 +0800 -> 02:00:00 UTC + self.assertEqual(datetime_array[0], np.datetime64('2023-01-01T02:00:00')) + self.assertEqual(datetime_array[1], np.datetime64('2023-01-01T03:00:00')) + + # Test nested arrays (numpy arrays containing numpy arrays) + nested_int = ret.iloc[0]["array_nested_int"] + self.assertIsInstance(nested_int, np.ndarray) + self.assertEqual(len(nested_int), 3) + np.testing.assert_array_equal(nested_int[0], [1, 2]) + np.testing.assert_array_equal(nested_int[1], [3, 4]) + np.testing.assert_array_equal(nested_int[2], [5, 6]) + + nested_uint32 = ret.iloc[0]["array_nested_uint32"] + self.assertIsInstance(nested_uint32, np.ndarray) + self.assertEqual(len(nested_uint32), 3) + np.testing.assert_array_equal(nested_uint32[0], [100, 200]) + np.testing.assert_array_equal(nested_uint32[1], [300, 400]) + np.testing.assert_array_equal(nested_uint32[2], [500, 600]) + + nested_string = ret.iloc[0]["array_nested_string"] + self.assertIsInstance(nested_string, np.ndarray) + self.assertEqual(len(nested_string), 2) + np.testing.assert_array_equal(nested_string[0], ['a', 'b']) + np.testing.assert_array_equal(nested_string[1], ['c', 'd']) + + # Test empty arrays and arrays with empty string elements + empty_int_array = ret.iloc[0]["array_empty_int"] + self.assertIsInstance(empty_int_array, np.ndarray) + self.assertEqual(len(empty_int_array), 0) + + string_element_array = ret.iloc[0]["array_empty_string_element"] + self.assertIsInstance(string_element_array, np.ndarray) + np.testing.assert_array_equal(string_element_array, ['']) + + # Test nullable arrays (numpy arrays with None values) + nullable_int = ret.iloc[0]["array_nullable_int"] + self.assertIsInstance(nullable_int, np.ndarray) + self.assertEqual(len(nullable_int), 4) + self.assertTrue(nullable_int.mask[0]) + self.assertEqual(nullable_int[1], 1) + self.assertTrue(nullable_int.mask[2]) + self.assertEqual(nullable_int[3], 3) + + nullable_string = ret.iloc[0]["array_nullable_string"] + self.assertIsInstance(nullable_string, np.ndarray) + self.assertEqual(len(nullable_string), 3) + # self.assertTrue(nullable_string.mask[0]) + self.assertIsNone(nullable_string[0]) + self.assertEqual(nullable_string[1], 'test') + # self.assertTrue(nullable_string.mask[2]) + self.assertIsNone(nullable_string[2]) + + # Test second row - different arrays (numpy arrays) + np.testing.assert_array_equal(ret.iloc[1]["array_int32"], [10, 20, 30]) + np.testing.assert_array_equal(ret.iloc[1]["array_uint64"], [100, 200, 300]) + np.testing.assert_array_equal(ret.iloc[1]["array_float64"], [10.5, 20.5]) + np.testing.assert_array_equal(ret.iloc[1]["array_string"], ['test', 'array', 'data']) + np.testing.assert_array_equal(ret.iloc[1]["array_bool"], [False, False, True]) + + # Test second row datetime array: 14:30:00 +0800 -> 06:30:00 UTC + datetime_array_2 = ret.iloc[1]["array_datetime"] + self.assertEqual(len(datetime_array_2), 1) + self.assertEqual(datetime_array_2[0], np.datetime64('2024-06-15T06:30:00')) + + # Test third row - edge cases (numpy arrays) + np.testing.assert_array_equal(ret.iloc[2]["array_int32"], [-1, 0, 1, 2147483647, -2147483648]) + np.testing.assert_array_equal(ret.iloc[2]["array_uint64"], [0, 18446744073709551615]) + + # Test third row datetime array: Asia/Shanghai times converted to UTC + datetime_array_3 = ret.iloc[2]["array_datetime"] + self.assertEqual(len(datetime_array_3), 2) + # 1970-01-02 00:00:00 +0800 -> 1970-01-01 16:00:00 UTC + self.assertEqual(datetime_array_3[0], np.datetime64('1970-01-01T16:00:00')) + # 2106-02-07 06:28:15 +0800 -> 2106-02-06 22:28:15 UTC + self.assertEqual(datetime_array_3[1], np.datetime64('2106-02-06T22:28:15')) + + # Test float special values in array + float_array = ret.iloc[2]["array_float64"] + self.assertEqual(float_array[0], 0.0) + self.assertEqual(float_array[1], -1.5) + self.assertTrue(math.isinf(float_array[2])) # positive infinity + self.assertTrue(math.isinf(float_array[3])) # negative infinity + self.assertTrue(math.isnan(float_array[4])) # NaN + + # Test string array with special characters (numpy array) + string_array = ret.iloc[2]["array_string"] + self.assertIsInstance(string_array, np.ndarray) + self.assertEqual(string_array[0], 'Unicode: 🌍') + self.assertEqual(string_array[1], "Special: \t\n\"'") # ClickHouse interprets escape sequences + self.assertEqual(string_array[2], '') + + # Test nested arrays with empty elements (numpy arrays) + nested_int_3 = ret.iloc[2]["array_nested_int"] + self.assertIsInstance(nested_int_3, np.ndarray) + self.assertEqual(len(nested_int_3[0]), 0) # empty array + np.testing.assert_array_equal(nested_int_3[1], [1]) # single element + np.testing.assert_array_equal(nested_int_3[2], [2, 3, 4, 5]) # multiple elements + + nested_uint32_3 = ret.iloc[2]["array_nested_uint32"] + self.assertIsInstance(nested_uint32_3, np.ndarray) + self.assertEqual(len(nested_uint32_3[0]), 0) # empty array + np.testing.assert_array_equal(nested_uint32_3[1], [1000]) # single element + np.testing.assert_array_equal(nested_uint32_3[2], [2000, 3000, 4000]) # multiple elements + + # Test empty typed arrays + self.assertEqual(len(ret.iloc[2]["array_empty_int"]), 0) + self.assertEqual(len(ret.iloc[2]["array_empty_string_element"]), 0) + + # Test nullable arrays with only NULL values + self.assertEqual(len(ret.iloc[2]["array_nullable_int"]), 1) + self.assertTrue(ret.iloc[2]["array_nullable_int"].mask[0]) + + self.assertEqual(len(ret.iloc[2]["array_nullable_string"]), 2) + # self.assertTrue(ret.iloc[2]["array_nullable_string"].mask[0]) + # self.assertTrue(ret.iloc[2]["array_nullable_string"].mask[1]) + self.assertIsNone(ret.iloc[2]["array_nullable_string"][0]) + self.assertIsNone(ret.iloc[2]["array_nullable_string"][1]) + + # Precise data type validation - Arrays should be mapped to object dtype in pandas + expected_types = { + "row_id": "uint8", + "array_int32": "object", # Array(Int32) mapped to object dtype + "array_uint64": "object", # Array(UInt64) mapped to object dtype + "array_float64": "object", # Array(Float64) mapped to object dtype + "array_string": "object", # Array(String) mapped to object dtype + "array_bool": "object", # Array(Bool) mapped to object dtype + "array_date": "object", # Array(Date) mapped to object dtype + "array_datetime": "object", # Array(DateTime) mapped to object dtype + "array_nested_int": "object", # Array(Array(Int32)) mapped to object dtype + "array_nested_uint32": "object", # Array(Array(UInt32)) mapped to object dtype + "array_nested_string": "object", # Array(Array(String)) mapped to object dtype + "array_empty_int": "object", # Empty Array(Int32) mapped to object dtype + "array_empty_string_element": "object", # Array(String) with empty string mapped to object dtype + "array_nullable_int": "object", # Array(Nullable(Int32)) mapped to object dtype + "array_nullable_string": "object" # Array(Nullable(String)) mapped to object dtype + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + # Verify all array columns contain numpy arrays + array_columns = [col for col in ret.columns if col.startswith('array_')] + for col in array_columns: + for i in range(len(ret)): + array_value = ret.iloc[i][col] + # Check if it's a numpy array + self.assertIsInstance(array_value, np.ndarray, f"Row {i}, column {col} should be numpy array") + # Verify numpy array properties + self.assertTrue(hasattr(array_value, '__len__'), f"Row {i}, column {col} should have length") + self.assertTrue(hasattr(array_value, '__getitem__'), f"Row {i}, column {col} should be indexable") + + def test_map_types(self): + """Test Map(K,V) types where K and V can be any types""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + -- Basic primitive type combinations with multiple key-value pairs + map('str_key1', 42, 'str_key2', 100, 'str_key3', -50)::Map(String, Int32) as map_str_int, + map(100, 'int_key1', 200, 'int_key2', -10, 'negative_key')::Map(Int32, String) as map_int_str, + map(true, 'bool_true', false, 'bool_false')::Map(Bool, String) as map_bool_str, + map('pi', 3.14, 'e', 2.718, 'phi', 1.618)::Map(String, Float64) as map_str_float, + + -- DateTime and Date types as values with multiple pairs + map('created', toTimeZone('2023-01-15 10:30:00'::DateTime('Asia/Shanghai'), 'Asia/Shanghai'), 'updated', toTimeZone('2024-03-20 14:45:30'::DateTime('Asia/Shanghai'), 'Asia/Shanghai'), 'archived', toTimeZone('2024-12-01 09:15:00'::DateTime('Asia/Shanghai'), 'Asia/Shanghai'))::Map(String, DateTime('Asia/Shanghai')) as map_str_datetime, + map('birth_date', '1990-05-15'::Date, 'start_date', '2020-01-01'::Date, 'end_date', '2025-12-31'::Date)::Map(String, Date) as map_str_date, + map('precise_time1', toTimeZone('2023-01-15 10:30:00.123456'::DateTime64(6, 'Asia/Shanghai'), 'Asia/Shanghai'), 'precise_time2', toTimeZone('2024-03-20 14:45:30.987654'::DateTime64(6, 'Asia/Shanghai'), 'Asia/Shanghai'), 'precise_time3', toTimeZone('2024-12-01 09:15:00.555555'::DateTime64(6, 'Asia/Shanghai'), 'Asia/Shanghai'))::Map(String, DateTime64(6, 'Asia/Shanghai')) as map_str_datetime64, + map('event_id', '1001', 'timestamp', '2023-06-10 16:20:45', 'event_id2', '1002', 'timestamp2', '2023-06-11 17:30:15')::Map(String, String) as map_mixed_datetime, + + -- Decimal types as values with multiple pairs + map('price1', 99.99::Decimal(10,2), 'price2', 149.50::Decimal(10,2), 'discount', 15.75::Decimal(10,2))::Map(String, Decimal(10,2)) as map_str_decimal, + + -- Array as Key and Value types + map([1,2], 'array_key')::Map(Array(Int32), String) as map_array_str, + map('array_val1', [10,20,30], 'array_val2', [40,50], 'empty_array', [])::Map(String, Array(Int32)) as map_str_array, + + -- Tuple as Key and Value types + map((1,'tuple'), 'tuple_key')::Map(Tuple(Int32, String), String) as map_tuple_str, + map('tuple_val1', (100, 'data1'), 'tuple_val2', (200, 'data2'))::Map(String, Tuple(Int32, String)) as map_str_tuple, + + -- Nested Map as Value with multiple entries + map('config1', map('timeout', 30, 'retries', 3), 'config2', map('timeout', 60, 'retries', 5))::Map(String, Map(String, Int32)) as map_nested, + + -- Nullable types with multiple pairs + map('nullable1', NULL, 'nullable2', 'has_value', 'nullable3', NULL)::Map(String, Nullable(String)) as map_nullable + UNION ALL + SELECT + 2 as row_id, + -- Different values with multiple pairs + map('key_a', 999, 'key_b', 888, 'key_c', 777)::Map(String, Int32) as map_str_int, + map(300, 'triple', 400, 'quad', 500, 'penta')::Map(Int32, String) as map_int_str, + map(false, 'false_key', true, 'true_key')::Map(Bool, String) as map_bool_str, + map('sqrt2', 1.414, 'sqrt3', 1.732, 'sqrt5', 2.236)::Map(String, Float64) as map_str_float, + + -- Different datetime values + map('morning', toTimeZone('2024-01-01 08:00:00'::DateTime('Asia/Shanghai'), 'Asia/Shanghai'), 'noon', toTimeZone('2024-01-01 12:00:00'::DateTime('Asia/Shanghai'), 'Asia/Shanghai'), 'evening', toTimeZone('2024-01-01 18:00:00'::DateTime('Asia/Shanghai'), 'Asia/Shanghai'))::Map(String, DateTime('Asia/Shanghai')) as map_str_datetime, + map('monday', '2024-01-01'::Date, 'friday', '2024-01-05'::Date, 'sunday', '2024-01-07'::Date)::Map(String, Date) as map_str_date, + map('morning_precise', toTimeZone('2024-01-01 08:00:00.111111'::DateTime64(6, 'Asia/Shanghai'), 'Asia/Shanghai'), 'noon_precise', toTimeZone('2024-01-01 12:00:00.222222'::DateTime64(6, 'Asia/Shanghai'), 'Asia/Shanghai'), 'evening_precise', toTimeZone('2024-01-01 18:00:00.333333'::DateTime64(6, 'Asia/Shanghai'), 'Asia/Shanghai'))::Map(String, DateTime64(6, 'Asia/Shanghai')) as map_str_datetime64, + map('log_entry1', 'ERROR: 2024-02-15 10:30:00', 'log_entry2', 'INFO: 2024-02-15 10:31:00')::Map(String, String) as map_mixed_datetime, + + -- Different decimal values + map('tax', 8.25::Decimal(10,2), 'shipping', 12.99::Decimal(10,2), 'total', 199.99::Decimal(10,2))::Map(String, Decimal(10,2)) as map_str_decimal, + + map([5,6,7], 'different_array')::Map(Array(Int32), String) as map_array_str, + map('values1', [100,200], 'values2', [300,400,500])::Map(String, Array(Int32)) as map_str_array, + + map((2,'another'), 'another_tuple')::Map(Tuple(Int32, String), String) as map_tuple_str, + map('tuple_a', (200, 'test_a'), 'tuple_b', (300, 'test_b'))::Map(String, Tuple(Int32, String)) as map_str_tuple, + + map('db_config', map('host', 1, 'port', 5432), 'cache_config', map('ttl', 300, 'size', 1000))::Map(String, Map(String, Int32)) as map_nested, + + map('active', 'yes', 'inactive', NULL, 'pending', 'maybe')::Map(String, Nullable(String)) as map_nullable + UNION ALL + SELECT + 3 as row_id, + -- Edge cases and special values with multiple pairs + map('min_int', -2147483648, 'max_int', 2147483647, 'zero', 0)::Map(String, Int32) as map_str_int, + map(-50, 'negative_int', 0, 'zero_int', 1000000, 'million')::Map(Int32, String) as map_int_str, + map(true, 'always_true', false, 'always_false')::Map(Bool, String) as map_bool_str, + map('inf', 1.0/0.0, 'neg_inf', -1.0/0.0, 'nan', 0.0/0.0)::Map(String, Float64) as map_str_float, + + -- Extreme datetime values + map('epoch', toTimeZone('1970-01-01 00:00:00'::DateTime('Asia/Shanghai'), 'Asia/Shanghai'), 'y2k', toTimeZone('2000-01-01 00:00:00'::DateTime('Asia/Shanghai'), 'Asia/Shanghai'), 'future', toTimeZone('2099-12-31 23:59:59'::DateTime('Asia/Shanghai'), 'Asia/Shanghai'))::Map(String, DateTime('Asia/Shanghai')) as map_str_datetime, + map('past', '1900-01-01'::Date, 'present', today(), 'future', '2100-01-01'::Date)::Map(String, Date) as map_str_date, + map('epoch_precise', toTimeZone('1970-01-01 08:00:00.000001'::DateTime64(6, 'Asia/Shanghai'), 'Asia/Shanghai'), 'y2k_precise', toTimeZone('2000-01-01 00:00:00.999999'::DateTime64(6, 'Asia/Shanghai'), 'Asia/Shanghai'), 'future_precise', toTimeZone('2099-12-31 23:59:59.123456'::DateTime64(6, 'Asia/Shanghai'), 'Asia/Shanghai'))::Map(String, DateTime64(6, 'Asia/Shanghai')) as map_str_datetime64, + map('debug1', 'TRACE: 1970-01-01 00:00:01', 'debug2', 'DEBUG: 2038-01-19 03:14:07')::Map(String, String) as map_mixed_datetime, + + -- Extreme decimal values + map('min_decimal', 0.01::Decimal(10,2), 'max_decimal', 99999999.99::Decimal(10,2), 'zero_decimal', 0.00::Decimal(10,2))::Map(String, Decimal(10,2)) as map_str_decimal, + + map([], 'empty_array')::Map(Array(Int32), String) as map_array_str, + map('empty_val', [], 'single_val', [42], 'multi_val', [1,2,3,4,5])::Map(String, Array(Int32)) as map_str_array, + + map((0,'zero'), 'zero_tuple')::Map(Tuple(Int32, String), String) as map_tuple_str, + map('empty_like', (0, ''), 'full_like', (999, 'full_string'))::Map(String, Tuple(Int32, String)) as map_str_tuple, + + map('triple_nested', map('level2', 999))::Map(String, Map(String, Int32)) as map_nested, + + map('null_again', NULL)::Map(String, Nullable(String)) as map_nullable + ) + ORDER BY row_id + """, "DataFrame") + + # Verify we have 3 rows and 16 columns + self.assertEqual(len(ret), 3) + self.assertEqual(len(ret.columns), 16) + + # Test Row 1 - Basic primitive type combinations with multiple key-value pairs + # Map(String, Int32) with multiple pairs + map_str_int = ret.iloc[0]["map_str_int"] + self.assertIsInstance(map_str_int, dict) + self.assertEqual(len(map_str_int), 3) # Should have 3 key-value pairs + self.assertEqual(map_str_int['str_key1'], 42) + self.assertEqual(map_str_int['str_key2'], 100) + self.assertEqual(map_str_int['str_key3'], -50) + + # Map(Int32, String) with multiple pairs + map_int_str = ret.iloc[0]["map_int_str"] + self.assertIsInstance(map_int_str, dict) + self.assertEqual(len(map_int_str), 3) # Should have 3 key-value pairs + self.assertEqual(map_int_str[100], 'int_key1') + self.assertEqual(map_int_str[200], 'int_key2') + self.assertEqual(map_int_str[-10], 'negative_key') + + # Map(Bool, String) with both true and false keys + map_bool_str = ret.iloc[0]["map_bool_str"] + self.assertIsInstance(map_bool_str, dict) + self.assertEqual(len(map_bool_str), 2) # Should have 2 key-value pairs + self.assertEqual(map_bool_str[True], 'bool_true') + self.assertEqual(map_bool_str[False], 'bool_false') + + # Map(String, Float64) with multiple mathematical constants + map_str_float = ret.iloc[0]["map_str_float"] + self.assertIsInstance(map_str_float, dict) + self.assertEqual(len(map_str_float), 3) # Should have 3 key-value pairs + self.assertAlmostEqual(map_str_float['pi'], 3.14, places=2) + self.assertAlmostEqual(map_str_float['e'], 2.718, places=3) + self.assertAlmostEqual(map_str_float['phi'], 1.618, places=3) + + # Test DateTime and Date types as values + # Map(String, DateTime) with multiple datetime values + map_str_datetime = ret.iloc[0]["map_str_datetime"] + self.assertIsInstance(map_str_datetime, dict) + self.assertEqual(len(map_str_datetime), 3) # Should have 3 key-value pairs + # Verify datetime values (converted to python datetime objects with Shanghai timezone) + self.assertIsInstance(map_str_datetime['created'], datetime) + self.assertEqual(map_str_datetime['created'], datetime(2023, 1, 15, 10, 30, 0, tzinfo=self.shanghai_tz)) + self.assertIsInstance(map_str_datetime['updated'], datetime) + self.assertEqual(map_str_datetime['updated'], datetime(2024, 3, 20, 14, 45, 30, tzinfo=self.shanghai_tz)) + self.assertIsInstance(map_str_datetime['archived'], datetime) + self.assertEqual(map_str_datetime['archived'], datetime(2024, 12, 1, 9, 15, 0, tzinfo=self.shanghai_tz)) + + # Map(String, Date) with multiple date values + map_str_date = ret.iloc[0]["map_str_date"] + self.assertIsInstance(map_str_date, dict) + self.assertEqual(len(map_str_date), 3) # Should have 3 key-value pairs + # Verify date values (converted to python date objects) + self.assertIsInstance(map_str_date['birth_date'], date) + self.assertEqual(map_str_date['birth_date'], date(1990, 5, 15)) + self.assertIsInstance(map_str_date['start_date'], date) + self.assertEqual(map_str_date['start_date'], date(2020, 1, 1)) + self.assertIsInstance(map_str_date['end_date'], date) + self.assertEqual(map_str_date['end_date'], date(2025, 12, 31)) + + # Test DateTime64 with microsecond precision + # Map(String, DateTime64) with multiple datetime64 values + map_str_datetime64 = ret.iloc[0]["map_str_datetime64"] + self.assertIsInstance(map_str_datetime64, dict) + self.assertEqual(len(map_str_datetime64), 3) # Should have 3 key-value pairs + # Verify datetime64 values (converted to python datetime objects with Shanghai timezone and microseconds) + self.assertIsInstance(map_str_datetime64['precise_time1'], datetime) + self.assertEqual(map_str_datetime64['precise_time1'], datetime(2023, 1, 15, 10, 30, 0, 123456, tzinfo=self.shanghai_tz)) + self.assertIsInstance(map_str_datetime64['precise_time2'], datetime) + self.assertEqual(map_str_datetime64['precise_time2'], datetime(2024, 3, 20, 14, 45, 30, 987654, tzinfo=self.shanghai_tz)) + self.assertIsInstance(map_str_datetime64['precise_time3'], datetime) + self.assertEqual(map_str_datetime64['precise_time3'], datetime(2024, 12, 1, 9, 15, 0, 555555, tzinfo=self.shanghai_tz)) + + # Map(String, String) with mixed datetime strings + map_mixed_datetime = ret.iloc[0]["map_mixed_datetime"] + self.assertIsInstance(map_mixed_datetime, dict) + self.assertEqual(len(map_mixed_datetime), 4) # Should have 4 key-value pairs + self.assertEqual(map_mixed_datetime['event_id'], '1001') + self.assertIn('2023-06-10 16:20:45', map_mixed_datetime['timestamp']) + + # Map(String, Decimal) with multiple decimal values + map_str_decimal = ret.iloc[0]["map_str_decimal"] + self.assertIsInstance(map_str_decimal, dict) + self.assertEqual(len(map_str_decimal), 3) # Should have 3 key-value pairs + # Verify decimal values (should be converted to float or Decimal) + self.assertAlmostEqual(float(map_str_decimal['price1']), 99.99, places=2) + self.assertAlmostEqual(float(map_str_decimal['price2']), 149.50, places=2) + self.assertAlmostEqual(float(map_str_decimal['discount']), 15.75, places=2) + + # Test Array as Key/Value types + # Map(Array(Int32), String) - Array as Key (non-hashable, uses keys/values structure) + map_array_str = ret.iloc[0]["map_array_str"] + self.assertIsInstance(map_array_str, dict) + # Non-hashable keys create {keys: [...], values: [...]} structure + self.assertIn('keys', map_array_str) + self.assertIn('values', map_array_str) + self.assertEqual(len(map_array_str['keys']), 1) + self.assertEqual(len(map_array_str['values']), 1) + # Verify the array key and its corresponding value + array_key = map_array_str['keys'][0] + self.assertIsInstance(array_key, list) + np.testing.assert_array_equal(array_key, [1, 2]) + self.assertEqual(map_array_str['values'][0], 'array_key') + + # Map(String, Array(Int32)) - Array as Value with multiple pairs (hashable key, normal dict) + map_str_array = ret.iloc[0]["map_str_array"] + self.assertIsInstance(map_str_array, dict) + self.assertEqual(len(map_str_array), 3) # Should have 3 key-value pairs + # Verify multiple array values + array_value1 = map_str_array['array_val1'] + self.assertIsInstance(array_value1, list) + np.testing.assert_array_equal(array_value1, [10, 20, 30]) + array_value2 = map_str_array['array_val2'] + self.assertIsInstance(array_value2, list) + np.testing.assert_array_equal(array_value2, [40, 50]) + empty_array = map_str_array['empty_array'] + self.assertIsInstance(empty_array, list) + self.assertEqual(len(empty_array), 0) + + # Test Tuple as Key/Value types + # Map(Tuple(Int32, String), String) - Tuple as Key (non-hashable, uses keys/values structure) + map_tuple_str = ret.iloc[0]["map_tuple_str"] + self.assertIsInstance(map_tuple_str, dict) + # Non-hashable keys create {keys: [...], values: [...]} structure + self.assertIn('keys', map_tuple_str) + self.assertIn('values', map_tuple_str) + self.assertEqual(len(map_tuple_str['keys']), 1) + self.assertEqual(len(map_tuple_str['values']), 1) + # Verify the tuple key and its corresponding value + tuple_key = map_tuple_str['keys'][0] + self.assertIsInstance(tuple_key, tuple) + self.assertEqual(map_tuple_str['values'][0], 'tuple_key') + + # Map(String, Tuple(Int32, String)) - Tuple as Value with multiple pairs (hashable key, normal dict) + map_str_tuple = ret.iloc[0]["map_str_tuple"] + self.assertIsInstance(map_str_tuple, dict) + self.assertEqual(len(map_str_tuple), 2) # Should have 2 key-value pairs + # Verify multiple tuple values + tuple_value1 = map_str_tuple['tuple_val1'] + self.assertIsInstance(tuple_value1, tuple) + self.assertEqual(tuple_value1, (100, 'data1')) + tuple_value2 = map_str_tuple['tuple_val2'] + self.assertIsInstance(tuple_value2, tuple) + self.assertEqual(tuple_value2, (200, 'data2')) + + # Test Nested Map with multiple entries - Map(String, Map(String, Int32)) + map_nested = ret.iloc[0]["map_nested"] + self.assertIsInstance(map_nested, dict) + self.assertEqual(len(map_nested), 2) # Should have 2 key-value pairs + # Verify first nested map + inner_map1 = map_nested['config1'] + self.assertIsInstance(inner_map1, dict) + self.assertEqual(inner_map1['timeout'], 30) + self.assertEqual(inner_map1['retries'], 3) + # Verify second nested map + inner_map2 = map_nested['config2'] + self.assertIsInstance(inner_map2, dict) + self.assertEqual(inner_map2['timeout'], 60) + self.assertEqual(inner_map2['retries'], 5) + + # Test Nullable Value with multiple pairs - Map(String, Nullable(String)) + map_nullable = ret.iloc[0]["map_nullable"] + self.assertIsInstance(map_nullable, dict) + self.assertEqual(len(map_nullable), 3) # Should have 3 key-value pairs + # Verify mixed null and non-null values + self.assertTrue(pd.isna(map_nullable['nullable1'])) + self.assertEqual(map_nullable['nullable2'], 'has_value') + self.assertTrue(pd.isna(map_nullable['nullable3'])) + + # Test Row 2 - Different values with multiple pairs + # Test Map(String, Int32) with different data (hashable key -> normal dict) + map_str_int_2 = ret.iloc[1]["map_str_int"] + self.assertIsInstance(map_str_int_2, dict) + self.assertEqual(len(map_str_int_2), 3) # Should have 3 key-value pairs + self.assertEqual(map_str_int_2['key_a'], 999) + self.assertEqual(map_str_int_2['key_b'], 888) + self.assertEqual(map_str_int_2['key_c'], 777) + + # Test Map(Bool, String) with both keys (hashable key -> normal dict) + map_bool_str_2 = ret.iloc[1]["map_bool_str"] + self.assertIsInstance(map_bool_str_2, dict) + self.assertEqual(len(map_bool_str_2), 2) # Should have 2 key-value pairs + self.assertEqual(map_bool_str_2[False], 'false_key') + self.assertEqual(map_bool_str_2[True], 'true_key') + + # Test DateTime values in row 2 + map_str_datetime_2 = ret.iloc[1]["map_str_datetime"] + self.assertIsInstance(map_str_datetime_2, dict) + self.assertEqual(len(map_str_datetime_2), 3) # Should have 3 key-value pairs + self.assertIsInstance(map_str_datetime_2['morning'], datetime) + self.assertEqual(map_str_datetime_2['morning'], datetime(2024, 1, 1, 8, 0, 0, tzinfo=self.shanghai_tz)) + self.assertIsInstance(map_str_datetime_2['noon'], datetime) + self.assertEqual(map_str_datetime_2['noon'], datetime(2024, 1, 1, 12, 0, 0, tzinfo=self.shanghai_tz)) + self.assertIsInstance(map_str_datetime_2['evening'], datetime) + self.assertEqual(map_str_datetime_2['evening'], datetime(2024, 1, 1, 18, 0, 0, tzinfo=self.shanghai_tz)) + + # Test Date values in row 2 + map_str_date_2 = ret.iloc[1]["map_str_date"] + self.assertIsInstance(map_str_date_2, dict) + self.assertEqual(len(map_str_date_2), 3) # Should have 3 key-value pairs + self.assertIsInstance(map_str_date_2['monday'], date) + self.assertEqual(map_str_date_2['monday'], date(2024, 1, 1)) + self.assertIsInstance(map_str_date_2['friday'], date) + self.assertEqual(map_str_date_2['friday'], date(2024, 1, 5)) + self.assertIsInstance(map_str_date_2['sunday'], date) + self.assertEqual(map_str_date_2['sunday'], date(2024, 1, 7)) + + # Test DateTime64 values in row 2 + map_str_datetime64_2 = ret.iloc[1]["map_str_datetime64"] + self.assertIsInstance(map_str_datetime64_2, dict) + self.assertEqual(len(map_str_datetime64_2), 3) # Should have 3 key-value pairs + self.assertIsInstance(map_str_datetime64_2['morning_precise'], datetime) + self.assertEqual(map_str_datetime64_2['morning_precise'], datetime(2024, 1, 1, 8, 0, 0, 111111, tzinfo=self.shanghai_tz)) + self.assertIsInstance(map_str_datetime64_2['noon_precise'], datetime) + self.assertEqual(map_str_datetime64_2['noon_precise'], datetime(2024, 1, 1, 12, 0, 0, 222222, tzinfo=self.shanghai_tz)) + self.assertIsInstance(map_str_datetime64_2['evening_precise'], datetime) + self.assertEqual(map_str_datetime64_2['evening_precise'], datetime(2024, 1, 1, 18, 0, 0, 333333, tzinfo=self.shanghai_tz)) + + # Test Decimal values in row 2 + map_str_decimal_2 = ret.iloc[1]["map_str_decimal"] + self.assertIsInstance(map_str_decimal_2, dict) + self.assertEqual(len(map_str_decimal_2), 3) # Should have 3 key-value pairs + self.assertAlmostEqual(float(map_str_decimal_2['tax']), 8.25, places=2) + self.assertAlmostEqual(float(map_str_decimal_2['shipping']), 12.99, places=2) + self.assertAlmostEqual(float(map_str_decimal_2['total']), 199.99, places=2) + + # Test Map with nullable that has mixed values (hashable key -> normal dict) + map_nullable_2 = ret.iloc[1]["map_nullable"] + self.assertIsInstance(map_nullable_2, dict) + self.assertEqual(len(map_nullable_2), 3) # Should have 3 key-value pairs + self.assertEqual(map_nullable_2['active'], 'yes') + self.assertTrue(pd.isna(map_nullable_2['inactive'])) + self.assertEqual(map_nullable_2['pending'], 'maybe') + + # Test Array as key in row 2 (non-hashable -> keys/values structure) + map_array_str_2 = ret.iloc[1]["map_array_str"] + self.assertIn('keys', map_array_str_2) + self.assertIn('values', map_array_str_2) + array_key_2 = map_array_str_2['keys'][0] + np.testing.assert_array_equal(array_key_2, [5, 6, 7]) + self.assertEqual(map_array_str_2['values'][0], 'different_array') + + # Test Array values in row 2 with multiple pairs + map_str_array_2 = ret.iloc[1]["map_str_array"] + self.assertIsInstance(map_str_array_2, dict) + self.assertEqual(len(map_str_array_2), 2) # Should have 2 key-value pairs + np.testing.assert_array_equal(map_str_array_2['values1'], [100, 200]) + np.testing.assert_array_equal(map_str_array_2['values2'], [300, 400, 500]) + + # Test Row 3 - Edge cases and special values with multiple pairs + # Test extreme integer values (hashable keys -> normal dict) + map_str_int_3 = ret.iloc[2]["map_str_int"] + self.assertIsInstance(map_str_int_3, dict) + self.assertEqual(len(map_str_int_3), 3) # Should have 3 key-value pairs + self.assertEqual(map_str_int_3['min_int'], -2147483648) + self.assertEqual(map_str_int_3['max_int'], 2147483647) + self.assertEqual(map_str_int_3['zero'], 0) + + map_int_str_3 = ret.iloc[2]["map_int_str"] + self.assertIsInstance(map_int_str_3, dict) + self.assertEqual(len(map_int_str_3), 3) # Should have 3 key-value pairs + self.assertEqual(map_int_str_3[-50], 'negative_int') + self.assertEqual(map_int_str_3[0], 'zero_int') + self.assertEqual(map_int_str_3[1000000], 'million') + + # Test special float values (infinity, negative infinity, NaN) + map_str_float_3 = ret.iloc[2]["map_str_float"] + self.assertIsInstance(map_str_float_3, dict) + self.assertEqual(len(map_str_float_3), 3) # Should have 3 key-value pairs + self.assertTrue(math.isinf(map_str_float_3['inf'])) + self.assertTrue(map_str_float_3['inf'] > 0) # Positive infinity + self.assertTrue(math.isinf(map_str_float_3['neg_inf'])) + self.assertTrue(map_str_float_3['neg_inf'] < 0) # Negative infinity + self.assertTrue(math.isnan(map_str_float_3['nan'])) + + # Test extreme datetime values in row 3 + map_str_datetime_3 = ret.iloc[2]["map_str_datetime"] + self.assertIsInstance(map_str_datetime_3, dict) + self.assertEqual(len(map_str_datetime_3), 3) # Should have 3 key-value pairs + self.assertIsInstance(map_str_datetime_3['epoch'], datetime) + self.assertEqual(map_str_datetime_3['epoch'], datetime(1970, 1, 1, 8, 0, 0, tzinfo=self.shanghai_tz)) + self.assertIsInstance(map_str_datetime_3['y2k'], datetime) + print(map_str_datetime_3['y2k']) + self.assertEqual(map_str_datetime_3['y2k'], datetime(2000, 1, 1, 0, 0, 0, tzinfo=self.shanghai_tz)) + self.assertIsInstance(map_str_datetime_3['future'], datetime) + self.assertEqual(map_str_datetime_3['future'], datetime(2099, 12, 31, 23, 59, 59, tzinfo=self.shanghai_tz)) + + # Test extreme date values in row 3 + map_str_date_3 = ret.iloc[2]["map_str_date"] + self.assertIsInstance(map_str_date_3, dict) + self.assertEqual(len(map_str_date_3), 3) # Should have 3 key-value pairs + self.assertIsInstance(map_str_date_3['past'], date) + self.assertEqual(map_str_date_3['past'], date(1970, 1, 1)) + self.assertIsInstance(map_str_date_3['present'], date) + # Note: 'present' uses today() so we just check it's a date, not exact value + self.assertIsInstance(map_str_date_3['future'], date) + self.assertEqual(map_str_date_3['future'], date(2100, 1, 1)) + + # Test extreme DateTime64 values in row 3 + map_str_datetime64_3 = ret.iloc[2]["map_str_datetime64"] + self.assertIsInstance(map_str_datetime64_3, dict) + self.assertEqual(len(map_str_datetime64_3), 3) # Should have 3 key-value pairs + self.assertIsInstance(map_str_datetime64_3['epoch_precise'], datetime) + self.assertEqual(map_str_datetime64_3['epoch_precise'], datetime(1970, 1, 1, 8, 0, 0, 1, tzinfo=self.shanghai_tz)) + self.assertIsInstance(map_str_datetime64_3['y2k_precise'], datetime) + self.assertEqual(map_str_datetime64_3['y2k_precise'], datetime(2000, 1, 1, 0, 0, 0, 999999, tzinfo=self.shanghai_tz)) + self.assertIsInstance(map_str_datetime64_3['future_precise'], datetime) + self.assertEqual(map_str_datetime64_3['future_precise'], datetime(2099, 12, 31, 23, 59, 59, 123456, tzinfo=self.shanghai_tz)) + + # Test extreme decimal values in row 3 + map_str_decimal_3 = ret.iloc[2]["map_str_decimal"] + self.assertIsInstance(map_str_decimal_3, dict) + self.assertEqual(len(map_str_decimal_3), 3) # Should have 3 key-value pairs + self.assertAlmostEqual(float(map_str_decimal_3['min_decimal']), 0.01, places=2) + self.assertAlmostEqual(float(map_str_decimal_3['max_decimal']), 99999999.99, places=2) + self.assertAlmostEqual(float(map_str_decimal_3['zero_decimal']), 0.00, places=2) + + # Test Array values in row 3 with multiple pairs including edge cases + map_str_array_3 = ret.iloc[2]["map_str_array"] + self.assertIsInstance(map_str_array_3, dict) + self.assertEqual(len(map_str_array_3), 3) # Should have 3 key-value pairs + # Empty array + empty_array = map_str_array_3['empty_val'] + self.assertIsInstance(empty_array, list) + self.assertEqual(len(empty_array), 0) + # Single element array + single_array = map_str_array_3['single_val'] + self.assertIsInstance(single_array, list) + np.testing.assert_array_equal(single_array, [42]) + # Multi element array + multi_array = map_str_array_3['multi_val'] + self.assertIsInstance(multi_array, list) + np.testing.assert_array_equal(multi_array, [1, 2, 3, 4, 5]) + + # Test Tuple values in row 3 with multiple pairs + map_str_tuple_3 = ret.iloc[2]["map_str_tuple"] + self.assertIsInstance(map_str_tuple_3, dict) + self.assertEqual(len(map_str_tuple_3), 2) # Should have 2 key-value pairs + # Empty-like tuple + empty_like_tuple = map_str_tuple_3['empty_like'] + self.assertIsInstance(empty_like_tuple, tuple) + self.assertEqual(empty_like_tuple, (0, '')) + # Full tuple + full_like_tuple = map_str_tuple_3['full_like'] + self.assertIsInstance(full_like_tuple, tuple) + self.assertEqual(full_like_tuple, (999, 'full_string')) + + # Test empty arrays (non-hashable key -> keys/values structure) + map_array_str_3 = ret.iloc[2]["map_array_str"] + self.assertIn('keys', map_array_str_3) + self.assertIn('values', map_array_str_3) + empty_array_key = map_array_str_3['keys'][0] + self.assertIsInstance(empty_array_key, list) + self.assertEqual(len(empty_array_key), 0) # Empty array + self.assertEqual(map_array_str_3['values'][0], 'empty_array') + + # Test empty array as value (hashable key -> normal dict) + map_str_array_3 = ret.iloc[2]["map_str_array"] + empty_array_value = map_str_array_3['empty_val'] + self.assertIsInstance(empty_array_value, list) + self.assertEqual(len(empty_array_value), 0) + + # Comprehensive type validation for all Map variations + for i in range(len(ret)): + # Verify all Maps return dict objects + for col in ['map_str_int', 'map_int_str', 'map_bool_str', 'map_str_float', + 'map_array_str', 'map_str_array', 'map_tuple_str', 'map_str_tuple', + 'map_nested', 'map_nullable']: + map_value = ret.iloc[i][col] + self.assertIsInstance(map_value, dict, f"Row {i}, column {col} should be dict") + + # Verify Map(String, Int32) key-value types + str_int_map = ret.iloc[i]["map_str_int"] + for key, value in str_int_map.items(): + self.assertIsInstance(key, str, f"Row {i} map_str_int key should be string") + self.assertIsInstance(value, (int, np.integer), f"Row {i} map_str_int value should be integer") + + # Verify Map(Int32, String) key-value types + int_str_map = ret.iloc[i]["map_int_str"] + for key, value in int_str_map.items(): + self.assertIsInstance(key, (int, np.integer), f"Row {i} map_int_str key should be integer") + self.assertIsInstance(value, str, f"Row {i} map_int_str value should be string") + + # Verify Map(Bool, String) key-value types + bool_str_map = ret.iloc[i]["map_bool_str"] + for key, value in bool_str_map.items(): + self.assertIsInstance(key, (bool, np.bool_), f"Row {i} map_bool_str key should be bool") + self.assertIsInstance(value, str, f"Row {i} map_bool_str value should be string") + + # Verify data types - All Map types should be mapped to object dtype in pandas + expected_types = { + "row_id": "uint8", + "map_str_int": "object", # Map(String, Int32) mapped to object dtype + "map_int_str": "object", # Map(Int32, String) mapped to object dtype + "map_bool_str": "object", # Map(Bool, String) mapped to object dtype + "map_str_float": "object", # Map(String, Float64) mapped to object dtype + "map_array_str": "object", # Map(Array(Int32), String) mapped to object dtype + "map_str_array": "object", # Map(String, Array(Int32)) mapped to object dtype + "map_tuple_str": "object", # Map(Tuple(Int32, String), String) mapped to object dtype + "map_str_tuple": "object", # Map(String, Tuple(Int32, String)) mapped to object dtype + "map_nested": "object", # Map(String, Map(String, Int32)) mapped to object dtype + "map_nullable": "object" # Map(String, Nullable(String)) mapped to object dtype + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type, f"Column {col} type mismatch") + + # Test Map functions and operations + map_ops_ret = self.session.query(""" + SELECT + map('a', 1, 'b', 2, 'c', 3) as test_map, + mapKeys(map('x', 10, 'y', 20)) as map_keys, + mapValues(map('p', 100, 'q', 200)) as map_values, + length(map('one', 1, 'two', 2, 'three', 3)) as map_length + """, "DataFrame") + + test_map = map_ops_ret.iloc[0]["test_map"] + self.assertIsInstance(test_map, dict) + self.assertEqual(len(test_map), 3) + + # mapKeys should return an array + map_keys = map_ops_ret.iloc[0]["map_keys"] + self.assertIsInstance(map_keys, np.ndarray) + self.assertEqual(len(map_keys), 2) + self.assertIn('x', map_keys) + self.assertIn('y', map_keys) + + # mapValues should return an array + map_values = map_ops_ret.iloc[0]["map_values"] + self.assertIsInstance(map_values, np.ndarray) + self.assertEqual(len(map_values), 2) + self.assertIn(100, map_values) + self.assertIn(200, map_values) + + # length should return integer + map_length = map_ops_ret.iloc[0]["map_length"] + self.assertEqual(map_length, 3) + + +if __name__ == "__main__": + unittest.main() diff --git a/tests/test_dataframe_column_types_2.py b/tests/test_dataframe_column_types_2.py new file mode 100644 index 00000000000..715d30bd5fa --- /dev/null +++ b/tests/test_dataframe_column_types_2.py @@ -0,0 +1,1357 @@ +#!/usr/bin/env python3 + +import unittest +import uuid +import pandas as pd +import chdb +import json +import numpy as np +import datetime +from datetime import date, timedelta + + +class TestDataFrameColumnTypesTwo(unittest.TestCase): + + def setUp(self): + self.session = chdb.session.Session() + + def tearDown(self): + self.session.close() + + def test_variant_types(self): + """Test Variant type with mixed data types""" + # Enable suspicious variant types to allow similar types like Int32 and Float64 + self.session.query("SET allow_suspicious_variant_types = 1") + + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + NULL::Variant(UInt64, String, Array(UInt64)) as variant_basic, + NULL::Variant(Float64, String, Bool) as variant_mixed, + NULL::Variant(String, Array(String), Tuple(String, Int32)) as variant_complex + UNION ALL + SELECT + 2 as row_id, + 42::Variant(UInt64, String, Array(UInt64)) as variant_basic, + -100.0::Variant(Float64, String, Bool) as variant_mixed, + 'Hello World'::Variant(String, Array(String), Tuple(String, Int32)) as variant_complex + UNION ALL + SELECT + 3 as row_id, + 'Hello, World!'::Variant(UInt64, String, Array(UInt64)) as variant_basic, + 3.14159::Variant(Float64, String, Bool) as variant_mixed, + ['a', 'b', 'c']::Variant(String, Array(String), Tuple(String, Int32)) as variant_complex + UNION ALL + SELECT + 4 as row_id, + [1, 2, 3]::Variant(UInt64, String, Array(UInt64)) as variant_basic, + true::Variant(Float64, String, Bool) as variant_mixed, + ('tuple_str', 123)::Variant(String, Array(String), Tuple(String, Int32)) as variant_complex + UNION ALL + SELECT + 5 as row_id, + 9223372036854775807::Variant(UInt64, String, Array(UInt64)) as variant_basic, + 'mixed_string'::Variant(Float64, String, Bool) as variant_mixed, + 'Simple String'::Variant(String, Array(String), Tuple(String, Int32)) as variant_complex + UNION ALL + SELECT + 6 as row_id, + 'Another String'::Variant(UInt64, String, Array(UInt64)) as variant_basic, + false::Variant(Float64, String, Bool) as variant_mixed, + ['x', 'y']::Variant(String, Array(String), Tuple(String, Int32)) as variant_complex + UNION ALL + SELECT + 7 as row_id, + [10, 20, 30, 40]::Variant(UInt64, String, Array(UInt64)) as variant_basic, + -2.71828::Variant(Float64, String, Bool) as variant_mixed, + ('another', 456)::Variant(String, Array(String), Tuple(String, Int32)) as variant_complex + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Verify we have 7 rows and 4 columns + self.assertEqual(len(ret), 7) + self.assertEqual(len(ret.columns), 4) + + # Test first row - all NULL values + self.assertTrue(pd.isna(ret.iloc[0]["variant_basic"])) + self.assertTrue(pd.isna(ret.iloc[0]["variant_mixed"])) + self.assertTrue(pd.isna(ret.iloc[0]["variant_complex"])) + + # Test second row - basic types (UInt64, Float64, String) + self.assertEqual(ret.iloc[1]["variant_basic"], 42) # UInt64 + self.assertEqual(ret.iloc[1]["variant_mixed"], -100.0) # Float64 + self.assertEqual(ret.iloc[1]["variant_complex"], "Hello World") # String + + # Test third row - different types (String, Float64, Array(String)) + self.assertEqual(ret.iloc[2]["variant_basic"], "Hello, World!") # String + self.assertAlmostEqual(ret.iloc[2]["variant_mixed"], 3.14159, places=5) # Float64 + # Array may be returned as numpy array or list + array_val = ret.iloc[2]["variant_complex"] + if isinstance(array_val, np.ndarray): + np.testing.assert_array_equal(array_val, ['a', 'b', 'c']) + else: + self.assertEqual(array_val, ['a', 'b', 'c']) + + # Test fourth row - Array(UInt64), Bool, Tuple(String, Int32) + array_basic = ret.iloc[3]["variant_basic"] + if isinstance(array_basic, np.ndarray): + np.testing.assert_array_equal(array_basic, [1, 2, 3]) + else: + self.assertEqual(array_basic, [1, 2, 3]) + self.assertEqual(ret.iloc[3]["variant_mixed"], True) # Bool + # Tuple may be returned as numpy array or tuple + tuple_val = ret.iloc[3]["variant_complex"] + if isinstance(tuple_val, np.ndarray): + self.assertEqual(tuple_val[0], 'tuple_str') + self.assertEqual(tuple_val[1], 123) + else: + self.assertEqual(tuple_val, ('tuple_str', 123)) + + # Test fifth row - large UInt64, String, String + self.assertEqual(ret.iloc[4]["variant_basic"], 9223372036854775807) # Large UInt64 + self.assertEqual(ret.iloc[4]["variant_mixed"], "mixed_string") # String + self.assertEqual(ret.iloc[4]["variant_complex"], "Simple String") # String + + # Test sixth row - String, Bool, Array(String) + self.assertEqual(ret.iloc[5]["variant_basic"], "Another String") # String + self.assertEqual(ret.iloc[5]["variant_mixed"], False) # Bool + array_val_6 = ret.iloc[5]["variant_complex"] + if isinstance(array_val_6, np.ndarray): + np.testing.assert_array_equal(array_val_6, ['x', 'y']) + else: + self.assertEqual(array_val_6, ['x', 'y']) + + # Test seventh row - Array(UInt64), Float64, Tuple(String, Int32) + array_val_7 = ret.iloc[6]["variant_basic"] + if isinstance(array_val_7, np.ndarray): + np.testing.assert_array_equal(array_val_7, [10, 20, 30, 40]) + else: + self.assertEqual(array_val_7, [10, 20, 30, 40]) + self.assertAlmostEqual(ret.iloc[6]["variant_mixed"], -2.71828, places=5) # Float64 + tuple_val_7 = ret.iloc[6]["variant_complex"] + if isinstance(tuple_val_7, np.ndarray): + self.assertEqual(tuple_val_7[0], 'another') + self.assertEqual(tuple_val_7[1], 456) + else: + self.assertEqual(tuple_val_7, ('another', 456)) + + # Data type validation - Variant types mapped to object in pandas + expected_types = { + "row_id": "uint8", + "variant_basic": "object", # Variant(UInt64, String, Array(UInt64)) + "variant_mixed": "object", # Variant(Float64, String, Bool) + "variant_complex": "object" # Variant(String, Array(String), Tuple(String, Int32)) + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_dynamic_types(self): + """Test Dynamic type with schema evolution""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + 'Static String'::Dynamic as dynamic_string, + 100::Dynamic as dynamic_number, + [1, 2, 3, 4, 5]::Dynamic as dynamic_array, + 'Alice'::Dynamic as dynamic_object, + NULL::Dynamic as dynamic_null + UNION ALL + SELECT + 2 as row_id, + 'Another Dynamic String'::Dynamic as dynamic_string, + -500::Dynamic as dynamic_number, + ['x', 'y', 'z', 'w']::Dynamic as dynamic_array, + 'Engineer'::Dynamic as dynamic_object, + 'Now not null'::Dynamic as dynamic_null + UNION ALL + SELECT + 3 as row_id, + 'evolvedSchema'::Dynamic as dynamic_string, -- Schema evolution: string in string field + [10, 20, 30]::Dynamic as dynamic_number, -- Schema evolution: array in number field + 'Now a string'::Dynamic as dynamic_array, -- Schema evolution: string in array field + 42::Dynamic as dynamic_object, -- Schema evolution: number in object field + 'nested_value'::Dynamic as dynamic_null -- Schema evolution: string value + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - initial dynamic types + self.assertEqual(ret.iloc[0]["dynamic_string"], "Static String") + self.assertEqual(ret.iloc[0]["dynamic_number"], '100') + self.assertIn("[1,2,3,4,5]", str(ret.iloc[0]["dynamic_array"]).replace(" ", "")) + self.assertEqual(ret.iloc[0]["dynamic_object"], "Alice") + self.assertTrue(pd.isna(ret.iloc[0]["dynamic_null"])) + + # Test second row - different dynamic values + self.assertEqual(ret.iloc[1]["dynamic_string"], "Another Dynamic String") + self.assertEqual(ret.iloc[1]["dynamic_number"], '-500') + self.assertIn("['x','y','z','w']", str(ret.iloc[1]["dynamic_array"]).replace(" ", "").replace('"', "'")) + self.assertEqual(ret.iloc[1]["dynamic_object"], "Engineer") + self.assertEqual(ret.iloc[1]["dynamic_null"], "Now not null") + + # Test third row - schema evolution + self.assertEqual(ret.iloc[2]["dynamic_string"], "evolvedSchema") + self.assertIn("[10,20,30]", str(ret.iloc[2]["dynamic_number"]).replace(" ", "")) + self.assertEqual(ret.iloc[2]["dynamic_array"], "Now a string") + self.assertEqual(ret.iloc[2]["dynamic_object"], '42') + self.assertEqual(ret.iloc[2]["dynamic_null"], "nested_value") + + # Data type validation - Dynamic types may be mapped to object in pandas + expected_types = { + "dynamic_string": "object", + "dynamic_number": "object", + "dynamic_array": "object", + "dynamic_object": "object", + "dynamic_null": "object" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_json_types(self): + """Test JSON type with complex nested structures""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + '{"name": "John", "age": 30, "city": "New York"}'::JSON as json_simple, + '{"items": [1, 2, 3], "metadata": {"created": "2023-01-01", "version": 1.0}}'::JSON as json_nested, + '{"array": [1, 2, 3, 4, 5]}'::JSON as json_array, + '{"active": true, "score": null, "tags": ["urgent", "new"]}'::JSON as json_mixed, + 'null'::JSON as json_null, + '{"value": "simple string"}'::JSON as json_string, + '{"value": 42}'::JSON as json_number, + '{"value": true}'::JSON as json_boolean + UNION ALL + SELECT + 2 as row_id, + '{"product": "laptop", "price": 1299.99, "in_stock": true}'::JSON as json_simple, + '{"users": [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}], "total": 2}'::JSON as json_nested, + '{"mixed": ["text", 3.14, true, null]}'::JSON as json_array, + '{"config": {"timeout": 30, "retries": 3}, "enabled": false}'::JSON as json_mixed, + '{"complex": null, "array": [null, {"nested": null}]}'::JSON as json_null, + '{"value": "Unicode: 🌍 éñáíóú"}'::JSON as json_string, + '{"value": -123.456}'::JSON as json_number, + '{"value": false}'::JSON as json_boolean + UNION ALL + SELECT + 3 as row_id, + '{"very": {"deeply": {"nested": {"structure": {"with": {"many": {"levels": "value"}}}}}}}'::JSON as json_simple, + '{"matrix": [[1, 2], [3, 4]], "config": {"debug": false, "timeout": 30}}'::JSON as json_nested, + '{"objects": [{"a": 1}, {"b": 2}, {"c": [1, 2, 3]}]}'::JSON as json_array, + '{"key1": "value1", "key2": 42, "key3": [1, 2, 3], "key4": {"nested": true}}'::JSON as json_mixed, + '{"nullValue": null, "notNull": "value", "arrayWithNull": [1, null, 3]}'::JSON as json_null, + '{"value": "Special chars with escapes"}'::JSON as json_string, + '{"value": 1.7976931348623157e+308}'::JSON as json_number, + '{"value": true}'::JSON as json_boolean + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - basic JSON structures + json_simple_1 = ret.iloc[0]["json_simple"] + self.assertEqual(json_simple_1['name'], 'John') + self.assertEqual(json_simple_1['age'], 30) + self.assertEqual(json_simple_1['city'], 'New York') + + json_nested_1 = ret.iloc[0]["json_nested"] + self.assertEqual(json_nested_1['items'], [1, 2, 3]) + self.assertIn('metadata', json_nested_1) + created_date = json_nested_1['metadata']['created'] + self.assertIsInstance(created_date, datetime.date) + self.assertEqual(created_date, datetime.date(2023, 1, 1)) + + json_array_1 = ret.iloc[0]["json_array"] + self.assertEqual(json_array_1['array'], [1, 2, 3, 4, 5]) + + json_mixed_1 = ret.iloc[0]["json_mixed"] + self.assertIsInstance(json_mixed_1['active'], bool) + self.assertTrue(json_mixed_1['active']) + self.assertNotIn('score', json_mixed_1) # null values don't create keys + self.assertEqual(json_mixed_1['tags'], ['urgent', 'new']) + + self.assertIsNone(ret.iloc[0]["json_null"]) + json_string_1 = ret.iloc[0]["json_string"] + self.assertEqual(json_string_1['value'], 'simple string') + json_number_1 = ret.iloc[0]["json_number"] + self.assertEqual(json_number_1['value'], 42) + json_boolean_1 = ret.iloc[0]["json_boolean"] + self.assertEqual(json_boolean_1['value'], True) + + # Test second row - more complex JSON structures + json_simple_2 = ret.iloc[1]["json_simple"] + self.assertEqual(json_simple_2['product'], 'laptop') + self.assertEqual(json_simple_2['price'], 1299.99) + self.assertEqual(json_simple_2['in_stock'], True) + + json_nested_2 = ret.iloc[1]["json_nested"] + expected_users = [{"id": 1, "name": "Alice"}, {"id": 2, "name": "Bob"}] + self.assertEqual(json_nested_2['users'], expected_users) + self.assertEqual(json_nested_2['total'], 2) + + json_array_2 = ret.iloc[1]["json_array"] + expected_mixed = ['text', 3.14, True, None] + self.assertEqual(json_array_2['mixed'], expected_mixed) + + json_mixed_2 = ret.iloc[1]["json_mixed"] + self.assertEqual(json_mixed_2['config']['timeout'], 30) + self.assertEqual(json_mixed_2['config']['retries'], 3) + self.assertEqual(json_mixed_2['enabled'], False) + + json_null_2 = ret.iloc[1]["json_null"] + self.assertNotIn('complex', json_null_2) # complex is null, so key doesn't exist + expected_array = [None, None] + self.assertEqual(json_null_2['array'], expected_array) + + json_string_2 = ret.iloc[1]["json_string"] + self.assertIn("Unicode", json_string_2['value']) + self.assertIn("🌍", json_string_2['value']) + json_number_2 = ret.iloc[1]["json_number"] + self.assertEqual(json_number_2['value'], -123.456) + + json_boolean_2 = ret.iloc[1]["json_boolean"] + self.assertEqual(json_boolean_2['value'], False) + + # Test third row - very complex and edge cases + json_simple_3 = ret.iloc[2]["json_simple"] + self.assertEqual(json_simple_3['very']['deeply']['nested']['structure']['with']['many']['levels'], 'value') + + json_nested_3 = ret.iloc[2]["json_nested"] + self.assertEqual(json_nested_3['matrix'], [[1, 2], [3, 4]]) + self.assertEqual(json_nested_3['config']['debug'], False) + self.assertEqual(json_nested_3['config']['timeout'], 30) + + json_array_3 = ret.iloc[2]["json_array"] + expected_objects = [{'a': 1}, {'b': 2}, {'c': [1, 2, 3]}] + self.assertEqual(json_array_3['objects'], expected_objects) + + json_mixed_3 = ret.iloc[2]["json_mixed"] + self.assertEqual(json_mixed_3['key1'], 'value1') + self.assertEqual(json_mixed_3['key2'], 42) + self.assertEqual(json_mixed_3['key3'], [1, 2, 3]) + self.assertEqual(json_mixed_3['key4']['nested'], True) + + json_null_3 = ret.iloc[2]["json_null"] + self.assertNotIn('nullValue', json_null_3) # null values don't create keys + self.assertEqual(json_null_3['notNull'], 'value') + self.assertEqual(json_null_3['arrayWithNull'], [1, None, 3]) + + json_string_3 = ret.iloc[2]["json_string"] + self.assertIn("Special chars with escapes", json_string_3['value']) + + json_number_3 = ret.iloc[2]["json_number"] + self.assertEqual(json_number_3['value'], 1.7976931348623157e+308) # Large number representation + + json_boolean_3 = ret.iloc[2]["json_boolean"] + self.assertEqual(json_boolean_3['value'], True) + + # Data type validation - JSON types mapped to object in pandas + expected_types = { + "row_id": "uint8", + "json_simple": "object", + "json_nested": "object", + "json_array": "object", + "json_mixed": "object", + "json_null": "object", + "json_string": "object", + "json_number": "object", + "json_boolean": "object" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_nested_types(self): + """Test Nested type with structured data""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + [(1, 'Alice', 25.5), (2, 'Bob', 30.0), (3, 'Charlie', 35.5)]::Array(Tuple(UInt32, String, Float64))::Nested(id UInt32, name String, salary Float64) as employees, + [(100, 'Engineering'), (200, 'Marketing'), (300, 'Finance')]::Array(Tuple(UInt32, String))::Nested(dept_id UInt32, dept_name String) as departments, + [('2023-01-01', 1000.0), ('2023-02-01', 1500.0)]::Array(Tuple(Date, Float64))::Nested(date Date, amount Float64) as transactions + UNION ALL + SELECT + 2 as row_id, + [(4, 'Diana', 45.0), (5, 'Eve', 28.5)]::Array(Tuple(UInt32, String, Float64))::Nested(id UInt32, name String, salary Float64) as employees, + [(400, 'Sales'), (500, 'HR')]::Array(Tuple(UInt32, String))::Nested(dept_id UInt32, dept_name String) as departments, + [('2023-03-01', 2000.0), ('2023-04-01', 2500.0), ('2023-05-01', 1800.0)]::Array(Tuple(Date, Float64))::Nested(date Date, amount Float64) as transactions + UNION ALL + SELECT + 3 as row_id, + [(6, 'Frank', 55.0)]::Array(Tuple(UInt32, String, Float64))::Nested(id UInt32, name String, salary Float64) as employees, + [(600, 'Operations')]::Array(Tuple(UInt32, String))::Nested(dept_id UInt32, dept_name String) as departments, + [('2023-06-01', 3000.0)]::Array(Tuple(Date, Float64))::Nested(date Date, amount Float64) as transactions + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - multiple employees (Nested types are returned as numpy arrays of tuples) + employees_1 = ret.iloc[0]["employees"] + # Extract values by iterating through tuples and accessing by position + ids_1 = [emp[0] for emp in employees_1] + names_1 = [emp[1] for emp in employees_1] + salaries_1 = [emp[2] for emp in employees_1] + self.assertEqual(ids_1, [1, 2, 3]) + self.assertEqual(names_1, ['Alice', 'Bob', 'Charlie']) + self.assertEqual(salaries_1, [25.5, 30.0, 35.5]) + + departments_1 = ret.iloc[0]["departments"] + # Extract dept_id and dept_name by position + dept_ids_1 = [dept[0] for dept in departments_1] + dept_names_1 = [dept[1] for dept in departments_1] + self.assertEqual(dept_ids_1, [100, 200, 300]) + self.assertEqual(dept_names_1, ['Engineering', 'Marketing', 'Finance']) + + transactions_1 = ret.iloc[0]["transactions"] + # Extract date and amount by position + dates_1 = [trans[0] for trans in transactions_1] + amounts_1 = [trans[1] for trans in transactions_1] + expected_dates_1 = [datetime.date(2023, 1, 1), datetime.date(2023, 2, 1)] + self.assertEqual(dates_1, expected_dates_1) + self.assertEqual(amounts_1, [1000.0, 1500.0]) + + # Test second row - different data + employees_2 = ret.iloc[1]["employees"] + ids_2 = [emp[0] for emp in employees_2] + names_2 = [emp[1] for emp in employees_2] + salaries_2 = [emp[2] for emp in employees_2] + self.assertEqual(ids_2, [4, 5]) + self.assertEqual(names_2, ['Diana', 'Eve']) + self.assertEqual(salaries_2, [45.0, 28.5]) + + departments_2 = ret.iloc[1]["departments"] + dept_ids_2 = [dept[0] for dept in departments_2] + dept_names_2 = [dept[1] for dept in departments_2] + self.assertEqual(dept_ids_2, [400, 500]) + self.assertEqual(dept_names_2, ['Sales', 'HR']) + + transactions_2 = ret.iloc[1]["transactions"] + dates_2 = [trans[0] for trans in transactions_2] + amounts_2 = [trans[1] for trans in transactions_2] + expected_dates_2 = [datetime.date(2023, 3, 1), datetime.date(2023, 4, 1), datetime.date(2023, 5, 1)] + self.assertEqual(dates_2, expected_dates_2) + self.assertEqual(amounts_2, [2000.0, 2500.0, 1800.0]) + + # Test third row - single employee + employees_3 = ret.iloc[2]["employees"] + ids_3 = [emp[0] for emp in employees_3] + names_3 = [emp[1] for emp in employees_3] + salaries_3 = [emp[2] for emp in employees_3] + self.assertEqual(ids_3, [6]) + self.assertEqual(names_3, ['Frank']) + self.assertEqual(salaries_3, [55.0]) + + departments_3 = ret.iloc[2]["departments"] + dept_ids_3 = [dept[0] for dept in departments_3] + dept_names_3 = [dept[1] for dept in departments_3] + self.assertEqual(dept_ids_3, [600]) + self.assertEqual(dept_names_3, ['Operations']) + + transactions_3 = ret.iloc[2]["transactions"] + dates_3 = [trans[0] for trans in transactions_3] + amounts_3 = [trans[1] for trans in transactions_3] + expected_dates_3 = [datetime.date(2023, 6, 1)] + self.assertEqual(dates_3, expected_dates_3) + self.assertEqual(amounts_3, [3000.0]) + + # Data type validation - Nested types should be mapped to object in pandas + expected_types = { + "row_id": "uint8", + "employees": "object", + "departments": "object", + "transactions": "object" + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type) + + def test_interval_types(self): + """Test various Interval types - time intervals in ClickHouse""" + + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + INTERVAL 30 SECOND as interval_seconds, + INTERVAL 15 MINUTE as interval_minutes, + INTERVAL 3 HOUR as interval_hours, + INTERVAL 7 DAY as interval_days, + INTERVAL 2 WEEK as interval_weeks, + INTERVAL 6 MONTH as interval_months, + INTERVAL 1 QUARTER as interval_quarters, + INTERVAL 2 YEAR as interval_years + UNION ALL + SELECT + 2 as row_id, + INTERVAL 90 SECOND as interval_seconds, + INTERVAL 45 MINUTE as interval_minutes, + INTERVAL 12 HOUR as interval_hours, + INTERVAL 14 DAY as interval_days, + INTERVAL 4 WEEK as interval_weeks, + INTERVAL 18 MONTH as interval_months, + INTERVAL 2 QUARTER as interval_quarters, + INTERVAL 5 YEAR as interval_years + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - basic intervals + self.assertEqual(ret.iloc[0]["row_id"], 1) + + # Interval values are typically returned as timedelta objects or integers representing units + # Let's test the actual values + intervals_1 = { + "interval_seconds": ret.iloc[0]["interval_seconds"], + "interval_minutes": ret.iloc[0]["interval_minutes"], + "interval_hours": ret.iloc[0]["interval_hours"], + "interval_days": ret.iloc[0]["interval_days"], + "interval_weeks": ret.iloc[0]["interval_weeks"], + "interval_months": ret.iloc[0]["interval_months"], + "interval_quarters": ret.iloc[0]["interval_quarters"], + "interval_years": ret.iloc[0]["interval_years"] + } + + # Check interval values are not None and have expected types + # Basic intervals should return timedelta64 type + for interval_name, interval_value in intervals_1.items(): + self.assertIsNotNone(interval_value, f"{interval_name} should not be None") + self.assertEqual(type(interval_value).__name__, 'Timedelta', + f"{interval_name} should be timedelta64, got {type(interval_value).__name__}") + + # Test second row - different interval values + self.assertEqual(ret.iloc[1]["row_id"], 2) + + intervals_2 = { + "interval_seconds": ret.iloc[1]["interval_seconds"], + "interval_minutes": ret.iloc[1]["interval_minutes"], + "interval_hours": ret.iloc[1]["interval_hours"], + "interval_days": ret.iloc[1]["interval_days"], + "interval_weeks": ret.iloc[1]["interval_weeks"], + "interval_months": ret.iloc[1]["interval_months"], + "interval_quarters": ret.iloc[1]["interval_quarters"], + "interval_years": ret.iloc[1]["interval_years"] + } + + # Check second row interval values - should also be timedelta64 + for interval_name, interval_value in intervals_2.items(): + self.assertIsNotNone(interval_value, f"{interval_name} should not be None") + self.assertEqual(type(interval_value).__name__, 'Timedelta', + f"{interval_name} should be timedelta64, got {type(interval_value).__name__}") + + # Data type validation - Intervals should be mapped according to C++ NumpyType.cpp + expected_interval_types = { + "row_id": "uint8", + "interval_seconds": "timedelta64[s]", # Second -> timedelta64[s] + "interval_minutes": "timedelta64[s]", # Minute -> timedelta64[m] + "interval_hours": "timedelta64[s]", # Hour -> timedelta64[h] + "interval_days": "timedelta64[s]", # Day -> timedelta64[D] + "interval_weeks": "timedelta64[s]", # Week -> timedelta64[W] + "interval_months": "timedelta64[s]", # Month -> timedelta64[M] + "interval_quarters": "timedelta64[s]", # Quarter -> timedelta64[M] (numpy doesn't have quarter) + "interval_years": "timedelta64[s]" # Year -> timedelta64[Y] + } + + for col, expected_type in expected_interval_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type, f"{col} dtype should be {expected_type}, got {actual_type}") + + # Value assertions for first row + self.assertEqual(intervals_1["interval_seconds"], pd.Timedelta(seconds=30)) + self.assertEqual(intervals_1["interval_minutes"], pd.Timedelta(minutes=15)) + self.assertEqual(intervals_1["interval_hours"], pd.Timedelta(hours=3)) + self.assertEqual(intervals_1["interval_days"], pd.Timedelta(days=7)) + self.assertEqual(intervals_1["interval_weeks"], pd.Timedelta(weeks=2)) + self.assertEqual(intervals_1["interval_months"], pd.Timedelta(days=180)) # Approximate months as days + self.assertEqual(intervals_1["interval_quarters"], pd.Timedelta(days=1*90)) # Approximate quarters as days + self.assertEqual(intervals_1["interval_years"], pd.Timedelta(days=2*365)) # Approximate years as days + + # Value assertions for second row + self.assertEqual(intervals_2["interval_seconds"], pd.Timedelta(seconds=90)) + self.assertEqual(intervals_2["interval_minutes"], pd.Timedelta(minutes=45)) + self.assertEqual(intervals_2["interval_hours"], pd.Timedelta(hours=12)) + self.assertEqual(intervals_2["interval_days"], pd.Timedelta(days=14)) + self.assertEqual(intervals_2["interval_weeks"], pd.Timedelta(weeks=4)) + self.assertEqual(intervals_2["interval_months"], pd.Timedelta(days=540)) # Approximate months as days + self.assertEqual(intervals_2["interval_quarters"], pd.Timedelta(days=180)) # Approximate quarters as days + self.assertEqual(intervals_2["interval_years"], pd.Timedelta(days=365 * 5)) # Approximate years as days + + def test_nested_interval_types(self): + """Test Interval types in nested structures like tuples - should return timedelta""" + + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + (INTERVAL 1000 NANOSECOND, INTERVAL 5000 NANOSECOND) as tuple_intervals_nanoseconds, + (INTERVAL 500 MICROSECOND, INTERVAL 1500 MICROSECOND) as tuple_intervals_microseconds, + (INTERVAL 100 MILLISECOND, INTERVAL 500 MILLISECOND) as tuple_intervals_milliseconds, + (INTERVAL 30 SECOND, INTERVAL 90 SECOND) as tuple_intervals_seconds, + (INTERVAL 15 MINUTE, INTERVAL 45 MINUTE) as tuple_intervals_minutes, + (INTERVAL 2 HOUR, INTERVAL 6 HOUR) as tuple_intervals_hours, + (INTERVAL 1 DAY, INTERVAL 3 DAY) as tuple_intervals_days, + (INTERVAL 1 WEEK, INTERVAL 2 WEEK) as tuple_intervals_weeks, + (INTERVAL 1 MONTH, INTERVAL 3 MONTH) as tuple_intervals_months, + (INTERVAL 1 QUARTER, INTERVAL 2 QUARTER) as tuple_intervals_quarters, + (INTERVAL 1 YEAR, INTERVAL 2 YEAR) as tuple_intervals_years + UNION ALL + SELECT + 2 as row_id, + (INTERVAL 2000 NANOSECOND, INTERVAL 8000 NANOSECOND) as tuple_intervals_nanoseconds, + (INTERVAL 800 MICROSECOND, INTERVAL 2000 MICROSECOND) as tuple_intervals_microseconds, + (INTERVAL 200 MILLISECOND, INTERVAL 800 MILLISECOND) as tuple_intervals_milliseconds, + (INTERVAL 60 SECOND, INTERVAL 120 SECOND) as tuple_intervals_seconds, + (INTERVAL 30 MINUTE, INTERVAL 60 MINUTE) as tuple_intervals_minutes, + (INTERVAL 4 HOUR, INTERVAL 8 HOUR) as tuple_intervals_hours, + (INTERVAL 2 DAY, INTERVAL 5 DAY) as tuple_intervals_days, + (INTERVAL 3 WEEK, INTERVAL 4 WEEK) as tuple_intervals_weeks, + (INTERVAL 6 MONTH, INTERVAL 12 MONTH) as tuple_intervals_months, + (INTERVAL 3 QUARTER, INTERVAL 4 QUARTER) as tuple_intervals_quarters, + (INTERVAL 3 YEAR, INTERVAL 5 YEAR) as tuple_intervals_years + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - nested intervals in tuples + self.assertEqual(ret.iloc[0]["row_id"], 1) + + # Nested intervals should return timedelta objects when in tuples + tuple_intervals_1 = { + "tuple_intervals_nanoseconds": ret.iloc[0]["tuple_intervals_nanoseconds"], + "tuple_intervals_microseconds": ret.iloc[0]["tuple_intervals_microseconds"], + "tuple_intervals_milliseconds": ret.iloc[0]["tuple_intervals_milliseconds"], + "tuple_intervals_seconds": ret.iloc[0]["tuple_intervals_seconds"], + "tuple_intervals_minutes": ret.iloc[0]["tuple_intervals_minutes"], + "tuple_intervals_hours": ret.iloc[0]["tuple_intervals_hours"], + "tuple_intervals_days": ret.iloc[0]["tuple_intervals_days"], + "tuple_intervals_weeks": ret.iloc[0]["tuple_intervals_weeks"], + "tuple_intervals_months": ret.iloc[0]["tuple_intervals_months"], + "tuple_intervals_quarters": ret.iloc[0]["tuple_intervals_quarters"], + "tuple_intervals_years": ret.iloc[0]["tuple_intervals_years"] + } + + # Check nested interval tuples - elements should be timedelta + for tuple_name, tuple_value in tuple_intervals_1.items(): + self.assertIsNotNone(tuple_value, f"{tuple_name} should not be None") + # Should be a tuple containing interval values + self.assertTrue(hasattr(tuple_value, '__iter__'), f"{tuple_name} should be iterable") + + # Check individual elements in the tuple - should be timedelta + for i, interval_elem in enumerate(tuple_value): + self.assertIsNotNone(interval_elem, f"{tuple_name}[{i}] should not be None") + self.assertEqual(type(interval_elem).__name__, 'timedelta', + f"{tuple_name}[{i}] should be timedelta, got {type(interval_elem).__name__}") + + # Test second row + self.assertEqual(ret.iloc[1]["row_id"], 2) + + tuple_intervals_2 = { + "tuple_intervals_nanoseconds": ret.iloc[1]["tuple_intervals_nanoseconds"], + "tuple_intervals_microseconds": ret.iloc[1]["tuple_intervals_microseconds"], + "tuple_intervals_milliseconds": ret.iloc[1]["tuple_intervals_milliseconds"], + "tuple_intervals_seconds": ret.iloc[1]["tuple_intervals_seconds"], + "tuple_intervals_minutes": ret.iloc[1]["tuple_intervals_minutes"], + "tuple_intervals_hours": ret.iloc[1]["tuple_intervals_hours"], + "tuple_intervals_days": ret.iloc[1]["tuple_intervals_days"], + "tuple_intervals_weeks": ret.iloc[1]["tuple_intervals_weeks"], + "tuple_intervals_months": ret.iloc[1]["tuple_intervals_months"], + "tuple_intervals_quarters": ret.iloc[1]["tuple_intervals_quarters"], + "tuple_intervals_years": ret.iloc[1]["tuple_intervals_years"] + } + + # Check second row nested interval tuples + for tuple_name, tuple_value in tuple_intervals_2.items(): + self.assertIsNotNone(tuple_value, f"{tuple_name} should not be None") + self.assertTrue(hasattr(tuple_value, '__iter__'), f"{tuple_name} should be iterable") + + for i, interval_elem in enumerate(tuple_value): + self.assertIsNotNone(interval_elem, f"{tuple_name}[{i}] should not be None") + self.assertEqual(type(interval_elem).__name__, 'timedelta', + f"{tuple_name}[{i}] should be timedelta, got {type(interval_elem).__name__}") + + # Data type validation - Tuple intervals should be object type containing tuples + expected_nested_interval_types = { + "row_id": "uint8", + "tuple_intervals_nanoseconds": "object", + "tuple_intervals_microseconds": "object", + "tuple_intervals_milliseconds": "object", + "tuple_intervals_seconds": "object", + "tuple_intervals_minutes": "object", + "tuple_intervals_hours": "object", + "tuple_intervals_days": "object", + "tuple_intervals_weeks": "object", + "tuple_intervals_months": "object", + "tuple_intervals_quarters": "object", + "tuple_intervals_years": "object" + } + + for col, expected_type in expected_nested_interval_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type, f"{col} dtype should be {expected_type}, got {actual_type}") + + + # Nanoseconds: (1000ns, 5000ns) -> microseconds = value / 1000 + self.assertEqual(tuple_intervals_1["tuple_intervals_nanoseconds"][0], timedelta(microseconds=1000/1000)) + self.assertEqual(tuple_intervals_1["tuple_intervals_nanoseconds"][1], timedelta(microseconds=5000/1000)) + + # Microseconds: (500us, 1500us) + self.assertEqual(tuple_intervals_1["tuple_intervals_microseconds"][0], timedelta(microseconds=500)) + self.assertEqual(tuple_intervals_1["tuple_intervals_microseconds"][1], timedelta(microseconds=1500)) + + # Milliseconds: (100ms, 500ms) + self.assertEqual(tuple_intervals_1["tuple_intervals_milliseconds"][0], timedelta(milliseconds=100)) + self.assertEqual(tuple_intervals_1["tuple_intervals_milliseconds"][1], timedelta(milliseconds=500)) + + # Seconds: (30s, 90s) + self.assertEqual(tuple_intervals_1["tuple_intervals_seconds"][0], timedelta(seconds=30)) + self.assertEqual(tuple_intervals_1["tuple_intervals_seconds"][1], timedelta(seconds=90)) + + # Minutes: (15m, 45m) + self.assertEqual(tuple_intervals_1["tuple_intervals_minutes"][0], timedelta(minutes=15)) + self.assertEqual(tuple_intervals_1["tuple_intervals_minutes"][1], timedelta(minutes=45)) + + # Hours: (2h, 6h) + self.assertEqual(tuple_intervals_1["tuple_intervals_hours"][0], timedelta(hours=2)) + self.assertEqual(tuple_intervals_1["tuple_intervals_hours"][1], timedelta(hours=6)) + + # Days: (1d, 3d) + self.assertEqual(tuple_intervals_1["tuple_intervals_days"][0], timedelta(days=1)) + self.assertEqual(tuple_intervals_1["tuple_intervals_days"][1], timedelta(days=3)) + + # Weeks: (1w, 2w) + self.assertEqual(tuple_intervals_1["tuple_intervals_weeks"][0], timedelta(weeks=1)) + self.assertEqual(tuple_intervals_1["tuple_intervals_weeks"][1], timedelta(weeks=2)) + + # Months: (1 month, 3 months) -> days = value * 30 + self.assertEqual(tuple_intervals_1["tuple_intervals_months"][0], timedelta(days=1*30)) + self.assertEqual(tuple_intervals_1["tuple_intervals_months"][1], timedelta(days=3*30)) + + # Quarters: (1 quarter, 2 quarters) -> days = value * 90 + self.assertEqual(tuple_intervals_1["tuple_intervals_quarters"][0], timedelta(days=1*90)) + self.assertEqual(tuple_intervals_1["tuple_intervals_quarters"][1], timedelta(days=2*90)) + + # Years: (1 year, 2 years) -> days = value * 365 + self.assertEqual(tuple_intervals_1["tuple_intervals_years"][0], timedelta(days=1*365)) + self.assertEqual(tuple_intervals_1["tuple_intervals_years"][1], timedelta(days=2*365)) + + # Value assertions for second row tuples + # Nanoseconds: (2000ns, 8000ns) -> microseconds = value / 1000 + self.assertEqual(tuple_intervals_2["tuple_intervals_nanoseconds"][0], timedelta(microseconds=2000/1000)) + self.assertEqual(tuple_intervals_2["tuple_intervals_nanoseconds"][1], timedelta(microseconds=8000/1000)) + + # Microseconds: (800us, 2000us) + self.assertEqual(tuple_intervals_2["tuple_intervals_microseconds"][0], timedelta(microseconds=800)) + self.assertEqual(tuple_intervals_2["tuple_intervals_microseconds"][1], timedelta(microseconds=2000)) + + # Milliseconds: (200ms, 800ms) + self.assertEqual(tuple_intervals_2["tuple_intervals_milliseconds"][0], timedelta(milliseconds=200)) + self.assertEqual(tuple_intervals_2["tuple_intervals_milliseconds"][1], timedelta(milliseconds=800)) + + # Seconds: (60s, 120s) + self.assertEqual(tuple_intervals_2["tuple_intervals_seconds"][0], timedelta(seconds=60)) + self.assertEqual(tuple_intervals_2["tuple_intervals_seconds"][1], timedelta(seconds=120)) + + # Minutes: (30m, 60m) + self.assertEqual(tuple_intervals_2["tuple_intervals_minutes"][0], timedelta(minutes=30)) + self.assertEqual(tuple_intervals_2["tuple_intervals_minutes"][1], timedelta(minutes=60)) + + # Hours: (4h, 8h) + self.assertEqual(tuple_intervals_2["tuple_intervals_hours"][0], timedelta(hours=4)) + self.assertEqual(tuple_intervals_2["tuple_intervals_hours"][1], timedelta(hours=8)) + + # Days: (2d, 5d) + self.assertEqual(tuple_intervals_2["tuple_intervals_days"][0], timedelta(days=2)) + self.assertEqual(tuple_intervals_2["tuple_intervals_days"][1], timedelta(days=5)) + + # Weeks: (3w, 4w) + self.assertEqual(tuple_intervals_2["tuple_intervals_weeks"][0], timedelta(weeks=3)) + self.assertEqual(tuple_intervals_2["tuple_intervals_weeks"][1], timedelta(weeks=4)) + + # Months: (6 months, 12 months) -> days = value * 30 + self.assertEqual(tuple_intervals_2["tuple_intervals_months"][0], timedelta(days=6*30)) + self.assertEqual(tuple_intervals_2["tuple_intervals_months"][1], timedelta(days=12*30)) + + # Quarters: (3 quarters, 4 quarters) -> days = value * 90 + self.assertEqual(tuple_intervals_2["tuple_intervals_quarters"][0], timedelta(days=3*90)) + self.assertEqual(tuple_intervals_2["tuple_intervals_quarters"][1], timedelta(days=4*90)) + + # Years: (3 years, 5 years) -> days = value * 365 + self.assertEqual(tuple_intervals_2["tuple_intervals_years"][0], timedelta(days=3*365)) + self.assertEqual(tuple_intervals_2["tuple_intervals_years"][1], timedelta(days=5*365)) + + def test_nothing_types(self): + """Test Nothing type - represents absence of value""" + ret = self.session.query(""" + SELECT + 1 as row_id, + array() as nothing_val + FROM numbers(1) + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + nothing_val = ret.iloc[0]["nothing_val"] + + # Check if it's an empty array + self.assertEqual(len(nothing_val), 0, "Should be empty array") + + # Data type validation + expected_types = { + "row_id": "uint8", + "nothing_val": "object", + } + + for col, expected_type in expected_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type, f"{col} dtype should be {expected_type}, got {actual_type}") + + def test_geo_types(self): + """Test native Point and Ring geo types""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + (0.0, 0.0)::Point as point_origin, + (37.7749, -122.4194)::Point as point_sf, + [(0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0), (0.0, 0.0)]::Ring as ring_square + UNION ALL + SELECT + 2 as row_id, + (-74.006, 40.7128)::Point as point_origin, + (51.5074, -0.1278)::Point as point_sf, + [(-1.0, -1.0), (0.0, -1.0), (0.0, 0.0), (-1.0, 0.0), (-1.0, -1.0)]::Ring as ring_square + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row geo values + self.assertEqual(ret.iloc[0]["row_id"], 1) + + # Point is Tuple(Float64, Float64) - should be tuples with 2 float coordinates + point_origin = ret.iloc[0]["point_origin"] + self.assertIsInstance(point_origin, np.ndarray, "Point should be tuple") + self.assertEqual(len(point_origin), 2, "Point should have 2 coordinates") + self.assertEqual(point_origin[0], 0.0) + self.assertEqual(point_origin[1], 0.0) + + point_sf = ret.iloc[0]["point_sf"] + self.assertAlmostEqual(point_sf[0], 37.7749, places=4) + self.assertAlmostEqual(point_sf[1], -122.4194, places=4) + + # Ring is Array(Point) - should be array of points + ring_square = ret.iloc[0]["ring_square"] + self.assertTrue(hasattr(ring_square, '__iter__'), "Ring should be iterable") + self.assertEqual(len(ring_square), 5, "Square ring should have 5 points (closed)") + + # Each point in ring should be a tuple + for point in ring_square: + self.assertIsInstance(point, np.ndarray, "Each point in ring should be tuple") + self.assertEqual(len(point), 2, "Each point should have 2 coordinates") + + # Test second row + self.assertEqual(ret.iloc[1]["row_id"], 2) + point_nyc = ret.iloc[1]["point_origin"] + self.assertAlmostEqual(point_nyc[0], -74.006, places=3) + self.assertAlmostEqual(point_nyc[1], 40.7128, places=4) + + # Data type validation - Geo types should be object + expected_geo_types = { + "row_id": "uint8", + "point_origin": "object", + "point_sf": "object", + "ring_square": "object" + } + + for col, expected_type in expected_geo_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type, f"{col} dtype should be {expected_type}, got {actual_type}") + + def test_nested_geo_types(self): + """Test Geo types nested in tuples""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + ((0.0, 0.0)::Point, (1.0, 1.0)::Point) as tuple_two_points, + ((37.7749, -122.4194)::Point, 'San Francisco') as tuple_point_with_name, + ([(0.0, 0.0), (1.0, 0.0), (1.0, 1.0), (0.0, 1.0), (0.0, 0.0)]::Ring, 'square') as tuple_ring_with_name + UNION ALL + SELECT + 2 as row_id, + ((-74.006, 40.7128)::Point, (51.5074, -0.1278)::Point) as tuple_two_points, + ((40.7589, -73.9851)::Point, 'Times Square') as tuple_point_with_name, + ([(-1.0, -1.0), (0.0, -1.0), (0.0, 0.0), (-1.0, 0.0), (-1.0, -1.0)]::Ring, 'negative_square') as tuple_ring_with_name + ) + ORDER BY row_id + """, "DataFrame") + + # Test nested geo in tuples + self.assertEqual(ret.iloc[0]["row_id"], 1) + + tuple_two_points = ret.iloc[0]["tuple_two_points"] + self.assertIsInstance(tuple_two_points, np.ndarray, "Should be tuple") + self.assertEqual(len(tuple_two_points), 2, "Should have 2 points") + self.assertEqual(tuple_two_points[0], (0.0, 0.0)) + self.assertEqual(tuple_two_points[1], (1.0, 1.0)) + + tuple_point_with_name = ret.iloc[0]["tuple_point_with_name"] + self.assertEqual(len(tuple_point_with_name), 2) + self.assertAlmostEqual(tuple_point_with_name[0][0], 37.7749, places=4) + self.assertEqual(tuple_point_with_name[1], 'San Francisco') + + tuple_ring_with_name = ret.iloc[0]["tuple_ring_with_name"] + self.assertEqual(len(tuple_ring_with_name), 2) + self.assertEqual(len(tuple_ring_with_name[0]), 5) # Ring with 5 points + self.assertEqual(tuple_ring_with_name[1], 'square') + + def test_simple_aggregate_function_types(self): + """Test SimpleAggregateFunction types with sum and max functions""" + # Create a table using SimpleAggregateFunction + self.session.query("DROP TABLE IF EXISTS test_simple_agg") + self.session.query(""" + CREATE TABLE IF NOT EXISTS test_simple_agg ( + id UInt32, + sum_val SimpleAggregateFunction(sum, UInt64), + max_val SimpleAggregateFunction(max, Float64) + ) ENGINE = AggregatingMergeTree() ORDER BY id + """) + + # Insert test data + self.session.query(""" + INSERT INTO test_simple_agg VALUES + (1, 100, 10.5), + (1, 200, 20.3), + (2, 50, 5.7), + (2, 150, 15.2) + """) + + # Query the data + ret = self.session.query(""" + SELECT + id, + sum(sum_val) as total_sum, + max(max_val) as total_max + FROM test_simple_agg + GROUP BY id + ORDER BY id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Verify aggregated results + self.assertEqual(ret.iloc[0]["id"], 1) + self.assertEqual(ret.iloc[0]["total_sum"], 300) # 100 + 200 + self.assertAlmostEqual(ret.iloc[0]["total_max"], 20.3, places=1) + + self.assertEqual(ret.iloc[1]["id"], 2) + self.assertEqual(ret.iloc[1]["total_sum"], 200) # 50 + 150 + self.assertAlmostEqual(ret.iloc[1]["total_max"], 15.2, places=1) + + self.session.query("DROP TABLE IF EXISTS test_simple_agg") + + def test_aggregate_function_types(self): + """Test AggregateFunction types with uniq and avgState functions""" + # Create a table using AggregateFunction + self.session.query("DROP TABLE IF EXISTS test_agg_func") + self.session.query(""" + CREATE TABLE IF NOT EXISTS test_agg_func ( + id UInt32, + uniq_state AggregateFunction(uniq, String), + avg_state AggregateFunction(avgState, Float64) + ) ENGINE = AggregatingMergeTree() ORDER BY id + """) + + # Insert aggregate states + self.session.query(""" + INSERT INTO test_agg_func + SELECT + 1 as id, + uniqState('a') as uniq_state, + avgState(10.5) as avg_state + UNION ALL + SELECT + 1 as id, + uniqState('b') as uniq_state, + avgState(20.3) as avg_state + UNION ALL + SELECT + 2 as id, + uniqState('c') as uniq_state, + avgState(5.7) as avg_state + """) + + # Query finalized results + ret = self.session.query(""" + SELECT + id, + uniqMerge(uniq_state) as unique_count, + avgMerge(avg_state) as average_value + FROM test_agg_func + GROUP BY id + ORDER BY id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Verify aggregated results + self.assertEqual(ret.iloc[0]["id"], 1) + self.assertEqual(ret.iloc[0]["unique_count"], 2) # 'a' and 'b' + self.assertAlmostEqual(ret.iloc[0]["average_value"], 15.4, places=1) # (10.5 + 20.3) / 2 + + self.assertEqual(ret.iloc[1]["id"], 2) + self.assertEqual(ret.iloc[1]["unique_count"], 1) # 'c' + self.assertAlmostEqual(ret.iloc[1]["average_value"], 5.7, places=1) + + self.session.query("DROP TABLE IF EXISTS test_agg_func") + + def test_low_cardinality_types(self): + """Test LowCardinality types with String and various integer types""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + toLowCardinality('red') as lc_string, + toLowCardinality(toInt8(1)) as lc_int8, + toLowCardinality(toInt32(100)) as lc_int32, + toLowCardinality(toUInt16(65535)) as lc_uint16, + toLowCardinality(toFloat32(3.14)) as lc_float32 + UNION ALL + SELECT + 2 as row_id, + toLowCardinality('blue') as lc_string, + toLowCardinality(toInt8(2)) as lc_int8, + toLowCardinality(toInt32(200)) as lc_int32, + toLowCardinality(toUInt16(32768)) as lc_uint16, + toLowCardinality(toFloat32(2.71)) as lc_float32 + UNION ALL + SELECT + 3 as row_id, + toLowCardinality('green') as lc_string, + toLowCardinality(toInt8(1)) as lc_int8, -- Repeat value to show low cardinality + toLowCardinality(toInt32(100)) as lc_int32, -- Repeat value + toLowCardinality(toUInt16(65535)) as lc_uint16, -- Repeat value + toLowCardinality(toFloat32(3.14)) as lc_float32 -- Repeat value + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test values + self.assertEqual(ret.iloc[0]["lc_string"], 'red') + self.assertEqual(ret.iloc[0]["lc_int8"], 1) + self.assertEqual(ret.iloc[0]["lc_int32"], 100) + self.assertEqual(ret.iloc[0]["lc_uint16"], 65535) + self.assertAlmostEqual(ret.iloc[0]["lc_float32"], 3.14, places=2) + + self.assertEqual(ret.iloc[1]["lc_string"], 'blue') + self.assertEqual(ret.iloc[1]["lc_int8"], 2) + self.assertEqual(ret.iloc[1]["lc_int32"], 200) + self.assertEqual(ret.iloc[1]["lc_uint16"], 32768) + self.assertAlmostEqual(ret.iloc[1]["lc_float32"], 2.71, places=2) + + # Test repeated values (showing low cardinality) + self.assertEqual(ret.iloc[2]["lc_string"], 'green') + self.assertEqual(ret.iloc[2]["lc_int8"], 1) # Same as row 0 + self.assertEqual(ret.iloc[2]["lc_int32"], 100) # Same as row 0 + + # Data type validation - LowCardinality should typically be object for strings, specific types for numbers + expected_lc_types = { + "row_id": "uint8", + "lc_string": "object", + "lc_int8": "int8", + "lc_int32": "int32", + "lc_uint16": "uint16", + "lc_float32": "float32" + } + + for col, expected_type in expected_lc_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type, f"{col} dtype should be {expected_type}, got {actual_type}") + + def test_nullable_types(self): + """Test Nullable(T) with comprehensive type coverage from both test files""" + ret = self.session.query(""" + SELECT * FROM ( + SELECT + 1 as row_id, + -- Integer types + toNullable(toInt8(127)) as nullable_int8, + toNullable(toInt32(-2147483648)) as nullable_int32, + toNullable(toInt64(9223372036854775807)) as nullable_int64, + toNullable(toUInt16(65535)) as nullable_uint16, + toNullable(toUInt64(18446744073709551615)) as nullable_uint64, + -- Float types + toNullable(toFloat32(3.14159)) as nullable_float32, + toNullable(toFloat64(2.718281828)) as nullable_float64, + -- Decimal types + toNullable(toDecimal32(123.45, 2)) as nullable_decimal32, + toNullable(toDecimal64(987654.321, 3)) as nullable_decimal64, + -- String types + toNullable('Hello World') as nullable_string, + toNullable(toFixedString('Fixed', 5)) as nullable_fixed_string, + -- Date/Time types + toNullable(toDate('2023-12-25')) as nullable_date, + toNullable(toDateTime('2023-12-25 18:30:45', 'Asia/Shanghai')) as nullable_datetime, + toNullable(toDateTime64('2023-12-25 18:30:45.123', 3, 'Asia/Shanghai')) as nullable_datetime64, + -- Enum types + toNullable(CAST('red', 'Enum8(''red''=1, ''green''=2, ''blue''=3)')) as nullable_enum8, + -- UUID type + toNullable(toUUID('550e8400-e29b-41d4-a716-446655440000')) as nullable_uuid, + -- IPv4/IPv6 types + toNullable(toIPv4('192.168.1.1')) as nullable_ipv4, + toNullable(toIPv6('2001:0db8:85a3:0000:0000:8a2e:0370:7334')) as nullable_ipv6, + -- Bool type + toNullable(true) as nullable_bool, + -- JSON type + toNullable(CAST('{"name": "Alice", "age": 30, "active": true}', 'JSON')) as nullable_json, + -- Interval types + toNullable(INTERVAL 3 YEAR) as nullable_interval_year, + toNullable(INTERVAL 6 MONTH) as nullable_interval_month, + toNullable(INTERVAL 15 DAY) as nullable_interval_day, + toNullable(INTERVAL 2 HOUR) as nullable_interval_hour + UNION ALL + SELECT + 2 as row_id, + -- Mix of NULL and non-NULL values + NULL as nullable_int8, + toNullable(toInt32(2147483647)) as nullable_int32, + NULL as nullable_int64, + toNullable(toUInt16(32768)) as nullable_uint16, + NULL as nullable_uint64, + toNullable(toFloat32(-3.14159)) as nullable_float32, + NULL as nullable_float64, + toNullable(toDecimal32(-456.78, 2)) as nullable_decimal32, + NULL as nullable_decimal64, + NULL as nullable_string, + toNullable(toFixedString('NULL ', 5)) as nullable_fixed_string, + NULL as nullable_date, + toNullable(toDateTime('2024-01-01 00:00:00', 'Asia/Shanghai')) as nullable_datetime, + NULL as nullable_datetime64, + toNullable(CAST('blue', 'Enum8(''red''=1, ''green''=2, ''blue''=3)')) as nullable_enum8, + NULL as nullable_uuid, + toNullable(toIPv4('10.0.0.1')) as nullable_ipv4, + NULL as nullable_ipv6, + toNullable(false) as nullable_bool, + toNullable(CAST('{"name": "Bob", "age": 25, "active": false}', 'JSON')) as nullable_json, + -- Interval types + toNullable(INTERVAL 1 YEAR) as nullable_interval_year, + NULL as nullable_interval_month, + toNullable(INTERVAL 7 DAY) as nullable_interval_day, + NULL as nullable_interval_hour + UNION ALL + SELECT + 3 as row_id, + -- All NULL values to test NULL handling + NULL as nullable_int8, + NULL as nullable_int32, + NULL as nullable_int64, + NULL as nullable_uint16, + NULL as nullable_uint64, + NULL as nullable_float32, + NULL as nullable_float64, + NULL as nullable_decimal32, + NULL as nullable_decimal64, + NULL as nullable_string, + NULL as nullable_fixed_string, + NULL as nullable_date, + NULL as nullable_datetime, + NULL as nullable_datetime64, + NULL as nullable_enum8, + NULL as nullable_uuid, + NULL as nullable_ipv4, + NULL as nullable_ipv6, + NULL as nullable_bool, + NULL as nullable_json, + -- Interval types + NULL as nullable_interval_year, + NULL as nullable_interval_month, + NULL as nullable_interval_day, + NULL as nullable_interval_hour + ) + ORDER BY row_id + """, "DataFrame") + + for col in ret.columns: + print(f"{col}: {ret.dtypes[col]} (actual value: {ret.iloc[0][col]}, Python type: {type(ret.iloc[0][col])})") + + # Test first row - all non-NULL values + self.assertEqual(ret.iloc[0]["row_id"], 1) + + # Integer types + self.assertEqual(ret.iloc[0]["nullable_int8"], 127) + self.assertEqual(ret.iloc[0]["nullable_int32"], -2147483648) + self.assertEqual(ret.iloc[0]["nullable_int64"], 9223372036854775807) + self.assertEqual(ret.iloc[0]["nullable_uint16"], 65535) + self.assertEqual(ret.iloc[0]["nullable_uint64"], 18446744073709551615) + + # Float types + self.assertAlmostEqual(ret.iloc[0]["nullable_float32"], 3.14159, places=5) + self.assertAlmostEqual(ret.iloc[0]["nullable_float64"], 2.718281828, places=9) + + # Decimal types + self.assertAlmostEqual(ret.iloc[0]["nullable_decimal32"], 123.45, places=2) + self.assertAlmostEqual(ret.iloc[0]["nullable_decimal64"], 987654.321, places=3) + + # String types + self.assertEqual(ret.iloc[0]["nullable_string"], 'Hello World') + self.assertEqual(ret.iloc[0]["nullable_fixed_string"], 'Fixed') + + # Date/Time types + nullable_date = ret.iloc[0]["nullable_date"] + self.assertIsInstance(nullable_date, pd.Timestamp) + self.assertEqual(nullable_date.date(), date(2023, 12, 25)) + + nullable_datetime = ret.iloc[0]["nullable_datetime"] + self.assertIsInstance(nullable_datetime, pd.Timestamp) + + # Check if timezone info is preserved (may be naive depending on implementation) + if nullable_datetime.tz is not None: + self.assertEqual(nullable_datetime, pd.Timestamp('2023-12-25 18:30:45', tz='Asia/Shanghai')) + else: + # If timezone is lost, just check the datetime value without timezone + self.assertEqual(nullable_datetime, pd.Timestamp('2023-12-25 10:30:45')) + + nullable_datetime64 = ret.iloc[0]["nullable_datetime64"] + self.assertIsInstance(nullable_datetime64, pd.Timestamp) + # Check if timezone info is preserved for DateTime64 + if nullable_datetime64.tz is not None: + self.assertEqual(nullable_datetime64, pd.Timestamp('2023-12-25 18:30:45.123', tz='Asia/Shanghai')) + else: + # If timezone is lost, just check the datetime value without timezone + self.assertEqual(nullable_datetime64, pd.Timestamp('2023-12-25 10:30:45.123')) + + # Enum, UUID, IP types + self.assertEqual(ret.iloc[0]["nullable_enum8"], 'red') + self.assertIsInstance(ret.iloc[0]["nullable_uuid"], uuid.UUID) + self.assertEqual(ret.iloc[0]["nullable_uuid"], uuid.UUID('550e8400-e29b-41d4-a716-446655440000')) + + # IP types + self.assertEqual(str(ret.iloc[0]["nullable_ipv4"]), '192.168.1.1') + ipv6_str = str(ret.iloc[0]["nullable_ipv6"]) + self.assertIn('2001', ipv6_str) + + # Bool type + self.assertEqual(ret.iloc[0]["nullable_bool"], True) + + # JSON type + json_val = ret.iloc[0]["nullable_json"] + self.assertIsInstance(json_val, dict) + self.assertEqual(json_val["name"], "Alice") + self.assertEqual(json_val["age"], 30) + self.assertEqual(json_val["active"], True) + + # Interval types + self.assertEqual(ret.iloc[0]["nullable_interval_year"], timedelta(days=3*365)) + self.assertEqual(ret.iloc[0]["nullable_interval_month"], timedelta(days=6*30)) + self.assertEqual(ret.iloc[0]["nullable_interval_day"], timedelta(days=15)) + self.assertEqual(ret.iloc[0]["nullable_interval_hour"], timedelta(hours=2)) + + # Test second row - mix of NULL and non-NULL values + self.assertEqual(ret.iloc[1]["row_id"], 2) + + # Test NULL values + self.assertTrue(pd.isna(ret.iloc[1]["nullable_int8"]), "Should be NULL/NaN") + self.assertTrue(pd.isna(ret.iloc[1]["nullable_int64"]), "Should be NULL/NaN") + self.assertTrue(pd.isna(ret.iloc[1]["nullable_uint64"]), "Should be NULL/NaN") + self.assertTrue(pd.isna(ret.iloc[1]["nullable_float64"]), "Should be NULL/NaN") + self.assertTrue(pd.isna(ret.iloc[1]["nullable_string"]), "Should be NULL/NaN") + + # Test non-NULL values in second row + self.assertEqual(ret.iloc[1]["nullable_int32"], 2147483647) + self.assertEqual(ret.iloc[1]["nullable_uint16"], 32768) + self.assertAlmostEqual(ret.iloc[1]["nullable_float32"], -3.14159, places=5) + self.assertEqual(ret.iloc[1]["nullable_fixed_string"], 'NULL ') + self.assertEqual(ret.iloc[1]["nullable_bool"], False) + + # JSON type for second row + json_val_2 = ret.iloc[1]["nullable_json"] + self.assertIsInstance(json_val_2, dict) + self.assertEqual(json_val_2["name"], "Bob") + self.assertEqual(json_val_2["age"], 25) + self.assertEqual(json_val_2["active"], False) + + # Interval types for second row + self.assertEqual(ret.iloc[1]["nullable_interval_year"], timedelta(days=1*365)) + self.assertTrue(pd.isna(ret.iloc[1]["nullable_interval_month"]), "Should be NULL/NaN") + self.assertEqual(ret.iloc[1]["nullable_interval_day"], timedelta(days=7)) + self.assertTrue(pd.isna(ret.iloc[1]["nullable_interval_hour"]), "Should be NULL/NaN") + + # Test third row - all NULL values + self.assertEqual(ret.iloc[2]["row_id"], 3) + + # All nullable columns should be NULL in third row + nullable_columns = [col for col in ret.columns if col.startswith('nullable_')] + for col in nullable_columns: + value = ret.iloc[2][col] + self.assertTrue(pd.isna(value) or value is None, f"{col} should be NULL/NaN in row 3") + + # Data type validation - Nullable types should maintain their underlying types or be object + expected_nullable_types = { + "row_id": "uint8", + "nullable_int8": "Int8", + "nullable_int32": "Int32", + "nullable_int64": "Int64", + "nullable_uint16": "UInt16", + "nullable_uint64": "UInt64", + "nullable_float32": "Float32", + "nullable_float64": "Float64", + "nullable_decimal32": "Float64", + "nullable_decimal64": "Float64", + "nullable_string": "object", + "nullable_fixed_string": "object", + "nullable_date": "datetime64[s]", + "nullable_datetime": "datetime64[s]", + "nullable_datetime64": "datetime64[ns]", + "nullable_enum8": "object", + "nullable_uuid": "object", + "nullable_ipv4": "object", + "nullable_ipv6": "object", + "nullable_bool": "boolean", + "nullable_json": "object", + "nullable_interval_year": "timedelta64[s]", + "nullable_interval_month": "timedelta64[s]", + "nullable_interval_day": "timedelta64[s]", + "nullable_interval_hour": "timedelta64[s]", + } + + for col, expected_type in expected_nullable_types.items(): + actual_type = str(ret.dtypes[col]) + self.assertEqual(actual_type, expected_type, f"{col} dtype should be {expected_type}, got {actual_type}") + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_dataframe_large_scale_1.py b/tests/test_dataframe_large_scale_1.py new file mode 100644 index 00000000000..9e9a97416ab --- /dev/null +++ b/tests/test_dataframe_large_scale_1.py @@ -0,0 +1,263 @@ +#!/usr/bin/env python3 + +import os +import unittest +import time +from urllib.request import urlretrieve +import pandas as pd +import chdb +import json +import numpy as np +from datetime import timedelta + + +class TestDataFrameLargeScale(unittest.TestCase): + """Test DataFrame generation with large scale data (1M rows) and diverse data types""" + + def setUp(self): + self.session = chdb.session.Session() + + def tearDown(self): + self.session.close() + + def test_large_scale_dataframe_generation(self): + """Test generating 1M rows DataFrame with diverse data types and validate correctness""" + + print("Starting 1M row DataFrame generation test...") + start_time = time.time() + + ret = self.session.query(""" + SELECT + number as row_id, + + -- Integer types + toInt8(number % 127) as int8_col, + toInt16(number % 32767) as int16_col, + toInt32(number % 2147483647) as int32_col, + toInt64(number) as int64_col, + toUInt8(number % 255) as uint8_col, + toUInt16(number % 65535) as uint16_col, + toUInt32(number % 4294967295) as uint32_col, + toUInt64(number) as uint64_col, + + -- Float types + toFloat32(number * 3.14159 / 1000000) as float32_col, + toFloat64(number * 2.718281828 / 1000000) as float64_col, + + -- Decimal types + toDecimal32(number % 1000000 / 100.0, 2) as decimal32_col, + toDecimal64(number / 1000000.0, 6) as decimal64_col, + + -- String types + concat('row_', toString(number)) as string_col, + toFixedString(concat('fix', toString(number % 100)), 10) as fixed_string_col, + + -- Date and DateTime types + toDate('2024-01-01') + (number % 365) as date_col, + toDateTime('2024-01-01 00:00:00', 'Asia/Shanghai') + (number % 86400) as datetime_col, + + -- Boolean type + toBool((number % 2) = 0) as bool_col, + + -- UUID type + generateUUIDv4() as uuid_col, + + -- Nullable types + if(number % 10 = 0, NULL, toInt32(number % 1000)) as nullable_int_col, + if(number % 15 = 0, NULL, toFloat64(number / 1000.0)) as nullable_float_col, + if(number % 20 = 0, NULL, concat('nullable_', toString(number))) as nullable_string_col, + + -- Array types + [toInt32(number % 100), toInt32((number + 1) % 100), toInt32((number + 2) % 100)] as array_int_col, + [toString(number % 10), toString((number + 1) % 10)] as array_string_col, + [toFloat64(number / 1000.0), toFloat64((number + 1) / 1000.0)] as array_float_col, + + -- Tuple types + (toInt32(number % 1000), concat('tuple_', toString(number % 100))) as tuple_int_string_col, + (toFloat64(number / 1000000.0), toDate('2024-01-01') + (number % 30), number % 2 = 0) as tuple_mixed_col, + + -- JSON type (simulate with Map) + map( + 'id', toString(number), + 'name', concat('user_', toString(number % 10000)), + 'score', toString(toFloat64(number % 100) / 10.0), + 'active', toString(number % 3 = 0) + ) as json_col, + + -- Interval types + INTERVAL (number % 3600) SECOND as interval_seconds_col, + INTERVAL (number % 24) HOUR as interval_hours_col, + INTERVAL (number % 30) DAY as interval_days_col, + + -- Enum simulation with LowCardinality + toLowCardinality( + multiIf( + number % 5 = 0, 'Level_A', + number % 5 = 1, 'Level_B', + number % 5 = 2, 'Level_C', + number % 5 = 3, 'Level_D', + 'Level_E' + ) + ) as enum_col + FROM numbers(1000000) + """, "DataFrame") + + query_time = time.time() - start_time + print(f"Query execution time: {query_time:.2f} seconds") + + # Validate DataFrame structure + self.assertEqual(len(ret), 1000000, "Should have exactly 1M rows") + self.assertEqual(len(ret.columns), 32, "Should have exactly 32 columns") + + # Validate data types + expected_types = { + 'row_id': ['uint64'], + 'int8_col': ['int8'], + 'int16_col': ['int16'], + 'int32_col': ['int32'], + 'int64_col': ['int64'], + 'uint8_col': ['uint8'], + 'uint16_col': ['uint16'], + 'uint32_col': ['uint32'], + 'uint64_col': ['uint64'], + 'float32_col': ['float32'], + 'float64_col': ['float64'], + 'string_col': ['object'], + 'bool_col': ['bool'], + 'date_col': ['datetime64[s]'], + 'datetime_col': ['datetime64[s, Asia/Shanghai]'], + 'nullable_int_col': ['Int32'], + 'array_int_col': ['object'], + 'array_string_col': ['object'], + 'tuple_int_string_col': ['object'], + 'json_col': ['object'], + 'enum_col': ['object'], + 'interval_seconds_col': ['timedelta64[s]'], + } + + print("\nData type validation:") + for col, allowed_types in expected_types.items(): + if col in ret.columns: + actual_type = str(ret.dtypes[col]) + self.assertIn(actual_type, allowed_types, + f"Column {col} has unexpected type {actual_type}, expected one of {allowed_types}") + print(f" {col}: {actual_type}") + + # Validate sample data correctness + print("\nData correctness validation:") + + # Check first row (number = 0) + first_row = ret.iloc[0] + self.assertEqual(first_row['row_id'], 0) + self.assertEqual(first_row['int8_col'], 0) # 0 % 127 = 0 + self.assertEqual(first_row['int16_col'], 0) # 0 % 32767 = 0 + self.assertEqual(first_row['uint8_col'], 0) # 0 % 255 = 0 + self.assertEqual(first_row['string_col'], 'row_0') + self.assertEqual(first_row['bool_col'], True) # 0 % 2 == 0 + # Check nullable column - might be NaN instead of None + self.assertTrue(pd.isna(first_row['nullable_int_col']), f"nullable_int_col should be NULL/NaN, got {first_row['nullable_int_col']}") # 0 % 10 == 0 -> NULL + self.assertEqual(first_row['float32_col'], 0.0) # 0 * 3.14159 / 1000000 = 0 + print("First row data validation passed") + + # Check middle row (number = 500000) + middle_row = ret.iloc[500000] + self.assertEqual(middle_row['row_id'], 500000) + self.assertEqual(middle_row['int8_col'], 500000 % 127) # 500000 % 127 = 73 + self.assertEqual(middle_row['uint8_col'], 500000 % 255) # 500000 % 255 = 5 + self.assertEqual(middle_row['string_col'], 'row_500000') + self.assertEqual(middle_row['bool_col'], True) # 500000 % 2 == 0 + # 500000 % 10 == 0, so should be NULL + self.assertTrue(pd.isna(middle_row['nullable_int_col']), "nullable_int_col should be NULL/NaN") + # Check enum value: 500000 % 5 = 0 -> 'Level_A' + self.assertEqual(middle_row['enum_col'], 'Level_A') + print("Middle row data validation passed") + + # Check last row (number = 999999) + last_row = ret.iloc[999999] + self.assertEqual(last_row['row_id'], 999999) + self.assertEqual(last_row['int8_col'], 999999 % 127) # 999999 % 127 = 126 + self.assertEqual(last_row['uint8_col'], 999999 % 255) # 999999 % 255 = 254 + self.assertEqual(last_row['string_col'], 'row_999999') + self.assertEqual(last_row['bool_col'], False) # 999999 % 2 == 1 + # 999999 % 10 != 0, so should have value + self.assertFalse(pd.isna(last_row['nullable_int_col']), "nullable_int_col should not be NULL/NaN") + self.assertEqual(last_row['nullable_int_col'], 999999 % 1000) # 999 + # Check enum value: 999999 % 5 = 4 -> 'Level_E' + self.assertEqual(last_row['enum_col'], 'Level_E') + print("Last row data validation passed") + + # Validate nullable columns have some NULL values + null_count_nullable_int = ret['nullable_int_col'].isna().sum() + null_count_nullable_float = ret['nullable_float_col'].isna().sum() + null_count_nullable_string = ret['nullable_string_col'].isna().sum() + + self.assertEqual(null_count_nullable_int, 100000, "nullable_int_col should have exactly 100k NULLs (every 10th row)") + self.assertEqual(null_count_nullable_float, 66667, "nullable_float_col should have exactly 66,667 NULLs (every 15th row)") + self.assertEqual(null_count_nullable_string, 50000, "nullable_string_col should have exactly 50k NULLs (every 20th row)") + + print(f"NULL value validation: int({null_count_nullable_int}), float({null_count_nullable_float}), string({null_count_nullable_string})") + + # Validate array columns (using row 100, number = 100) + sample_array_int = ret.iloc[100]['array_int_col'] + sample_array_string = ret.iloc[100]['array_string_col'] + sample_array_float = ret.iloc[100]['array_float_col'] + + self.assertIsInstance(sample_array_int, np.ndarray, "array_int_col should be array-like") + self.assertIsInstance(sample_array_string, np.ndarray, "array_string_col should be array-like") + self.assertEqual(len(sample_array_int), 3, "array_int_col should have 3 elements") + self.assertEqual(len(sample_array_string), 2, "array_string_col should have 2 elements") + + # Validate specific array values for row 100 (number = 100) + expected_int_array = [100 % 100, 101 % 100, 102 % 100] # [0, 1, 2] + expected_string_array = ['0', '1'] # [toString(100 % 10), toString(101 % 10)] + + np.testing.assert_array_equal(sample_array_int, expected_int_array) + np.testing.assert_array_equal(sample_array_string, expected_string_array) + + print("Array column validation passed") + + # Validate tuple columns (using row 200, number = 200) + sample_tuple = ret.iloc[200]['tuple_int_string_col'] + sample_tuple_mixed = ret.iloc[200]['tuple_mixed_col'] + + self.assertIsInstance(sample_tuple, np.ndarray, "tuple_int_string_col should be array-like") + self.assertEqual(len(sample_tuple), 2, "tuple should have 2 elements") + + # Validate tuple values for row 200 (number = 200) + expected_tuple_int = 200 % 1000 # 200 + expected_tuple_string = 'tuple_0' # concat('tuple_', toString(200 % 100)) + + self.assertEqual(sample_tuple[0], expected_tuple_int) + self.assertEqual(sample_tuple[1], expected_tuple_string) + + # Validate mixed tuple: (float, date, bool) + self.assertEqual(len(sample_tuple_mixed), 3, "Mixed tuple should have 3 elements") + expected_float = 200 / 1000000.0 # 0.0002 + expected_bool = (200 % 2 == 0) # True + + self.assertAlmostEqual(sample_tuple_mixed[0], expected_float, places=7) + self.assertEqual(sample_tuple_mixed[2], expected_bool) + + print("Tuple column validation passed") + + # Validate JSON-like column (Map) (using row 300, number = 300) + sample_json = ret.iloc[300]['json_col'] + self.assertIsInstance(sample_json, dict, "json_col should be dict-like") + self.assertIn('id', sample_json, "JSON should have 'id' key") + self.assertIn('name', sample_json, "JSON should have 'name' key") + self.assertIn('score', sample_json, "JSON should have 'score' key") + self.assertIn('active', sample_json, "JSON should have 'active' key") + + # Validate specific JSON values for row 300 (number = 300) + self.assertEqual(sample_json['id'], '300') + self.assertEqual(sample_json['name'], 'user_300') # concat('user_', toString(300 % 10000)) + self.assertEqual(sample_json['score'], '0') # toString(toFloat64(300 % 100) / 10.0) = toString(0.0) + self.assertEqual(sample_json['active'], '1') # toString(300 % 3 = 0) = toString(true) + + print("JSON column validation passed") + + print("Large scale DataFrame test completed successfully!") + + +if __name__ == '__main__': + unittest.main() diff --git a/tests/test_dataframe_large_scale_2.py b/tests/test_dataframe_large_scale_2.py new file mode 100644 index 00000000000..f2339d16365 --- /dev/null +++ b/tests/test_dataframe_large_scale_2.py @@ -0,0 +1,238 @@ +#!/usr/bin/env python3 + +import os +import unittest +import time +from urllib.request import urlretrieve +import pandas as pd +import chdb +import json +import numpy as np +from datetime import timedelta + + +class TestDataFrameLargeScale(unittest.TestCase): + """Test DataFrame generation with large scale data (1M rows) and diverse data types""" + + @classmethod + def setUpClass(cls): + cls.parquet_file = "hits_0.parquet" + if not os.path.exists(cls.parquet_file): + print(f"Downloading {cls.parquet_file}...") + url = "https://datasets.clickhouse.com/hits_compatible/athena_partitioned/hits_0.parquet" + urlretrieve(url, cls.parquet_file) + print("Download complete!") + + @classmethod + def tearDownClass(cls): + pass + + def setUp(self): + self.session = chdb.session.Session("./tmp") + + def tearDown(self): + self.session.close() + + def test_pandas_chdb_dataframe_consistency(self): + """Compare pandas and chdb DataFrame consistency when reading the same parquet file""" + + print(f"Testing DataFrame consistency between pandas and chdb for {self.parquet_file}") + + # 1. Read with pandas + print("Reading parquet file with pandas...") + pandas_start = time.time() + df_pandas = pd.read_parquet(self.parquet_file) + pandas_time = time.time() - pandas_start + print(f"Pandas read time: {pandas_time:.3f} seconds") + + # 2. Read with chdb + print("Reading parquet file with chdb...") + chdb_start = time.time() + df_chdb = self.session.query(f""" + SELECT * FROM file('{self.parquet_file}') + """, "DataFrame") + chdb_time = time.time() - chdb_start + print(f"chdb read time: {chdb_time:.3f} seconds") + + # 3. Compare basic structure + print("\n=== Structure Comparison ===") + pandas_rows, pandas_cols = df_pandas.shape + chdb_rows, chdb_cols = df_chdb.shape + + print(f"Pandas DataFrame: {pandas_rows:,} rows × {pandas_cols} columns") + print(f"chdb DataFrame: {chdb_rows:,} rows × {chdb_cols} columns") + + # Assert row and column counts match + self.assertEqual(pandas_rows, chdb_rows, f"Row count mismatch: pandas={pandas_rows}, chdb={chdb_rows}") + self.assertEqual(pandas_cols, chdb_cols, f"Column count mismatch: pandas={pandas_cols}, chdb={chdb_cols}") + + # 4. Compare column names + print("\n=== Column Names Comparison ===") + pandas_columns = set(df_pandas.columns) + chdb_columns = set(df_chdb.columns) + + missing_in_chdb = pandas_columns - chdb_columns + missing_in_pandas = chdb_columns - pandas_columns + + if missing_in_chdb: + print(f"Columns missing in chdb: {missing_in_chdb}") + if missing_in_pandas: + print(f"Columns missing in pandas: {missing_in_pandas}") + + # Assert column names match + self.assertEqual(pandas_columns, chdb_columns, "Column names don't match between pandas and chdb") + print("✓ Column names match") + + # 5. Compare data types + print("\n=== Data Types Comparison ===") + + common_columns = list(pandas_columns.intersection(chdb_columns)) + self.assertEqual(len(common_columns), len(pandas_columns), "Column names don't match between pandas and chdb") + + print(f"Comparing data types for {len(common_columns)} columns:") + print("-" * 80) + + dtype_mismatches = [] + for col in common_columns: + pandas_dtype = str(df_pandas[col].dtype) + chdb_dtype = str(df_chdb[col].dtype) + + # Print each column's data types + match_status = "✓" if pandas_dtype == chdb_dtype else "✗" + print(f"{match_status} {col:<20} | pandas: {pandas_dtype:<20} | chdb: {chdb_dtype:<20}") + + if pandas_dtype != chdb_dtype: + dtype_mismatches.append({ + 'column': col, + 'pandas': pandas_dtype, + 'chdb': chdb_dtype + }) + + print("-" * 80) + + if dtype_mismatches: + print(f"\nData type differences found in {len(dtype_mismatches)} columns:") + for mismatch in dtype_mismatches: + print(f" ✗ {mismatch['column']}: pandas={mismatch['pandas']} vs chdb={mismatch['chdb']}") + self.fail("Data type differences found between pandas and chdb") + else: + print("\n✓ All data types match perfectly!") + + # 6. Compare data values every 1000 rows + print("\n=== Data Values Comparison (every 1000 rows) ===") + + # Sort both DataFrames by WatchID (unique identifier) to ensure consistent ordering + sort_col = 'WatchID' if 'WatchID' in common_columns else common_columns[0] + print(f"Sorting by column: {sort_col}") + df_pandas = df_pandas.sort_values(by=sort_col).reset_index(drop=True) + df_chdb = df_chdb.sort_values(by=sort_col).reset_index(drop=True) + self.assertEqual(len(df_pandas), len(df_chdb)) + + total_rows = len(df_pandas) + sample_interval = 1000 + sample_indices = list(range(0, total_rows, sample_interval)) + + # Add the last row if it's not already included + if total_rows - 1 not in sample_indices: + sample_indices.append(total_rows - 1) + + print(f"Comparing {len(sample_indices)} sample rows (every {sample_interval} rows)") + + data_mismatches = [] + for idx in sample_indices: + row_mismatches = [] + + for col in common_columns: + pandas_val = df_pandas.iloc[idx][col] + chdb_val = df_chdb.iloc[idx][col] + + # Handle different ways of representing None/NaN + pandas_is_na = pd.isna(pandas_val) + chdb_is_na = pd.isna(chdb_val) + + if pandas_is_na and chdb_is_na: + continue # Both are NaN, considered equal + elif pandas_is_na != chdb_is_na: + row_mismatches.append({ + 'column': col, + 'pandas': pandas_val, + 'chdb': chdb_val, + 'reason': 'null_mismatch' + }) + elif pandas_val != chdb_val: + # Handle bytes vs string comparison + if isinstance(pandas_val, bytes) and isinstance(chdb_val, str): + # Convert string to bytes for comparison (preserve original binary data) + try: + chdb_bytes = chdb_val.encode('utf-8') + if pandas_val == chdb_bytes: + continue # Values are equivalent after conversion + except: + print(f"Failed to encode string to bytes: {chdb_val}") + self.fail("Failed to encode string to bytes") + elif isinstance(pandas_val, str) and isinstance(chdb_val, bytes): + # Convert string to bytes for comparison (preserve original binary data) + try: + pandas_bytes = pandas_val.encode('utf-8') + if pandas_bytes == chdb_val: + continue # Values are equivalent after conversion + except: + print(f"Failed to encode string to bytes: {pandas_val}") + self.fail("Failed to encode string to bytes") + + # For floating point numbers, use approximate comparison + if isinstance(pandas_val, (float, int)) and isinstance(chdb_val, (float, int)): + if abs(float(pandas_val) - float(chdb_val)) > 1e-10: + row_mismatches.append({ + 'column': col, + 'pandas': pandas_val, + 'chdb': chdb_val, + 'reason': 'value_mismatch' + }) + else: + # Check if this is a bytes vs string type issue + is_bytes_string_mismatch = ( + (isinstance(pandas_val, bytes) and isinstance(chdb_val, str)) or + (isinstance(pandas_val, str) and isinstance(chdb_val, bytes)) + ) + + row_mismatches.append({ + 'column': col, + 'pandas': pandas_val, + 'chdb': chdb_val, + 'reason': 'bytes_string_mismatch' if is_bytes_string_mismatch else 'value_mismatch' + }) + + if row_mismatches: + data_mismatches.append({ + 'row_index': idx, + 'mismatches': row_mismatches + }) + + # 7. Report results + print(f"\n=== Summary ===") + print(f"✓ Row count: {pandas_rows:,}") + print(f"✓ Column count: {pandas_cols}") + print(f"✓ Sample rows checked: {len(sample_indices)}") + + if data_mismatches: + print(f"Data mismatches found in {len(data_mismatches)} rows") + + # Show first few mismatches for debugging + for i, mismatch in enumerate(data_mismatches[:3]): + print(f"\nRow {mismatch['row_index']} mismatches:") + for col_mismatch in mismatch['mismatches'][:5]: # Show first 5 column mismatches + print(f" {col_mismatch['column']}: pandas='{col_mismatch['pandas']}' vs chdb='{col_mismatch['chdb']}'") + + if len(data_mismatches) > 3: + print(f"... and {len(data_mismatches) - 3} more rows with mismatches") + + self.fail(f"Data mismatches found in {len(data_mismatches)} rows") + else: + print("All sampled data values match") + + print("\nDataFrame consistency test completed!") + + +if __name__ == '__main__': + unittest.main()