From 71db832790ed55936e2e40f58462ca0ed0b08fc8 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Sat, 6 May 2023 10:43:15 +0800 Subject: [PATCH 1/4] GH-34375: [C++][Parquet] Ignore page header stats when page index enabled --- .../parquet/arrow/arrow_reader_writer_test.cc | 21 +++++++++++++++++-- cpp/src/parquet/column_writer.cc | 12 +++++++++-- 2 files changed, 29 insertions(+), 4 deletions(-) diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index ad33ca296a2..ba3702ede4d 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5252,17 +5252,34 @@ class ParquetPageIndexRoundTripTest : public ::testing::Test { auto row_group_index_reader = page_index_reader->RowGroup(rg); ASSERT_NE(row_group_index_reader, nullptr); + auto row_group_reader = reader->RowGroup(rg); + ASSERT_NE(row_group_reader, nullptr); + for (int col = 0; col < metadata->num_columns(); ++col) { auto column_index = row_group_index_reader->GetColumnIndex(col); column_indexes_.emplace_back(column_index.get()); + bool expect_no_page_index = + expect_columns_without_index.find(col) != expect_columns_without_index.cend(); + auto offset_index = row_group_index_reader->GetOffsetIndex(col); - if (expect_columns_without_index.find(col) != - expect_columns_without_index.cend()) { + if (expect_no_page_index) { ASSERT_EQ(offset_index, nullptr); } else { CheckOffsetIndex(offset_index.get(), expect_num_pages, &offset_lower_bound); } + + // Verify page stats are not written to page header if page index is enabled. + auto page_reader = row_group_reader->GetColumnPageReader(col); + ASSERT_NE(page_reader, nullptr); + std::shared_ptr page = nullptr; + while ((page = page_reader->NextPage()) != nullptr) { + if (page->type() == PageType::DATA_PAGE || + page->type() == PageType::DATA_PAGE_V2) { + ASSERT_EQ(std::static_pointer_cast(page)->statistics().is_set(), + expect_no_page_index); + } + } } } } diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index f298f282fe7..33e9f8f6658 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -460,7 +460,11 @@ class SerializedPageWriter : public PageWriter { ToThrift(page.definition_level_encoding())); data_page_header.__set_repetition_level_encoding( ToThrift(page.repetition_level_encoding())); - data_page_header.__set_statistics(ToThrift(page.statistics())); + + // Write page statistics only when page index is not enabled. + if (column_index_builder_ == nullptr) { + data_page_header.__set_statistics(ToThrift(page.statistics())); + } page_header.__set_type(format::PageType::DATA_PAGE); page_header.__set_data_page_header(data_page_header); @@ -479,7 +483,11 @@ class SerializedPageWriter : public PageWriter { page.repetition_levels_byte_length()); data_page_header.__set_is_compressed(page.is_compressed()); - data_page_header.__set_statistics(ToThrift(page.statistics())); + + // Write page statistics only when page index is not enabled. + if (column_index_builder_ == nullptr) { + data_page_header.__set_statistics(ToThrift(page.statistics())); + } page_header.__set_type(format::PageType::DATA_PAGE_V2); page_header.__set_data_page_header_v2(data_page_header); From 9c2f00279074028445dba696ca7798e28939580c Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Fri, 26 May 2023 13:26:37 +0800 Subject: [PATCH 2/4] update docstring --- cpp/src/parquet/properties.h | 5 ++++- docs/source/cpp/parquet.rst | 18 +++++++----------- 2 files changed, 11 insertions(+), 12 deletions(-) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 0a9864de626..c7272cd0633 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -525,7 +525,8 @@ class PARQUET_EXPORT WriterProperties { /// Enable writing page index in general for all columns. Default disabled. /// /// Page index contains statistics for data pages and can be used to skip pages - /// when scanning data in ordered and unordered columns. + /// when scanning data in ordered and unordered columns. Note that it does not + /// write statistics to the page header once page index is enabled. /// /// Please check the link below for more details: /// https://github.com/apache/parquet-format/blob/master/PageIndex.md @@ -541,6 +542,8 @@ class PARQUET_EXPORT WriterProperties { } /// Enable writing page index for column specified by `path`. Default disabled. + /// Note that it does not write statistics to the page header once page index is + /// enabled. Builder* enable_write_page_index(const std::string& path) { page_index_enabled_[path] = true; return this; diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst index 0ea6063b2a2..75e6f9bf289 100644 --- a/docs/source/cpp/parquet.rst +++ b/docs/source/cpp/parquet.rst @@ -304,6 +304,8 @@ Statistics are enabled by default for all columns. You can disable statistics fo all columns or specific columns using ``disable_statistics`` on the builder. There is a ``max_statistics_size`` which limits the maximum number of bytes that may be used for min and max values, useful for types like strings or binary blobs. +If a column has enabled page index using ``enable_write_page_index``, then it does +not write statistics to the page header because it is duplicated in the ColumnIndex. There are also Arrow-specific settings that can be configured with :class:`parquet::ArrowWriterProperties`: @@ -573,20 +575,14 @@ Miscellaneous +--------------------------+----------+----------+---------+ | Feature | Reading | Writing | Notes | +==========================+==========+==========+=========+ -| Column Index | ✓ | | \(1) | +| Column Index | ✓ | ✓ | | +--------------------------+----------+----------+---------+ -| Offset Index | ✓ | | \(1) | +| Offset Index | ✓ | ✓ | | +--------------------------+----------+----------+---------+ -| Bloom Filter | ✓ | ✓ | \(2) | +| Bloom Filter | ✓ | ✓ | \(1) | +--------------------------+----------+----------+---------+ -| CRC checksums | ✓ | ✓ | \(3) | +| CRC checksums | ✓ | ✓ | | +--------------------------+----------+----------+---------+ -* \(1) Access to the Column and Offset Index structures is provided, but - data read APIs do not currently make any use of them. - -* \(2) APIs are provided for creating, serializing and deserializing Bloom +* \(1) APIs are provided for creating, serializing and deserializing Bloom Filters, but they are not integrated into data read APIs. - -* \(3) For now, only the checksums of V1 Data Pages and Dictionary Pages - are computed. From cd7414db178529db46d69ac67afa671471812724 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 6 Jun 2023 17:28:29 +0800 Subject: [PATCH 3/4] modify comment and doc --- cpp/src/parquet/properties.h | 8 +++++--- docs/source/cpp/parquet.rst | 11 +++++++---- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index c7272cd0633..0a8d776d39c 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -524,9 +524,11 @@ class PARQUET_EXPORT WriterProperties { /// Enable writing page index in general for all columns. Default disabled. /// - /// Page index contains statistics for data pages and can be used to skip pages - /// when scanning data in ordered and unordered columns. Note that it does not - /// write statistics to the page header once page index is enabled. + /// Writing statistics to the page index disables the old method of writing + /// statistics to each data page header. + /// The page index makes filtering more efficient than the page header, as + /// it gathers all the statistics for a Parquet file in a single place, + /// avoiding scattered I/O. /// /// Please check the link below for more details: /// https://github.com/apache/parquet-format/blob/master/PageIndex.md diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst index 75e6f9bf289..95f2d8d98dc 100644 --- a/docs/source/cpp/parquet.rst +++ b/docs/source/cpp/parquet.rst @@ -575,14 +575,17 @@ Miscellaneous +--------------------------+----------+----------+---------+ | Feature | Reading | Writing | Notes | +==========================+==========+==========+=========+ -| Column Index | ✓ | ✓ | | +| Column Index | ✓ | ✓ | \(1) | +--------------------------+----------+----------+---------+ -| Offset Index | ✓ | ✓ | | +| Offset Index | ✓ | ✓ | \(1) | +--------------------------+----------+----------+---------+ -| Bloom Filter | ✓ | ✓ | \(1) | +| Bloom Filter | ✓ | ✓ | \(2) | +--------------------------+----------+----------+---------+ | CRC checksums | ✓ | ✓ | | +--------------------------+----------+----------+---------+ -* \(1) APIs are provided for creating, serializing and deserializing Bloom +* \(1) Access to the Column and Offset Index structures is provided, but + data read APIs do not currently make any use of them. + +* \(2) APIs are provided for creating, serializing and deserializing Bloom Filters, but they are not integrated into data read APIs. From 336058c84ad8c128823239a6a46edee304b5e411 Mon Sep 17 00:00:00 2001 From: Gang Wu Date: Tue, 6 Jun 2023 17:37:01 +0800 Subject: [PATCH 4/4] remove confusing line --- cpp/src/parquet/properties.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 0a8d776d39c..d1012fccf0c 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -544,8 +544,6 @@ class PARQUET_EXPORT WriterProperties { } /// Enable writing page index for column specified by `path`. Default disabled. - /// Note that it does not write statistics to the page header once page index is - /// enabled. Builder* enable_write_page_index(const std::string& path) { page_index_enabled_[path] = true; return this;