diff --git a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc index ad33ca296a2..ba3702ede4d 100644 --- a/cpp/src/parquet/arrow/arrow_reader_writer_test.cc +++ b/cpp/src/parquet/arrow/arrow_reader_writer_test.cc @@ -5252,17 +5252,34 @@ class ParquetPageIndexRoundTripTest : public ::testing::Test { auto row_group_index_reader = page_index_reader->RowGroup(rg); ASSERT_NE(row_group_index_reader, nullptr); + auto row_group_reader = reader->RowGroup(rg); + ASSERT_NE(row_group_reader, nullptr); + for (int col = 0; col < metadata->num_columns(); ++col) { auto column_index = row_group_index_reader->GetColumnIndex(col); column_indexes_.emplace_back(column_index.get()); + bool expect_no_page_index = + expect_columns_without_index.find(col) != expect_columns_without_index.cend(); + auto offset_index = row_group_index_reader->GetOffsetIndex(col); - if (expect_columns_without_index.find(col) != - expect_columns_without_index.cend()) { + if (expect_no_page_index) { ASSERT_EQ(offset_index, nullptr); } else { CheckOffsetIndex(offset_index.get(), expect_num_pages, &offset_lower_bound); } + + // Verify page stats are not written to page header if page index is enabled. + auto page_reader = row_group_reader->GetColumnPageReader(col); + ASSERT_NE(page_reader, nullptr); + std::shared_ptr page = nullptr; + while ((page = page_reader->NextPage()) != nullptr) { + if (page->type() == PageType::DATA_PAGE || + page->type() == PageType::DATA_PAGE_V2) { + ASSERT_EQ(std::static_pointer_cast(page)->statistics().is_set(), + expect_no_page_index); + } + } } } } diff --git a/cpp/src/parquet/column_writer.cc b/cpp/src/parquet/column_writer.cc index f298f282fe7..33e9f8f6658 100644 --- a/cpp/src/parquet/column_writer.cc +++ b/cpp/src/parquet/column_writer.cc @@ -460,7 +460,11 @@ class SerializedPageWriter : public PageWriter { ToThrift(page.definition_level_encoding())); data_page_header.__set_repetition_level_encoding( ToThrift(page.repetition_level_encoding())); - data_page_header.__set_statistics(ToThrift(page.statistics())); + + // Write page statistics only when page index is not enabled. + if (column_index_builder_ == nullptr) { + data_page_header.__set_statistics(ToThrift(page.statistics())); + } page_header.__set_type(format::PageType::DATA_PAGE); page_header.__set_data_page_header(data_page_header); @@ -479,7 +483,11 @@ class SerializedPageWriter : public PageWriter { page.repetition_levels_byte_length()); data_page_header.__set_is_compressed(page.is_compressed()); - data_page_header.__set_statistics(ToThrift(page.statistics())); + + // Write page statistics only when page index is not enabled. + if (column_index_builder_ == nullptr) { + data_page_header.__set_statistics(ToThrift(page.statistics())); + } page_header.__set_type(format::PageType::DATA_PAGE_V2); page_header.__set_data_page_header_v2(data_page_header); diff --git a/cpp/src/parquet/properties.h b/cpp/src/parquet/properties.h index 0a9864de626..d1012fccf0c 100644 --- a/cpp/src/parquet/properties.h +++ b/cpp/src/parquet/properties.h @@ -524,8 +524,11 @@ class PARQUET_EXPORT WriterProperties { /// Enable writing page index in general for all columns. Default disabled. /// - /// Page index contains statistics for data pages and can be used to skip pages - /// when scanning data in ordered and unordered columns. + /// Writing statistics to the page index disables the old method of writing + /// statistics to each data page header. + /// The page index makes filtering more efficient than the page header, as + /// it gathers all the statistics for a Parquet file in a single place, + /// avoiding scattered I/O. /// /// Please check the link below for more details: /// https://github.com/apache/parquet-format/blob/master/PageIndex.md diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst index 0ea6063b2a2..95f2d8d98dc 100644 --- a/docs/source/cpp/parquet.rst +++ b/docs/source/cpp/parquet.rst @@ -304,6 +304,8 @@ Statistics are enabled by default for all columns. You can disable statistics fo all columns or specific columns using ``disable_statistics`` on the builder. There is a ``max_statistics_size`` which limits the maximum number of bytes that may be used for min and max values, useful for types like strings or binary blobs. +If a column has enabled page index using ``enable_write_page_index``, then it does +not write statistics to the page header because it is duplicated in the ColumnIndex. There are also Arrow-specific settings that can be configured with :class:`parquet::ArrowWriterProperties`: @@ -573,13 +575,13 @@ Miscellaneous +--------------------------+----------+----------+---------+ | Feature | Reading | Writing | Notes | +==========================+==========+==========+=========+ -| Column Index | ✓ | | \(1) | +| Column Index | ✓ | ✓ | \(1) | +--------------------------+----------+----------+---------+ -| Offset Index | ✓ | | \(1) | +| Offset Index | ✓ | ✓ | \(1) | +--------------------------+----------+----------+---------+ | Bloom Filter | ✓ | ✓ | \(2) | +--------------------------+----------+----------+---------+ -| CRC checksums | ✓ | ✓ | \(3) | +| CRC checksums | ✓ | ✓ | | +--------------------------+----------+----------+---------+ * \(1) Access to the Column and Offset Index structures is provided, but @@ -587,6 +589,3 @@ Miscellaneous * \(2) APIs are provided for creating, serializing and deserializing Bloom Filters, but they are not integrated into data read APIs. - -* \(3) For now, only the checksums of V1 Data Pages and Dictionary Pages - are computed.