Skip to content

Commit 7f8ccb5

Browse files
authored
GH-34375: [C++][Parquet] Ignore page header stats when page index enabled (#35455)
### Rationale for this change Page-level statistics are probably not used in production, and after adding column indexes they are useless. parquet-mr already stopped writing them in https://issues.apache.org/jira/browse/PARQUET-1365. ### What changes are included in this PR? Once page index is enabled for one column, it does not write page stats to the header any more. ### Are these changes tested? Added a test to check page stats have been skipped. ### Are there any user-facing changes? Yes (behavior change when page index is enabled). * Closes: #34375 Authored-by: Gang Wu <[email protected]> Signed-off-by: Antoine Pitrou <[email protected]>
1 parent 68cbc6f commit 7f8ccb5

File tree

4 files changed

+39
-12
lines changed

4 files changed

+39
-12
lines changed

cpp/src/parquet/arrow/arrow_reader_writer_test.cc

Lines changed: 19 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -5252,17 +5252,34 @@ class ParquetPageIndexRoundTripTest : public ::testing::Test {
52525252
auto row_group_index_reader = page_index_reader->RowGroup(rg);
52535253
ASSERT_NE(row_group_index_reader, nullptr);
52545254

5255+
auto row_group_reader = reader->RowGroup(rg);
5256+
ASSERT_NE(row_group_reader, nullptr);
5257+
52555258
for (int col = 0; col < metadata->num_columns(); ++col) {
52565259
auto column_index = row_group_index_reader->GetColumnIndex(col);
52575260
column_indexes_.emplace_back(column_index.get());
52585261

5262+
bool expect_no_page_index =
5263+
expect_columns_without_index.find(col) != expect_columns_without_index.cend();
5264+
52595265
auto offset_index = row_group_index_reader->GetOffsetIndex(col);
5260-
if (expect_columns_without_index.find(col) !=
5261-
expect_columns_without_index.cend()) {
5266+
if (expect_no_page_index) {
52625267
ASSERT_EQ(offset_index, nullptr);
52635268
} else {
52645269
CheckOffsetIndex(offset_index.get(), expect_num_pages, &offset_lower_bound);
52655270
}
5271+
5272+
// Verify page stats are not written to page header if page index is enabled.
5273+
auto page_reader = row_group_reader->GetColumnPageReader(col);
5274+
ASSERT_NE(page_reader, nullptr);
5275+
std::shared_ptr<Page> page = nullptr;
5276+
while ((page = page_reader->NextPage()) != nullptr) {
5277+
if (page->type() == PageType::DATA_PAGE ||
5278+
page->type() == PageType::DATA_PAGE_V2) {
5279+
ASSERT_EQ(std::static_pointer_cast<DataPage>(page)->statistics().is_set(),
5280+
expect_no_page_index);
5281+
}
5282+
}
52665283
}
52675284
}
52685285
}

cpp/src/parquet/column_writer.cc

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -460,7 +460,11 @@ class SerializedPageWriter : public PageWriter {
460460
ToThrift(page.definition_level_encoding()));
461461
data_page_header.__set_repetition_level_encoding(
462462
ToThrift(page.repetition_level_encoding()));
463-
data_page_header.__set_statistics(ToThrift(page.statistics()));
463+
464+
// Write page statistics only when page index is not enabled.
465+
if (column_index_builder_ == nullptr) {
466+
data_page_header.__set_statistics(ToThrift(page.statistics()));
467+
}
464468

465469
page_header.__set_type(format::PageType::DATA_PAGE);
466470
page_header.__set_data_page_header(data_page_header);
@@ -479,7 +483,11 @@ class SerializedPageWriter : public PageWriter {
479483
page.repetition_levels_byte_length());
480484

481485
data_page_header.__set_is_compressed(page.is_compressed());
482-
data_page_header.__set_statistics(ToThrift(page.statistics()));
486+
487+
// Write page statistics only when page index is not enabled.
488+
if (column_index_builder_ == nullptr) {
489+
data_page_header.__set_statistics(ToThrift(page.statistics()));
490+
}
483491

484492
page_header.__set_type(format::PageType::DATA_PAGE_V2);
485493
page_header.__set_data_page_header_v2(data_page_header);

cpp/src/parquet/properties.h

Lines changed: 5 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -524,8 +524,11 @@ class PARQUET_EXPORT WriterProperties {
524524

525525
/// Enable writing page index in general for all columns. Default disabled.
526526
///
527-
/// Page index contains statistics for data pages and can be used to skip pages
528-
/// when scanning data in ordered and unordered columns.
527+
/// Writing statistics to the page index disables the old method of writing
528+
/// statistics to each data page header.
529+
/// The page index makes filtering more efficient than the page header, as
530+
/// it gathers all the statistics for a Parquet file in a single place,
531+
/// avoiding scattered I/O.
529532
///
530533
/// Please check the link below for more details:
531534
/// https://github.com/apache/parquet-format/blob/master/PageIndex.md

docs/source/cpp/parquet.rst

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -304,6 +304,8 @@ Statistics are enabled by default for all columns. You can disable statistics fo
304304
all columns or specific columns using ``disable_statistics`` on the builder.
305305
There is a ``max_statistics_size`` which limits the maximum number of bytes that
306306
may be used for min and max values, useful for types like strings or binary blobs.
307+
If a column has enabled page index using ``enable_write_page_index``, then it does
308+
not write statistics to the page header because it is duplicated in the ColumnIndex.
307309

308310
There are also Arrow-specific settings that can be configured with
309311
:class:`parquet::ArrowWriterProperties`:
@@ -573,20 +575,17 @@ Miscellaneous
573575
+--------------------------+----------+----------+---------+
574576
| Feature | Reading | Writing | Notes |
575577
+==========================+==========+==========+=========+
576-
| Column Index || | \(1) |
578+
| Column Index || | \(1) |
577579
+--------------------------+----------+----------+---------+
578-
| Offset Index || | \(1) |
580+
| Offset Index || | \(1) |
579581
+--------------------------+----------+----------+---------+
580582
| Bloom Filter ||| \(2) |
581583
+--------------------------+----------+----------+---------+
582-
| CRC checksums ||| \(3) |
584+
| CRC checksums ||| |
583585
+--------------------------+----------+----------+---------+
584586

585587
* \(1) Access to the Column and Offset Index structures is provided, but
586588
data read APIs do not currently make any use of them.
587589

588590
* \(2) APIs are provided for creating, serializing and deserializing Bloom
589591
Filters, but they are not integrated into data read APIs.
590-
591-
* \(3) For now, only the checksums of V1 Data Pages and Dictionary Pages
592-
are computed.

0 commit comments

Comments
 (0)