Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 19 additions & 2 deletions cpp/src/parquet/arrow/arrow_reader_writer_test.cc
Original file line number Diff line number Diff line change
Expand Up @@ -5252,17 +5252,34 @@ class ParquetPageIndexRoundTripTest : public ::testing::Test {
auto row_group_index_reader = page_index_reader->RowGroup(rg);
ASSERT_NE(row_group_index_reader, nullptr);

auto row_group_reader = reader->RowGroup(rg);
ASSERT_NE(row_group_reader, nullptr);

for (int col = 0; col < metadata->num_columns(); ++col) {
auto column_index = row_group_index_reader->GetColumnIndex(col);
column_indexes_.emplace_back(column_index.get());

bool expect_no_page_index =
expect_columns_without_index.find(col) != expect_columns_without_index.cend();

auto offset_index = row_group_index_reader->GetOffsetIndex(col);
if (expect_columns_without_index.find(col) !=
expect_columns_without_index.cend()) {
if (expect_no_page_index) {
ASSERT_EQ(offset_index, nullptr);
} else {
CheckOffsetIndex(offset_index.get(), expect_num_pages, &offset_lower_bound);
}

// Verify page stats are not written to page header if page index is enabled.
auto page_reader = row_group_reader->GetColumnPageReader(col);
ASSERT_NE(page_reader, nullptr);
std::shared_ptr<Page> page = nullptr;
while ((page = page_reader->NextPage()) != nullptr) {
if (page->type() == PageType::DATA_PAGE ||
page->type() == PageType::DATA_PAGE_V2) {
ASSERT_EQ(std::static_pointer_cast<DataPage>(page)->statistics().is_set(),
expect_no_page_index);
}
}
}
}
}
Expand Down
12 changes: 10 additions & 2 deletions cpp/src/parquet/column_writer.cc
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,11 @@ class SerializedPageWriter : public PageWriter {
ToThrift(page.definition_level_encoding()));
data_page_header.__set_repetition_level_encoding(
ToThrift(page.repetition_level_encoding()));
data_page_header.__set_statistics(ToThrift(page.statistics()));

// Write page statistics only when page index is not enabled.
if (column_index_builder_ == nullptr) {
data_page_header.__set_statistics(ToThrift(page.statistics()));
}

page_header.__set_type(format::PageType::DATA_PAGE);
page_header.__set_data_page_header(data_page_header);
Expand All @@ -479,7 +483,11 @@ class SerializedPageWriter : public PageWriter {
page.repetition_levels_byte_length());

data_page_header.__set_is_compressed(page.is_compressed());
data_page_header.__set_statistics(ToThrift(page.statistics()));

// Write page statistics only when page index is not enabled.
if (column_index_builder_ == nullptr) {
data_page_header.__set_statistics(ToThrift(page.statistics()));
}

page_header.__set_type(format::PageType::DATA_PAGE_V2);
page_header.__set_data_page_header_v2(data_page_header);
Expand Down
7 changes: 5 additions & 2 deletions cpp/src/parquet/properties.h
Original file line number Diff line number Diff line change
Expand Up @@ -524,8 +524,11 @@ class PARQUET_EXPORT WriterProperties {

/// Enable writing page index in general for all columns. Default disabled.
///
/// Page index contains statistics for data pages and can be used to skip pages
/// when scanning data in ordered and unordered columns.
/// Writing statistics to the page index disables the old method of writing
/// statistics to each data page header.
/// The page index makes filtering more efficient than the page header, as
/// it gathers all the statistics for a Parquet file in a single place,
/// avoiding scattered I/O.
///
/// Please check the link below for more details:
/// https://github.com/apache/parquet-format/blob/master/PageIndex.md
Expand Down
11 changes: 5 additions & 6 deletions docs/source/cpp/parquet.rst
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,8 @@ Statistics are enabled by default for all columns. You can disable statistics fo
all columns or specific columns using ``disable_statistics`` on the builder.
There is a ``max_statistics_size`` which limits the maximum number of bytes that
may be used for min and max values, useful for types like strings or binary blobs.
If a column has enabled page index using ``enable_write_page_index``, then it does
not write statistics to the page header because it is duplicated in the ColumnIndex.

There are also Arrow-specific settings that can be configured with
:class:`parquet::ArrowWriterProperties`:
Expand Down Expand Up @@ -573,20 +575,17 @@ Miscellaneous
+--------------------------+----------+----------+---------+
| Feature | Reading | Writing | Notes |
+==========================+==========+==========+=========+
| Column Index | ✓ | | \(1) |
| Column Index | ✓ | | \(1) |
+--------------------------+----------+----------+---------+
| Offset Index | ✓ | | \(1) |
| Offset Index | ✓ | | \(1) |
+--------------------------+----------+----------+---------+
| Bloom Filter | ✓ | ✓ | \(2) |
+--------------------------+----------+----------+---------+
| CRC checksums | ✓ | ✓ | \(3) |
| CRC checksums | ✓ | ✓ | |
+--------------------------+----------+----------+---------+

* \(1) Access to the Column and Offset Index structures is provided, but
data read APIs do not currently make any use of them.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why remove this note? AFAIU it is still valid (we do not expose any high-level filtering feature).


* \(2) APIs are provided for creating, serializing and deserializing Bloom
Filters, but they are not integrated into data read APIs.

* \(3) For now, only the checksums of V1 Data Pages and Dictionary Pages
are computed.