apache · pitrou · Jun 6, 2023 · May 6, 2023 · May 26, 2023 · Jun 6, 2023
@@ -5252,17 +5252,34 @@ class ParquetPageIndexRoundTripTest : public ::testing::Test {
       auto row_group_index_reader = page_index_reader->RowGroup(rg);
       ASSERT_NE(row_group_index_reader, nullptr);
 
+      auto row_group_reader = reader->RowGroup(rg);
+      ASSERT_NE(row_group_reader, nullptr);
+
       for (int col = 0; col < metadata->num_columns(); ++col) {
         auto column_index = row_group_index_reader->GetColumnIndex(col);
         column_indexes_.emplace_back(column_index.get());
 
+        bool expect_no_page_index =
+            expect_columns_without_index.find(col) != expect_columns_without_index.cend();
+
         auto offset_index = row_group_index_reader->GetOffsetIndex(col);
-        if (expect_columns_without_index.find(col) !=
-            expect_columns_without_index.cend()) {
+        if (expect_no_page_index) {
           ASSERT_EQ(offset_index, nullptr);
         } else {
           CheckOffsetIndex(offset_index.get(), expect_num_pages, &offset_lower_bound);
         }
+
+        // Verify page stats are not written to page header if page index is enabled.
+        auto page_reader = row_group_reader->GetColumnPageReader(col);
+        ASSERT_NE(page_reader, nullptr);
+        std::shared_ptr<Page> page = nullptr;
+        while ((page = page_reader->NextPage()) != nullptr) {
+          if (page->type() == PageType::DATA_PAGE ||
+              page->type() == PageType::DATA_PAGE_V2) {
+            ASSERT_EQ(std::static_pointer_cast<DataPage>(page)->statistics().is_set(),
+                      expect_no_page_index);
+          }
+        }
       }
     }
   }

@@ -460,7 +460,11 @@ class SerializedPageWriter : public PageWriter {
         ToThrift(page.definition_level_encoding()));
     data_page_header.__set_repetition_level_encoding(
         ToThrift(page.repetition_level_encoding()));
-    data_page_header.__set_statistics(ToThrift(page.statistics()));
+
+    // Write page statistics only when page index is not enabled.
+    if (column_index_builder_ == nullptr) {
+      data_page_header.__set_statistics(ToThrift(page.statistics()));
+    }
 
     page_header.__set_type(format::PageType::DATA_PAGE);
     page_header.__set_data_page_header(data_page_header);
@@ -479,7 +483,11 @@ class SerializedPageWriter : public PageWriter {
         page.repetition_levels_byte_length());
 
     data_page_header.__set_is_compressed(page.is_compressed());
-    data_page_header.__set_statistics(ToThrift(page.statistics()));
+
+    // Write page statistics only when page index is not enabled.
+    if (column_index_builder_ == nullptr) {
+      data_page_header.__set_statistics(ToThrift(page.statistics()));
+    }
 
     page_header.__set_type(format::PageType::DATA_PAGE_V2);
     page_header.__set_data_page_header_v2(data_page_header);

@@ -524,8 +524,11 @@ class PARQUET_EXPORT WriterProperties {
 
     /// Enable writing page index in general for all columns. Default disabled.
     ///
-    /// Page index contains statistics for data pages and can be used to skip pages
-    /// when scanning data in ordered and unordered columns.
+    /// Writing statistics to the page index disables the old method of writing
+    /// statistics to each data page header.
+    /// The page index makes filtering more efficient than the page header, as
+    /// it gathers all the statistics for a Parquet file in a single place,
+    /// avoiding scattered I/O.
     ///
     /// Please check the link below for more details:
     /// https://github.com/apache/parquet-format/blob/master/PageIndex.md

diff --git a/docs/source/cpp/parquet.rst b/docs/source/cpp/parquet.rst
@@ -304,6 +304,8 @@ Statistics are enabled by default for all columns. You can disable statistics fo
 all columns or specific columns using ``disable_statistics`` on the builder.
 There is a ``max_statistics_size`` which limits the maximum number of bytes that
 may be used for min and max values, useful for types like strings or binary blobs.
+If a column has enabled page index using ``enable_write_page_index``, then it does
+not write statistics to the page header because it is duplicated in the ColumnIndex.
 
 There are also Arrow-specific settings that can be configured with
 :class:`parquet::ArrowWriterProperties`:
@@ -573,20 +575,17 @@ Miscellaneous
 +--------------------------+----------+----------+---------+
 | Feature                  | Reading  | Writing  | Notes   |
 +==========================+==========+==========+=========+
-| Column Index             | ✓        |          | \(1)    |
+| Column Index             | ✓        | ✓        | \(1)    |
 +--------------------------+----------+----------+---------+
-| Offset Index             | ✓        |          | \(1)    |
+| Offset Index             | ✓        | ✓        | \(1)    |
 +--------------------------+----------+----------+---------+
 | Bloom Filter             | ✓        | ✓        | \(2)    |
 +--------------------------+----------+----------+---------+
-| CRC checksums            | ✓        | ✓        | \(3)    |
+| CRC checksums            | ✓        | ✓        |         |
 +--------------------------+----------+----------+---------+
 
 * \(1) Access to the Column and Offset Index structures is provided, but
   data read APIs do not currently make any use of them.
 
 * \(2) APIs are provided for creating, serializing and deserializing Bloom
   Filters, but they are not integrated into data read APIs.
-
-* \(3) For now, only the checksums of V1 Data Pages and Dictionary Pages
-  are computed.