From 6d1dae7ac5dfb23fa1ac1fed5b77d3b919fbb5f8 Mon Sep 17 00:00:00 2001 From: Raz Luvaton <16746759+rluvaton@users.noreply.github.com> Date: Mon, 29 Sep 2025 16:01:41 +0300 Subject: [PATCH] Add backward compat nested file --- data/README.md | 1 + data/backward_compat_nested.md | 305 ++++++++++++++++++++++++++++ data/backward_compat_nested.parquet | Bin 0 -> 17719 bytes 3 files changed, 306 insertions(+) create mode 100644 data/backward_compat_nested.md create mode 100644 data/backward_compat_nested.parquet diff --git a/data/README.md b/data/README.md index 912f3ef..00deef0 100644 --- a/data/README.md +++ b/data/README.md @@ -61,6 +61,7 @@ | unknown-logical-type.parquet | A file containing a column annotated with a LogicalType whose identifier has been set to an abitrary high value to check the behaviour of an old reader reading a file written by a new writer containing an unsupported type (see [related issue](https://github.com/apache/arrow/issues/41764)). | | int96_from_spark.parquet | Single column of (deprecated) int96 values that originated as Apache Spark microsecond-resolution timestamps. Some values are outside the range typically representable by 64-bit nanosecond-resolution timestamps. See [int96_from_spark.md](int96_from_spark.md) for details. | | binary_truncated_min_max.parquet | A file containing six columns with exact, fully-truncated and partially-truncated max and min statistics and with the expected is_{min/max}_value_exact. (see [note](Binary-truncated-min-and-max-statistics)).| +| backward_compat_nested.parquet | A file containing a nested schema that was created with older writer list of backward compatibility cases for testing readers. See [backward_compat_nested.md](backward_compat_nested.md) for details. | TODO: Document what each file is in the table above. diff --git a/data/backward_compat_nested.md b/data/backward_compat_nested.md new file mode 100644 index 0000000..8d3b6c8 --- /dev/null +++ b/data/backward_compat_nested.md @@ -0,0 +1,305 @@ +# Backward compat list +Explanation for [`./backward_compat_nested.parquet`](./backward_compat_nested.parquet) + +This file was generated using older Parquet libraries to ensure backward compatibility with older writers. + +## Generation + +`build.sbt`: +```sbt +import sbt.Keys.libraryDependencies + +import scala.collection.Seq + +ThisBuild / version := "0.1.0-SNAPSHOT" + +ThisBuild / scalaVersion := "2.12.20" + +lazy val root = (project in file(".")) + .settings( + name := "generate-parquet", + libraryDependencies ++= Seq( + "org.apache.parquet" % "parquet-hadoop" % "1.12.0", + "org.apache.parquet" % "parquet-common" % "1.12.0", + "org.apache.parquet" % "parquet-column" % "1.12.0", + "org.apache.hadoop" % "hadoop-client" % "3.3.1" + ) + + ) +``` + + +```scala +import org.apache.parquet.hadoop.ParquetWriter +import org.apache.parquet.hadoop.metadata.CompressionCodecName +import org.apache.parquet.schema._ +import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName._ +import org.apache.parquet.schema.Type.Repetition._ +import org.apache.parquet.schema.OriginalType._ +import org.apache.hadoop.fs.Path +import org.apache.hadoop.conf.Configuration +import org.apache.parquet.example.data.{Group, GroupWriter} +import org.apache.parquet.example.data.simple.SimpleGroupFactory +import org.apache.parquet.hadoop.example.{ExampleParquetWriter, GroupWriteSupport} + +object ParquetWriterApp { + + def buildSchema(): MessageType = { + new MessageType("MySchema", + // col_1 group + new GroupType(OPTIONAL, "col_1", + new PrimitiveType(OPTIONAL, INT64, "col_2"), + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_3"), + new PrimitiveType(OPTIONAL, BINARY, "col_4"), + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_5"), + new PrimitiveType(OPTIONAL, INT32, "col_6"), + new PrimitiveType(OPTIONAL, BINARY, "col_7"), + new GroupType(OPTIONAL, "col_8", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_9") + ), + new PrimitiveType(OPTIONAL, BOOLEAN, "col_10"), + new GroupType(OPTIONAL, "col_11", + new PrimitiveType(OPTIONAL, BINARY, "col_12") + ), + new PrimitiveType(OPTIONAL, BOOLEAN, "col_13"), + Types.primitive(BINARY, REPEATED).as(ENUM).named("col_14"), + new GroupType(REPEATED, "col_15", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_16"), + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_17"), + new GroupType(OPTIONAL, "col_18", + new PrimitiveType(OPTIONAL, INT64, "col_19"), + new PrimitiveType(OPTIONAL, INT32, "col_20") + ) + ) + ), + // col_21 group + new GroupType(OPTIONAL, "col_21", + new PrimitiveType(OPTIONAL, INT64, "col_22"), + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_23"), + new PrimitiveType(OPTIONAL, BINARY, "col_24"), + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_25"), + new PrimitiveType(OPTIONAL, INT32, "col_26"), + new PrimitiveType(OPTIONAL, INT32, "col_27"), + new PrimitiveType(OPTIONAL, BINARY, "col_28"), + new GroupType(OPTIONAL, "col_29", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_30") + ), + new PrimitiveType(OPTIONAL, BOOLEAN, "col_31"), + new GroupType(OPTIONAL, "col_32", + new PrimitiveType(OPTIONAL, BINARY, "col_33") + ), + new PrimitiveType(OPTIONAL, BOOLEAN, "col_34"), + new GroupType(REPEATED, "col_35", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_36"), + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_37"), + new GroupType(OPTIONAL, "col_38", + new PrimitiveType(OPTIONAL, INT64, "col_39"), + new PrimitiveType(OPTIONAL, INT32, "col_40") + ) + ) + ), + // col_41 group + new GroupType(OPTIONAL, "col_41", + new GroupType(OPTIONAL, "col_42", + new PrimitiveType(OPTIONAL, INT64, "col_43"), + new PrimitiveType(OPTIONAL, INT32, "col_44") + ), + new GroupType(OPTIONAL, "col_45", + new PrimitiveType(OPTIONAL, INT64, "col_46"), + new PrimitiveType(OPTIONAL, INT32, "col_47") + ) + ), + // col_48 + Types.primitive(BINARY, OPTIONAL).as(ENUM).named("col_48"), + // col_49 group + new GroupType(OPTIONAL, "col_49", + new PrimitiveType(OPTIONAL, INT32, "col_50"), + new PrimitiveType(OPTIONAL, INT64, "col_51"), + new PrimitiveType(OPTIONAL, FLOAT, "col_52"), + new PrimitiveType(OPTIONAL, INT32, "col_53"), + new PrimitiveType(OPTIONAL, INT32, "col_54"), + new PrimitiveType(OPTIONAL, INT32, "col_55"), + new PrimitiveType(OPTIONAL, INT64, "col_56"), + new PrimitiveType(OPTIONAL, INT64, "col_57"), + Types.primitive(BINARY, OPTIONAL).as(ENUM).named("col_58"), + new GroupType(OPTIONAL, "col_59", + new PrimitiveType(OPTIONAL, INT32, "col_60") + ), + new GroupType(OPTIONAL, "col_61", + new PrimitiveType(OPTIONAL, INT32, "col_62") + ), + new PrimitiveType(OPTIONAL, DOUBLE, "col_63"), + new PrimitiveType(OPTIONAL, DOUBLE, "col_64"), + new PrimitiveType(OPTIONAL, INT32, "col_65") + ), + // col_66 group + new GroupType(OPTIONAL, "col_66", + new PrimitiveType(OPTIONAL, INT32, "col_67"), + new PrimitiveType(OPTIONAL, INT64, "col_68"), + new PrimitiveType(OPTIONAL, FLOAT, "col_69"), + new PrimitiveType(OPTIONAL, INT32, "col_70"), + new PrimitiveType(OPTIONAL, INT32, "col_71"), + new PrimitiveType(OPTIONAL, INT32, "col_72"), + new PrimitiveType(OPTIONAL, INT64, "col_73"), + new PrimitiveType(OPTIONAL, INT64, "col_74"), + Types.primitive(BINARY, OPTIONAL).as(ENUM).named("col_75"), + new GroupType(OPTIONAL, "col_76", + new PrimitiveType(OPTIONAL, INT32, "col_77") + ), + new GroupType(OPTIONAL, "col_78", + new PrimitiveType(OPTIONAL, INT32, "col_79") + ), + new PrimitiveType(OPTIONAL, DOUBLE, "col_80"), + new PrimitiveType(OPTIONAL, DOUBLE, "col_81"), + new PrimitiveType(OPTIONAL, INT32, "col_82") + ), + // col_83 group + new GroupType(OPTIONAL, "col_83", + new PrimitiveType(OPTIONAL, BOOLEAN, "col_84"), + new PrimitiveType(OPTIONAL, INT32, "col_85"), + new PrimitiveType(OPTIONAL, BOOLEAN, "col_86"), + new PrimitiveType(OPTIONAL, INT32, "col_87"), + new PrimitiveType(OPTIONAL, BOOLEAN, "col_88"), + new PrimitiveType(OPTIONAL, INT32, "col_89") + ), + // col_90 group + new GroupType(OPTIONAL, "col_90", + new PrimitiveType(OPTIONAL, INT64, "col_91"), + Types.primitive(BINARY, OPTIONAL).as(ENUM).named("col_92"), + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_93"), + new PrimitiveType(OPTIONAL, INT64, "col_94"), + new PrimitiveType(OPTIONAL, INT64, "col_95"), + new PrimitiveType(OPTIONAL, INT64, "col_96"), + new PrimitiveType(OPTIONAL, INT64, "col_97"), + new GroupType(REPEATED, "col_98", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_99"), + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_100") + ), + new GroupType(OPTIONAL, "col_101", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_102") + ), + new GroupType(OPTIONAL, "col_103", + new PrimitiveType(OPTIONAL, BINARY, "col_104"), + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_105"), + new GroupType(OPTIONAL, "col_106", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_107") + ) + ), + new GroupType(OPTIONAL, "col_108", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_109"), + new PrimitiveType(OPTIONAL, BINARY, "col_110"), + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_111"), + new GroupType(OPTIONAL, "col_112", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_113") + ) + ), + new GroupType(OPTIONAL, "col_114", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_115"), + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_116") + ), + new GroupType(OPTIONAL, "col_117", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_118") + ), + new GroupType(OPTIONAL, "col_119", + new PrimitiveType(OPTIONAL, INT64, "col_120") + ), + new GroupType(OPTIONAL, "col_121", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_122") + ), + new GroupType(OPTIONAL, "col_123", + new PrimitiveType(OPTIONAL, INT64, "col_124"), + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_125") + ), + new GroupType(OPTIONAL, "col_126", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_127") + ), + new GroupType(OPTIONAL, "col_128", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_129") + ), + new GroupType(OPTIONAL, "col_130", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_131"), + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_132") + ), + new GroupType(OPTIONAL, "col_133", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_134") + ), + new GroupType(OPTIONAL, "col_135", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_136") + ), + new GroupType(OPTIONAL, "col_137", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_138") + ), + new GroupType(OPTIONAL, "col_139", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_140") + ), + new GroupType(OPTIONAL, "col_141", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_142") + ), + new GroupType(OPTIONAL, "col_143", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_144") + ), + new GroupType(OPTIONAL, "col_145", + new PrimitiveType(OPTIONAL, INT64, "col_146") + ), + new PrimitiveType(OPTIONAL, BOOLEAN, "col_147") + ), + // Remaining top-level fields + new PrimitiveType(OPTIONAL, BOOLEAN, "col_148"), + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_149"), + new PrimitiveType(OPTIONAL, BOOLEAN, "col_150"), + new GroupType(OPTIONAL, "col_151", + new PrimitiveType(OPTIONAL, INT64, "col_152"), + new PrimitiveType(OPTIONAL, INT32, "col_153") + ), + new PrimitiveType(OPTIONAL, BOOLEAN, "col_154"), + new GroupType(OPTIONAL, "col_155", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_156") + ), + new GroupType(OPTIONAL, "col_157", + Types.primitive(BINARY, OPTIONAL).as(UTF8).named("col_158"), + new PrimitiveType(OPTIONAL, INT32, "col_159") + ), + new PrimitiveType(OPTIONAL, BINARY, "col_160"), + new PrimitiveType(OPTIONAL, BINARY, "col_161"), + new GroupType(OPTIONAL, "col_162", + new PrimitiveType(OPTIONAL, INT32, "col_163"), + new PrimitiveType(OPTIONAL, INT64, "col_164") + ) + ) + } + + def main(args: Array[String]): Unit = { + val outputPath = if (args.length > 0) new Path(args(0)) else new Path("output.parquet") + val schema = buildSchema() + val conf = new Configuration() + + GroupWriteSupport.setSchema(schema, conf) + + val writer = ExampleParquetWriter.builder(outputPath) + .withConf(conf) + .withCompressionCodec(CompressionCodecName.SNAPPY) + .withWriteMode(org.apache.parquet.hadoop.ParquetFileWriter.Mode.OVERWRITE) + .build() + + try { + val factory = new SimpleGroupFactory(schema) + + // Write a minimal record with the problematic schema structure + // All fields are optional, so we can write an empty or minimal record + val record = factory.newGroup() + + // Write just enough data to create a valid but minimal Parquet file + // This preserves the schema structure that may cause reading issues + val col1 = record.addGroup("col_1") + col1.add("col_2", 1L) + + writer.write(record) + + println(s"Successfully wrote Parquet file with problematic schema to: ${outputPath}") + println(s"Schema has ${schema.getFieldCount} top-level fields") + } finally { + writer.close() + } + } +} +``` \ No newline at end of file diff --git a/data/backward_compat_nested.parquet b/data/backward_compat_nested.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d9c69c3c657f2eb319f63216c271d7ddf4561745 GIT binary patch literal 17719 zcmeHPeTY=o6(7fS-I(<=zR9lHnAPZJ)o5nk$IKgJZIXW3)TB0P8~d5AacyihNo|s* z|B1-3$g(WMAU1?e5F;WYX%PEZM6B5DMiyC?X(}RC5)mt6#ReG`k)Ct!efQjVW?oj3 zO8T)$aL+q)f4}p)=i|P&&mn6beyQ8V;NAKbD+!nNRzo5&-Jh69Bqk;k z6W|B_10PJYW?7Zr^}S&%o7^OpP56KQcn&1+uV)G$ObGK*pGZtiCn0AO;a~VuWarN!xw*BF7C22&Q=E{h5G9nh_HkV;pN6-HG?-N5dD8H(!F8lcBpFKZV`My!2z?#|S^h$78X7>gUZflg&*H;cvNyX(gLeiA3|v3C&Fj_zy3W za=%|%)%sVwF)spW?#MbUaq&ERttj3+AK6!k16i}nT5N^OJ-pTdC4r?O zFqOPhwX)(gZJwK^l^vL_&C%%^);~j=qh$8aB=1a}onV&sR>%zQAupN1Z1R#BSmY%$ zNRgM!;9l~Q88nfX%%GXPWCnA{OJ;B%dC3e~$V+B$KY7Uv=8~7p-~sZI8MKm@%wQgQ z$qeR`m(1Wn@{$=WATOE0Lh_Osw2`+>x5kNwwAtIP7YvixyNJAttS3}g7Z+>q2${tt z?`X2~WmnI(t3WR_g=l3DV|>*--A zX0$yn?q!FFV|~NlG*jBUOT8vc*4K=?u1nhPyP1U7hde# z++i(9c*&+oFTA|z`HTg>0Msp1S9Zc`3~gs&Vs#6?!O$Mpu)XlpWVk+Se45kzzLhlm z`4E)=RK19w#--BCglpv`h@oW;vec=C?!t!Et3jF z;+k1;{jky@GHU`QVtC$A``Vh$5YKw+T&#=x)9@Nl<*&q1`Gvq@ZKY1}J6h@zN=$b4~ zxp0VK1e8agcIw)u6H0lU@KjURs$A+SJY9%U6wHWeOtixI@C695d39YwX$A^N9GMJP zmC*-|)H*UQPPf4kDm8uK0Ys==c#tVxzbdkrIV_D_A&+TgvuxT`*|eST%u$F) zHtkTO)?>45+Ev-K^}S6ohS}5{PCA+iC(Ndi38zY_<3uyzxI7bZnyb62jI57yip;X< zRAtlAcNxjyR&^+x?2#yLl~R`}b*q%ZNe>)UGZ#h(XIRBl6Roabq^WD%Fq_7DIPUch zJ-qK2%;AAAx$wlNIB?_>j^)m)>O?OWtr{Hu>PZL>9K{c39%+T+-@FocXSpsbZis}# zI{;-P>n2jk=9cE0ueUX{Sar=_Lw%yTT|SqIKVjBV$<*r9nzn`&^eUIERH9wK;3Y5o z)(hV;fb%htNZowSnw{!eVXaKGt!=Ryt3yBuA;j|3%C@yDvhW(1puKAWIIO@bER>?X>bola07WB!~)3BUT?L?}Bf-0Dd-H7>!r?O>rkt&$Xs16|2 zkf16U6E7s#vaYtQ>x$H4ZG{E~KaTLr0zLrnu(cy_Sy+S*aQJD2cRhwVDcltUm!%ca z?_$xfB7Coa55&RcP>bjszWql4KOv*z*P_&1M8wOc5YeZw=x-wYBN;soF8e`5=kWIs zzV&gj_q${E4)=2yY7zZz7JUHWeF8o}aM?Ry;qqcDaL*4}1~|)EWH};5G}2s}DRAJX z8;WRTBCaF+j1&Rz@NkO2Z7IU&0?yL)V_>OB5mRGC7+o-xiReT4&L=RD0{~aU8POmo z+@T2{;3S8T%v1G##>f*BeGE_GX%8eBQ#t}GKiKdEAma+dC&0LuZv(mz9wOHCH4 zR{sp9WjC^%lv;qLrWRQ)mu7F2vjoU;NotuHqeZdM2=>opT23KL;j1Dqz@pl>NhH~o zvxj%HQI5NSxIF?F%+k@@B~A{VM?)9PVqr%Seq6vS01uzCB5)eI3WxXp6yWD&^m|&Y z2J_f*RdhPCgL_!?y$HW0qXRs0>VW1IR5;5qWEqws?yb#|l`{>G`r$AK_cARXBFj~&1z4ukXripq z1sr|_;oaX5nP`fE8(q-EqHp^d!1oJy1>m)MFl%%Hhwno8po|W`9z@{TteB%Ll`&{$ z(GMd0qKppkNE3x{Dr1GiPau5zDzW!DZ2m&HJlOHZ9n4|T&merafL8!sOMK2KKZjpK z_%Rv%z8E+i9>IMqdZ7d0=VbIaIJHfM!{0*qO&PtVcItA}HbD!E{yxIr{3fQZ5(lTY zsc`rp!UF*h?q~C-GM}Ti3GQdn&msIn86Dt}`3vE)dver070&VrvV0;%%#C4@Go3TK zU@p_LV;!&*pA?w@mdN}mExyqO9R3c%-xKiQff%^a1rM<3hY@~CMhAGU9`uba;P4@Y zUy;#U*#n#Ji23uWj6o}l{t?1=d`s*d;F0JdoXS|?@Cw5B33xCsCc1GCn8%{`ya4b4 z86Ds?qZ{St@I45>AfwNZiB5+{FrP*5NBFkYV($R28J*gu!r{XRe@DQB2ig4P(<1YE zqwWu~=ob-wR7MAQWyey-)#YCsJ3EEin%LuH!{gTt(+3lb1Z|6&u#A z%z65dXXke?-=$?SdTcds#=ExCGN$Jc@*EO8W#Fl)$5#Dq)4Y{A&q?GNk$N79(-ZO- z^?HQqIgdQoq@GwFHD@**jb+Yr19^I$5&22S=~4ZN-$YRFCrW9ir~5_VIUsn-aXgMX z2aK0mWzMr3c}_|_nHW8qht9`RhUp2A=aSSD%cJt_81>>jr;w-ctjLcYr^l0hs9tel zAC_#U=K}KV5j^Es9yJ&6Rf{msWzI8-JjbOTCr*!=3-wk?=y8~y-i^R>Uh0YCan)SF z*EqBu&a)SJZb?0Ej2>6z*`+*}T&Cv;^1St3%ySudYR;Fd@@%}DDs!F@G!G?@>9~aW(^3cF)nD@&ucgWycng6y1hAOZi3ovma`1gtG0P&BegVMJ_i*RK z091cFqk!s776OMka6bay7r>sJPFcI`MNievquTZ4Sma|!Ff0g80YUgU%n0b2@)Rfd z5DBhI1%8!uS#6Zw}4+-eN8{z~9k>H|Kuv{xpWmB(^aPQsAS>zK4-2NQy{bV&zJ~|-q zBnO^B;BEozdX!7fs%Hh`y;0YrEb=uZI3@@Nfgt=9KxSPN(2x#tg2E;sIEMtOeSRw0 z){xq|Vm5xOx@qQbwrqU0bIY>LFR$<1wCtsh&7Dn?JKyNo{K}@zMAMV6bZq(6Yn`ty z-Mpps_0BE7-uUuMt(j#Rds(`*ecfvtH?417zb@x>Wc`e{!Cu#qS)b|b^z-X89r<)e n=Q=x|-H_Sfbod+W^dk7XarhGA_