[SPARK-28098][SQL]Support read partitioned Hive tables with (#40)

catalinii · anuvedverma · commit d2f6ce9e122c · 2025-09-29T17:48:46.000-07:00
(cherry picked from commit 984bf78)
diff --git a/core/src/test/scala/org/apache/spark/util/LyftUtilsSuite.scala b/core/src/test/scala/org/apache/spark/util/LyftUtilsSuite.scala
@@ -17,7 +17,7 @@
 
 package org.apache.spark.util
 
-import org.apache.spark.{SparkFunSuite}
+import org.apache.spark.SparkFunSuite
 import org.apache.spark.internal.Logging
 
 object TestObjectLyftUtils {
diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/internal/SQLConf.scala
@@ -4311,6 +4311,13 @@ object SQLConf {
       .booleanConf
       .createWithDefault(false)
 
+    val READ_PARTITION_WITH_SUBDIRECTORY_ENABLED =
+    buildConf("spark.sql.sources.readPartitionWithSubdirectory.enabled")
+      .doc("When set to true, Spark SQL could read the files of " +
+        " partitioned hive table from subdirectories under root path of table")
+      .booleanConf
+      .createWithDefault(true)
+
   val LEGACY_AVRO_ALLOW_INCOMPATIBLE_SCHEMA =
     buildConf("spark.sql.legacy.avro.allowIncompatibleSchema")
       .internal()
@@ -5254,6 +5261,9 @@ class SQLConf extends Serializable with Logging with SqlApiConf {
 
   def maxConcurrentOutputFileWriters: Int = getConf(SQLConf.MAX_CONCURRENT_OUTPUT_FILE_WRITERS)
 
+  def readPartitionWithSubdirectoryEnabled: Boolean =
+    getConf(READ_PARTITION_WITH_SUBDIRECTORY_ENABLED)
+
   def plannedWriteEnabled: Boolean = getConf(SQLConf.PLANNED_WRITE_ENABLED)
 
   def inferDictAsStruct: Boolean = getConf(SQLConf.INFER_NESTED_DICT_AS_STRUCT)
diff --git a/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala b/sql/core/src/main/scala/org/apache/spark/sql/execution/datasources/InMemoryFileIndex.scala
@@ -61,6 +61,9 @@ class InMemoryFileIndex(
   override val rootPaths =
     rootPathsSpecified.filterNot(FileStreamSink.ancestorIsMetadataDirectory(_, hadoopConf))
 
+  val readPartitionWithSubdirectoryEnabled =
+    sparkSession.sessionState.conf.readPartitionWithSubdirectoryEnabled
+
   @volatile private var cachedLeafFiles: mutable.LinkedHashMap[Path, FileStatus] = _
   @volatile private var cachedLeafDirToChildrenFiles: Map[Path, Array[FileStatus]] = _
   @volatile private var cachedPartitionSpec: PartitionSpec = _
@@ -96,10 +99,25 @@ class InMemoryFileIndex(
     val files = listLeafFiles(rootPaths)
     cachedLeafFiles =
       new mutable.LinkedHashMap[Path, FileStatus]() ++= files.map(f => f.getPath -> f)
-    cachedLeafDirToChildrenFiles = files.toArray.groupBy(_.getPath.getParent)
+    cachedLeafDirToChildrenFiles =
+      if (readPartitionWithSubdirectoryEnabled) {
+        files.toArray.groupBy(file => getRootPathsLeafDir(file.getPath.getParent, file.getPath))
+      } else {
+        files.toArray.groupBy(_.getPath.getParent)
+      }
     cachedPartitionSpec = null
   }
 
+  private def getRootPathsLeafDir(path: Path, child: Path): Path = {
+    if (rootPaths.contains(child)) {
+      path
+    } else if (rootPaths.contains(path)) {
+      path
+    } else {
+      getRootPathsLeafDir(path.getParent, path)
+    }
+  }
+
   override def equals(other: Any): Boolean = other match {
     case hdfs: InMemoryFileIndex => rootPaths.toSet == hdfs.rootPaths.toSet
     case _ => false
diff --git a/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala b/sql/hive/src/main/scala/org/apache/spark/sql/hive/HiveMetastoreCatalog.scala
@@ -25,6 +25,7 @@ import com.google.common.util.concurrent.Striped
 import org.apache.hadoop.fs.Path
 
 import org.apache.spark.SparkException
+import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.internal.Logging
 import org.apache.spark.sql.{AnalysisException, SparkSession}
 import org.apache.spark.sql.catalyst.{QualifiedTableName, TableIdentifier}
@@ -283,7 +284,7 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
             LogicalRelation(
               DataSource(
                 sparkSession = sparkSession,
-                paths = rootPath.toString :: Nil,
+                paths = getDirectoryPathSeq(rootPath),
                 userSpecifiedSchema = Option(updatedTable.dataSchema),
                 bucketSpec = hiveBucketSpec,
                 // Do not interpret the 'path' option at all when tables are read using the Hive
@@ -321,6 +322,18 @@ private[hive] class HiveMetastoreCatalog(sparkSession: SparkSession) extends Log
     result.copy(output = newOutput)
   }
 
+  private def getDirectoryPathSeq(rootPath: Path): Seq[String] = {
+    val enableSupportSubDirectories =
+      sparkSession.sessionState.conf.readPartitionWithSubdirectoryEnabled
+
+    if (enableSupportSubDirectories) {
+      val fs = rootPath.getFileSystem(sparkSession.sessionState.newHadoopConf())
+      SparkHadoopUtil.get.listLeafDirStatuses(fs, rootPath).map(_.getPath.toString)
+    } else {
+      rootPath.toString :: Nil
+    }
+  }
+
   private def inferIfNeeded(
       relation: HiveTableRelation,
       options: Map[String, String],