Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -470,6 +470,12 @@ class VeloxSparkPlanExecApi extends SparkPlanExecApi with Logging {
}
}
}
case _: KeyGroupedPartitioning =>
FallbackTags.add(
shuffle,
ValidationResult.failed(
"KeyGroupedPartitioning is not supported by Gluten native shuffle"))
shuffle.withNewChildren(child :: Nil)
case _ =>
ColumnarShuffleExchangeExec(shuffle, child, null)
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ package org.apache.spark.sql.execution.utils
import org.apache.gluten.backendsapi.BackendsApiManager
import org.apache.gluten.columnarbatch.{ColumnarBatches, VeloxColumnarBatches}
import org.apache.gluten.config.ShuffleWriterType
import org.apache.gluten.exception.GlutenNotSupportException
import org.apache.gluten.iterator.Iterators
import org.apache.gluten.memory.arrow.alloc.ArrowBufferAllocators
import org.apache.gluten.runtime.Runtimes
Expand Down Expand Up @@ -172,6 +173,9 @@ object ExecUtil {
// range partitioning fall back to row-based partition id computation
case RangePartitioning(orders, n) =>
new NativePartitioning(GlutenShuffleUtils.RangePartitioningShortName, n)
case other =>
throw new GlutenNotSupportException(
s"Partitioning $other is not supported by native shuffle")
}

val isRoundRobin = newPartitioning.isInstanceOf[RoundRobinPartitioning] &&
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import org.apache.gluten.execution.SortMergeJoinExecTransformer

import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, GlutenSQLTestsBaseTrait, Row}
import org.apache.spark.sql.catalyst.plans.physical.KeyGroupedPartitioning
import org.apache.spark.sql.connector.catalog.{Column, Identifier, InMemoryTableCatalog}
import org.apache.spark.sql.connector.distributions.Distributions
import org.apache.spark.sql.connector.expressions.Expressions.{bucket, days, identity, years}
Expand Down Expand Up @@ -1072,6 +1073,51 @@ class GlutenKeyGroupedPartitioningSuite
}
}

testGluten(
"GLUTEN-10992: KeyGroupedPartitioning shuffle falls back to vanilla Spark") {
val items_partitions = Array(identity("id"))
createTable(items, itemsColumns, items_partitions)

sql(
s"INSERT INTO testcat.ns.$items VALUES " +
"(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " +
"(3, 'bb', 10.0, cast('2020-01-01' as timestamp)), " +
"(4, 'cc', 15.5, cast('2020-02-01' as timestamp))")

createTable(purchases, purchasesColumns, Array.empty)
sql(
s"INSERT INTO testcat.ns.$purchases VALUES " +
"(1, 42.0, cast('2020-01-01' as timestamp)), " +
"(3, 19.5, cast('2020-02-01' as timestamp))")

// With V2 bucketing shuffle enabled and only one side reporting partitioning, Spark
// shuffles the other side with a ShuffleExchangeExec whose output partitioning is
// KeyGroupedPartitioning. Gluten native shuffle does not support it, so the exchange
// must fall back to vanilla Spark. Offloading it to ColumnarShuffleExchangeExec would
// crash with a scala.MatchError in ExecUtil.genShuffleDependency (GLUTEN-10992).
withSQLConf(SQLConf.V2_BUCKETING_SHUFFLE_ENABLED.key -> "true") {
val df = createJoinTestDF(Seq("id" -> "item_id"))
val plan = df.queryExecution.executedPlan

val keyGroupedShuffles = collect(plan) {
case s: ShuffleExchangeExec
if s.outputPartitioning.isInstanceOf[KeyGroupedPartitioning] =>
s
}
assert(
keyGroupedShuffles.nonEmpty,
"KeyGroupedPartitioning shuffle should fall back to a vanilla ShuffleExchangeExec")

val columnarKeyGroupedShuffles = collectAllShuffles(plan)
.filter(_.outputPartitioning.isInstanceOf[KeyGroupedPartitioning])
assert(
columnarKeyGroupedShuffles.isEmpty,
"KeyGroupedPartitioning must not be offloaded to ColumnarShuffleExchangeExec")

checkAnswer(df, Seq(Row(1, "aa", 40.0, 42.0), Row(3, "bb", 10.0, 19.5)))
}
}

testGluten("SPARK-41471: shuffle one side: only one side reports partitioning") {
val items_partitions = Array(identity("id"))
createTable(items, itemsColumns, items_partitions)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@ import org.apache.gluten.execution.SortMergeJoinExecTransformer

import org.apache.spark.SparkConf
import org.apache.spark.sql.{DataFrame, GlutenSQLTestsBaseTrait, Row}
import org.apache.spark.sql.catalyst.plans.physical.KeyGroupedPartitioning
import org.apache.spark.sql.connector.catalog.{Column, Identifier, InMemoryTableCatalog}
import org.apache.spark.sql.connector.distributions.Distributions
import org.apache.spark.sql.connector.expressions.Expressions.{bucket, days, identity, years}
Expand Down Expand Up @@ -1072,6 +1073,51 @@ class GlutenKeyGroupedPartitioningSuite
}
}

testGluten(
"GLUTEN-10992: KeyGroupedPartitioning shuffle falls back to vanilla Spark") {
val items_partitions = Array(identity("id"))
createTable(items, itemsColumns, items_partitions)

sql(
s"INSERT INTO testcat.ns.$items VALUES " +
"(1, 'aa', 40.0, cast('2020-01-01' as timestamp)), " +
"(3, 'bb', 10.0, cast('2020-01-01' as timestamp)), " +
"(4, 'cc', 15.5, cast('2020-02-01' as timestamp))")

createTable(purchases, purchasesColumns, Array.empty)
sql(
s"INSERT INTO testcat.ns.$purchases VALUES " +
"(1, 42.0, cast('2020-01-01' as timestamp)), " +
"(3, 19.5, cast('2020-02-01' as timestamp))")

// With V2 bucketing shuffle enabled and only one side reporting partitioning, Spark
// shuffles the other side with a ShuffleExchangeExec whose output partitioning is
// KeyGroupedPartitioning. Gluten native shuffle does not support it, so the exchange
// must fall back to vanilla Spark. Offloading it to ColumnarShuffleExchangeExec would
// crash with a scala.MatchError in ExecUtil.genShuffleDependency (GLUTEN-10992).
withSQLConf(SQLConf.V2_BUCKETING_SHUFFLE_ENABLED.key -> "true") {
val df = createJoinTestDF(Seq("id" -> "item_id"))
val plan = df.queryExecution.executedPlan

val keyGroupedShuffles = collect(plan) {
case s: ShuffleExchangeExec
if s.outputPartitioning.isInstanceOf[KeyGroupedPartitioning] =>
s
}
assert(
keyGroupedShuffles.nonEmpty,
"KeyGroupedPartitioning shuffle should fall back to a vanilla ShuffleExchangeExec")

val columnarKeyGroupedShuffles = collectAllShuffles(plan)
.filter(_.outputPartitioning.isInstanceOf[KeyGroupedPartitioning])
assert(
columnarKeyGroupedShuffles.isEmpty,
"KeyGroupedPartitioning must not be offloaded to ColumnarShuffleExchangeExec")

checkAnswer(df, Seq(Row(1, "aa", 40.0, 42.0), Row(3, "bb", 10.0, 19.5)))
}
}

testGluten("SPARK-41471: shuffle one side: only one side reports partitioning") {
val items_partitions = Array(identity("id"))
createTable(items, itemsColumns, items_partitions)
Expand Down
Loading