diff --git a/backends-velox/src-delta33/test/scala/org/apache/gluten/execution/GlutenDeltaStatsSuite.scala b/backends-velox/src-delta33/test/scala/org/apache/gluten/execution/GlutenDeltaStatsSuite.scala new file mode 100644 index 0000000000..70da8cc317 --- /dev/null +++ b/backends-velox/src-delta33/test/scala/org/apache/gluten/execution/GlutenDeltaStatsSuite.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.delta.test.DeltaSQLCommandTest +import org.apache.spark.sql.test.SharedSparkSession + +import java.io.File + +/** + * Regression test for the Gluten Delta per-file statistics tracker. + * + * Writing a Delta table whose collected min/max statistics cannot be offloaded to Velox -- for + * example over a TIMESTAMP_NTZ column -- used to crash the write task with a ClassCastException + * (ProjectExec cannot be cast to WholeStageTransformer), because the native stats tracker assumed + * the statistics aggregation always collapses into a WholeStageTransformer. The tracker must now + * fall back to row-based statistics collection instead of crashing. + */ +class GlutenDeltaStatsSuite extends QueryTest with SharedSparkSession with DeltaSQLCommandTest { + + import testImplicits._ + + test("TIMESTAMP_NTZ stats fall back instead of crashing the write") { + withTempDir { + dir => + val path = new File(dir, "ntz-stats").getCanonicalPath + // The maxValue statistic for a TIMESTAMP_NTZ near Long.MaxValue triggers the per-file + // statistics aggregation that cannot be offloaded to Velox. + val nearMaxMicros = Long.MaxValue - 999L + val data = Seq(nearMaxMicros) + .toDF("micros") + .selectExpr("micros AS id", "CAST(TIMESTAMP_MICROS(micros) AS TIMESTAMP_NTZ) AS ts") + + // Without the fix this write fails with a ClassCastException (ProjectExec cannot be cast + // to WholeStageTransformer) while collecting statistics. With the fix it succeeds via the + // row-based fallback tracker. A count avoids materializing the TIMESTAMP_NTZ column, which + // is an unrelated read-path limitation. + data.coalesce(1).write.format("delta").save(path) + + assert(spark.read.format("delta").load(path).count() === 1) + } + } +} diff --git a/backends-velox/src-delta40/test/scala/org/apache/gluten/execution/GlutenDeltaStatsSuite.scala b/backends-velox/src-delta40/test/scala/org/apache/gluten/execution/GlutenDeltaStatsSuite.scala new file mode 100644 index 0000000000..70da8cc317 --- /dev/null +++ b/backends-velox/src-delta40/test/scala/org/apache/gluten/execution/GlutenDeltaStatsSuite.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.gluten.execution + +import org.apache.spark.sql.QueryTest +import org.apache.spark.sql.delta.test.DeltaSQLCommandTest +import org.apache.spark.sql.test.SharedSparkSession + +import java.io.File + +/** + * Regression test for the Gluten Delta per-file statistics tracker. + * + * Writing a Delta table whose collected min/max statistics cannot be offloaded to Velox -- for + * example over a TIMESTAMP_NTZ column -- used to crash the write task with a ClassCastException + * (ProjectExec cannot be cast to WholeStageTransformer), because the native stats tracker assumed + * the statistics aggregation always collapses into a WholeStageTransformer. The tracker must now + * fall back to row-based statistics collection instead of crashing. + */ +class GlutenDeltaStatsSuite extends QueryTest with SharedSparkSession with DeltaSQLCommandTest { + + import testImplicits._ + + test("TIMESTAMP_NTZ stats fall back instead of crashing the write") { + withTempDir { + dir => + val path = new File(dir, "ntz-stats").getCanonicalPath + // The maxValue statistic for a TIMESTAMP_NTZ near Long.MaxValue triggers the per-file + // statistics aggregation that cannot be offloaded to Velox. + val nearMaxMicros = Long.MaxValue - 999L + val data = Seq(nearMaxMicros) + .toDF("micros") + .selectExpr("micros AS id", "CAST(TIMESTAMP_MICROS(micros) AS TIMESTAMP_NTZ) AS ts") + + // Without the fix this write fails with a ClassCastException (ProjectExec cannot be cast + // to WholeStageTransformer) while collecting statistics. With the fix it succeeds via the + // row-based fallback tracker. A count avoids materializing the TIMESTAMP_NTZ column, which + // is an unrelated read-path limitation. + data.coalesce(1).write.format("delta").save(path) + + assert(spark.read.format("delta").load(path).count() === 1) + } + } +}