From 185eabbac3abff0dc9ca16a1821ed13cf44fa5b5 Mon Sep 17 00:00:00 2001 From: Felipe Fujiy Pessoto Date: Sun, 14 Jun 2026 05:57:18 +0000 Subject: [PATCH 1/6] [VL] Fix native crash in toVeloxExpr for nested field reference into array/map SubstraitVeloxExprConverter::toVeloxExpr(FieldReference) resolves a nested struct_field chain by calling asRowType() on each child to descend a level. When the reference traverses a non-struct child -- e.g. a field nested under an array, as exercised by Delta's "nested data support - analysis error - updating array type" UpdateSuite test -- asRowType() returns null and the next loop iteration dereferenced that null RowType, crashing the whole forked JVM with a SIGSEGV in libvelox.so (gluten::SubstraitVeloxExprConverter::toVeloxExpr). A SIGSEGV is not catchable, so plan validation could not fall back and the test process died outright. Replace the unchecked navigation with VELOX_USER_CHECK guards (field-index bounds + non-null struct child) that throw a VeloxUserError. The plan validator already wraps expression conversion in try/catch(VeloxException), so the query now falls back to vanilla execution cleanly instead of crashing. Add SubstraitVeloxExprConverterTest with a regression repro: a field reference descending into an array column, and an out-of-range field index, both of which must now throw instead of crashing. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- cpp/velox/substrait/SubstraitToVeloxExpr.cc | 20 +++++- cpp/velox/tests/CMakeLists.txt | 1 + .../tests/SubstraitVeloxExprConverterTest.cc | 66 +++++++++++++++++++ 3 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 cpp/velox/tests/SubstraitVeloxExprConverterTest.cc diff --git a/cpp/velox/substrait/SubstraitToVeloxExpr.cc b/cpp/velox/substrait/SubstraitToVeloxExpr.cc index b7615d844d9..6ef99685c15 100755 --- a/cpp/velox/substrait/SubstraitToVeloxExpr.cc +++ b/cpp/velox/substrait/SubstraitToVeloxExpr.cc @@ -217,13 +217,29 @@ std::shared_ptr SubstraitVeloxExprConverter::t auto inputColumnType = inputType; for (;;) { auto idx = tmp->field(); - fieldAccess = makeFieldAccessExpr(inputColumnType->nameOf(idx), inputColumnType->childAt(idx), fieldAccess); + VELOX_USER_CHECK( + idx >= 0 && static_cast(idx) < inputColumnType->size(), + "Field reference index {} is out of range for the {}-field row type.", + idx, + inputColumnType->size()); + const TypePtr childType = inputColumnType->childAt(idx); + fieldAccess = makeFieldAccessExpr(inputColumnType->nameOf(idx), childType, fieldAccess); if (!tmp->has_child()) { break; } - inputColumnType = asRowType(inputColumnType->childAt(idx)); + // Descending into a nested field is only valid when the current child is + // itself a struct/row. For array/map/primitive children (e.g. a field + // nested under an array, as in Delta's "updating array type" case) + // asRowType() returns null; previously the next loop iteration + // dereferenced that null RowType and crashed the process with a SIGSEGV. + // Throw a user error instead so plan validation fails cleanly and the + // query falls back to vanilla execution. + inputColumnType = asRowType(childType); + VELOX_USER_CHECK_NOT_NULL( + inputColumnType, + "Nested field reference into a non-struct type (e.g. an array or map element) is not supported."); tmp = &tmp->child().struct_field(); } return fieldAccess; diff --git a/cpp/velox/tests/CMakeLists.txt b/cpp/velox/tests/CMakeLists.txt index 3052ce23f6b..0f5e634b9de 100644 --- a/cpp/velox/tests/CMakeLists.txt +++ b/cpp/velox/tests/CMakeLists.txt @@ -129,6 +129,7 @@ add_velox_test( Substrait2VeloxPlanValidatorTest.cc Substrait2VeloxValuesNodeConversionTest.cc SubstraitExtensionCollectorTest.cc + SubstraitVeloxExprConverterTest.cc VeloxSubstraitRoundTripTest.cc VeloxSubstraitSignatureTest.cc VeloxToSubstraitTypeTest.cc) diff --git a/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc b/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc new file mode 100644 index 00000000000..0a87b8ee56b --- /dev/null +++ b/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc @@ -0,0 +1,66 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "substrait/SubstraitToVeloxExpr.h" + +#include "velox/common/base/tests/GTestUtils.h" +#include "velox/type/Type.h" + +using namespace facebook::velox; + +namespace gluten { + +// Regression test for a SIGSEGV in +// SubstraitVeloxExprConverter::toVeloxExpr(Expression::FieldReference, ...). +// The direct-reference loop descends one nested struct_field at a time with +// `inputColumnType = asRowType(childAt(idx))`. When the field path traverses a +// non-struct child -- e.g. a field nested under an array, as produced by Delta's +// nested-array UPDATE rewrite ("nested data support - ... updating array type") +// -- asRowType() returns null and the next iteration dereferenced that null +// RowType, crashing the whole forked JVM. A SIGSEGV is not catchable, so plan +// validation could not fall back. The converter must instead throw a +// VeloxUserError, which SubstraitToVeloxPlanValidator catches to fall back to +// vanilla execution. +TEST(SubstraitVeloxExprConverterTest, nestedFieldReferenceIntoNonStructThrows) { + // Schema with a single array column. + RowTypePtr inputType = ROW({"arr"}, {ARRAY(INTEGER())}); + + // Reference column 0 (the array), then descend one more level via a child + // struct_field -- i.e. into the array's element, which is not a struct/row. + ::substrait::Expression::FieldReference fieldReference; + auto* structField = fieldReference.mutable_direct_reference()->mutable_struct_field(); + structField->set_field(0); + structField->mutable_child()->mutable_struct_field()->set_field(0); + + VELOX_ASSERT_THROW( + SubstraitVeloxExprConverter::toVeloxExpr(fieldReference, inputType), + "Nested field reference into a non-struct type"); +} + +// A field-reference index past the end of the row type must be rejected cleanly +// instead of indexing out of bounds (the raw RowType::childAt/nameOf accessors +// use unchecked operator[]). +TEST(SubstraitVeloxExprConverterTest, fieldReferenceIndexOutOfRangeThrows) { + RowTypePtr inputType = ROW({"a", "b"}, {INTEGER(), INTEGER()}); + + ::substrait::Expression::FieldReference fieldReference; + fieldReference.mutable_direct_reference()->mutable_struct_field()->set_field(5); + + VELOX_ASSERT_THROW(SubstraitVeloxExprConverter::toVeloxExpr(fieldReference, inputType), "out of range"); +} + +} // namespace gluten From 68c93dd81311226a3f47450e7249f671254c3406 Mon Sep 17 00:00:00 2001 From: Felipe Fujiy Pessoto Date: Thu, 18 Jun 2026 08:29:32 +0000 Subject: [PATCH 2/6] [VL] Remove redundant index bounds check - validated by PR #12278 CI After validating with PR #12278 (all 79 CI checks passed including Delta tests), confirmed that the explicit VELOX_USER_CHECK(idx < size) is redundant - Velox's RowType::childAt/nameOf already have VELOX_CHECK_LT bounds checks that throw catchable VeloxUserError, providing sufficient error handling for Gluten. Kept the essential VELOX_USER_CHECK_NOT_NULL for the rowType, which prevents SIGSEGV by catching null before dereferencing. Updated test comment to reflect actual Velox behavior (built-in bounds checks). Validation: https://github.com/apache/gluten/pull/12278/commits/4a9d3d846 --- cpp/velox/substrait/SubstraitToVeloxExpr.cc | 5 ----- cpp/velox/tests/SubstraitVeloxExprConverterTest.cc | 8 +++++--- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/cpp/velox/substrait/SubstraitToVeloxExpr.cc b/cpp/velox/substrait/SubstraitToVeloxExpr.cc index 6ef99685c15..10cdaae490d 100755 --- a/cpp/velox/substrait/SubstraitToVeloxExpr.cc +++ b/cpp/velox/substrait/SubstraitToVeloxExpr.cc @@ -217,11 +217,6 @@ std::shared_ptr SubstraitVeloxExprConverter::t auto inputColumnType = inputType; for (;;) { auto idx = tmp->field(); - VELOX_USER_CHECK( - idx >= 0 && static_cast(idx) < inputColumnType->size(), - "Field reference index {} is out of range for the {}-field row type.", - idx, - inputColumnType->size()); const TypePtr childType = inputColumnType->childAt(idx); fieldAccess = makeFieldAccessExpr(inputColumnType->nameOf(idx), childType, fieldAccess); diff --git a/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc b/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc index 0a87b8ee56b..eff71a7bd02 100644 --- a/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc +++ b/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc @@ -51,9 +51,11 @@ TEST(SubstraitVeloxExprConverterTest, nestedFieldReferenceIntoNonStructThrows) { "Nested field reference into a non-struct type"); } -// A field-reference index past the end of the row type must be rejected cleanly -// instead of indexing out of bounds (the raw RowType::childAt/nameOf accessors -// use unchecked operator[]). +// A field-reference index past the end of the row type must be rejected cleanly. +// Velox's RowType::childAt/nameOf have built-in VELOX_CHECK_LT bounds checks that +// throw VeloxUserError, which Gluten catches and falls back. This test validates +// that out-of-range field access results in a clean fallback instead of undefined +// behavior. TEST(SubstraitVeloxExprConverterTest, fieldReferenceIndexOutOfRangeThrows) { RowTypePtr inputType = ROW({"a", "b"}, {INTEGER(), INTEGER()}); From b6fe845b9e1ea95b094a470d7b2c2d35eb3cadaf Mon Sep 17 00:00:00 2001 From: Felipe Fujiy Pessoto Date: Thu, 18 Jun 2026 09:48:30 +0000 Subject: [PATCH 3/6] [VL] Fix test assertion to match Velox's error message format The test was checking for 'out of range' in the error message, but Velox's VELOX_CHECK_LT throws with format 'Expression: idx < children_.size() (5 vs. 2)'. Updated the test assertion to match the actual Velox error message. --- cpp/velox/tests/SubstraitVeloxExprConverterTest.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc b/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc index eff71a7bd02..01ffb2f428e 100644 --- a/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc +++ b/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc @@ -62,7 +62,8 @@ TEST(SubstraitVeloxExprConverterTest, fieldReferenceIndexOutOfRangeThrows) { ::substrait::Expression::FieldReference fieldReference; fieldReference.mutable_direct_reference()->mutable_struct_field()->set_field(5); - VELOX_ASSERT_THROW(SubstraitVeloxExprConverter::toVeloxExpr(fieldReference, inputType), "out of range"); + // Velox's VELOX_CHECK_LT throws with format "Expression: idx < children_.size() (5 vs. 2)" + VELOX_ASSERT_THROW(SubstraitVeloxExprConverter::toVeloxExpr(fieldReference, inputType), "idx < children_.size()"); } } // namespace gluten From fab87a291c5b80abbf6003f647f55ad6164ae0de Mon Sep 17 00:00:00 2001 From: Felipe Fujiy Pessoto Date: Fri, 19 Jun 2026 02:36:32 +0000 Subject: [PATCH 4/6] [VL] Remove message check from test - just verify exception is thrown VELOX_ASSERT_THROW with empty string checks that an exception is thrown without validating the message content. This makes the test less brittle to Velox error message format changes while still verifying the bounds check happens. --- cpp/velox/tests/SubstraitVeloxExprConverterTest.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc b/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc index 01ffb2f428e..0b297f40375 100644 --- a/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc +++ b/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc @@ -62,8 +62,8 @@ TEST(SubstraitVeloxExprConverterTest, fieldReferenceIndexOutOfRangeThrows) { ::substrait::Expression::FieldReference fieldReference; fieldReference.mutable_direct_reference()->mutable_struct_field()->set_field(5); - // Velox's VELOX_CHECK_LT throws with format "Expression: idx < children_.size() (5 vs. 2)" - VELOX_ASSERT_THROW(SubstraitVeloxExprConverter::toVeloxExpr(fieldReference, inputType), "idx < children_.size()"); + // Velox's VELOX_CHECK_LT throws VeloxRuntimeError when idx >= size + VELOX_ASSERT_THROW(SubstraitVeloxExprConverter::toVeloxExpr(fieldReference, inputType), ""); } } // namespace gluten From 4070b5cf0d1017e8dc67a409958ce05850bdd895 Mon Sep 17 00:00:00 2001 From: Felipe Fujiy Pessoto Date: Fri, 19 Jun 2026 02:48:02 +0000 Subject: [PATCH 5/6] [VL] Use VELOX_ASSERT_RUNTIME_THROW for bounds check test VELOX_CHECK_LT throws VeloxRuntimeError (not VeloxUserError or generic VeloxException), so we need to use VELOX_ASSERT_RUNTIME_THROW to catch the correct exception type. --- cpp/velox/tests/SubstraitVeloxExprConverterTest.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc b/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc index 0b297f40375..084b41e874d 100644 --- a/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc +++ b/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc @@ -53,7 +53,7 @@ TEST(SubstraitVeloxExprConverterTest, nestedFieldReferenceIntoNonStructThrows) { // A field-reference index past the end of the row type must be rejected cleanly. // Velox's RowType::childAt/nameOf have built-in VELOX_CHECK_LT bounds checks that -// throw VeloxUserError, which Gluten catches and falls back. This test validates +// throw VeloxRuntimeError, which Gluten catches and falls back. This test validates // that out-of-range field access results in a clean fallback instead of undefined // behavior. TEST(SubstraitVeloxExprConverterTest, fieldReferenceIndexOutOfRangeThrows) { @@ -63,7 +63,7 @@ TEST(SubstraitVeloxExprConverterTest, fieldReferenceIndexOutOfRangeThrows) { fieldReference.mutable_direct_reference()->mutable_struct_field()->set_field(5); // Velox's VELOX_CHECK_LT throws VeloxRuntimeError when idx >= size - VELOX_ASSERT_THROW(SubstraitVeloxExprConverter::toVeloxExpr(fieldReference, inputType), ""); + VELOX_ASSERT_RUNTIME_THROW(SubstraitVeloxExprConverter::toVeloxExpr(fieldReference, inputType), ""); } } // namespace gluten From a39d5c948b2429ba6ff90eb6bc262b7eb2e44220 Mon Sep 17 00:00:00 2001 From: Felipe Fujiy Pessoto Date: Fri, 19 Jun 2026 06:34:41 +0000 Subject: [PATCH 6/6] [VL] Revert bounds check removal - idx can be negative The field() method returns an int32 which can be negative. When a negative index is cast to uint32_t for childAt(idx), it becomes a huge positive number that bypasses Velox's VELOX_CHECK_LT(idx, size) check and causes undefined behavior (SIGSEGV observed in Spark 4.0 script transformation tests). The explicit idx >= 0 check is necessary to catch negative indices before they're cast to unsigned and cause memory corruption. Reverts commits: - 4070b5cf0 [VL] Use VELOX_ASSERT_RUNTIME_THROW for bounds check test - fab87a291 [VL] Remove message check from test - just verify exception is thrown - b6fe845b9 [VL] Fix test assertion to match Velox's error message format - 68c93dd81 [VL] Remove redundant index bounds check - validated by PR #12278 CI Fixes #12307 test failure in spark-test-spark40. --- cpp/velox/substrait/SubstraitToVeloxExpr.cc | 5 +++++ cpp/velox/tests/SubstraitVeloxExprConverterTest.cc | 12 ++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/cpp/velox/substrait/SubstraitToVeloxExpr.cc b/cpp/velox/substrait/SubstraitToVeloxExpr.cc index 10cdaae490d..6ef99685c15 100755 --- a/cpp/velox/substrait/SubstraitToVeloxExpr.cc +++ b/cpp/velox/substrait/SubstraitToVeloxExpr.cc @@ -217,6 +217,11 @@ std::shared_ptr SubstraitVeloxExprConverter::t auto inputColumnType = inputType; for (;;) { auto idx = tmp->field(); + VELOX_USER_CHECK( + idx >= 0 && static_cast(idx) < inputColumnType->size(), + "Field reference index {} is out of range for the {}-field row type.", + idx, + inputColumnType->size()); const TypePtr childType = inputColumnType->childAt(idx); fieldAccess = makeFieldAccessExpr(inputColumnType->nameOf(idx), childType, fieldAccess); diff --git a/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc b/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc index 084b41e874d..784ba0c13de 100644 --- a/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc +++ b/cpp/velox/tests/SubstraitVeloxExprConverterTest.cc @@ -52,18 +52,18 @@ TEST(SubstraitVeloxExprConverterTest, nestedFieldReferenceIntoNonStructThrows) { } // A field-reference index past the end of the row type must be rejected cleanly. -// Velox's RowType::childAt/nameOf have built-in VELOX_CHECK_LT bounds checks that -// throw VeloxRuntimeError, which Gluten catches and falls back. This test validates -// that out-of-range field access results in a clean fallback instead of undefined -// behavior. +// The out-of-range check is defense-in-depth: Spark's analyzer keeps field indices +// in bounds, but a malformed Substrait plan might have a negative index or an index +// exceeding the row size. Without the check, a negative index casts to a huge uint32_t +// and causes undefined behavior; an out-of-bounds index would throw from Velox's +// RowType::childAt but with a less clear error message. TEST(SubstraitVeloxExprConverterTest, fieldReferenceIndexOutOfRangeThrows) { RowTypePtr inputType = ROW({"a", "b"}, {INTEGER(), INTEGER()}); ::substrait::Expression::FieldReference fieldReference; fieldReference.mutable_direct_reference()->mutable_struct_field()->set_field(5); - // Velox's VELOX_CHECK_LT throws VeloxRuntimeError when idx >= size - VELOX_ASSERT_RUNTIME_THROW(SubstraitVeloxExprConverter::toVeloxExpr(fieldReference, inputType), ""); + VELOX_ASSERT_USER_THROW(SubstraitVeloxExprConverter::toVeloxExpr(fieldReference, inputType), "out of range"); } } // namespace gluten