From e4f2a0417bca63bab0366d1a979f74d0baf4df4c Mon Sep 17 00:00:00 2001 From: Manikandan Ganesan Date: Fri, 20 Mar 2026 16:05:35 +0530 Subject: [PATCH 1/3] Change to throw error when column name is missed in where condition before IN clause --- .../resources/error/error-conditions.json | 8 +++++ .../sql/catalyst/analysis/CheckAnalysis.scala | 4 +++ .../catalyst/analysis/FunctionRegistry.scala | 25 +++++++++++++- .../sql/errors/QueryCompilationErrors.scala | 8 +++++ .../analysis/AnalysisErrorSuite.scala | 33 +++++++++++++++++++ 5 files changed, 77 insertions(+), 1 deletion(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 2f7d50fe764ae..8439ef4735813 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -4372,6 +4372,14 @@ "IN predicate requires at least one value. Empty IN clauses like 'IN ()' are not allowed. Consider using 'WHERE FALSE' if you need an always-false condition, or provide at least one value in the IN list." ] }, + "MISSING_COLUMN_BEFORE_IN" : { + "message" : [ + "Column name is missing before IN clause.", + "Expected syntax: IN (value1, value2, ...)", + "Found: IN (values)", + "For example: SELECT * FROM t WHERE id IN (1, 2)" + ] + }, "EMPTY_PARTITION_VALUE" : { "message" : [ "Partition key must set value." diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala index 94c40215ab59f..739a74e451060 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/CheckAnalysis.scala @@ -302,6 +302,10 @@ trait CheckAnalysis extends LookupCatalog with QueryErrorsBase with PlanToString case u: UnresolvedRelation => u.tableNotFound(u.multipartIdentifier) + // Rare: identifier "in" as column + IN (list) when "in" is a valid unquoted identifier. + case f @ Filter(In(UnresolvedAttribute(Seq(name)), _), _) if name.equalsIgnoreCase("in") => + throw QueryCompilationErrors.missingColumnBeforeInError(f.condition.origin) + case u: UnresolvedFunctionName => val catalogPath = currentCatalog.name +: catalogManager.currentNamespace val searchPath = SQLConf.get.resolutionSearchPath(catalogPath.toSeq) diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala index cadb7750c2460..74576faff9e64 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/catalyst/analysis/FunctionRegistry.scala @@ -924,7 +924,7 @@ object FunctionRegistry { // predicates expression[Between]("between"), expression[And]("and"), - expression[In]("in"), + expressionBuilder("in", InPredicateExpressionBuilder), expression[Not]("not"), expression[Or]("or"), @@ -1345,6 +1345,29 @@ object TableFunctionRegistry { */ trait ExpressionBuilder extends FunctionBuilderBase[Expression] +/** + * SQL `in(col, v1, ...)` as a function. Rejects: + * - `in(v)` — how `WHERE in (v)` is parsed (column omitted before IN list). + * - `in(v1, v2, ...)` when every argument is a [[Literal]] — how + * `WHERE IN ('a','b','c')` / `DELETE ... WHERE IN (...)` is parsed (no column). + * Legitimate `in(col, v1, v2)` has a non-literal first argument (the column). + */ +private[analysis] object InPredicateExpressionBuilder extends ExpressionBuilder { + override def build(funcName: String, expressions: Seq[Expression]): Expression = { + expressions.length match { + case 0 => + throw QueryCompilationErrors.wrongNumArgsError(funcName, Seq(2), 0) + case 1 => + throw QueryCompilationErrors.missingColumnBeforeInError(expressions.head.origin) + case _ => + if (expressions.forall(_.isInstanceOf[Literal])) { + throw QueryCompilationErrors.missingColumnBeforeInError(expressions.head.origin) + } + In(expressions.head, expressions.tail) + } + } +} + /** * This is a trait used for table valued functions that defines how their expression * representations are constructed in [[TableFunctionRegistry]]. diff --git a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala index 89b574fa61fcc..a48dd350714c9 100644 --- a/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala +++ b/sql/catalyst/src/main/scala/org/apache/spark/sql/errors/QueryCompilationErrors.scala @@ -453,6 +453,14 @@ private[sql] object QueryCompilationErrors extends QueryErrorsBase with Compilat messageParameters = commonParam ++ proposalParam) } + def missingColumnBeforeInError(origin: Origin): Throwable = { + new AnalysisException( + errorClass = "INVALID_SQL_SYNTAX.MISSING_COLUMN_BEFORE_IN", + messageParameters = Map.empty, + origin = origin + ) + } + def unresolvedFieldError( fieldName: String, columnPath: Seq[String], diff --git a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala index ee644fc62a1ab..ac18e1be19a2e 100644 --- a/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala +++ b/sql/catalyst/src/test/scala/org/apache/spark/sql/catalyst/analysis/AnalysisErrorSuite.scala @@ -1062,6 +1062,39 @@ class AnalysisErrorSuite extends AnalysisTest with DataTypeErrorsBase { "expressionList" -> "max(DISTINCT b)")) } + test("Error when column name is missing before IN in WHERE clause") { + val a = AttributeReference("a", StringType)() + val plan = Filter( + In(UnresolvedAttribute("in"), Seq(Literal("2024-07-05"))), + LocalRelation(a)) + assertAnalysisErrorCondition(plan, + expectedErrorCondition = "INVALID_SQL_SYNTAX.MISSING_COLUMN_BEFORE_IN", + expectedMessageParameters = Map.empty) + } + + test("Error when WHERE uses in(v) as function — same as Spark SQL WHERE in (1)") { + val a = AttributeReference("id", IntegerType)() + val plan = Filter( + UnresolvedFunction("in", Seq(Literal(1)), isDistinct = false), + LocalRelation(a)) + assertAnalysisErrorCondition(plan, + expectedErrorCondition = "INVALID_SQL_SYNTAX.MISSING_COLUMN_BEFORE_IN", + expectedMessageParameters = Map.empty) + } + + test("Error when in(...) has only literals — no column before IN list (e.g. WHERE IN (v1,v2,...))") { + val a = AttributeReference("id", IntegerType)() + val plan = Filter( + UnresolvedFunction( + "in", + Seq(Literal(1), Literal(2), Literal(3)), + isDistinct = false), + LocalRelation(a)) + assertAnalysisErrorCondition(plan, + expectedErrorCondition = "INVALID_SQL_SYNTAX.MISSING_COLUMN_BEFORE_IN", + expectedMessageParameters = Map.empty) + } + test("SPARK-30811: CTE should not cause stack overflow when " + "it refers to non-existent table with same name") { val plan = UnresolvedWith( From a3e232aa5525cb222e288b54a1369d847fb16695 Mon Sep 17 00:00:00 2001 From: Manikandan Ganesan Date: Sat, 21 Mar 2026 00:19:42 +0530 Subject: [PATCH 2/3] modified error clause as per guidelines --- common/utils/src/main/resources/error/error-conditions.json | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index 8439ef4735813..e2ae8d9f04413 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -4374,10 +4374,8 @@ }, "MISSING_COLUMN_BEFORE_IN" : { "message" : [ - "Column name is missing before IN clause.", - "Expected syntax: IN (value1, value2, ...)", - "Found: IN (values)", - "For example: SELECT * FROM t WHERE id IN (1, 2)" + "Invalid IN predicate: a column or expression must appear before IN and its parenthesized value list.", + "For example, use `WHERE id IN (1, 2)` instead of `WHERE IN (1, 2)` or `WHERE in (1)` with no expression before `in`." ] }, "EMPTY_PARTITION_VALUE" : { From b18b514974dc921e458f033fc00dc37fac0a4ee0 Mon Sep 17 00:00:00 2001 From: Manikandan Ganesan Date: Mon, 23 Mar 2026 17:58:03 +0530 Subject: [PATCH 3/3] modified clause order in error-conditions.json --- .../src/main/resources/error/error-conditions.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/common/utils/src/main/resources/error/error-conditions.json b/common/utils/src/main/resources/error/error-conditions.json index e2ae8d9f04413..b4c0ade7cd1f3 100644 --- a/common/utils/src/main/resources/error/error-conditions.json +++ b/common/utils/src/main/resources/error/error-conditions.json @@ -4372,12 +4372,6 @@ "IN predicate requires at least one value. Empty IN clauses like 'IN ()' are not allowed. Consider using 'WHERE FALSE' if you need an always-false condition, or provide at least one value in the IN list." ] }, - "MISSING_COLUMN_BEFORE_IN" : { - "message" : [ - "Invalid IN predicate: a column or expression must appear before IN and its parenthesized value list.", - "For example, use `WHERE id IN (1, 2)` instead of `WHERE IN (1, 2)` or `WHERE in (1)` with no expression before `in`." - ] - }, "EMPTY_PARTITION_VALUE" : { "message" : [ "Partition key must set value." @@ -4413,6 +4407,12 @@ "LATERAL can only be used with subquery and table-valued functions." ] }, + "MISSING_COLUMN_BEFORE_IN" : { + "message" : [ + "Invalid IN predicate: a column or expression must appear before IN and its parenthesized value list.", + "For example, use `WHERE id IN (1, 2)` instead of `WHERE IN (1, 2)` or `WHERE in (1)` with no expression before `in`." + ] + }, "MULTI_PART_NAME" : { "message" : [ " with multiple part name() is not allowed."