From d0fb0396bdc7f9ee57017e6667be599a029368c4 Mon Sep 17 00:00:00 2001 From: Richard Wooding Date: Sun, 14 Jun 2026 18:26:39 +0200 Subject: [PATCH] refactor(dialect): extract shared RegexSafety and SqlEmitters helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The five RE2-style dialects each carried a byte-identical copy of the ReDoS-safety validation (length/group/nesting limits, nested-quantifier and quantified-alternation detection, and the three helper methods), and the six dialect implementations duplicated several SQL-fragment emitters (EXTRACT, array join, regex match, JSON path probe, json_each membership, binary-function split, arrow JSON access). Consolidate both into two focused helpers in the dialect package: - RegexSafety: checkLength + checkReDoS plus the shared limits/patterns. Also normalizes the rejection message — MySQL/BigQuery previously emitted "Invalid regex pattern" while the others emitted "Invalid pattern in expression"; all dialects now use the latter consistently. - SqlEmitters: writeBinaryCall, writeArrayJoin, writeJsonEachMembership, writeJsonPathProbe, writeInfixRegex, writeStandardExtract / writeExtractWithPostgresDow, and writeArrowJsonAccess. Per-dialect field-name escaping is threaded through as a method reference so BigQuery's distinct escaping is preserved. Each dialect keeps its own override and delegates the body; dialects whose output genuinely differs keep their inline implementation. Net ~878 fewer lines across the 11 dialect files. No behavioral change — the per-dialect SQL-output tests pass unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../cel2sql/dialect/RegexSafety.java | 183 ++++++++++++++++++ .../cel2sql/dialect/SqlEmitters.java | 147 ++++++++++++++ .../dialect/bigquery/BigQueryDialect.java | 33 +--- .../dialect/bigquery/BigQueryRegex.java | 149 +------------- .../cel2sql/dialect/duckdb/DuckDbDialect.java | 70 +------ .../cel2sql/dialect/duckdb/DuckDbRegex.java | 149 +------------- .../cel2sql/dialect/mysql/MySqlDialect.java | 14 +- .../cel2sql/dialect/mysql/MySqlRegex.java | 149 +------------- .../dialect/postgres/PostgresDialect.java | 50 +---- .../dialect/postgres/PostgresRegex.java | 151 +-------------- .../cel2sql/dialect/spark/SparkDialect.java | 39 +--- .../cel2sql/dialect/spark/SparkRegex.java | 111 +---------- .../cel2sql/dialect/sqlite/SqliteDialect.java | 21 +- 13 files changed, 388 insertions(+), 878 deletions(-) create mode 100644 src/main/java/com/spandigital/cel2sql/dialect/RegexSafety.java create mode 100644 src/main/java/com/spandigital/cel2sql/dialect/SqlEmitters.java diff --git a/src/main/java/com/spandigital/cel2sql/dialect/RegexSafety.java b/src/main/java/com/spandigital/cel2sql/dialect/RegexSafety.java new file mode 100644 index 0000000..a42be8d --- /dev/null +++ b/src/main/java/com/spandigital/cel2sql/dialect/RegexSafety.java @@ -0,0 +1,183 @@ +package com.spandigital.cel2sql.dialect; + +import com.spandigital.cel2sql.error.ConversionException; + +import java.util.regex.Pattern; + +/** + * Shared ReDoS-safety validation for the RE2-style regex dialects + * (PostgreSQL, MySQL, DuckDB, BigQuery, Spark). + * + *

Every dialect that accepts regular expressions enforces the same structural + * limits to prevent catastrophic-backtracking attacks (CWE-1333): a maximum + * pattern length, capture-group count, and nesting depth, plus heuristics that + * reject nested quantifiers and quantified alternation. Those checks are + * dialect-agnostic — they operate on the RE2 source pattern before any + * dialect-specific conversion — so they live here once instead of being copied + * into each {@code XxxRegex} class.

+ * + *

Dialect-specific concerns (unsupported-feature detection, the actual + * RE2-to-native conversion) remain in the per-dialect classes.

+ */ +public final class RegexSafety { + + /** Maximum allowed regex pattern length. */ + public static final int MAX_PATTERN_LENGTH = 500; + + /** Maximum allowed capture groups in a pattern. */ + public static final int MAX_GROUPS = 20; + + /** Maximum allowed nesting depth of parenthesized groups. */ + public static final int MAX_NESTING_DEPTH = 10; + + private static final Pattern NESTED_QUANTIFIERS = Pattern.compile("[*+][*+]"); + private static final Pattern QUANTIFIED_ALTERNATION = Pattern.compile("\\([^)]*\\|[^)]*\\)[*+]"); + + private RegexSafety() { + } + + /** + * Enforces the maximum pattern-length limit. + * + * @param pattern the RE2 regex pattern + * @throws ConversionException if the pattern exceeds {@link #MAX_PATTERN_LENGTH} + */ + public static void checkLength(String pattern) throws ConversionException { + if (pattern.length() > MAX_PATTERN_LENGTH) { + throw new ConversionException( + "Invalid pattern in expression", + String.format("pattern length %d exceeds limit of %d characters", + pattern.length(), MAX_PATTERN_LENGTH)); + } + } + + /** + * Runs the shared structural ReDoS checks against a pattern, in order: + *
    + *
  1. simple back-to-back quantifiers ({@code a*+}, {@code a++})
  2. + *
  3. quantified groups that themselves contain inner quantifiers ({@code (a+)+})
  4. + *
  5. capture-group count limit
  6. + *
  7. quantified alternation ({@code (a|b)+})
  8. + *
  9. group nesting-depth limit
  10. + *
+ * + * @param pattern the RE2 regex pattern (after any case-insensitivity flag has been stripped) + * @throws ConversionException if any limit is exceeded or a catastrophic construct is detected + */ + public static void checkReDoS(String pattern) throws ConversionException { + if (NESTED_QUANTIFIERS.matcher(pattern).find()) { + throw new ConversionException( + "Invalid pattern in expression", + "regex contains catastrophic nested quantifiers that could cause ReDoS"); + } + + validateNoNestedQuantifiers(pattern); + + int groupCount = countUnescapedParens(pattern); + if (groupCount > MAX_GROUPS) { + throw new ConversionException( + "Invalid pattern in expression", + String.format("regex contains %d capture groups, exceeds limit of %d", + groupCount, MAX_GROUPS)); + } + + if (QUANTIFIED_ALTERNATION.matcher(pattern).find()) { + throw new ConversionException( + "Invalid pattern in expression", + "regex contains quantified alternation that could cause ReDoS"); + } + + int maxDepth = computeMaxNestingDepth(pattern); + if (maxDepth > MAX_NESTING_DEPTH) { + throw new ConversionException( + "Invalid pattern in expression", + String.format("nesting depth %d exceeds limit of %d", maxDepth, MAX_NESTING_DEPTH)); + } + } + + /** + * Validates that no quantified groups contain inner quantifiers (nested quantifiers). + * This detects patterns like {@code (a+)+} that can cause catastrophic backtracking. + */ + private static void validateNoNestedQuantifiers(String pattern) throws ConversionException { + int depth = 0; + boolean[] groupHasQuantifier = new boolean[pattern.length() + 1]; // oversized but safe + int stackTop = -1; + + for (int i = 0; i < pattern.length(); i++) { + char ch = pattern.charAt(i); + + // Skip escaped characters + if (i > 0 && pattern.charAt(i - 1) == '\\') { + continue; + } + + switch (ch) { + case '(' -> { + depth++; + stackTop++; + groupHasQuantifier[stackTop] = false; + } + case ')' -> { + if (depth > 0) { + depth--; + if (i + 1 < pattern.length()) { + char next = pattern.charAt(i + 1); + if (next == '*' || next == '+' || next == '?' || next == '{') { + if (stackTop >= 0 && groupHasQuantifier[stackTop]) { + throw new ConversionException( + "Invalid pattern in expression", + "regex contains catastrophic nested quantifiers that could cause ReDoS"); + } + } + } + if (stackTop > 0 && groupHasQuantifier[stackTop]) { + groupHasQuantifier[stackTop - 1] = true; + } + if (stackTop >= 0) { + stackTop--; + } + } + } + case '*', '+', '?', '{' -> { + if (stackTop >= 0) { + groupHasQuantifier[stackTop] = true; + } + } + } + } + } + + /** + * Counts the number of unescaped opening parentheses in the pattern. + */ + private static int countUnescapedParens(String pattern) { + int count = 0; + for (int i = 0; i < pattern.length(); i++) { + if (pattern.charAt(i) == '(' && (i == 0 || pattern.charAt(i - 1) != '\\')) { + count++; + } + } + return count; + } + + /** + * Computes the maximum nesting depth of parenthesized groups in the pattern. + */ + private static int computeMaxNestingDepth(String pattern) { + int maxDepth = 0; + int currentDepth = 0; + for (int i = 0; i < pattern.length(); i++) { + char ch = pattern.charAt(i); + if (ch == '(' && (i == 0 || pattern.charAt(i - 1) != '\\')) { + currentDepth++; + if (currentDepth > maxDepth) { + maxDepth = currentDepth; + } + } else if (ch == ')' && (i == 0 || pattern.charAt(i - 1) != '\\')) { + currentDepth--; + } + } + return maxDepth; + } +} diff --git a/src/main/java/com/spandigital/cel2sql/dialect/SqlEmitters.java b/src/main/java/com/spandigital/cel2sql/dialect/SqlEmitters.java new file mode 100644 index 0000000..188af4f --- /dev/null +++ b/src/main/java/com/spandigital/cel2sql/dialect/SqlEmitters.java @@ -0,0 +1,147 @@ +package com.spandigital.cel2sql.dialect; + +import com.spandigital.cel2sql.error.ConversionException; + +import java.util.List; +import java.util.function.UnaryOperator; + +/** + * Shared SQL-emission helpers for the recurring fragment shapes that several + * dialects render identically. + * + *

Each dialect still declares its own {@code Dialect} override (so the + * per-dialect behaviour stays explicit and greppable), but the dialects that + * happen to share a shape delegate the actual {@link StringBuilder} writing + * here instead of copy-pasting the body. Dialects whose output genuinely + * differs keep their own inline implementation.

+ */ +public final class SqlEmitters { + + private SqlEmitters() { + } + + /** + * Writes a two-argument function call: {@code func(a, b)}. + */ + public static void writeBinaryCall(StringBuilder w, String func, SqlWriter writeA, SqlWriter writeB) + throws ConversionException { + w.append(func).append('('); + writeA.write(); + w.append(", "); + writeB.write(); + w.append(')'); + } + + /** + * Writes an array-to-string join: {@code func(array, delim)} where a null + * delimiter falls back to the empty string, optionally followed by a + * trailing empty-string argument (PostgreSQL's {@code ARRAY_TO_STRING} + * null-replacement parameter). + */ + public static void writeArrayJoin(StringBuilder w, String func, SqlWriter writeArray, + SqlWriter writeDelim, boolean trailingEmptyArg) + throws ConversionException { + w.append(func).append('('); + writeArray.write(); + w.append(", "); + if (writeDelim != null) { + writeDelim.write(); + } else { + w.append("''"); + } + if (trailingEmptyArg) { + w.append(", ''"); + } + w.append(')'); + } + + /** + * Writes the {@code json_each} membership idiom shared by SQLite and DuckDB: + * {@code EXISTS (SELECT 1 FROM json_each(array) WHERE value = elem)}. + */ + public static void writeJsonEachMembership(StringBuilder w, SqlWriter writeArray, SqlWriter writeElem) + throws ConversionException { + w.append("EXISTS (SELECT 1 FROM json_each("); + writeArray.write(); + w.append(") WHERE value = "); + writeElem.write(); + w.append(')'); + } + + /** + * Writes a JSON path-existence probe: {@code func(root, '$.seg.seg...')} + * followed by {@code suffix} (e.g. {@code " IS NOT NULL"}). Each path + * segment is escaped via {@code escape}. + */ + public static void writeJsonPathProbe(StringBuilder w, String func, SqlWriter writeRoot, + List pathSegments, String suffix, UnaryOperator escape) + throws ConversionException { + w.append(func).append('('); + writeRoot.write(); + w.append(", '$"); + for (String segment : pathSegments) { + w.append('.').append(escape.apply(segment)); + } + w.append("')").append(suffix); + } + + /** + * Writes an infix regex match: {@code target 'pattern'} with the + * pattern's single quotes doubled for SQL-string escaping. Used by dialects + * whose regex operator is a binary infix token ({@code ~}/{@code ~*}, + * {@code REGEXP}, {@code RLIKE}). + */ + public static void writeInfixRegex(StringBuilder w, SqlWriter writeTarget, String op, String pattern) + throws ConversionException { + writeTarget.write(); + w.append(op); + w.append('\'').append(pattern.replace("'", "''")).append('\''); + } + + /** + * Writes a standard SQL {@code EXTRACT(part FROM expr [AT TIME ZONE tz])} + * clause. The day-of-week conversion wrapping that some dialects apply is + * left to the caller. + */ + public static void writeStandardExtract(StringBuilder w, String part, SqlWriter writeExpr, SqlWriter writeTZ) + throws ConversionException { + w.append("EXTRACT(").append(part).append(" FROM "); + writeExpr.write(); + if (writeTZ != null) { + w.append(" AT TIME ZONE "); + writeTZ.write(); + } + w.append(')'); + } + + /** + * Writes a standard {@code EXTRACT}, applying the PostgreSQL/DuckDB + * day-of-week remapping {@code (EXTRACT(DOW FROM ...) + 6) % 7} when + * {@code part} is {@code "DOW"}. Both engines share this exact convention. + */ + public static void writeExtractWithPostgresDow(StringBuilder w, String part, SqlWriter writeExpr, SqlWriter writeTZ) + throws ConversionException { + boolean isDOW = "DOW".equals(part); + if (isDOW) { + w.append('('); + } + writeStandardExtract(w, part, writeExpr, writeTZ); + if (isDOW) { + w.append(" + 6) % 7"); + } + } + + /** + * Writes PostgreSQL/DuckDB arrow-operator JSON field access: + * {@code base->>'field'} (final/text extraction) or {@code base->'field'} + * (intermediate/json extraction). The field name is escaped via + * {@code escape}. + */ + public static void writeArrowJsonAccess(StringBuilder w, SqlWriter writeBase, String fieldName, + boolean isFinal, UnaryOperator escape) + throws ConversionException { + writeBase.write(); + w.append(isFinal ? "->>'" : "->'"); + w.append(escape.apply(fieldName)).append('\''); + } +} diff --git a/src/main/java/com/spandigital/cel2sql/dialect/bigquery/BigQueryDialect.java b/src/main/java/com/spandigital/cel2sql/dialect/bigquery/BigQueryDialect.java index db5ec41..c4dfe41 100644 --- a/src/main/java/com/spandigital/cel2sql/dialect/bigquery/BigQueryDialect.java +++ b/src/main/java/com/spandigital/cel2sql/dialect/bigquery/BigQueryDialect.java @@ -7,6 +7,7 @@ import com.spandigital.cel2sql.dialect.IndexRecommendation; import com.spandigital.cel2sql.dialect.PatternType; import com.spandigital.cel2sql.dialect.RegexResult; +import com.spandigital.cel2sql.dialect.SqlEmitters; import com.spandigital.cel2sql.dialect.SqlWriter; import com.spandigital.cel2sql.error.ConversionException; @@ -195,13 +196,7 @@ public void writeJSONArrayLength(StringBuilder w, SqlWriter writeExpr) throws Co @Override public void writeJSONExtractPath(StringBuilder w, List pathSegments, SqlWriter writeRoot) throws ConversionException { - w.append("JSON_VALUE("); - writeRoot.write(); - w.append(", '$"); - for (String segment : pathSegments) { - w.append('.').append(escapeJSONFieldName(segment)); - } - w.append("') IS NOT NULL"); + SqlEmitters.writeJsonPathProbe(w, "JSON_VALUE", writeRoot, pathSegments, " IS NOT NULL", BigQueryDialect::escapeJSONFieldName); } @Override @@ -246,13 +241,7 @@ public void writeExtract(StringBuilder w, String part, SqlWriter writeExpr, SqlW } w.append(") - 1)"); } else { - w.append("EXTRACT(").append(part).append(" FROM "); - writeExpr.write(); - if (writeTZ != null) { - w.append(" AT TIME ZONE "); - writeTZ.write(); - } - w.append(')'); + SqlEmitters.writeStandardExtract(w, part, writeExpr, writeTZ); } } @@ -282,11 +271,7 @@ public void writeContains(StringBuilder w, SqlWriter writeHaystack, SqlWriter wr @Override public void writeSplit(StringBuilder w, SqlWriter writeStr, SqlWriter writeDelim) throws ConversionException { - w.append("SPLIT("); - writeStr.write(); - w.append(", "); - writeDelim.write(); - w.append(')'); + SqlEmitters.writeBinaryCall(w, "SPLIT", writeStr, writeDelim); } @Override @@ -300,15 +285,7 @@ public void writeSplitWithLimit(StringBuilder w, SqlWriter writeStr, SqlWriter w @Override public void writeJoin(StringBuilder w, SqlWriter writeArray, SqlWriter writeDelim) throws ConversionException { - w.append("ARRAY_TO_STRING("); - writeArray.write(); - w.append(", "); - if (writeDelim != null) { - writeDelim.write(); - } else { - w.append("''"); - } - w.append(')'); + SqlEmitters.writeArrayJoin(w, "ARRAY_TO_STRING", writeArray, writeDelim, false); } @Override diff --git a/src/main/java/com/spandigital/cel2sql/dialect/bigquery/BigQueryRegex.java b/src/main/java/com/spandigital/cel2sql/dialect/bigquery/BigQueryRegex.java index fa3f397..d09488f 100644 --- a/src/main/java/com/spandigital/cel2sql/dialect/bigquery/BigQueryRegex.java +++ b/src/main/java/com/spandigital/cel2sql/dialect/bigquery/BigQueryRegex.java @@ -1,6 +1,7 @@ package com.spandigital.cel2sql.dialect.bigquery; import com.spandigital.cel2sql.dialect.RegexResult; +import com.spandigital.cel2sql.dialect.RegexSafety; import com.spandigital.cel2sql.error.ConversionException; import java.util.regex.Pattern; @@ -15,18 +16,6 @@ */ final class BigQueryRegex { - /** Maximum allowed regex pattern length. */ - static final int MAX_PATTERN_LENGTH = 500; - - /** Maximum allowed capture groups in a pattern. */ - static final int MAX_GROUPS = 20; - - /** Maximum allowed nesting depth of parenthesized groups. */ - static final int MAX_NESTING_DEPTH = 10; - - private static final Pattern NESTED_QUANTIFIERS = Pattern.compile("[*+][*+]"); - private static final Pattern QUANTIFIED_ALTERNATION = Pattern.compile("\\([^)]*\\|[^)]*\\)[*+]"); - private BigQueryRegex() { } @@ -57,12 +46,7 @@ private BigQueryRegex() { */ static RegexResult convertRE2ToBigQuery(String re2Pattern) throws ConversionException { // 1. Check pattern length - if (re2Pattern.length() > MAX_PATTERN_LENGTH) { - throw ConversionException.of( - "Invalid regex pattern", - String.format("pattern length %d exceeds limit of %d characters", - re2Pattern.length(), MAX_PATTERN_LENGTH)); - } + RegexSafety.checkLength(re2Pattern); // 2. Validate pattern compiles try { @@ -90,39 +74,9 @@ static RegexResult convertRE2ToBigQuery(String re2Pattern) throws ConversionExce "named capture groups (?P...) are not supported in BigQuery regex"); } - // 4. Detect catastrophic nested quantifiers - if (NESTED_QUANTIFIERS.matcher(re2Pattern).find()) { - throw ConversionException.of( - "Invalid regex pattern", - "regex contains catastrophic nested quantifiers that could cause ReDoS"); - } - - // 5. Check nested quantifiers in groups - validateNoNestedQuantifiers(re2Pattern); - - // 6. Count and limit capture groups - int groupCount = countUnescapedParens(re2Pattern); - if (groupCount > MAX_GROUPS) { - throw ConversionException.of( - "Invalid regex pattern", - String.format("regex contains %d capture groups, exceeds limit of %d", - groupCount, MAX_GROUPS)); - } - - // 7. Detect exponential alternation patterns - if (QUANTIFIED_ALTERNATION.matcher(re2Pattern).find()) { - throw ConversionException.of( - "Invalid regex pattern", - "regex contains quantified alternation that could cause ReDoS"); - } - - // 8. Check nesting depth - int maxDepth = computeMaxNestingDepth(re2Pattern); - if (maxDepth > MAX_NESTING_DEPTH) { - throw ConversionException.of( - "Invalid regex pattern", - String.format("nesting depth %d exceeds limit of %d", maxDepth, MAX_NESTING_DEPTH)); - } + // 4-8. Shared ReDoS safety checks (nested quantifiers, group count, + // quantified alternation, nesting depth) + RegexSafety.checkReDoS(re2Pattern); // 9. Handle (?i) flag -> set caseInsensitive=true, strip prefix boolean caseInsensitive = false; @@ -146,97 +100,4 @@ static RegexResult convertRE2ToBigQuery(String re2Pattern) throws ConversionExce return new RegexResult(pattern, caseInsensitive); } - - /** - * Validates that no quantified groups contain inner quantifiers (nested quantifiers). - * This detects patterns like {@code (a+)+} that can cause catastrophic backtracking. - */ - private static void validateNoNestedQuantifiers(String pattern) throws ConversionException { - int depth = 0; - boolean[] groupHasQuantifier = new boolean[pattern.length()]; // oversized but safe - int stackTop = -1; - - for (int i = 0; i < pattern.length(); i++) { - char ch = pattern.charAt(i); - - // Skip escaped characters - if (i > 0 && pattern.charAt(i - 1) == '\\') { - continue; - } - - switch (ch) { - case '(' -> { - depth++; - stackTop++; - groupHasQuantifier[stackTop] = false; - } - case ')' -> { - if (depth > 0) { - depth--; - if (i + 1 < pattern.length()) { - char next = pattern.charAt(i + 1); - if (next == '*' || next == '+' || next == '?' || next == '{') { - if (stackTop >= 0 && groupHasQuantifier[stackTop]) { - throw ConversionException.of( - "Invalid regex pattern", - "regex contains catastrophic nested quantifiers that could cause ReDoS"); - } - } - } - if (stackTop > 0) { - if (groupHasQuantifier[stackTop]) { - groupHasQuantifier[stackTop - 1] = true; - } - } - if (stackTop >= 0) { - stackTop--; - } - } - } - case '*', '+', '?' -> { - if (stackTop >= 0) { - groupHasQuantifier[stackTop] = true; - } - } - case '{' -> { - if (stackTop >= 0) { - groupHasQuantifier[stackTop] = true; - } - } - } - } - } - - /** - * Counts the number of unescaped opening parentheses in the pattern. - */ - private static int countUnescapedParens(String pattern) { - int count = 0; - for (int i = 0; i < pattern.length(); i++) { - if (pattern.charAt(i) == '(' && (i == 0 || pattern.charAt(i - 1) != '\\')) { - count++; - } - } - return count; - } - - /** - * Computes the maximum nesting depth of parenthesized groups in the pattern. - */ - private static int computeMaxNestingDepth(String pattern) { - int maxDepth = 0; - int currentDepth = 0; - for (int i = 0; i < pattern.length(); i++) { - char ch = pattern.charAt(i); - if (ch == '(' && (i == 0 || pattern.charAt(i - 1) != '\\')) { - currentDepth++; - if (currentDepth > maxDepth) { - maxDepth = currentDepth; - } - } else if (ch == ')' && (i == 0 || pattern.charAt(i - 1) != '\\')) { - currentDepth--; - } - } - return maxDepth; - } } diff --git a/src/main/java/com/spandigital/cel2sql/dialect/duckdb/DuckDbDialect.java b/src/main/java/com/spandigital/cel2sql/dialect/duckdb/DuckDbDialect.java index b76557a..bf23194 100644 --- a/src/main/java/com/spandigital/cel2sql/dialect/duckdb/DuckDbDialect.java +++ b/src/main/java/com/spandigital/cel2sql/dialect/duckdb/DuckDbDialect.java @@ -7,6 +7,7 @@ import com.spandigital.cel2sql.dialect.IndexRecommendation; import com.spandigital.cel2sql.dialect.PatternType; import com.spandigital.cel2sql.dialect.RegexResult; +import com.spandigital.cel2sql.dialect.SqlEmitters; import com.spandigital.cel2sql.dialect.SqlWriter; import com.spandigital.cel2sql.error.ConversionException; @@ -61,14 +62,7 @@ public void writeStringConcat(StringBuilder w, SqlWriter writeLHS, SqlWriter wri @Override public void writeRegexMatch(StringBuilder w, SqlWriter writeTarget, String pattern, boolean caseInsensitive) throws ConversionException { - writeTarget.write(); - if (caseInsensitive) { - w.append(" ~* "); - } else { - w.append(" ~ "); - } - String escaped = pattern.replace("'", "''"); - w.append('\'').append(escaped).append('\''); + SqlEmitters.writeInfixRegex(w, writeTarget, caseInsensitive ? " ~* " : " ~ ", pattern); } @Override @@ -160,14 +154,7 @@ public void writeEmptyTypedArray(StringBuilder w, String typeName) { @Override public void writeJSONFieldAccess(StringBuilder w, SqlWriter writeBase, String fieldName, boolean isFinal) throws ConversionException { - writeBase.write(); - String escapedField = escapeJSONFieldName(fieldName); - if (isFinal) { - w.append("->>'"); - } else { - w.append("->'"); - } - w.append(escapedField).append('\''); + SqlEmitters.writeArrowJsonAccess(w, writeBase, fieldName, isFinal, DuckDbDialect::escapeJSONFieldName); } @Override @@ -194,31 +181,17 @@ public void writeJSONArrayLength(StringBuilder w, SqlWriter writeExpr) throws Co @Override public void writeJSONExtractPath(StringBuilder w, List pathSegments, SqlWriter writeRoot) throws ConversionException { - w.append("json_exists("); - writeRoot.write(); - w.append(", '$"); - for (String segment : pathSegments) { - w.append('.').append(escapeJSONFieldName(segment)); - } - w.append("')"); + SqlEmitters.writeJsonPathProbe(w, "json_exists", writeRoot, pathSegments, "", DuckDbDialect::escapeJSONFieldName); } @Override public void writeJSONArrayMembership(StringBuilder w, String jsonFunc, SqlWriter writeElem, SqlWriter writeArray) throws ConversionException { - w.append("EXISTS (SELECT 1 FROM json_each("); - writeArray.write(); - w.append(") WHERE value = "); - writeElem.write(); - w.append(')'); + SqlEmitters.writeJsonEachMembership(w, writeArray, writeElem); } @Override public void writeNestedJSONArrayMembership(StringBuilder w, SqlWriter writeElem, SqlWriter writeArray) throws ConversionException { - w.append("EXISTS (SELECT 1 FROM json_each("); - writeArray.write(); - w.append(") WHERE value = "); - writeElem.write(); - w.append(')'); + SqlEmitters.writeJsonEachMembership(w, writeArray, writeElem); } // --- Timestamps --- @@ -237,20 +210,7 @@ public void writeInterval(StringBuilder w, SqlWriter writeValue, String unit) th @Override public void writeExtract(StringBuilder w, String part, SqlWriter writeExpr, SqlWriter writeTZ) throws ConversionException { - boolean isDOW = "DOW".equals(part); - if (isDOW) { - w.append('('); - } - w.append("EXTRACT(").append(part).append(" FROM "); - writeExpr.write(); - if (writeTZ != null) { - w.append(" AT TIME ZONE "); - writeTZ.write(); - } - w.append(')'); - if (isDOW) { - w.append(" + 6) % 7"); - } + SqlEmitters.writeExtractWithPostgresDow(w, part, writeExpr, writeTZ); } @Override @@ -273,11 +233,7 @@ public void writeContains(StringBuilder w, SqlWriter writeHaystack, SqlWriter wr @Override public void writeSplit(StringBuilder w, SqlWriter writeStr, SqlWriter writeDelim) throws ConversionException { - w.append("STRING_SPLIT("); - writeStr.write(); - w.append(", "); - writeDelim.write(); - w.append(')'); + SqlEmitters.writeBinaryCall(w, "STRING_SPLIT", writeStr, writeDelim); } @Override @@ -291,15 +247,7 @@ public void writeSplitWithLimit(StringBuilder w, SqlWriter writeStr, SqlWriter w @Override public void writeJoin(StringBuilder w, SqlWriter writeArray, SqlWriter writeDelim) throws ConversionException { - w.append("ARRAY_TO_STRING("); - writeArray.write(); - w.append(", "); - if (writeDelim != null) { - writeDelim.write(); - } else { - w.append("''"); - } - w.append(')'); + SqlEmitters.writeArrayJoin(w, "ARRAY_TO_STRING", writeArray, writeDelim, false); } @Override diff --git a/src/main/java/com/spandigital/cel2sql/dialect/duckdb/DuckDbRegex.java b/src/main/java/com/spandigital/cel2sql/dialect/duckdb/DuckDbRegex.java index 3bfe38e..3ae028c 100644 --- a/src/main/java/com/spandigital/cel2sql/dialect/duckdb/DuckDbRegex.java +++ b/src/main/java/com/spandigital/cel2sql/dialect/duckdb/DuckDbRegex.java @@ -1,6 +1,7 @@ package com.spandigital.cel2sql.dialect.duckdb; import com.spandigital.cel2sql.dialect.RegexResult; +import com.spandigital.cel2sql.dialect.RegexSafety; import com.spandigital.cel2sql.error.ConversionException; import java.util.regex.Pattern; @@ -15,18 +16,6 @@ */ final class DuckDbRegex { - /** Maximum allowed regex pattern length. */ - static final int MAX_PATTERN_LENGTH = 500; - - /** Maximum allowed capture groups in a pattern. */ - static final int MAX_GROUPS = 20; - - /** Maximum allowed nesting depth of parenthesized groups. */ - static final int MAX_NESTING_DEPTH = 10; - - private static final Pattern NESTED_QUANTIFIERS = Pattern.compile("[*+][*+]"); - private static final Pattern QUANTIFIED_ALTERNATION = Pattern.compile("\\([^)]*\\|[^)]*\\)[*+]"); - private DuckDbRegex() { } @@ -56,12 +45,7 @@ private DuckDbRegex() { */ static RegexResult convertRE2ToDuckDB(String re2Pattern) throws ConversionException { // 1. Check pattern length - if (re2Pattern.length() > MAX_PATTERN_LENGTH) { - throw new ConversionException( - "Invalid pattern in expression", - String.format("pattern length %d exceeds limit of %d characters", - re2Pattern.length(), MAX_PATTERN_LENGTH)); - } + RegexSafety.checkLength(re2Pattern); // 2. Validate pattern compiles try { @@ -89,39 +73,9 @@ static RegexResult convertRE2ToDuckDB(String re2Pattern) throws ConversionExcept "named capture groups (?P...) are not supported in DuckDB regex"); } - // 4. Detect catastrophic nested quantifiers - if (NESTED_QUANTIFIERS.matcher(re2Pattern).find()) { - throw new ConversionException( - "Invalid pattern in expression", - "regex contains catastrophic nested quantifiers that could cause ReDoS"); - } - - // 5. Check for groups with quantifiers that are themselves quantified - validateNoNestedQuantifiers(re2Pattern); - - // 6. Count and limit capture groups - int groupCount = countUnescapedParens(re2Pattern); - if (groupCount > MAX_GROUPS) { - throw new ConversionException( - "Invalid pattern in expression", - String.format("regex contains %d capture groups, exceeds limit of %d", - groupCount, MAX_GROUPS)); - } - - // 7. Detect exponential alternation patterns - if (QUANTIFIED_ALTERNATION.matcher(re2Pattern).find()) { - throw new ConversionException( - "Invalid pattern in expression", - "regex contains quantified alternation that could cause ReDoS"); - } - - // 8. Check nesting depth - int maxDepth = computeMaxNestingDepth(re2Pattern); - if (maxDepth > MAX_NESTING_DEPTH) { - throw new ConversionException( - "Invalid pattern in expression", - String.format("nesting depth %d exceeds limit of %d", maxDepth, MAX_NESTING_DEPTH)); - } + // 4-8. Shared ReDoS safety checks (nested quantifiers, group count, + // quantified alternation, nesting depth) + RegexSafety.checkReDoS(re2Pattern); // 9. Handle (?i) flag boolean caseInsensitive = false; @@ -143,97 +97,4 @@ static RegexResult convertRE2ToDuckDB(String re2Pattern) throws ConversionExcept return new RegexResult(pattern, caseInsensitive); } - - /** - * Validates that no quantified groups contain inner quantifiers (nested quantifiers). - * This detects patterns like {@code (a+)+} that can cause catastrophic backtracking. - */ - private static void validateNoNestedQuantifiers(String pattern) throws ConversionException { - int depth = 0; - boolean[] groupHasQuantifier = new boolean[pattern.length()]; // oversized but safe - int stackTop = -1; - - for (int i = 0; i < pattern.length(); i++) { - char ch = pattern.charAt(i); - - // Skip escaped characters - if (i > 0 && pattern.charAt(i - 1) == '\\') { - continue; - } - - switch (ch) { - case '(' -> { - depth++; - stackTop++; - groupHasQuantifier[stackTop] = false; - } - case ')' -> { - if (depth > 0) { - depth--; - if (i + 1 < pattern.length()) { - char next = pattern.charAt(i + 1); - if (next == '*' || next == '+' || next == '?' || next == '{') { - if (stackTop >= 0 && groupHasQuantifier[stackTop]) { - throw new ConversionException( - "Invalid pattern in expression", - "regex contains catastrophic nested quantifiers that could cause ReDoS"); - } - } - } - if (stackTop > 0) { - if (groupHasQuantifier[stackTop]) { - groupHasQuantifier[stackTop - 1] = true; - } - } - if (stackTop >= 0) { - stackTop--; - } - } - } - case '*', '+', '?' -> { - if (stackTop >= 0) { - groupHasQuantifier[stackTop] = true; - } - } - case '{' -> { - if (stackTop >= 0) { - groupHasQuantifier[stackTop] = true; - } - } - } - } - } - - /** - * Counts the number of unescaped opening parentheses in the pattern. - */ - private static int countUnescapedParens(String pattern) { - int count = 0; - for (int i = 0; i < pattern.length(); i++) { - if (pattern.charAt(i) == '(' && (i == 0 || pattern.charAt(i - 1) != '\\')) { - count++; - } - } - return count; - } - - /** - * Computes the maximum nesting depth of parenthesized groups in the pattern. - */ - private static int computeMaxNestingDepth(String pattern) { - int maxDepth = 0; - int currentDepth = 0; - for (int i = 0; i < pattern.length(); i++) { - char ch = pattern.charAt(i); - if (ch == '(' && (i == 0 || pattern.charAt(i - 1) != '\\')) { - currentDepth++; - if (currentDepth > maxDepth) { - maxDepth = currentDepth; - } - } else if (ch == ')' && (i == 0 || pattern.charAt(i - 1) != '\\')) { - currentDepth--; - } - } - return maxDepth; - } } diff --git a/src/main/java/com/spandigital/cel2sql/dialect/mysql/MySqlDialect.java b/src/main/java/com/spandigital/cel2sql/dialect/mysql/MySqlDialect.java index 2f2e838..0c84394 100644 --- a/src/main/java/com/spandigital/cel2sql/dialect/mysql/MySqlDialect.java +++ b/src/main/java/com/spandigital/cel2sql/dialect/mysql/MySqlDialect.java @@ -7,6 +7,7 @@ import com.spandigital.cel2sql.dialect.IndexRecommendation; import com.spandigital.cel2sql.dialect.PatternType; import com.spandigital.cel2sql.dialect.RegexResult; +import com.spandigital.cel2sql.dialect.SqlEmitters; import com.spandigital.cel2sql.dialect.SqlWriter; import com.spandigital.cel2sql.error.ConversionException; @@ -63,10 +64,7 @@ public void writeStringConcat(StringBuilder w, SqlWriter writeLHS, SqlWriter wri @Override public void writeRegexMatch(StringBuilder w, SqlWriter writeTarget, String pattern, boolean caseInsensitive) throws ConversionException { - writeTarget.write(); - w.append(" REGEXP "); - String escaped = pattern.replace("'", "''"); - w.append('\'').append(escaped).append('\''); + SqlEmitters.writeInfixRegex(w, writeTarget, " REGEXP ", pattern); } @Override @@ -247,13 +245,7 @@ public void writeExtract(StringBuilder w, String part, SqlWriter writeExpr, SqlW } w.append(") + 5) % 7"); } else { - w.append("EXTRACT(").append(part).append(" FROM "); - writeExpr.write(); - if (writeTZ != null) { - w.append(" AT TIME ZONE "); - writeTZ.write(); - } - w.append(')'); + SqlEmitters.writeStandardExtract(w, part, writeExpr, writeTZ); } } diff --git a/src/main/java/com/spandigital/cel2sql/dialect/mysql/MySqlRegex.java b/src/main/java/com/spandigital/cel2sql/dialect/mysql/MySqlRegex.java index fab8b50..b9da7f0 100644 --- a/src/main/java/com/spandigital/cel2sql/dialect/mysql/MySqlRegex.java +++ b/src/main/java/com/spandigital/cel2sql/dialect/mysql/MySqlRegex.java @@ -1,6 +1,7 @@ package com.spandigital.cel2sql.dialect.mysql; import com.spandigital.cel2sql.dialect.RegexResult; +import com.spandigital.cel2sql.dialect.RegexSafety; import com.spandigital.cel2sql.error.ConversionException; import java.util.regex.Pattern; @@ -14,18 +15,6 @@ */ final class MySqlRegex { - /** Maximum allowed regex pattern length. */ - static final int MAX_PATTERN_LENGTH = 500; - - /** Maximum allowed capture groups in a pattern. */ - static final int MAX_GROUPS = 20; - - /** Maximum allowed nesting depth of parenthesized groups. */ - static final int MAX_NESTING_DEPTH = 10; - - private static final Pattern NESTED_QUANTIFIERS = Pattern.compile("[*+][*+]"); - private static final Pattern QUANTIFIED_ALTERNATION = Pattern.compile("\\([^)]*\\|[^)]*\\)[*+]"); - private MySqlRegex() { } @@ -56,12 +45,7 @@ private MySqlRegex() { */ static RegexResult convertRE2ToMySQL(String re2Pattern) throws ConversionException { // 1. Check pattern length - if (re2Pattern.length() > MAX_PATTERN_LENGTH) { - throw ConversionException.of( - "Invalid regex pattern", - String.format("pattern length %d exceeds limit of %d characters", - re2Pattern.length(), MAX_PATTERN_LENGTH)); - } + RegexSafety.checkLength(re2Pattern); // 2. Validate pattern compiles try { @@ -89,39 +73,9 @@ static RegexResult convertRE2ToMySQL(String re2Pattern) throws ConversionExcepti "named capture groups (?P...) are not supported in MySQL regex"); } - // 4. Detect catastrophic nested quantifiers - if (NESTED_QUANTIFIERS.matcher(re2Pattern).find()) { - throw ConversionException.of( - "Invalid regex pattern", - "regex contains catastrophic nested quantifiers that could cause ReDoS"); - } - - // 5. Check nested quantifiers in groups - validateNoNestedQuantifiers(re2Pattern); - - // 6. Count and limit capture groups - int groupCount = countUnescapedParens(re2Pattern); - if (groupCount > MAX_GROUPS) { - throw ConversionException.of( - "Invalid regex pattern", - String.format("regex contains %d capture groups, exceeds limit of %d", - groupCount, MAX_GROUPS)); - } - - // 7. Detect exponential alternation patterns - if (QUANTIFIED_ALTERNATION.matcher(re2Pattern).find()) { - throw ConversionException.of( - "Invalid regex pattern", - "regex contains quantified alternation that could cause ReDoS"); - } - - // 8. Check nesting depth - int maxDepth = computeMaxNestingDepth(re2Pattern); - if (maxDepth > MAX_NESTING_DEPTH) { - throw ConversionException.of( - "Invalid regex pattern", - String.format("nesting depth %d exceeds limit of %d", maxDepth, MAX_NESTING_DEPTH)); - } + // 4-8. Shared ReDoS safety checks (nested quantifiers, group count, + // quantified alternation, nesting depth) + RegexSafety.checkReDoS(re2Pattern); // 9. Handle (?i) flag -> set caseInsensitive=true, strip prefix boolean caseInsensitive = false; @@ -146,97 +100,4 @@ static RegexResult convertRE2ToMySQL(String re2Pattern) throws ConversionExcepti // 13. Return result return new RegexResult(pattern, caseInsensitive); } - - /** - * Validates that no quantified groups contain inner quantifiers (nested quantifiers). - * This detects patterns like {@code (a+)+} that can cause catastrophic backtracking. - */ - private static void validateNoNestedQuantifiers(String pattern) throws ConversionException { - int depth = 0; - boolean[] groupHasQuantifier = new boolean[pattern.length()]; // oversized but safe - int stackTop = -1; - - for (int i = 0; i < pattern.length(); i++) { - char ch = pattern.charAt(i); - - // Skip escaped characters - if (i > 0 && pattern.charAt(i - 1) == '\\') { - continue; - } - - switch (ch) { - case '(' -> { - depth++; - stackTop++; - groupHasQuantifier[stackTop] = false; - } - case ')' -> { - if (depth > 0) { - depth--; - if (i + 1 < pattern.length()) { - char next = pattern.charAt(i + 1); - if (next == '*' || next == '+' || next == '?' || next == '{') { - if (stackTop >= 0 && groupHasQuantifier[stackTop]) { - throw ConversionException.of( - "Invalid regex pattern", - "regex contains catastrophic nested quantifiers that could cause ReDoS"); - } - } - } - if (stackTop > 0) { - if (groupHasQuantifier[stackTop]) { - groupHasQuantifier[stackTop - 1] = true; - } - } - if (stackTop >= 0) { - stackTop--; - } - } - } - case '*', '+', '?' -> { - if (stackTop >= 0) { - groupHasQuantifier[stackTop] = true; - } - } - case '{' -> { - if (stackTop >= 0) { - groupHasQuantifier[stackTop] = true; - } - } - } - } - } - - /** - * Counts the number of unescaped opening parentheses in the pattern. - */ - private static int countUnescapedParens(String pattern) { - int count = 0; - for (int i = 0; i < pattern.length(); i++) { - if (pattern.charAt(i) == '(' && (i == 0 || pattern.charAt(i - 1) != '\\')) { - count++; - } - } - return count; - } - - /** - * Computes the maximum nesting depth of parenthesized groups in the pattern. - */ - private static int computeMaxNestingDepth(String pattern) { - int maxDepth = 0; - int currentDepth = 0; - for (int i = 0; i < pattern.length(); i++) { - char ch = pattern.charAt(i); - if (ch == '(' && (i == 0 || pattern.charAt(i - 1) != '\\')) { - currentDepth++; - if (currentDepth > maxDepth) { - maxDepth = currentDepth; - } - } else if (ch == ')' && (i == 0 || pattern.charAt(i - 1) != '\\')) { - currentDepth--; - } - } - return maxDepth; - } } diff --git a/src/main/java/com/spandigital/cel2sql/dialect/postgres/PostgresDialect.java b/src/main/java/com/spandigital/cel2sql/dialect/postgres/PostgresDialect.java index 7479882..d880f76 100644 --- a/src/main/java/com/spandigital/cel2sql/dialect/postgres/PostgresDialect.java +++ b/src/main/java/com/spandigital/cel2sql/dialect/postgres/PostgresDialect.java @@ -7,6 +7,7 @@ import com.spandigital.cel2sql.dialect.IndexRecommendation; import com.spandigital.cel2sql.dialect.PatternType; import com.spandigital.cel2sql.dialect.RegexResult; +import com.spandigital.cel2sql.dialect.SqlEmitters; import com.spandigital.cel2sql.dialect.SqlWriter; import com.spandigital.cel2sql.error.ConversionException; @@ -61,14 +62,7 @@ public void writeStringConcat(StringBuilder w, SqlWriter writeLHS, SqlWriter wri @Override public void writeRegexMatch(StringBuilder w, SqlWriter writeTarget, String pattern, boolean caseInsensitive) throws ConversionException { - writeTarget.write(); - if (caseInsensitive) { - w.append(" ~* "); - } else { - w.append(" ~ "); - } - String escaped = pattern.replace("'", "''"); - w.append('\'').append(escaped).append('\''); + SqlEmitters.writeInfixRegex(w, writeTarget, caseInsensitive ? " ~* " : " ~ ", pattern); } @Override @@ -162,14 +156,7 @@ public void writeEmptyTypedArray(StringBuilder w, String typeName) { @Override public void writeJSONFieldAccess(StringBuilder w, SqlWriter writeBase, String fieldName, boolean isFinal) throws ConversionException { - writeBase.write(); - String escapedField = escapeJSONFieldName(fieldName); - if (isFinal) { - w.append("->>'"); - } else { - w.append("->'"); - } - w.append(escapedField).append('\''); + SqlEmitters.writeArrowJsonAccess(w, writeBase, fieldName, isFinal, PostgresDialect::escapeJSONFieldName); } @Override @@ -243,20 +230,7 @@ public void writeInterval(StringBuilder w, SqlWriter writeValue, String unit) th @Override public void writeExtract(StringBuilder w, String part, SqlWriter writeExpr, SqlWriter writeTZ) throws ConversionException { - boolean isDOW = "DOW".equals(part); - if (isDOW) { - w.append('('); - } - w.append("EXTRACT(").append(part).append(" FROM "); - writeExpr.write(); - if (writeTZ != null) { - w.append(" AT TIME ZONE "); - writeTZ.write(); - } - w.append(')'); - if (isDOW) { - w.append(" + 6) % 7"); - } + SqlEmitters.writeExtractWithPostgresDow(w, part, writeExpr, writeTZ); } @Override @@ -279,11 +253,7 @@ public void writeContains(StringBuilder w, SqlWriter writeHaystack, SqlWriter wr @Override public void writeSplit(StringBuilder w, SqlWriter writeStr, SqlWriter writeDelim) throws ConversionException { - w.append("STRING_TO_ARRAY("); - writeStr.write(); - w.append(", "); - writeDelim.write(); - w.append(')'); + SqlEmitters.writeBinaryCall(w, "STRING_TO_ARRAY", writeStr, writeDelim); } @Override @@ -297,15 +267,7 @@ public void writeSplitWithLimit(StringBuilder w, SqlWriter writeStr, SqlWriter w @Override public void writeJoin(StringBuilder w, SqlWriter writeArray, SqlWriter writeDelim) throws ConversionException { - w.append("ARRAY_TO_STRING("); - writeArray.write(); - w.append(", "); - if (writeDelim != null) { - writeDelim.write(); - } else { - w.append("''"); - } - w.append(", '')"); + SqlEmitters.writeArrayJoin(w, "ARRAY_TO_STRING", writeArray, writeDelim, true); } @Override diff --git a/src/main/java/com/spandigital/cel2sql/dialect/postgres/PostgresRegex.java b/src/main/java/com/spandigital/cel2sql/dialect/postgres/PostgresRegex.java index edc3e1d..4f25d90 100644 --- a/src/main/java/com/spandigital/cel2sql/dialect/postgres/PostgresRegex.java +++ b/src/main/java/com/spandigital/cel2sql/dialect/postgres/PostgresRegex.java @@ -1,10 +1,9 @@ package com.spandigital.cel2sql.dialect.postgres; import com.spandigital.cel2sql.dialect.RegexResult; +import com.spandigital.cel2sql.dialect.RegexSafety; import com.spandigital.cel2sql.error.ConversionException; -import java.util.regex.Pattern; - /** * Converts RE2 regex patterns to POSIX ERE format for PostgreSQL. * Performs security validation to prevent ReDoS attacks (CWE-1333). @@ -13,18 +12,6 @@ */ final class PostgresRegex { - /** Maximum allowed regex pattern length. */ - static final int MAX_PATTERN_LENGTH = 500; - - /** Maximum allowed capture groups in a pattern. */ - static final int MAX_GROUPS = 20; - - /** Maximum allowed nesting depth of parenthesized groups. */ - static final int MAX_NESTING_DEPTH = 10; - - private static final Pattern NESTED_QUANTIFIERS = Pattern.compile("[*+][*+]"); - private static final Pattern QUANTIFIED_ALTERNATION = Pattern.compile("\\([^)]*\\|[^)]*\\)[*+]"); - private PostgresRegex() { } @@ -61,12 +48,7 @@ private PostgresRegex() { */ static RegexResult convertRE2ToPOSIX(String re2Pattern) throws ConversionException { // 1. Check pattern length - if (re2Pattern.length() > MAX_PATTERN_LENGTH) { - throw new ConversionException( - "Invalid pattern in expression", - String.format("pattern length %d exceeds limit of %d characters", - re2Pattern.length(), MAX_PATTERN_LENGTH)); - } + RegexSafety.checkLength(re2Pattern); // 2. Extract case-insensitive flag boolean caseInsensitive = false; @@ -98,39 +80,9 @@ static RegexResult convertRE2ToPOSIX(String re2Pattern) throws ConversionExcepti "inline flags other than (?i) are not supported in PostgreSQL POSIX regex"); } - // 4. Detect catastrophic nested quantifiers - if (NESTED_QUANTIFIERS.matcher(pattern).find()) { - throw new ConversionException( - "Invalid pattern in expression", - "regex contains catastrophic nested quantifiers that could cause ReDoS"); - } - - // Check for groups with quantifiers that are themselves quantified - validateNoNestedQuantifiers(pattern); - - // 5. Count and limit capture groups - int groupCount = countUnescapedParens(pattern); - if (groupCount > MAX_GROUPS) { - throw new ConversionException( - "Invalid pattern in expression", - String.format("regex contains %d capture groups, exceeds limit of %d", - groupCount, MAX_GROUPS)); - } - - // 6. Detect exponential alternation patterns - if (QUANTIFIED_ALTERNATION.matcher(pattern).find()) { - throw new ConversionException( - "Invalid pattern in expression", - "regex contains quantified alternation that could cause ReDoS"); - } - - // 7. Check nesting depth - int maxDepth = computeMaxNestingDepth(pattern); - if (maxDepth > MAX_NESTING_DEPTH) { - throw new ConversionException( - "Invalid pattern in expression", - String.format("nesting depth %d exceeds limit of %d", maxDepth, MAX_NESTING_DEPTH)); - } + // 4-7. Shared ReDoS safety checks (nested quantifiers, group count, + // quantified alternation, nesting depth) + RegexSafety.checkReDoS(pattern); // 8. Convert RE2 to POSIX String posix = pattern; @@ -146,97 +98,4 @@ static RegexResult convertRE2ToPOSIX(String re2Pattern) throws ConversionExcepti return new RegexResult(posix, caseInsensitive); } - - /** - * Validates that no quantified groups contain inner quantifiers (nested quantifiers). - * This detects patterns like {@code (a+)+} that can cause catastrophic backtracking. - */ - private static void validateNoNestedQuantifiers(String pattern) throws ConversionException { - int depth = 0; - boolean[] groupHasQuantifier = new boolean[pattern.length()]; // oversized but safe - int stackTop = -1; - - for (int i = 0; i < pattern.length(); i++) { - char ch = pattern.charAt(i); - - // Skip escaped characters - if (i > 0 && pattern.charAt(i - 1) == '\\') { - continue; - } - - switch (ch) { - case '(' -> { - depth++; - stackTop++; - groupHasQuantifier[stackTop] = false; - } - case ')' -> { - if (depth > 0) { - depth--; - if (i + 1 < pattern.length()) { - char next = pattern.charAt(i + 1); - if (next == '*' || next == '+' || next == '?' || next == '{') { - if (stackTop >= 0 && groupHasQuantifier[stackTop]) { - throw new ConversionException( - "Invalid pattern in expression", - "regex contains catastrophic nested quantifiers that could cause ReDoS"); - } - } - } - if (stackTop > 0) { - if (groupHasQuantifier[stackTop]) { - groupHasQuantifier[stackTop - 1] = true; - } - } - if (stackTop >= 0) { - stackTop--; - } - } - } - case '*', '+', '?' -> { - if (stackTop >= 0) { - groupHasQuantifier[stackTop] = true; - } - } - case '{' -> { - if (stackTop >= 0) { - groupHasQuantifier[stackTop] = true; - } - } - } - } - } - - /** - * Counts the number of unescaped opening parentheses in the pattern. - */ - private static int countUnescapedParens(String pattern) { - int count = 0; - for (int i = 0; i < pattern.length(); i++) { - if (pattern.charAt(i) == '(' && (i == 0 || pattern.charAt(i - 1) != '\\')) { - count++; - } - } - return count; - } - - /** - * Computes the maximum nesting depth of parenthesized groups in the pattern. - */ - private static int computeMaxNestingDepth(String pattern) { - int maxDepth = 0; - int currentDepth = 0; - for (int i = 0; i < pattern.length(); i++) { - char ch = pattern.charAt(i); - if (ch == '(' && (i == 0 || pattern.charAt(i - 1) != '\\')) { - currentDepth++; - if (currentDepth > maxDepth) { - maxDepth = currentDepth; - } - } else if (ch == ')' && (i == 0 || pattern.charAt(i - 1) != '\\')) { - currentDepth--; - } - } - return maxDepth; - } } diff --git a/src/main/java/com/spandigital/cel2sql/dialect/spark/SparkDialect.java b/src/main/java/com/spandigital/cel2sql/dialect/spark/SparkDialect.java index efeefe5..d6e8bd2 100644 --- a/src/main/java/com/spandigital/cel2sql/dialect/spark/SparkDialect.java +++ b/src/main/java/com/spandigital/cel2sql/dialect/spark/SparkDialect.java @@ -7,6 +7,7 @@ import com.spandigital.cel2sql.dialect.IndexRecommendation; import com.spandigital.cel2sql.dialect.PatternType; import com.spandigital.cel2sql.dialect.RegexResult; +import com.spandigital.cel2sql.dialect.SqlEmitters; import com.spandigital.cel2sql.dialect.SqlWriter; import com.spandigital.cel2sql.error.ConversionException; @@ -80,11 +81,7 @@ public void writeStringConcat(StringBuilder w, SqlWriter writeLHS, SqlWriter wri public void writeRegexMatch(StringBuilder w, SqlWriter writeTarget, String pattern, boolean caseInsensitive) throws ConversionException { // Spark regex uses Java pattern syntax; (?i) inline flag is honoured by the // engine, so caseInsensitive is folded into the pattern by SparkRegex. - writeTarget.write(); - w.append(" RLIKE '"); - String escaped = pattern.replace("'", "''"); - w.append(escaped); - w.append('\''); + SqlEmitters.writeInfixRegex(w, writeTarget, " RLIKE ", pattern); } @Override @@ -215,13 +212,7 @@ public void writeJSONArrayLength(StringBuilder w, SqlWriter writeExpr) throws Co @Override public void writeJSONExtractPath(StringBuilder w, List pathSegments, SqlWriter writeRoot) throws ConversionException { - w.append("get_json_object("); - writeRoot.write(); - w.append(", '$"); - for (String segment : pathSegments) { - w.append('.').append(escapeJSONFieldName(segment)); - } - w.append("') IS NOT NULL"); + SqlEmitters.writeJsonPathProbe(w, "get_json_object", writeRoot, pathSegments, " IS NOT NULL", SparkDialect::escapeJSONFieldName); } /** @@ -286,13 +277,7 @@ public void writeExtract(StringBuilder w, String part, SqlWriter writeExpr, SqlW w.append(") - 1)"); return; } - w.append("EXTRACT(").append(part).append(" FROM "); - writeExpr.write(); - if (writeTZ != null) { - w.append(" AT TIME ZONE "); - writeTZ.write(); - } - w.append(')'); + SqlEmitters.writeStandardExtract(w, part, writeExpr, writeTZ); } @Override @@ -316,11 +301,7 @@ public void writeContains(StringBuilder w, SqlWriter writeHaystack, SqlWriter wr @Override public void writeSplit(StringBuilder w, SqlWriter writeStr, SqlWriter writeDelim) throws ConversionException { - w.append("split("); - writeStr.write(); - w.append(", "); - writeDelim.write(); - w.append(')'); + SqlEmitters.writeBinaryCall(w, "split", writeStr, writeDelim); } @Override @@ -335,15 +316,7 @@ public void writeSplitWithLimit(StringBuilder w, SqlWriter writeStr, SqlWriter w @Override public void writeJoin(StringBuilder w, SqlWriter writeArray, SqlWriter writeDelim) throws ConversionException { - w.append("array_join("); - writeArray.write(); - w.append(", "); - if (writeDelim != null) { - writeDelim.write(); - } else { - w.append("''"); - } - w.append(')'); + SqlEmitters.writeArrayJoin(w, "array_join", writeArray, writeDelim, false); } @Override diff --git a/src/main/java/com/spandigital/cel2sql/dialect/spark/SparkRegex.java b/src/main/java/com/spandigital/cel2sql/dialect/spark/SparkRegex.java index 9867807..60e28da 100644 --- a/src/main/java/com/spandigital/cel2sql/dialect/spark/SparkRegex.java +++ b/src/main/java/com/spandigital/cel2sql/dialect/spark/SparkRegex.java @@ -1,6 +1,7 @@ package com.spandigital.cel2sql.dialect.spark; import com.spandigital.cel2sql.dialect.RegexResult; +import com.spandigital.cel2sql.dialect.RegexSafety; import com.spandigital.cel2sql.error.ConversionException; import java.util.regex.Pattern; @@ -20,18 +21,6 @@ */ final class SparkRegex { - /** Maximum allowed regex pattern length. */ - static final int MAX_PATTERN_LENGTH = 500; - - /** Maximum allowed capture groups in a pattern. */ - static final int MAX_GROUPS = 20; - - /** Maximum allowed nesting depth of parenthesized groups. */ - static final int MAX_NESTING_DEPTH = 10; - - private static final Pattern NESTED_QUANTIFIERS = Pattern.compile("[*+][*+]"); - private static final Pattern QUANTIFIED_ALTERNATION = Pattern.compile("\\([^)]*\\|[^)]*\\)[*+]"); - private SparkRegex() {} /** @@ -41,12 +30,7 @@ private SparkRegex() {} * — the engine will honour the inline flag if present. */ static RegexResult convertRE2ToSpark(String re2Pattern) throws ConversionException { - if (re2Pattern.length() > MAX_PATTERN_LENGTH) { - throw new ConversionException( - "Invalid pattern in expression", - String.format("pattern length %d exceeds limit of %d characters", - re2Pattern.length(), MAX_PATTERN_LENGTH)); - } + RegexSafety.checkLength(re2Pattern); try { Pattern.compile(re2Pattern); } catch (PatternSyntaxException e) { @@ -69,31 +53,10 @@ static RegexResult convertRE2ToSpark(String re2Pattern) throws ConversionExcepti "Invalid pattern in expression", "named capture groups (?P...) are not supported in Spark regex"); } - if (NESTED_QUANTIFIERS.matcher(re2Pattern).find()) { - throw new ConversionException( - "Invalid pattern in expression", - "regex contains catastrophic nested quantifiers that could cause ReDoS"); - } - validateNoNestedQuantifiers(re2Pattern); + // Shared ReDoS safety checks (nested quantifiers, group count, + // quantified alternation, nesting depth) + RegexSafety.checkReDoS(re2Pattern); - int groupCount = countUnescapedParens(re2Pattern); - if (groupCount > MAX_GROUPS) { - throw new ConversionException( - "Invalid pattern in expression", - String.format("regex contains %d capture groups, exceeds limit of %d", - groupCount, MAX_GROUPS)); - } - if (QUANTIFIED_ALTERNATION.matcher(re2Pattern).find()) { - throw new ConversionException( - "Invalid pattern in expression", - "regex contains quantified alternation that could cause ReDoS"); - } - int maxDepth = computeMaxNestingDepth(re2Pattern); - if (maxDepth > MAX_NESTING_DEPTH) { - throw new ConversionException( - "Invalid pattern in expression", - String.format("nesting depth %d exceeds limit of %d", maxDepth, MAX_NESTING_DEPTH)); - } if (re2Pattern.contains("(?m") || re2Pattern.contains("(?s") || re2Pattern.contains("(?-")) { throw new ConversionException( "Invalid pattern in expression", @@ -101,68 +64,4 @@ static RegexResult convertRE2ToSpark(String re2Pattern) throws ConversionExcepti } return new RegexResult(re2Pattern, false); } - - private static void validateNoNestedQuantifiers(String pattern) throws ConversionException { - int depth = 0; - boolean[] groupHasQuantifier = new boolean[pattern.length() + 1]; - int stackTop = -1; - for (int i = 0; i < pattern.length(); i++) { - char ch = pattern.charAt(i); - if (i > 0 && pattern.charAt(i - 1) == '\\') continue; - switch (ch) { - case '(' -> { - depth++; - stackTop++; - groupHasQuantifier[stackTop] = false; - } - case ')' -> { - if (depth > 0) { - depth--; - if (i + 1 < pattern.length()) { - char next = pattern.charAt(i + 1); - if (next == '*' || next == '+' || next == '?' || next == '{') { - if (stackTop >= 0 && groupHasQuantifier[stackTop]) { - throw new ConversionException( - "Invalid pattern in expression", - "regex contains catastrophic nested quantifiers that could cause ReDoS"); - } - } - } - if (stackTop > 0 && groupHasQuantifier[stackTop]) { - groupHasQuantifier[stackTop - 1] = true; - } - if (stackTop >= 0) stackTop--; - } - } - case '*', '+', '?', '{' -> { - if (stackTop >= 0) groupHasQuantifier[stackTop] = true; - } - } - } - } - - private static int countUnescapedParens(String pattern) { - int count = 0; - for (int i = 0; i < pattern.length(); i++) { - if (pattern.charAt(i) == '(' && (i == 0 || pattern.charAt(i - 1) != '\\')) { - count++; - } - } - return count; - } - - private static int computeMaxNestingDepth(String pattern) { - int maxDepth = 0; - int currentDepth = 0; - for (int i = 0; i < pattern.length(); i++) { - char ch = pattern.charAt(i); - if (ch == '(' && (i == 0 || pattern.charAt(i - 1) != '\\')) { - currentDepth++; - if (currentDepth > maxDepth) maxDepth = currentDepth; - } else if (ch == ')' && (i == 0 || pattern.charAt(i - 1) != '\\')) { - currentDepth--; - } - } - return maxDepth; - } } diff --git a/src/main/java/com/spandigital/cel2sql/dialect/sqlite/SqliteDialect.java b/src/main/java/com/spandigital/cel2sql/dialect/sqlite/SqliteDialect.java index d0cd8ab..98aac74 100644 --- a/src/main/java/com/spandigital/cel2sql/dialect/sqlite/SqliteDialect.java +++ b/src/main/java/com/spandigital/cel2sql/dialect/sqlite/SqliteDialect.java @@ -7,6 +7,7 @@ import com.spandigital.cel2sql.dialect.IndexRecommendation; import com.spandigital.cel2sql.dialect.PatternType; import com.spandigital.cel2sql.dialect.RegexResult; +import com.spandigital.cel2sql.dialect.SqlEmitters; import com.spandigital.cel2sql.dialect.SqlWriter; import com.spandigital.cel2sql.error.ConversionException; @@ -185,31 +186,17 @@ public void writeJSONArrayLength(StringBuilder w, SqlWriter writeExpr) throws Co @Override public void writeJSONExtractPath(StringBuilder w, List pathSegments, SqlWriter writeRoot) throws ConversionException { - w.append("json_type("); - writeRoot.write(); - w.append(", '$"); - for (String segment : pathSegments) { - w.append('.').append(escapeJSONFieldName(segment)); - } - w.append("') IS NOT NULL"); + SqlEmitters.writeJsonPathProbe(w, "json_type", writeRoot, pathSegments, " IS NOT NULL", SqliteDialect::escapeJSONFieldName); } @Override public void writeJSONArrayMembership(StringBuilder w, String jsonFunc, SqlWriter writeElem, SqlWriter writeArray) throws ConversionException { - w.append("EXISTS (SELECT 1 FROM json_each("); - writeArray.write(); - w.append(") WHERE value = "); - writeElem.write(); - w.append(')'); + SqlEmitters.writeJsonEachMembership(w, writeArray, writeElem); } @Override public void writeNestedJSONArrayMembership(StringBuilder w, SqlWriter writeElem, SqlWriter writeArray) throws ConversionException { - w.append("EXISTS (SELECT 1 FROM json_each("); - writeArray.write(); - w.append(") WHERE value = "); - writeElem.write(); - w.append(')'); + SqlEmitters.writeJsonEachMembership(w, writeArray, writeElem); } // --- Timestamps ---