From efc34086b047acf0b8381680e7f47d6e4f1c5d58 Mon Sep 17 00:00:00 2001 From: matt rice Date: Sat, 2 May 2026 17:42:01 -0700 Subject: [PATCH] Stop trimming escaped spaces off the end regex --- lrlex/src/lib/parser.rs | 49 +++++++++++++++++++++++- lrpar/cttests/src/regex_trailing_ws.test | 19 +++++++++ 2 files changed, 67 insertions(+), 1 deletion(-) create mode 100644 lrpar/cttests/src/regex_trailing_ws.test diff --git a/lrlex/src/lib/parser.rs b/lrlex/src/lib/parser.rs index 784a83f11..170e929a6 100644 --- a/lrlex/src/lib/parser.rs +++ b/lrlex/src/lib/parser.rs @@ -490,7 +490,7 @@ where if !dupe { let (start_states, re_str) = - self.parse_start_states(i, line[..rspace].trim_end_matches(matches_whitespace))?; + self.parse_start_states(i, trim_end_unescaped(&line[..rspace]))?; let rules_len = self.rules.len(); let tok_id = LexerTypesT::StorageT::try_from(rules_len) .unwrap_or_else(|_| panic!("StorageT::try_from \ @@ -685,6 +685,20 @@ where } } +fn trim_end_unescaped(s: &str) -> &str { + let trimmed = s.trim_end_matches(matches_whitespace); + if trimmed.len() == s.len() { + return s; + } + // If the number of backslashes is odd then the first space in the trimmed portion is escaped so re-add it. + if trimmed.chars().rev().take_while(|&c| c == '\\').count() % 2 == 1 { + // Panic safety: the trimmed portion is at least one char long. + &s[..trimmed.len() + s[trimmed.len()..].chars().next().unwrap().len_utf8()] + } else { + trimmed + } +} + #[cfg(test)] mod test { use super::*; @@ -1826,4 +1840,37 @@ b "A" 18, ); } + + #[test] + fn unescaped_trim() { + let escapes = [ + (r#"\ "#, r#"\ "#), + (r#"\ "#, r#"\ "#), + (r#"\\ "#, r#"\\"#), + (r#"\\ "#, r#"\\"#), + (r#"\\\ "#, r#"\\\ "#), + (r#"\\\ "#, r#"\\\ "#), + (r#"\\\\ "#, r#"\\\\"#), + (r#"\\\\ "#, r#"\\\\"#), + (r#"x"#, r#"x"#), + (r#"x\ "#, r#"x\ "#), + (r#"x\ "#, r#"x\ "#), + (r#"x\\ "#, r#"x\\"#), + (r#"x\\ "#, r#"x\\"#), + (r#"x\\\ "#, r#"x\\\ "#), + (r#"x\\\ "#, r#"x\\\ "#), + (r#"x\\\\ "#, r#"x\\\\"#), + (r#"x\\\\ "#, r#"x\\\\"#), + (r#"x\ y "#, r#"x\ y"#), + (r#"x\ y\ "#, r#"x\ y\ "#), + (r#"x\ y\\ "#, r#"x\ y\\"#), + (r#"x\ y "#, r#"x\ y"#), + (r#"x\ y\ "#, r#"x\ y\ "#), + (r#"x\ y\\ "#, r#"x\ y\\"#), + ]; + for (escaped, expected) in escapes { + let trimmed = trim_end_unescaped(escaped); + assert_eq!(expected, trimmed) + } + } } diff --git a/lrpar/cttests/src/regex_trailing_ws.test b/lrpar/cttests/src/regex_trailing_ws.test new file mode 100644 index 000000000..ce5b880c6 --- /dev/null +++ b/lrpar/cttests/src/regex_trailing_ws.test @@ -0,0 +1,19 @@ +name: Test with regex containing trailing ws +grammar: | + %grmtools { + yacckind: Original(YaccOriginalActionKind::NoAction), + recoverer: RecoveryKind::None, + test_files: ["*.input_trailing_ws"], + } + %start Expr + %% + Expr: "trailing"; + +lexer: | + %% + [a-zA-Z]\ "trailing" + [\n\t] ; + +extra_files: + input1.input_trailing_ws: | + a