From b8592188fd881b1bf51795564606e699a3092db0 Mon Sep 17 00:00:00 2001 From: Jan Niestadt Date: Fri, 21 Jun 2024 10:56:31 +0200 Subject: [PATCH] Add raw strings to BCQL. If you want to match a literal string, not a regular expression, you had to escape various special characters using a backslash. Now, you can use a raw string to achieve the same: [lemma=r'<'] This frees clients who want to match literally from having to know the exact escaping rules for Lucene's regex engine. --- .../corpusql/CorpusQueryLanguageParser.java | 11 +- .../GeneratedCorpusQueryLanguageParser.java | 22 +- ...tedCorpusQueryLanguageParserConstants.java | 16 +- ...CorpusQueryLanguageParserTokenManager.java | 232 +++++++++--------- .../inl/blacklab/queryParser/corpusql/cql.jj | 4 +- site/docs/guide/corpus-query-language.md | 12 +- .../src/main/java/nl/inl/util/StringUtil.java | 4 +- 7 files changed, 163 insertions(+), 138 deletions(-) diff --git a/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/CorpusQueryLanguageParser.java b/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/CorpusQueryLanguageParser.java index 290563ee2..fc2cc1bcf 100644 --- a/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/CorpusQueryLanguageParser.java +++ b/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/CorpusQueryLanguageParser.java @@ -66,6 +66,10 @@ String chopEnds(String input) { } String getStringBetweenQuotes(String input) throws SingleQuotesException { + boolean isRaw = input.charAt(0) == 'r'; + if (isRaw) + input = input.substring(1); + String quoteUsed = input.substring(0, 1); input = chopEnds(input); // eliminate quotes if (!allowSingleQuotes && quoteUsed.equals("\'")) @@ -73,7 +77,12 @@ String getStringBetweenQuotes(String input) throws SingleQuotesException { // Unescape ONLY the quotes found around this string // Leave other escaped characters as-is for Lucene's regex engine - return StringUtil.unescapeQuote(input, quoteUsed); + String quotedUnescaped = StringUtil.unescapeQuote(input, quoteUsed); + if (isRaw) { + // We want to find this string as-is; create a regex that will match this + return StringUtil.escapeLuceneRegexCharacters(quotedUnescaped); + } + return quotedUnescaped; } TextPatternTerm simplePattern(String str) { diff --git a/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/GeneratedCorpusQueryLanguageParser.java b/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/GeneratedCorpusQueryLanguageParser.java index f8f36b9f1..67c423f15 100644 --- a/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/GeneratedCorpusQueryLanguageParser.java +++ b/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/GeneratedCorpusQueryLanguageParser.java @@ -82,10 +82,10 @@ public class GeneratedCorpusQueryLanguageParser implements GeneratedCorpusQueryL } case DEFAULT_VALUE: case ROOT_DEP_OP: - case NAME: - case NUMBER: case QUOTED_STRING: case SINGLE_QUOTED_STRING: + case NAME: + case NUMBER: case 25: case 26: case 28: @@ -652,9 +652,9 @@ public class GeneratedCorpusQueryLanguageParser implements GeneratedCorpusQueryL Integer[] rep = null; switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case DEFAULT_VALUE: - case NAME: case QUOTED_STRING: case SINGLE_QUOTED_STRING: + case NAME: case 25: case 26: case 42:{ @@ -724,10 +724,10 @@ public class GeneratedCorpusQueryLanguageParser implements GeneratedCorpusQueryL switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) { case DEFAULT_VALUE: case ROOT_DEP_OP: - case NAME: - case NUMBER: case QUOTED_STRING: case SINGLE_QUOTED_STRING: + case NAME: + case NUMBER: case 25: case 26: case 28: @@ -1103,9 +1103,9 @@ private boolean jj_3R_40() { Token xsp; xsp = jj_scanpos; - if (jj_scan_token(17)) { + if (jj_scan_token(13)) { jj_scanpos = xsp; - if (jj_scan_token(18)) return true; + if (jj_scan_token(14)) return true; } return false; } @@ -1191,9 +1191,9 @@ private boolean jj_3R_4() { Token xsp; xsp = jj_scanpos; - if (jj_scan_token(13)) { + if (jj_scan_token(15)) { jj_scanpos = xsp; - if (jj_scan_token(15)) return true; + if (jj_scan_token(17)) return true; } return false; } @@ -1600,7 +1600,7 @@ private boolean jj_3R_26() if (jj_scan_token(38)) return true; Token xsp; xsp = jj_scanpos; - if (jj_scan_token(15)) jj_scanpos = xsp; + if (jj_scan_token(17)) jj_scanpos = xsp; if (jj_scan_token(37)) return true; return false; } @@ -1706,7 +1706,7 @@ private boolean jj_3R_46() jj_la1_init_1(); } private static void jj_la1_init_0() { - jj_la1_0 = new int[] {0x1607a600,0x80000,0x0,0x3f00000,0x3f00000,0x20000000,0x14060000,0x180,0x180,0xa000,0x1800,0xa000,0x0,0x2000,0x0,0x60000,0x8000,0x0,0x0,0x0,0x0,0xa000,0x6062200,0x16062200,0x1606a600,0x0,0xa000,0x4000,0x14002000,0x60200,0x0,0x14000000,0x4000,0x4000,0x2000,0x0,}; + jj_la1_0 = new int[] {0x1606e600,0x80000,0x0,0x3f00000,0x3f00000,0x20000000,0x14006000,0x180,0x180,0x28000,0x1800,0x28000,0x0,0x8000,0x0,0x6000,0x20000,0x0,0x0,0x0,0x0,0x28000,0x600e200,0x1600e200,0x1602e600,0x0,0x28000,0x10000,0x14008000,0x6200,0x0,0x14000000,0x10000,0x10000,0x8000,0x0,}; } private static void jj_la1_init_1() { jj_la1_1 = new int[] {0x400,0x0,0x380,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x1,0x0,0x0,0xe,0x10,0x380,0x380,0x0,0x400,0x400,0x400,0x40,0x0,0x0,0x0,0x400,0x380,0x0,0x0,0x0,0x0,0x1,}; diff --git a/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/GeneratedCorpusQueryLanguageParserConstants.java b/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/GeneratedCorpusQueryLanguageParserConstants.java index 6889704d4..51f7eedd1 100644 --- a/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/GeneratedCorpusQueryLanguageParserConstants.java +++ b/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/GeneratedCorpusQueryLanguageParserConstants.java @@ -27,17 +27,17 @@ public interface GeneratedCorpusQueryLanguageParserConstants { /** RegularExpression Id. */ int ALIGNMENT_OP = 12; /** RegularExpression Id. */ - int NAME = 13; + int QUOTED_STRING = 13; /** RegularExpression Id. */ - int FLAGS = 14; + int SINGLE_QUOTED_STRING = 14; /** RegularExpression Id. */ - int NUMBER = 15; + int NAME = 15; /** RegularExpression Id. */ - int SETTINGS_OP = 16; + int FLAGS = 16; /** RegularExpression Id. */ - int QUOTED_STRING = 17; + int NUMBER = 17; /** RegularExpression Id. */ - int SINGLE_QUOTED_STRING = 18; + int SETTINGS_OP = 18; /** Lexical state. */ int DEFAULT = 0; @@ -57,12 +57,12 @@ public interface GeneratedCorpusQueryLanguageParserConstants { "", "", "", + "", + "", "", "", "", "", - "", - "", "\"::\"", "\"=\"", "\"!=\"", diff --git a/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/GeneratedCorpusQueryLanguageParserTokenManager.java b/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/GeneratedCorpusQueryLanguageParserTokenManager.java index 588ba1660..b146c8161 100644 --- a/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/GeneratedCorpusQueryLanguageParserTokenManager.java +++ b/query-parser/src/main/java/nl/inl/blacklab/queryParser/corpusql/GeneratedCorpusQueryLanguageParserTokenManager.java @@ -393,7 +393,7 @@ private int jjMoveNfa_0(int startState, int curPos) catch(java.io.IOException e) { throw new Error("Internal Error"); } curPos = 0; int startsAt = 0; - jjnewStateCnt = 61; + jjnewStateCnt = 62; int i = 1; jjstateSet[0] = startState; int kind = 0x7fffffff; @@ -411,16 +411,16 @@ private int jjMoveNfa_0(int startState, int curPos) case 0: if ((0x3ff000000000000L & l) != 0L) { - if (kind > 15) - kind = 15; - { jjCheckNAdd(42); } + if (kind > 17) + kind = 17; + { jjCheckNAdd(52); } } + else if (curChar == 37) + jjstateSet[jjnewStateCnt++] = 51; else if (curChar == 39) { jjCheckNAddStates(0, 2); } else if (curChar == 34) { jjCheckNAddStates(3, 5); } - else if (curChar == 37) - jjstateSet[jjnewStateCnt++] = 41; else if (curChar == 61) { jjCheckNAddTwoStates(31, 37); } else if (curChar == 45) @@ -601,87 +601,87 @@ else if (curChar == 35) if (curChar == 61) jjstateSet[jjnewStateCnt++] = 35; break; + case 38: + if (curChar == 34) + { jjCheckNAddStates(3, 5); } + break; case 39: - if ((0x3ff200000000000L & l) == 0L) - break; - if (kind > 13) - kind = 13; - jjstateSet[jjnewStateCnt++] = 39; + if ((0xfffffffbffffffffL & l) != 0L) + { jjCheckNAddStates(3, 5); } break; - case 40: - if (curChar == 37) - jjstateSet[jjnewStateCnt++] = 41; + case 41: + { jjCheckNAddStates(3, 5); } break; case 42: - if ((0x3ff000000000000L & l) == 0L) - break; - if (kind > 15) - kind = 15; - { jjCheckNAdd(42); } + if (curChar == 34 && kind > 13) + kind = 13; break; - case 44: - if ((0x3ff000000000000L & l) != 0L) - { jjAddStates(21, 22); } + case 43: + if (curChar == 39) + { jjCheckNAddStates(0, 2); } break; - case 45: - if (curChar == 61) - { jjCheckNAdd(46); } + case 44: + if ((0xffffff7fffffffffL & l) != 0L) + { jjCheckNAddStates(0, 2); } break; case 46: - if ((0x3ff000000000000L & l) == 0L) - break; - if (kind > 16) - kind = 16; - { jjCheckNAddTwoStates(46, 47); } + { jjCheckNAddStates(0, 2); } break; case 47: - if (curChar == 44) - { jjCheckNAdd(48); } - break; - case 48: - if ((0x3ff000000000000L & l) != 0L) - { jjCheckNAddTwoStates(48, 49); } + if (curChar == 39 && kind > 14) + kind = 14; break; case 49: - if (curChar == 61) - { jjCheckNAdd(50); } - break; - case 50: - if ((0x3ff000000000000L & l) == 0L) + if ((0x3ff200000000000L & l) == 0L) break; - if (kind > 16) - kind = 16; - { jjCheckNAddTwoStates(47, 50); } + if (kind > 15) + kind = 15; + jjstateSet[jjnewStateCnt++] = 49; break; - case 51: - if (curChar == 34) - { jjCheckNAddStates(3, 5); } + case 50: + if (curChar == 37) + jjstateSet[jjnewStateCnt++] = 51; break; case 52: - if ((0xfffffffbffffffffL & l) != 0L) - { jjCheckNAddStates(3, 5); } + if ((0x3ff000000000000L & l) == 0L) + break; + if (kind > 17) + kind = 17; + { jjCheckNAdd(52); } break; case 54: - { jjCheckNAddStates(3, 5); } + if ((0x3ff000000000000L & l) != 0L) + { jjAddStates(21, 22); } break; case 55: - if (curChar == 34 && kind > 17) - kind = 17; + if (curChar == 61) + { jjCheckNAdd(56); } break; case 56: - if (curChar == 39) - { jjCheckNAddStates(0, 2); } + if ((0x3ff000000000000L & l) == 0L) + break; + if (kind > 18) + kind = 18; + { jjCheckNAddTwoStates(56, 57); } break; case 57: - if ((0xffffff7fffffffffL & l) != 0L) - { jjCheckNAddStates(0, 2); } + if (curChar == 44) + { jjCheckNAdd(58); } + break; + case 58: + if ((0x3ff000000000000L & l) != 0L) + { jjCheckNAddTwoStates(58, 59); } break; case 59: - { jjCheckNAddStates(0, 2); } + if (curChar == 61) + { jjCheckNAdd(60); } break; case 60: - if (curChar == 39 && kind > 18) + if ((0x3ff000000000000L & l) == 0L) + break; + if (kind > 18) kind = 18; + { jjCheckNAddTwoStates(57, 60); } break; default : break; } @@ -697,14 +697,16 @@ else if (curChar < 128) case 0: if ((0x7fffffe87fffffeL & l) != 0L) { - if (kind > 13) - kind = 13; - { jjCheckNAdd(39); } + if (kind > 15) + kind = 15; + { jjCheckNAdd(49); } } else if (curChar == 64) - { jjCheckNAdd(44); } + { jjCheckNAdd(54); } else if (curChar == 94) jjstateSet[jjnewStateCnt++] = 12; + if ((0x4000000040000L & l) != 0L) + { jjAddStates(23, 24); } break; case 1: if (kind > 5) @@ -764,68 +766,72 @@ else if (curChar == 94) kind = 12; jjstateSet[jjnewStateCnt++] = 36; break; - case 38: case 39: - if ((0x7fffffe87fffffeL & l) == 0L) - break; - if (kind > 13) - kind = 13; - { jjCheckNAdd(39); } + if ((0xffffffffefffffffL & l) != 0L) + { jjCheckNAddStates(3, 5); } break; - case 41: - if ((0x101800001018L & l) == 0L) - break; - if (kind > 14) - kind = 14; - jjstateSet[jjnewStateCnt++] = 41; + case 40: + if (curChar == 92) + jjstateSet[jjnewStateCnt++] = 41; break; - case 43: - if (curChar == 64) - { jjCheckNAdd(44); } + case 41: + { jjCheckNAddStates(3, 5); } break; case 44: - if ((0x7fffffe87fffffeL & l) != 0L) - { jjCheckNAddTwoStates(44, 45); } + if ((0xffffffffefffffffL & l) != 0L) + { jjCheckNAddStates(0, 2); } + break; + case 45: + if (curChar == 92) + jjstateSet[jjnewStateCnt++] = 46; break; case 46: - if ((0x7fffffe87fffffeL & l) == 0L) - break; - if (kind > 16) - kind = 16; - { jjCheckNAddTwoStates(46, 47); } + { jjCheckNAddStates(0, 2); } break; case 48: - if ((0x7fffffe87fffffeL & l) != 0L) - { jjAddStates(23, 24); } - break; - case 50: + case 49: if ((0x7fffffe87fffffeL & l) == 0L) break; + if (kind > 15) + kind = 15; + { jjCheckNAdd(49); } + break; + case 51: + if ((0x101800001018L & l) == 0L) + break; if (kind > 16) kind = 16; - { jjCheckNAddTwoStates(47, 50); } - break; - case 52: - if ((0xffffffffefffffffL & l) != 0L) - { jjCheckNAddStates(3, 5); } + jjstateSet[jjnewStateCnt++] = 51; break; case 53: - if (curChar == 92) - jjstateSet[jjnewStateCnt++] = 54; + if (curChar == 64) + { jjCheckNAdd(54); } break; case 54: - { jjCheckNAddStates(3, 5); } + if ((0x7fffffe87fffffeL & l) != 0L) + { jjCheckNAddTwoStates(54, 55); } break; - case 57: - if ((0xffffffffefffffffL & l) != 0L) - { jjCheckNAddStates(0, 2); } + case 56: + if ((0x7fffffe87fffffeL & l) == 0L) + break; + if (kind > 18) + kind = 18; + { jjCheckNAddTwoStates(56, 57); } break; case 58: - if (curChar == 92) - jjstateSet[jjnewStateCnt++] = 59; + if ((0x7fffffe87fffffeL & l) != 0L) + { jjAddStates(25, 26); } break; - case 59: - { jjCheckNAddStates(0, 2); } + case 60: + if ((0x7fffffe87fffffeL & l) == 0L) + break; + if (kind > 18) + kind = 18; + { jjCheckNAddTwoStates(57, 60); } + break; + case 61: + if ((0x4000000040000L & l) != 0L) + { jjAddStates(23, 24); } break; default : break; } @@ -872,13 +878,13 @@ else if (curChar == 94) if (jjCanMove_0(hiByte, i1, i2, l1, l2)) { jjCheckNAddStates(18, 20); } break; - case 52: - case 54: + case 39: + case 41: if (jjCanMove_0(hiByte, i1, i2, l1, l2)) { jjCheckNAddStates(3, 5); } break; - case 57: - case 59: + case 44: + case 46: if (jjCanMove_0(hiByte, i1, i2, l1, l2)) { jjCheckNAddStates(0, 2); } break; @@ -893,7 +899,7 @@ else if (curChar == 94) kind = 0x7fffffff; } ++curPos; - if ((i = jjnewStateCnt) == (startsAt = 61 - (jjnewStateCnt = startsAt))) + if ((i = jjnewStateCnt) == (startsAt = 62 - (jjnewStateCnt = startsAt))) break; try { curChar = input_stream.readChar(); } catch(java.io.IOException e) { break; } @@ -919,8 +925,8 @@ else if (jjmatchedPos == strPos && jjmatchedKind > strKind) return toRet; } static final int[] jjnextStates = { - 57, 58, 60, 52, 53, 55, 1, 2, 4, 6, 7, 10, 14, 15, 19, 24, - 25, 29, 32, 33, 37, 44, 45, 48, 49, + 44, 45, 47, 39, 40, 42, 1, 2, 4, 6, 7, 10, 14, 15, 19, 24, + 25, 29, 32, 33, 37, 54, 55, 38, 43, 58, 59, }; private static final boolean jjCanMove_0(int hiByte, int i1, int i2, long l1, long l2) { @@ -1103,7 +1109,7 @@ private void ReInitRounds() { int i; jjround = 0x80000001; - for (i = 61; i-- > 0;) + for (i = 62; i-- > 0;) jjrounds[i] = 0x80000000; } @@ -1139,8 +1145,8 @@ public void SwitchTo(int lexState) }; protected JavaCharStream input_stream; - private final int[] jjrounds = new int[61]; - private final int[] jjstateSet = new int[2 * 61]; + private final int[] jjrounds = new int[62]; + private final int[] jjstateSet = new int[2 * 62]; protected int curChar; diff --git a/query-parser/src/main/javacc/nl/inl/blacklab/queryParser/corpusql/cql.jj b/query-parser/src/main/javacc/nl/inl/blacklab/queryParser/corpusql/cql.jj index 6a5bac3a6..44559586c 100644 --- a/query-parser/src/main/javacc/nl/inl/blacklab/queryParser/corpusql/cql.jj +++ b/query-parser/src/main/javacc/nl/inl/blacklab/queryParser/corpusql/cql.jj @@ -100,13 +100,13 @@ TOKEN [IGNORE_CASE]: | ", " ", "'", "\"", "]"] )* )? "->" (["A"-"Z","a"-"z","_","-","0"-"9"])* > | ", " ", "'", "\"", "]"] )* )? "->" (["A"-"Z","a"-"z","_","-","0"-"9"])* > | ", " ", "'", "\"", "]"] )* )? "=>" (["A"-"Z","a"-"z","_","-","0"-"9"])* > +| +| | | | | -| -| } // --- Grammar rules start here --- diff --git a/site/docs/guide/corpus-query-language.md b/site/docs/guide/corpus-query-language.md index ad5918f13..b9b58444b 100644 --- a/site/docs/guide/corpus-query-language.md +++ b/site/docs/guide/corpus-query-language.md @@ -60,6 +60,16 @@ And to find lemmas starting with _under_, use: Explaining regular expression syntax is beyond the scope of this document, but for a complete overview, see [regular-expressions.info](http://www.regular-expressions.info/). +::: details Escaping and raw strings +To find characters with special meaning in a regular expression, such as the period, you need to escape them with a backslash: + + [lemma='etc\.'] + +Alternatively, you can use a "raw string" by prefixing the string with an `r`: + + [lemma=r'etc.'] +::: + #### Matching any token Sometimes you want to match any token, regardless of its value. @@ -653,7 +663,7 @@ BlackLab's CQL syntax and behaviour differs in a few ways from CWBs, although th For now, here's what you should know: * Case-insensitive search is the default in BlackLab, while CWB and Sketch Engine use case-sensitive search as the default. If you want to match a term case-sensitively, use `'(?-i)..'` or `'(?c)..'`. -* If you want to match a string literally, not as a regular expression, use backslash escaping: `'e\.g\.'`. +* If you want to match a string literally, not as a regular expression, use backslash escaping (`'e\.g\.'`) or a raw string (`r'e.g.'`) * BlackLab supports result set manipulation such as: sorting (including on specific context words), grouping/frequency distribution, subsets, sampling, setting context size, etc. However, these are supported through the REST and Java APIs, not through a command interface like in CWB. See [BlackLab Server overview](/server/overview.md)). * Querying XML elements and attributes looks natural in BlackLab: `` means "sentences", `` means "starts of sentences", `` means "sentence tags with a type attribute with value A". This natural syntax differs from CWBs in some places, however, particularly when matching XML attributes. * In capture constraints (expressions occurring after `::`), only literal matching (no regex matching) is currently supported. diff --git a/util/src/main/java/nl/inl/util/StringUtil.java b/util/src/main/java/nl/inl/util/StringUtil.java index d6bb23ff1..9aef4a03b 100644 --- a/util/src/main/java/nl/inl/util/StringUtil.java +++ b/util/src/main/java/nl/inl/util/StringUtil.java @@ -36,8 +36,8 @@ private StringUtil() { private static final Pattern PATT_REGEX_CHARACTERS_JAVA = Pattern.compile("([|\\\\?!*+()<>\\[\\]\\-=^${}.])"); /** Any characters that should be escaped when constructing a Lucene regular expression matching a value. - (compared to Java, doesn't escape <>-=!^$) but does escape ") */ - private static final Pattern PATT_REGEX_CHARACTERS_LUCENE = Pattern.compile("([|\\\\?*+()\\[\\]\\{}.\"])"); + (compared to Java, doesn't escape <>-=!^$) but does escape " and <) */ + private static final Pattern PATT_REGEX_CHARACTERS_LUCENE = Pattern.compile("([|\\\\?*+()\\[\\]\\{}\\<.\"])"); /** * Escape regex special characters