Skip to content

Commit

Permalink
Add raw strings to BCQL.
Browse files Browse the repository at this point in the history
If you want to match a literal string, not a regular expression,
you had to escape various special characters using a backslash.
Now, you can use a raw string to achieve the same:

  [lemma=r'<']

This frees clients who want to match literally from having to know
the exact escaping rules for Lucene's regex engine.
  • Loading branch information
jan-niestadt committed Jun 21, 2024
1 parent 6fc58ea commit b859218
Show file tree
Hide file tree
Showing 7 changed files with 163 additions and 138 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,23 @@ String chopEnds(String input) {
}

String getStringBetweenQuotes(String input) throws SingleQuotesException {
boolean isRaw = input.charAt(0) == 'r';
if (isRaw)
input = input.substring(1);

String quoteUsed = input.substring(0, 1);
input = chopEnds(input); // eliminate quotes
if (!allowSingleQuotes && quoteUsed.equals("\'"))
throw new SingleQuotesException();

// Unescape ONLY the quotes found around this string
// Leave other escaped characters as-is for Lucene's regex engine
return StringUtil.unescapeQuote(input, quoteUsed);
String quotedUnescaped = StringUtil.unescapeQuote(input, quoteUsed);
if (isRaw) {
// We want to find this string as-is; create a regex that will match this
return StringUtil.escapeLuceneRegexCharacters(quotedUnescaped);
}
return quotedUnescaped;
}

TextPatternTerm simplePattern(String str) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,10 +82,10 @@ public class GeneratedCorpusQueryLanguageParser implements GeneratedCorpusQueryL
}
case DEFAULT_VALUE:
case ROOT_DEP_OP:
case NAME:
case NUMBER:
case QUOTED_STRING:
case SINGLE_QUOTED_STRING:
case NAME:
case NUMBER:
case 25:
case 26:
case 28:
Expand Down Expand Up @@ -652,9 +652,9 @@ public class GeneratedCorpusQueryLanguageParser implements GeneratedCorpusQueryL
Integer[] rep = null;
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
case DEFAULT_VALUE:
case NAME:
case QUOTED_STRING:
case SINGLE_QUOTED_STRING:
case NAME:
case 25:
case 26:
case 42:{
Expand Down Expand Up @@ -724,10 +724,10 @@ public class GeneratedCorpusQueryLanguageParser implements GeneratedCorpusQueryL
switch ((jj_ntk==-1)?jj_ntk_f():jj_ntk) {
case DEFAULT_VALUE:
case ROOT_DEP_OP:
case NAME:
case NUMBER:
case QUOTED_STRING:
case SINGLE_QUOTED_STRING:
case NAME:
case NUMBER:
case 25:
case 26:
case 28:
Expand Down Expand Up @@ -1103,9 +1103,9 @@ private boolean jj_3R_40()
{
Token xsp;
xsp = jj_scanpos;
if (jj_scan_token(17)) {
if (jj_scan_token(13)) {
jj_scanpos = xsp;
if (jj_scan_token(18)) return true;
if (jj_scan_token(14)) return true;
}
return false;
}
Expand Down Expand Up @@ -1191,9 +1191,9 @@ private boolean jj_3R_4()
{
Token xsp;
xsp = jj_scanpos;
if (jj_scan_token(13)) {
if (jj_scan_token(15)) {
jj_scanpos = xsp;
if (jj_scan_token(15)) return true;
if (jj_scan_token(17)) return true;
}
return false;
}
Expand Down Expand Up @@ -1600,7 +1600,7 @@ private boolean jj_3R_26()
if (jj_scan_token(38)) return true;
Token xsp;
xsp = jj_scanpos;
if (jj_scan_token(15)) jj_scanpos = xsp;
if (jj_scan_token(17)) jj_scanpos = xsp;
if (jj_scan_token(37)) return true;
return false;
}
Expand Down Expand Up @@ -1706,7 +1706,7 @@ private boolean jj_3R_46()
jj_la1_init_1();
}
private static void jj_la1_init_0() {
jj_la1_0 = new int[] {0x1607a600,0x80000,0x0,0x3f00000,0x3f00000,0x20000000,0x14060000,0x180,0x180,0xa000,0x1800,0xa000,0x0,0x2000,0x0,0x60000,0x8000,0x0,0x0,0x0,0x0,0xa000,0x6062200,0x16062200,0x1606a600,0x0,0xa000,0x4000,0x14002000,0x60200,0x0,0x14000000,0x4000,0x4000,0x2000,0x0,};
jj_la1_0 = new int[] {0x1606e600,0x80000,0x0,0x3f00000,0x3f00000,0x20000000,0x14006000,0x180,0x180,0x28000,0x1800,0x28000,0x0,0x8000,0x0,0x6000,0x20000,0x0,0x0,0x0,0x0,0x28000,0x600e200,0x1600e200,0x1602e600,0x0,0x28000,0x10000,0x14008000,0x6200,0x0,0x14000000,0x10000,0x10000,0x8000,0x0,};
}
private static void jj_la1_init_1() {
jj_la1_1 = new int[] {0x400,0x0,0x380,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x0,0x1,0x0,0x1,0x0,0x0,0xe,0x10,0x380,0x380,0x0,0x400,0x400,0x400,0x40,0x0,0x0,0x0,0x400,0x380,0x0,0x0,0x0,0x0,0x1,};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,17 +27,17 @@ public interface GeneratedCorpusQueryLanguageParserConstants {
/** RegularExpression Id. */
int ALIGNMENT_OP = 12;
/** RegularExpression Id. */
int NAME = 13;
int QUOTED_STRING = 13;
/** RegularExpression Id. */
int FLAGS = 14;
int SINGLE_QUOTED_STRING = 14;
/** RegularExpression Id. */
int NUMBER = 15;
int NAME = 15;
/** RegularExpression Id. */
int SETTINGS_OP = 16;
int FLAGS = 16;
/** RegularExpression Id. */
int QUOTED_STRING = 17;
int NUMBER = 17;
/** RegularExpression Id. */
int SINGLE_QUOTED_STRING = 18;
int SETTINGS_OP = 18;

/** Lexical state. */
int DEFAULT = 0;
Expand All @@ -57,12 +57,12 @@ public interface GeneratedCorpusQueryLanguageParserConstants {
"<ROOT_DEP_OP>",
"<DEP_OP>",
"<ALIGNMENT_OP>",
"<QUOTED_STRING>",
"<SINGLE_QUOTED_STRING>",
"<NAME>",
"<FLAGS>",
"<NUMBER>",
"<SETTINGS_OP>",
"<QUOTED_STRING>",
"<SINGLE_QUOTED_STRING>",
"\"::\"",
"\"=\"",
"\"!=\"",
Expand Down
Loading

0 comments on commit b859218

Please sign in to comment.