Skip to content

Commit

Permalink
Specialize Regex codegen for ? (#1898)
Browse files Browse the repository at this point in the history
We're currently generating the code for a one / not one / set ? quantifier (zero-or-one) as a loop, when it can actually just be a conditional check.  This adds a special path for that case.

(I've also added another reduction test for alternation, and tweaked the codegen for a successful match to touch fields less.)
  • Loading branch information
stephentoub committed Jan 21, 2020
1 parent c26a653 commit dc17345
Show file tree
Hide file tree
Showing 3 changed files with 80 additions and 5 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -1591,22 +1591,24 @@ private bool TryGenerateNonBacktrackingGo(RegexNode node)
EmitNode(node);

// Success:
// this.runtextpos = runtextpos + textSpanPos;
// runtextpos += textSpanPos;
// this.runtextpos = runtextpos;
// Capture(0, originalruntextpos, runtextpos);
MarkLabel(stopSuccessLabel);
Ldthis();
Ldloc(runtextposLocal);
if (textSpanPos > 0)
{
Ldc(textSpanPos);
Add();
Stloc(runtextposLocal);
Ldloc(runtextposLocal);
}
Stfld(s_runtextposField);

// Capture(0, originalruntextposLocal, this.runtextpos);
Ldthis();
Ldc(0);
Ldloc(originalruntextposLocal);
Ldthisfld(s_runtextposField);
Ldloc(runtextposLocal);
Callvirt(s_captureMethod);

// If the graph contained captures, undo any remaining to handle failed matches.
Expand Down Expand Up @@ -2658,13 +2660,20 @@ void EmitAtomicSingleCharLoop(RegexNode node)
node.Type == RegexNode.Setloopatomic);
Debug.Assert(node.M < int.MaxValue);

// First generate the code to handle the required number of iterations.
// If this is actually a repeater, emit that instead.
if (node.M == node.N)
{
EmitSingleCharRepeater(node);
return;
}

// If this is actually an optional single char, emit that instead.
if (node.M == 0 && node.N == 1)
{
EmitAtomicSingleCharZeroOrOne(node);
return;
}

Debug.Assert(node.N > node.M);
int minIterations = node.M;
int maxIterations = node.N;
Expand Down Expand Up @@ -2812,13 +2821,71 @@ void EmitAtomicSingleCharLoop(RegexNode node)
ReturnInt32Local(iterationLocal);
}

// Emits the code to handle a non-backtracking optional zero-or-one loop.
void EmitAtomicSingleCharZeroOrOne(RegexNode node)
{
Debug.Assert(
node.Type == RegexNode.Oneloopatomic ||
node.Type == RegexNode.Notoneloopatomic ||
node.Type == RegexNode.Setloopatomic);
Debug.Assert(node.M == 0 && node.N == 1);

Label skipUpdatesLabel = DefineLabel();

// if ((uint)textSpanPos >= (uint)textSpan.Length) goto skipUpdatesLabel;
Ldc(textSpanPos);
Ldloca(textSpanLocal);
Call(s_spanGetLengthMethod);
BgeUnFar(skipUpdatesLabel);

// if (textSpan[i] != ch) goto skipUpdatesLabel;
Ldloca(textSpanLocal);
Ldc(textSpanPos);
Call(s_spanGetItemMethod);
LdindU2();
switch (node.Type)
{
case RegexNode.Oneloopatomic:
if (IsCaseInsensitive(node)) CallToLower();
Ldc(node.Ch);
BneFar(skipUpdatesLabel);
break;
case RegexNode.Notoneloopatomic:
if (IsCaseInsensitive(node)) CallToLower();
Ldc(node.Ch);
BeqFar(skipUpdatesLabel);
break;
case RegexNode.Setloopatomic:
LocalBuilder setScratchLocal = RentInt32Local();
EmitMatchCharacterClass(node.Str!, IsCaseInsensitive(node), setScratchLocal);
ReturnInt32Local(setScratchLocal);
BrfalseFar(skipUpdatesLabel);
break;
}

// textSpan = textSpan.Slice(1);
Ldloca(textSpanLocal);
Ldc(1);
Call(s_spanSliceIntMethod);
Stloc(textSpanLocal);

// runtextpos++;
Ldloc(runtextposLocal);
Ldc(1);
Add();
Stloc(runtextposLocal);

MarkLabel(skipUpdatesLabel);
}

// Emits the code to handle a non-backtracking, variable-length loop around another node.
void EmitAtomicNodeLoop(RegexNode node)
{
Debug.Assert(node.Type == RegexNode.Loop);
Debug.Assert(node.M == node.N || (node.Next != null && node.Next.Type == RegexNode.Atomic));
Debug.Assert(node.M < int.MaxValue);

// If this is actually a repeater, emit that instead.
if (node.M == node.N)
{
EmitNodeRepeater(node);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -674,6 +674,8 @@ public static IEnumerable<object[]> Groups_Basic_TestData()
yield return new object[] { null, @"(?:abcd|efghj{2,}|j[klm]o+)i", "efghjjjjji", RegexOptions.None, new string[] { "efghjjjjji" } };
yield return new object[] { null, @"(?:abcd|efghi{2,}|j[klm]o+)i", "efghiii", RegexOptions.None, new string[] { "efghiii" } };
yield return new object[] { null, @"(?:abcd|efghi{2,}|j[klm]o+)i", "efghiiiiiiii", RegexOptions.None, new string[] { "efghiiiiiiii" } };
yield return new object[] { null, @"a?ba?ba?ba?b", "abbabab", RegexOptions.None, new string[] { "abbabab" } };
yield return new object[] { null, @"a?ba?ba?ba?b", "abBAbab", RegexOptions.IgnoreCase, new string[] { "abBAbab" } };
// Implicitly upgrading (or not) notoneloop to be atomic
yield return new object[] { null, @"[^b]*b", "aaab", RegexOptions.None, new string[] { "aaab" } };
yield return new object[] { null, @"[^b]*b+", "aaab", RegexOptions.None, new string[] { "aaab" } };
Expand All @@ -684,6 +686,8 @@ public static IEnumerable<object[]> Groups_Basic_TestData()
yield return new object[] { null, @"(?:abc[^b]*|efgh)i", "efghi", RegexOptions.None, new string[] { "efghi" } }; // can't upgrade
yield return new object[] { null, @"(?:abcd|efg[^b]*)b", "efgb", RegexOptions.None, new string[] { "efgb" } };
yield return new object[] { null, @"(?:abcd|efg[^b]*)i", "efgi", RegexOptions.None, new string[] { "efgi" } }; // can't upgrade
yield return new object[] { null, @"[^a]?a[^a]?a[^a]?a[^a]?a", "baababa", RegexOptions.None, new string[] { "baababa" } };
yield return new object[] { null, @"[^a]?a[^a]?a[^a]?a[^a]?a", "BAababa", RegexOptions.IgnoreCase, new string[] { "BAababa" } };
// Implicitly upgrading (or not) setloop to be atomic
yield return new object[] { null, @"[ac]*", "aaa", RegexOptions.None, new string[] { "aaa" } };
yield return new object[] { null, @"[ac]*b", "aaab", RegexOptions.None, new string[] { "aaab" } };
Expand All @@ -710,6 +714,8 @@ public static IEnumerable<object[]> Groups_Basic_TestData()
yield return new object[] { null, @"(?:abcd|efg[hij]*)h", "efgh", RegexOptions.None, new string[] { "efgh" } }; // can't upgrade
yield return new object[] { null, @"(?:abcd|efg[hij]*)ih", "efgjih", RegexOptions.None, new string[] { "efgjih" } }; // can't upgrade
yield return new object[] { null, @"(?:abcd|efg[hij]*)k", "efgjk", RegexOptions.None, new string[] { "efgjk" } };
yield return new object[] { null, @"[ace]?b[ace]?b[ace]?b[ace]?b", "cbbabeb", RegexOptions.None, new string[] { "cbbabeb" } };
yield return new object[] { null, @"[ace]?b[ace]?b[ace]?b[ace]?b", "cBbAbEb", RegexOptions.IgnoreCase, new string[] { "cBbAbEb" } };
// Implicitly upgrading (or not) concat loops to be atomic
yield return new object[] { null, @"(?:[ab]c[de]f)*", "", RegexOptions.None, new string[] { "" } };
yield return new object[] { null, @"(?:[ab]c[de]f)*", "acdf", RegexOptions.None, new string[] { "acdf" } };
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -273,12 +273,14 @@ private static int GetMinRequiredLength(Regex r)
[InlineData("a*b+", "(?>a*)b+")]
[InlineData("a*b{3,4}", "(?>a*)b{3,4}")]
[InlineData("a+b", "(?>a+)b")]
[InlineData("a?b", "(?>a?)b")]
[InlineData("[^\n]*\n", "(?>[^\n]*)\n")]
[InlineData("[^\n]*\n+", "(?>[^\n]*)\n+")]
[InlineData("(a+)b", "((?>a+))b")]
[InlineData("a*(?:bcd|efg)", "(?>a*)(?:bcd|efg)")]
[InlineData("\\w*\\b", "(?>\\w*)\\b")]
[InlineData("\\d*\\b", "(?>\\d*)\\b")]
[InlineData("(?:abc*|def*)g", "(?:ab(?>c*)|de(?>f*))g")]
[InlineData("(?:a[ce]*|b*)g", "(?:a(?>[ce]*)|(?>b*))g")]
[InlineData("(?:a[ce]*|b*)c", "(?:a[ce]*|(?>b*))c")]
public void PatternsReduceIdentically(string pattern1, string pattern2)
Expand Down

0 comments on commit dc17345

Please sign in to comment.