From dfe7a6c9749ce3c9be9bf99f544fb13c04c61742 Mon Sep 17 00:00:00 2001 From: Dan Moseley Date: Mon, 17 Jul 2023 21:27:04 -0500 Subject: [PATCH] Feedback on regex parser PR (#89060) --- .../Text/RegularExpressions/RegexParser.cs | 54 ++++++++++--------- 1 file changed, 30 insertions(+), 24 deletions(-) diff --git a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs index bc1005472888c..1cc517713121c 100644 --- a/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs +++ b/src/libraries/System.Text.RegularExpressions/src/System/Text/RegularExpressions/RegexParser.cs @@ -521,19 +521,17 @@ private RegexNode ScanReplacement() int startpos = _pos; _pos = _pattern.IndexOf('$', _pos); - if (_pos == -1) + if (_pos < 0) + { _pos = _pattern.Length; + } AddToConcatenate(startpos, _pos - startpos, isReplacement: true); if (_pos < _pattern.Length) { - if (_pattern[_pos++] == '$') - { - _unit = ScanDollar(); - } - - _concatenation.AddChild(_unit!); + _pos++; + _concatenation.AddChild(ScanDollar()); _unit = null; } } @@ -864,9 +862,9 @@ private RegexNode ScanReplacement() { string capname = ScanCapname(); - if (_capnames != null && _capnames.ContainsKey(capname)) + if (_capnames?[capname] is int tmpCapnum) { - capnum = (int)_capnames![capname]!; + capnum = tmpCapnum; } // check if we have bogus character after the name @@ -911,9 +909,9 @@ private RegexNode ScanReplacement() { string uncapname = ScanCapname(); - if (_capnames != null && _capnames.ContainsKey(uncapname)) + if (_capnames?[uncapname] is int tmpCapnum) { - uncapnum = (int)_capnames![uncapname]!; + uncapnum = tmpCapnum; } else { @@ -971,9 +969,9 @@ private RegexNode ScanReplacement() { string capname = ScanCapname(); - if (_capnames != null && _capnames.ContainsKey(capname) && _pos < _pattern.Length && _pattern[_pos++] == ')') + if (_capnames?[capname] is int tmpCapnum && _pos < _pattern.Length && _pattern[_pos++] == ')') { - return new RegexNode(RegexNodeKind.BackreferenceConditional, _options, (int)_capnames![capname]!); + return new RegexNode(RegexNodeKind.BackreferenceConditional, _options, tmpCapnum); } } } @@ -1052,13 +1050,15 @@ private void ScanBlank() if ((_options & RegexOptions.IgnorePatternWhitespace) != 0 && _pos < _pattern.Length && _pattern[_pos] == '#') { _pos = _pattern.IndexOf('\n', _pos); - if (_pos == -1) + if (_pos < 0) + { _pos = _pattern.Length; + } } else if (_pos + 2 < _pattern.Length && _pattern[_pos + 2] == '#' && _pattern[_pos + 1] == '?' && _pattern[_pos] == '(') { _pos = _pattern.IndexOf(')', _pos); - if (_pos == -1) + if (_pos < 0) { _pos = _pattern.Length; throw MakeException(RegexParseError.UnterminatedComment, SR.UnterminatedComment); @@ -1076,6 +1076,8 @@ private void ScanBlank() /// Scans chars following a '\' (not counting the '\'), and returns a RegexNode for the type of atom scanned private RegexNode? ScanBackslash(bool scanOnly) { + Debug.Assert(_pos < _pattern.Length, "The current reading position must not be at the end of the pattern"); + char ch; switch (ch = _pattern[_pos]) { @@ -1149,6 +1151,8 @@ private void ScanBlank() /// Scans \-style backreferences and character escapes private RegexNode? ScanBasicBackslash(bool scanOnly) { + Debug.Assert(_pos < _pattern.Length, "The current reading position must not be at the end of the pattern"); + int backpos = _pos; char close = '\0'; bool angled = false; @@ -1263,7 +1267,7 @@ private void ScanBlank() { return scanOnly ? null : - _capnames != null && _capnames.ContainsKey(capname) ? new RegexNode(RegexNodeKind.Backreference, _options, (int)_capnames![capname]!) : + _capnames?[capname] is int tmpCapnum ? new RegexNode(RegexNodeKind.Backreference, _options, tmpCapnum) : throw MakeException(RegexParseError.UndefinedNamedReference, SR.Format(SR.UndefinedNamedReference, capname)); } } @@ -1359,9 +1363,9 @@ private RegexNode ScanDollar() string capname = ScanCapname(); if (_pos < _pattern.Length && _pattern[_pos++] == '}') { - if (_capnames != null && _capnames.ContainsKey(capname)) + if (_capnames?[capname] is int tmpCapnum) { - return new RegexNode(RegexNodeKind.Backreference, _options, (int)_capnames![capname]!); + return new RegexNode(RegexNodeKind.Backreference, _options, tmpCapnum); } } } @@ -1480,14 +1484,14 @@ private char ScanHex(int c) { for (; c > 0; c -= 1) { - int d; char ch = _pattern[_pos++]; - if ((uint)(d = ch - '0') <= 9) - i = (i * 0x10) + d; - else if ((uint)(d = (ch | 0x20) - 'a') <= 5) - i = (i * 0x10) + d + 0xa; - else + int result = HexConverter.FromChar(ch); + if (result == 0xFF) + { break; + } + + i = (i * 0x10) + result; } } @@ -1968,6 +1972,8 @@ private static int IndexOfMetachar(ReadOnlySpan input) private readonly bool IsTrueQuantifier() { + Debug.Assert(_pos < _pattern.Length, "The current reading position must not be at the end of the pattern"); + int startpos = _pos; char ch = _pattern[startpos]; if (ch != '{')