AG-26623 Improve Extended CSS tokenization

Merge in ADGUARD-FILTERS/tsurlfilter from feature/AG-26623-6 to feature/AG-26623-1 Squashed commit of the following: commit 7d9c9ed Author: scripthunter7 <d.tota@adguard.com> Date: Wed Oct 18 10:13:29 2023 +0200 Fix nits commit 9d5dd3e Merge: c18515d 02fb7f7 Author: scripthunter7 <d.tota@adguard.com> Date: Tue Oct 17 18:02:48 2023 +0200 Merge branch 'feature/AG-26623-1' into feature/AG-26623-6 commit c18515d Author: scripthunter7 <d.tota@adguard.com> Date: Tue Oct 17 17:50:29 2023 +0200 More robust Extended CSS tests commit 146b808 Author: scripthunter7 <d.tota@adguard.com> Date: Tue Oct 17 17:49:07 2023 +0200 Improve Extended CSS tokenization
AdguardTeam · Oct 18, 2023 · ca47032 · ca47032
1 parent 02fb7f7
commit ca47032
Show file tree

Hide file tree

Showing 8 changed files with 424 additions and 145 deletions.
diff --git a/packages/css-tokenizer/src/algorithms/extended-css-consumers/extended-css-generic.ts b/packages/css-tokenizer/src/algorithms/extended-css-consumers/extended-css-generic.ts
@@ -6,6 +6,8 @@ import { type TokenizerContext } from '../../common/context';
 import { CodePoint } from '../../common/enums/code-points';
 import { TokenType } from '../../common/enums/token-types';
 import { type TokenizerContextFunction } from '../../common/types/function-prototypes';
+import { consumeDelimToken } from '../consumers/delim-token';
+import { isWhitespace } from '../definitions';
 
 /**
  * Generic handler for the Extended CSS's pseudo-classes
@@ -16,14 +18,44 @@ export const handleRegularExtendedCssPseudo: TokenizerContextFunction = (context
     // Save the current offset, because we will need it later
     const start = context.offset;
 
+    // Consume as much whitespace as possible
+    while (isWhitespace(context.code())) {
+        context.consumeCodePoint();
+    }
+
+    // If the first non-whitespace code point is an apostrophe or a quotation mark, it means that we are dealing
+    // with a string parameter.
+    // In this case, we simply abort the custom handler here, and let the standard tokenizer handle the string and
+    // everything that comes after it as specified in the spec.
+    // This behavior is similar to the standard CSS's url() function, it is also handled differently if its parameter
+    // is a string.
+    if (context.code() === CodePoint.Apostrophe || context.code() === CodePoint.QuotationMark) {
+        // Report whitespace tokens (if any)
+        // It is important to report them, because we already consumed them - and the report is faster here than
+        // a re-consume
+        if (context.offset > start) {
+            context.onToken(TokenType.Whitespace, start, context.offset);
+        }
+
+        // We simply abort the custom handler
+        return;
+    }
+
+    // Otherwise, we need to find the closing parenthesis based on the parenthesis balance
     // Parenthesis balance: 1, because we start after the opening parenthesis:
     // :contains(param)
-    //           ^ we starts from here
+    //           ^ we starts from here, so we already have 1 open parenthesis
     let balance = 1;
-    let end = context.offset;
 
-    for (; context.offset < context.source.length; context.consumeCodePoint()) {
-        // TODO: handle newlines - they are not allowed within the pseudo-class
+    // Don't forget to report already consumed whitespace chars as delim-tokens (if any)
+    // Note: we handle the parameter characters as delim-tokens, this is why we don't need to report them here
+    // as whitespace-tokens
+    for (let i = start; i < context.offset; i += 1) {
+        context.onToken(TokenType.Delim, i, i + 1);
+    }
+
+    // Consume until we find the closing parenthesis or we reach the end of the source
+    while (!context.isEof()) {
         if (
             context.code() === CodePoint.LeftParenthesis
             && context.source.charCodeAt(context.offset - 1) !== CodePoint.ReverseSolidus
@@ -39,19 +71,11 @@ export const handleRegularExtendedCssPseudo: TokenizerContextFunction = (context
 
             // If the balance is 0, it means that we found the closing parenthesis
             if (balance === 0) {
-                end = context.offset;
                 break;
             }
         }
-    }
 
-    // If the balance is not 0, it means that we reached the end of the source code
-    // without finding the closing parenthesis
-    // If the balance is 0, it means that we found the closing parenthesis, so we need to report tokens between
-    // the start and the end offsets
-    if (balance === 0) {
-        for (let i = start; i < end; i += 1) {
-            context.onToken(TokenType.Delim, i, i + 1);
-        }
+        // Consume the current character as a delim-token
+        consumeDelimToken(context);
     }
 };
diff --git a/packages/css-tokenizer/src/algorithms/extended-css-consumers/extended-css-xpath.ts b/packages/css-tokenizer/src/algorithms/extended-css-consumers/extended-css-xpath.ts
@@ -8,6 +8,8 @@ import { type TokenizerContext } from '../../common/context';
 import { CodePoint } from '../../common/enums/code-points';
 import { TokenType } from '../../common/enums/token-types';
 import { type TokenizerContextFunction } from '../../common/types/function-prototypes';
+import { consumeDelimToken } from '../consumers/delim-token';
+import { isWhitespace } from '../definitions';
 
 /**
  * Handler for the Extended CSS's `:xpath()` pseudo-class
@@ -18,15 +20,50 @@ export const handleXpathExtendedCssPseudo: TokenizerContextFunction = (context:
     // Save the current offset, because we will need it later
     const start = context.offset;
 
+    // Consume as much whitespace as possible
+    while (isWhitespace(context.code())) {
+        context.consumeCodePoint();
+    }
+
+    // If the first non-whitespace code point is an apostrophe or a quotation mark, it means that we are dealing
+    // with a string parameter.
+    // In this case, we simply abort the custom handler here, and let the standard tokenizer handle the string and
+    // everything that comes after it as specified in the spec.
+    // This behavior is similar to the standard CSS's url() function, it is also handled differently if its parameter
+    // is a string.
+    if (context.code() === CodePoint.Apostrophe || context.code() === CodePoint.QuotationMark) {
+        // Report whitespace tokens (if any)
+        // It is important to report them, because we already consumed them - and the report is faster here than
+        // a re-consume
+        if (context.offset > start) {
+            context.onToken(TokenType.Whitespace, start, context.offset);
+        }
+
+        // We simply abort the custom handler
+        return;
+    }
+
+    // Otherwise, we need to find the closing parenthesis based on the parenthesis balance
     // Parenthesis balance: 1, because we start after the opening parenthesis:
-    // :xpath(param)
-    //        ^ we starts from here
+    // :contains(param)
+    //           ^ we starts from here, so we already have 1 open parenthesis
     let balance = 1;
-    let end = context.offset;
+
+    // Don't forget to report already consumed whitespace chars as delim-tokens (if any)
+    // Note: we handle the parameter characters as delim-tokens, this is why we don't need to report them here
+    // as whitespace-tokens
+    for (let i = start; i < context.offset; i += 1) {
+        context.onToken(TokenType.Delim, i, i + 1);
+    }
+
+    // :xpath() is a bit tricky, because it can contain unescaped parentheses inside strings in the XPath expression,
+    // like this:
+    // :xpath(//div[@class="foo(bar)"])
+    // but in this case, not required the whole XPath expression to be a string
     let inString = false;
 
-    for (; context.offset < context.source.length; context.consumeCodePoint()) {
-        // TODO: handle newlines - they are not allowed within the pseudo-class
+    // Consume until we find the closing parenthesis or we reach the end of the source
+    while (!context.isEof()) {
         // If we find an unescaped quote mark, we toggle the "inString" flag
         // It is important, because we should omit parentheses inside strings.
         if (
@@ -54,17 +91,12 @@ export const handleXpathExtendedCssPseudo: TokenizerContextFunction = (context:
                 // If the balance is 0, it means that we found the closing parenthesis of the
                 // pseudo-class
                 if (balance === 0) {
-                    end = context.offset;
                     break;
                 }
             }
         }
-    }
 
-    // If the balance is not 0, it means that we reached the end of the source code
-    if (balance === 0) {
-        for (let i = start; i < end; i += 1) {
-            context.onToken(TokenType.Delim, i, i + 1);
-        }
+        // Consume the current character as a delim-token
+        consumeDelimToken(context);
     }
 };
diff --git a/packages/css-tokenizer/test/extended-css-tokenizer/contains.test.ts b/packages/css-tokenizer/test/extended-css-tokenizer/contains.test.ts
@@ -1,73 +1,127 @@
-import type { TokenData } from '../helpers/test-interfaces';
 import { tokenizeExtended } from '../../src/extended-css-tokenizer';
 import { TokenType } from '../../src/common/enums/token-types';
 import { ExtendedCssPseudo } from '../../src/common/enums/extended-css-pseudos';
+import { testTokenization } from '../helpers/test-utils';
+import { createTests, type PseudoValues } from './helpers/test-creator';
+import { generateDelimStream } from './helpers/delim-generator';
 
 const PSEUDO_NAMES = [
     ExtendedCssPseudo.Contains,
     ExtendedCssPseudo.HasText,
     ExtendedCssPseudo.AbpContains,
 ];
 
-const PSEUDO_VALUES = [
-    String.raw``, // empty
-    String.raw` `, // single space
-    String.raw`  `, // multiple spaces
-    String.raw`a`, // single character
-    String.raw`a b`, // multiple characters
-    String.raw`a  b`, // multiple characters with multiple spaces
-    String.raw` a`, // single character with single space
-    String.raw` a b`, // multiple characters with single space
-    String.raw` a  b`, // multiple characters with multiple spaces
-    String.raw`a `, // single character with single space
-    String.raw`a b `, // multiple characters with single space
-    String.raw`a  b `, // multiple characters with multiple spaces
-    String.raw`(a)`, // single character with balanced parentheses
-    String.raw`(a b)`, // multiple characters with balanced parentheses
-    String.raw`a\(b`, // escaped left parenthesis
-    String.raw`a\)b`, // escaped right parenthesis
-    String.raw`a\(b\)c`, // escaped parentheses
-    String.raw`/a/`, // regular expression
-    String.raw`/a/i`, // regular expression with flags
-    String.raw`/a\/b/`, // regular expression with escaped forward slash
-    String.raw`/a(b|c)/`, // regular expression with balanced parentheses
-    String.raw`/^(a|b){3,}$/`, // regular expression with balanced parentheses and quantifiers
-    String.raw`/a\(\)/i`, // regular expression with escaped parentheses
-    String.raw`'`, // orphaned single quote
-    String.raw`a'`, // orphaned single quote
-    String.raw`'b`, // orphaned single quote
-    String.raw`a'b`, // orphaned single quote
-    String.raw`"`, // orphaned double quote
-    String.raw`a"`, // orphaned double quote
-    String.raw`"b`, // orphaned double quote
-    String.raw`a"b`, // orphaned double quote
-    String.raw`'a'`, // single quoted string
-    String.raw`"a"`, // double quoted string
-    String.raw`a'b"c`, // mixed orphaned quotes
-    String.raw`a'b"c'd`, // single quoted string with orphaned double quote in the middle
-];
+const PSEUDO_VALUES: PseudoValues = {
+    ...generateDelimStream([
+        String.raw``, // empty
+        String.raw` `, // single space
+        String.raw`  `, // multiple spaces
+        String.raw`a`, // single character
+        String.raw`ab`, // multiple characters
+        String.raw`a b`, // multiple characters with single space
+        String.raw`a  b`, // multiple characters with multiple spaces
+        String.raw` a`, // single character preceded by single space
+        String.raw`  a`, // single character preceded by multiple spaces
+        String.raw`a `, // single character followed by single space
+        String.raw`a  `, // single character followed by multiple spaces
+        String.raw` a `, // single character surrounded by single spaces
+        String.raw`  a  `, // single character surrounded by multiple spaces
+        String.raw` a  b `, // multiple characters surrounded by single spaces
+        String.raw`  a  b  `, // multiple characters surrounded by multiple spaces
+        String.raw`a b c`, // multiple characters with multiple spaces
+        String.raw`\(`, // escaped left parenthesis
+        String.raw`\)`, // escaped right parenthesis
+        String.raw`\(\)`, // escaped parentheses
+        String.raw`\)\(`, // escaped parentheses (reversed)
+        String.raw`()`, // balanced parentheses
+        String.raw`()(())`, // multiple balanced parentheses
+        String.raw`(a)`, // single character with balanced parentheses
+        String.raw`(a)(())`, // single character with multiple balanced parentheses
+        String.raw`(ab)`, // multiple characters with balanced parentheses
+        String.raw`a(\))(\()b`, // escaped parentheses with balanced parentheses
+        String.raw`/a/`, // simple regular expression
+        String.raw`/a/i`, // regular expression with flags
+        String.raw`/a/ig`, // regular expression with multiple flags
+        String.raw`/a\/b/`, // regular expression with escaped forward slash
+        String.raw`/(a|b)/`, // regular expression with balanced parentheses
+        String.raw`/^(a|b){3,}$/ig`, // regular expression with balanced parentheses and quantifiers and flags
+        String.raw`/a\(\)/i`, // regular expression with escaped parentheses
+    ]),
+
+    // 1-length string
+    [String.raw`'a'`]: [
+        [TokenType.String, 0, 3],
+    ],
+    [String.raw`"a"`]: [
+        [TokenType.String, 0, 3],
+    ],
+
+    // 2-length string
+    [String.raw`'ab'`]: [
+        [TokenType.String, 0, 4],
+    ],
+    [String.raw`"ab"`]: [
+        [TokenType.String, 0, 4],
+    ],
+
+    // ) in string
+    [String.raw`'a)'`]: [
+        [TokenType.String, 0, 4],
+    ],
+    [String.raw`"a)"`]: [
+        [TokenType.String, 0, 4],
+    ],
+
+    // ( in string
+    [String.raw`'a('`]: [
+        [TokenType.String, 0, 4],
+    ],
+    [String.raw`"a("`]: [
+        [TokenType.String, 0, 4],
+    ],
+
+    // ( and ) in string
+    [String.raw`'a()b'`]: [
+        [TokenType.String, 0, 6],
+    ],
+    [String.raw`"a()b"`]: [
+        [TokenType.String, 0, 6],
+    ],
+
+    // string + something
+    [String.raw`'a' 12px`]: [
+        [TokenType.String, 0, 3],
+        [TokenType.Whitespace, 3, 4],
+        [TokenType.Dimension, 4, 8],
+    ],
+
+    // single space + string
+    [String.raw` 'a'`]: [
+        [TokenType.Whitespace, 0, 1],
+        [TokenType.String, 1, 4],
+    ],
+
+    // multiple spaces + string
+    [String.raw`  'a'`]: [
+        [TokenType.Whitespace, 0, 2],
+        [TokenType.String, 2, 5],
+    ],
+
+    // string + single space
+    [String.raw`'a' `]: [
+        [TokenType.String, 0, 3],
+        [TokenType.Whitespace, 3, 4],
+    ],
 
-const tests = PSEUDO_NAMES.map((name: string) => (
-    PSEUDO_VALUES.map((param) => ({
-        actual: `:${name}(${param})`,
-        expected: [
-            // :name(
-            [TokenType.Colon, 0, 1],
-            [TokenType.Function, 1, name.length + 2],
-            // parameter splitted into delim tokens
-            ...param.split('').map((_, index) => (
-                [TokenType.Delim, name.length + 2 + index, name.length + 3 + index]
-            )),
-            // )
-            [TokenType.CloseParenthesis, 1 + name.length + param.length + 1, 1 + name.length + param.length + 2],
-        ] as TokenData[],
-    }))
-)).flat();
+    // string + multiple spaces
+    [String.raw`'a'  `]: [
+        [TokenType.String, 0, 3],
+        [TokenType.Whitespace, 3, 5],
+    ],
+};
 
 describe(`Extended CSS's :${PSEUDO_NAMES.join(', :')}`, () => {
-    test.each(tests)("should tokenize '$actual'", ({ actual, expected }) => {
-        const tokens: TokenData[] = [];
-        tokenizeExtended(actual, (...args) => tokens.push(args));
-        expect(tokens).toEqual(expected);
-    });
+    test.each(
+        createTests(PSEUDO_NAMES, PSEUDO_VALUES),
+    )("should tokenize '$actual' as $as", (testData) => testTokenization(testData, tokenizeExtended));
 });
diff --git a/packages/css-tokenizer/test/extended-css-tokenizer/helpers/delim-generator.ts b/packages/css-tokenizer/test/extended-css-tokenizer/helpers/delim-generator.ts
@@ -0,0 +1,21 @@
+import { TokenType } from '../../../src/common/enums/token-types';
+import { type TokenData } from '../../helpers/test-interfaces';
+import { type PseudoValues } from './test-creator';
+
+/**
+ * Helper function to generate token expectations for values that should be tokenized as delim tokens.
+ *
+ * @param inputs Inputs to generate delim pseudo values for.
+ * @returns Expected token data for each input.
+ */
+export const generateDelimStream = (inputs: string[]): PseudoValues => {
+    const result: PseudoValues = {};
+
+    for (const input of inputs) {
+        result[input] = input.split('').map((_, index) => (
+            [TokenType.Delim, index, index + 1] as TokenData
+        ));
+    }
+
+    return result;
+};