Improve Extended CSS tokenization

AdguardTeam · Oct 17, 2023 · 146b808 · 146b808
1 parent bc5d53f
commit 146b808
Show file tree

Hide file tree

Showing 2 changed files with 82 additions and 26 deletions.
diff --git a/packages/css-tokenizer/src/algorithms/extended-css-consumers/extended-css-generic.ts b/packages/css-tokenizer/src/algorithms/extended-css-consumers/extended-css-generic.ts
@@ -6,6 +6,8 @@ import { type TokenizerContext } from '../../common/context';
 import { CodePoint } from '../../common/enums/code-points';
 import { TokenType } from '../../common/enums/token-types';
 import { type TokenizerContextFunction } from '../../common/types/function-prototypes';
+import { consumeDelimToken } from '../consumers/delim-token';
+import { isWhitespace } from '../definitions';
 
 /**
  * Generic handler for the Extended CSS's pseudo-classes
@@ -16,14 +18,44 @@ export const handleRegularExtendedCssPseudo: TokenizerContextFunction = (context
     // Save the current offset, because we will need it later
     const start = context.offset;
 
+    // Consume as much whitespace as possible
+    while (isWhitespace(context.code())) {
+        context.consumeCodePoint();
+    }
+
+    // If the first non-whitespace code point is an apostrophe or a quotation mark, it means that we are dealing
+    // with a string parameter.
+    // In this case, we simply abort the custom handler here, and let the standard tokenizer handle the string and
+    // everything that comes after it as specified in the spec.
+    // This behavior is similar to the standard CSS's url() function, it is also handled differently if its parameter
+    // is a string.
+    if (context.code() === CodePoint.Apostrophe || context.code() === CodePoint.QuotationMark) {
+        // Report whitespace tokens (if any)
+        // It is important to report them, because we already consumed them - and the report is faster here than
+        // a re-consume
+        if (context.offset > start) {
+            context.onToken(TokenType.Whitespace, start, context.offset);
+        }
+
+        // We simply abort the custom handler
+        return;
+    }
+
+    // Otherwise, we need to find the closing parenthesis based on the parenthesis balance
     // Parenthesis balance: 1, because we start after the opening parenthesis:
     // :contains(param)
-    //           ^ we starts from here
+    //           ^ we starts from here, so we already have 1 open parenthesis
     let balance = 1;
-    let end = context.offset;
 
-    for (; context.offset < context.source.length; context.consumeCodePoint()) {
-        // TODO: handle newlines - they are not allowed within the pseudo-class
+    // Don't forget to report already consumed whitespace chars as delim-tokens (if any)
+    // Note: we handle the parameter characters as delim-tokens, this is why we don't need to report them here
+    // as whitespace-tokens
+    for (let i = start; i < context.offset; i += 1) {
+        context.onToken(TokenType.Delim, i, i + 1);
+    }
+
+    // Consume until we find the closing parenthesis or we reach the end of the source
+    while (!context.isEof()) {
         if (
             context.code() === CodePoint.LeftParenthesis
             && context.source.charCodeAt(context.offset - 1) !== CodePoint.ReverseSolidus
@@ -39,19 +71,11 @@ export const handleRegularExtendedCssPseudo: TokenizerContextFunction = (context
 
             // If the balance is 0, it means that we found the closing parenthesis
             if (balance === 0) {
-                end = context.offset;
                 break;
             }
         }
-    }
 
-    // If the balance is not 0, it means that we reached the end of the source code
-    // without finding the closing parenthesis
-    // If the balance is 0, it means that we found the closing parenthesis, so we need to report tokens between
-    // the start and the end offsets
-    if (balance === 0) {
-        for (let i = start; i < end; i += 1) {
-            context.onToken(TokenType.Delim, i, i + 1);
-        }
+        // Consume the current character as a delim-token
+        consumeDelimToken(context);
     }
 };
diff --git a/packages/css-tokenizer/src/algorithms/extended-css-consumers/extended-css-xpath.ts b/packages/css-tokenizer/src/algorithms/extended-css-consumers/extended-css-xpath.ts
@@ -8,6 +8,8 @@ import { type TokenizerContext } from '../../common/context';
 import { CodePoint } from '../../common/enums/code-points';
 import { TokenType } from '../../common/enums/token-types';
 import { type TokenizerContextFunction } from '../../common/types/function-prototypes';
+import { consumeDelimToken } from '../consumers/delim-token';
+import { isWhitespace } from '../definitions';
 
 /**
  * Handler for the Extended CSS's `:xpath()` pseudo-class
@@ -18,15 +20,50 @@ export const handleXpathExtendedCssPseudo: TokenizerContextFunction = (context:
     // Save the current offset, because we will need it later
     const start = context.offset;
 
+    // Consume as much whitespace as possible
+    while (isWhitespace(context.code())) {
+        context.consumeCodePoint();
+    }
+
+    // If the first non-whitespace code point is an apostrophe or a quotation mark, it means that we are dealing
+    // with a string parameter.
+    // In this case, we simply abort the custom handler here, and let the standard tokenizer handle the string and
+    // everything that comes after it as specified in the spec.
+    // This behavior is similar to the standard CSS's url() function, it is also handled differently if its parameter
+    // is a string.
+    if (context.code() === CodePoint.Apostrophe || context.code() === CodePoint.QuotationMark) {
+        // Report whitespace tokens (if any)
+        // It is important to report them, because we already consumed them - and the report is faster here than
+        // a re-consume
+        if (context.offset > start) {
+            context.onToken(TokenType.Whitespace, start, context.offset);
+        }
+
+        // We simply abort the custom handler
+        return;
+    }
+
+    // Otherwise, we need to find the closing parenthesis based on the parenthesis balance
     // Parenthesis balance: 1, because we start after the opening parenthesis:
-    // :xpath(param)
-    //        ^ we starts from here
+    // :contains(param)
+    //           ^ we starts from here, so we already have 1 open parenthesis
     let balance = 1;
-    let end = context.offset;
+
+    // Don't forget to report already consumed whitespace chars as delim-tokens (if any)
+    // Note: we handle the parameter characters as delim-tokens, this is why we don't need to report them here
+    // as whitespace-tokens
+    for (let i = start; i < context.offset; i += 1) {
+        context.onToken(TokenType.Delim, i, i + 1);
+    }
+
+    // :xpath() is a bit tricky, because it can contain unescaped parentheses inside strings in the XPath expression,
+    // like this:
+    // :xpath(//div[@class="foo(bar)"])
+    // but in this case, not required the whole XPath expression to be a string
     let inString = false;
 
-    for (; context.offset < context.source.length; context.consumeCodePoint()) {
-        // TODO: handle newlines - they are not allowed within the pseudo-class
+    // Consume until we find the closing parenthesis or we reach the end of the source
+    while (!context.isEof()) {
         // If we find an unescaped quote mark, we toggle the "inString" flag
         // It is important, because we should omit parentheses inside strings.
         if (
@@ -54,17 +91,12 @@ export const handleXpathExtendedCssPseudo: TokenizerContextFunction = (context:
                 // If the balance is 0, it means that we found the closing parenthesis of the
                 // pseudo-class
                 if (balance === 0) {
-                    end = context.offset;
                     break;
                 }
             }
         }
-    }
 
-    // If the balance is not 0, it means that we reached the end of the source code
-    if (balance === 0) {
-        for (let i = start; i < end; i += 1) {
-            context.onToken(TokenType.Delim, i, i + 1);
-        }
+        // Consume the current character as a delim-token
+        consumeDelimToken(context);
     }
 };