Skip to content

Commit

Permalink
AG-26623 Improve Extended CSS tokenization
Browse files Browse the repository at this point in the history
Merge in ADGUARD-FILTERS/tsurlfilter from feature/AG-26623-6 to feature/AG-26623-1

Squashed commit of the following:

commit 7d9c9ed
Author: scripthunter7 <d.tota@adguard.com>
Date:   Wed Oct 18 10:13:29 2023 +0200

    Fix nits

commit 9d5dd3e
Merge: c18515d 02fb7f7
Author: scripthunter7 <d.tota@adguard.com>
Date:   Tue Oct 17 18:02:48 2023 +0200

    Merge branch 'feature/AG-26623-1' into feature/AG-26623-6

commit c18515d
Author: scripthunter7 <d.tota@adguard.com>
Date:   Tue Oct 17 17:50:29 2023 +0200

    More robust Extended CSS tests

commit 146b808
Author: scripthunter7 <d.tota@adguard.com>
Date:   Tue Oct 17 17:49:07 2023 +0200

    Improve Extended CSS tokenization
  • Loading branch information
scripthunter7 committed Oct 18, 2023
1 parent 02fb7f7 commit ca47032
Show file tree
Hide file tree
Showing 8 changed files with 424 additions and 145 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ import { type TokenizerContext } from '../../common/context';
import { CodePoint } from '../../common/enums/code-points';
import { TokenType } from '../../common/enums/token-types';
import { type TokenizerContextFunction } from '../../common/types/function-prototypes';
import { consumeDelimToken } from '../consumers/delim-token';
import { isWhitespace } from '../definitions';

/**
* Generic handler for the Extended CSS's pseudo-classes
Expand All @@ -16,14 +18,44 @@ export const handleRegularExtendedCssPseudo: TokenizerContextFunction = (context
// Save the current offset, because we will need it later
const start = context.offset;

// Consume as much whitespace as possible
while (isWhitespace(context.code())) {
context.consumeCodePoint();
}

// If the first non-whitespace code point is an apostrophe or a quotation mark, it means that we are dealing
// with a string parameter.
// In this case, we simply abort the custom handler here, and let the standard tokenizer handle the string and
// everything that comes after it as specified in the spec.
// This behavior is similar to the standard CSS's url() function, it is also handled differently if its parameter
// is a string.
if (context.code() === CodePoint.Apostrophe || context.code() === CodePoint.QuotationMark) {
// Report whitespace tokens (if any)
// It is important to report them, because we already consumed them - and the report is faster here than
// a re-consume
if (context.offset > start) {
context.onToken(TokenType.Whitespace, start, context.offset);
}

// We simply abort the custom handler
return;
}

// Otherwise, we need to find the closing parenthesis based on the parenthesis balance
// Parenthesis balance: 1, because we start after the opening parenthesis:
// :contains(param)
// ^ we starts from here
// ^ we starts from here, so we already have 1 open parenthesis
let balance = 1;
let end = context.offset;

for (; context.offset < context.source.length; context.consumeCodePoint()) {
// TODO: handle newlines - they are not allowed within the pseudo-class
// Don't forget to report already consumed whitespace chars as delim-tokens (if any)
// Note: we handle the parameter characters as delim-tokens, this is why we don't need to report them here
// as whitespace-tokens
for (let i = start; i < context.offset; i += 1) {
context.onToken(TokenType.Delim, i, i + 1);
}

// Consume until we find the closing parenthesis or we reach the end of the source
while (!context.isEof()) {
if (
context.code() === CodePoint.LeftParenthesis
&& context.source.charCodeAt(context.offset - 1) !== CodePoint.ReverseSolidus
Expand All @@ -39,19 +71,11 @@ export const handleRegularExtendedCssPseudo: TokenizerContextFunction = (context

// If the balance is 0, it means that we found the closing parenthesis
if (balance === 0) {
end = context.offset;
break;
}
}
}

// If the balance is not 0, it means that we reached the end of the source code
// without finding the closing parenthesis
// If the balance is 0, it means that we found the closing parenthesis, so we need to report tokens between
// the start and the end offsets
if (balance === 0) {
for (let i = start; i < end; i += 1) {
context.onToken(TokenType.Delim, i, i + 1);
}
// Consume the current character as a delim-token
consumeDelimToken(context);
}
};
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ import { type TokenizerContext } from '../../common/context';
import { CodePoint } from '../../common/enums/code-points';
import { TokenType } from '../../common/enums/token-types';
import { type TokenizerContextFunction } from '../../common/types/function-prototypes';
import { consumeDelimToken } from '../consumers/delim-token';
import { isWhitespace } from '../definitions';

/**
* Handler for the Extended CSS's `:xpath()` pseudo-class
Expand All @@ -18,15 +20,50 @@ export const handleXpathExtendedCssPseudo: TokenizerContextFunction = (context:
// Save the current offset, because we will need it later
const start = context.offset;

// Consume as much whitespace as possible
while (isWhitespace(context.code())) {
context.consumeCodePoint();
}

// If the first non-whitespace code point is an apostrophe or a quotation mark, it means that we are dealing
// with a string parameter.
// In this case, we simply abort the custom handler here, and let the standard tokenizer handle the string and
// everything that comes after it as specified in the spec.
// This behavior is similar to the standard CSS's url() function, it is also handled differently if its parameter
// is a string.
if (context.code() === CodePoint.Apostrophe || context.code() === CodePoint.QuotationMark) {
// Report whitespace tokens (if any)
// It is important to report them, because we already consumed them - and the report is faster here than
// a re-consume
if (context.offset > start) {
context.onToken(TokenType.Whitespace, start, context.offset);
}

// We simply abort the custom handler
return;
}

// Otherwise, we need to find the closing parenthesis based on the parenthesis balance
// Parenthesis balance: 1, because we start after the opening parenthesis:
// :xpath(param)
// ^ we starts from here
// :contains(param)
// ^ we starts from here, so we already have 1 open parenthesis
let balance = 1;
let end = context.offset;

// Don't forget to report already consumed whitespace chars as delim-tokens (if any)
// Note: we handle the parameter characters as delim-tokens, this is why we don't need to report them here
// as whitespace-tokens
for (let i = start; i < context.offset; i += 1) {
context.onToken(TokenType.Delim, i, i + 1);
}

// :xpath() is a bit tricky, because it can contain unescaped parentheses inside strings in the XPath expression,
// like this:
// :xpath(//div[@class="foo(bar)"])
// but in this case, not required the whole XPath expression to be a string
let inString = false;

for (; context.offset < context.source.length; context.consumeCodePoint()) {
// TODO: handle newlines - they are not allowed within the pseudo-class
// Consume until we find the closing parenthesis or we reach the end of the source
while (!context.isEof()) {
// If we find an unescaped quote mark, we toggle the "inString" flag
// It is important, because we should omit parentheses inside strings.
if (
Expand Down Expand Up @@ -54,17 +91,12 @@ export const handleXpathExtendedCssPseudo: TokenizerContextFunction = (context:
// If the balance is 0, it means that we found the closing parenthesis of the
// pseudo-class
if (balance === 0) {
end = context.offset;
break;
}
}
}
}

// If the balance is not 0, it means that we reached the end of the source code
if (balance === 0) {
for (let i = start; i < end; i += 1) {
context.onToken(TokenType.Delim, i, i + 1);
}
// Consume the current character as a delim-token
consumeDelimToken(context);
}
};
172 changes: 113 additions & 59 deletions packages/css-tokenizer/test/extended-css-tokenizer/contains.test.ts
Original file line number Diff line number Diff line change
@@ -1,73 +1,127 @@
import type { TokenData } from '../helpers/test-interfaces';
import { tokenizeExtended } from '../../src/extended-css-tokenizer';
import { TokenType } from '../../src/common/enums/token-types';
import { ExtendedCssPseudo } from '../../src/common/enums/extended-css-pseudos';
import { testTokenization } from '../helpers/test-utils';
import { createTests, type PseudoValues } from './helpers/test-creator';
import { generateDelimStream } from './helpers/delim-generator';

const PSEUDO_NAMES = [
ExtendedCssPseudo.Contains,
ExtendedCssPseudo.HasText,
ExtendedCssPseudo.AbpContains,
];

const PSEUDO_VALUES = [
String.raw``, // empty
String.raw` `, // single space
String.raw` `, // multiple spaces
String.raw`a`, // single character
String.raw`a b`, // multiple characters
String.raw`a b`, // multiple characters with multiple spaces
String.raw` a`, // single character with single space
String.raw` a b`, // multiple characters with single space
String.raw` a b`, // multiple characters with multiple spaces
String.raw`a `, // single character with single space
String.raw`a b `, // multiple characters with single space
String.raw`a b `, // multiple characters with multiple spaces
String.raw`(a)`, // single character with balanced parentheses
String.raw`(a b)`, // multiple characters with balanced parentheses
String.raw`a\(b`, // escaped left parenthesis
String.raw`a\)b`, // escaped right parenthesis
String.raw`a\(b\)c`, // escaped parentheses
String.raw`/a/`, // regular expression
String.raw`/a/i`, // regular expression with flags
String.raw`/a\/b/`, // regular expression with escaped forward slash
String.raw`/a(b|c)/`, // regular expression with balanced parentheses
String.raw`/^(a|b){3,}$/`, // regular expression with balanced parentheses and quantifiers
String.raw`/a\(\)/i`, // regular expression with escaped parentheses
String.raw`'`, // orphaned single quote
String.raw`a'`, // orphaned single quote
String.raw`'b`, // orphaned single quote
String.raw`a'b`, // orphaned single quote
String.raw`"`, // orphaned double quote
String.raw`a"`, // orphaned double quote
String.raw`"b`, // orphaned double quote
String.raw`a"b`, // orphaned double quote
String.raw`'a'`, // single quoted string
String.raw`"a"`, // double quoted string
String.raw`a'b"c`, // mixed orphaned quotes
String.raw`a'b"c'd`, // single quoted string with orphaned double quote in the middle
];
const PSEUDO_VALUES: PseudoValues = {
...generateDelimStream([
String.raw``, // empty
String.raw` `, // single space
String.raw` `, // multiple spaces
String.raw`a`, // single character
String.raw`ab`, // multiple characters
String.raw`a b`, // multiple characters with single space
String.raw`a b`, // multiple characters with multiple spaces
String.raw` a`, // single character preceded by single space
String.raw` a`, // single character preceded by multiple spaces
String.raw`a `, // single character followed by single space
String.raw`a `, // single character followed by multiple spaces
String.raw` a `, // single character surrounded by single spaces
String.raw` a `, // single character surrounded by multiple spaces
String.raw` a b `, // multiple characters surrounded by single spaces
String.raw` a b `, // multiple characters surrounded by multiple spaces
String.raw`a b c`, // multiple characters with multiple spaces
String.raw`\(`, // escaped left parenthesis
String.raw`\)`, // escaped right parenthesis
String.raw`\(\)`, // escaped parentheses
String.raw`\)\(`, // escaped parentheses (reversed)
String.raw`()`, // balanced parentheses
String.raw`()(())`, // multiple balanced parentheses
String.raw`(a)`, // single character with balanced parentheses
String.raw`(a)(())`, // single character with multiple balanced parentheses
String.raw`(ab)`, // multiple characters with balanced parentheses
String.raw`a(\))(\()b`, // escaped parentheses with balanced parentheses
String.raw`/a/`, // simple regular expression
String.raw`/a/i`, // regular expression with flags
String.raw`/a/ig`, // regular expression with multiple flags
String.raw`/a\/b/`, // regular expression with escaped forward slash
String.raw`/(a|b)/`, // regular expression with balanced parentheses
String.raw`/^(a|b){3,}$/ig`, // regular expression with balanced parentheses and quantifiers and flags
String.raw`/a\(\)/i`, // regular expression with escaped parentheses
]),

// 1-length string
[String.raw`'a'`]: [
[TokenType.String, 0, 3],
],
[String.raw`"a"`]: [
[TokenType.String, 0, 3],
],

// 2-length string
[String.raw`'ab'`]: [
[TokenType.String, 0, 4],
],
[String.raw`"ab"`]: [
[TokenType.String, 0, 4],
],

// ) in string
[String.raw`'a)'`]: [
[TokenType.String, 0, 4],
],
[String.raw`"a)"`]: [
[TokenType.String, 0, 4],
],

// ( in string
[String.raw`'a('`]: [
[TokenType.String, 0, 4],
],
[String.raw`"a("`]: [
[TokenType.String, 0, 4],
],

// ( and ) in string
[String.raw`'a()b'`]: [
[TokenType.String, 0, 6],
],
[String.raw`"a()b"`]: [
[TokenType.String, 0, 6],
],

// string + something
[String.raw`'a' 12px`]: [
[TokenType.String, 0, 3],
[TokenType.Whitespace, 3, 4],
[TokenType.Dimension, 4, 8],
],

// single space + string
[String.raw` 'a'`]: [
[TokenType.Whitespace, 0, 1],
[TokenType.String, 1, 4],
],

// multiple spaces + string
[String.raw` 'a'`]: [
[TokenType.Whitespace, 0, 2],
[TokenType.String, 2, 5],
],

// string + single space
[String.raw`'a' `]: [
[TokenType.String, 0, 3],
[TokenType.Whitespace, 3, 4],
],

const tests = PSEUDO_NAMES.map((name: string) => (
PSEUDO_VALUES.map((param) => ({
actual: `:${name}(${param})`,
expected: [
// :name(
[TokenType.Colon, 0, 1],
[TokenType.Function, 1, name.length + 2],
// parameter splitted into delim tokens
...param.split('').map((_, index) => (
[TokenType.Delim, name.length + 2 + index, name.length + 3 + index]
)),
// )
[TokenType.CloseParenthesis, 1 + name.length + param.length + 1, 1 + name.length + param.length + 2],
] as TokenData[],
}))
)).flat();
// string + multiple spaces
[String.raw`'a' `]: [
[TokenType.String, 0, 3],
[TokenType.Whitespace, 3, 5],
],
};

describe(`Extended CSS's :${PSEUDO_NAMES.join(', :')}`, () => {
test.each(tests)("should tokenize '$actual'", ({ actual, expected }) => {
const tokens: TokenData[] = [];
tokenizeExtended(actual, (...args) => tokens.push(args));
expect(tokens).toEqual(expected);
});
test.each(
createTests(PSEUDO_NAMES, PSEUDO_VALUES),
)("should tokenize '$actual' as $as", (testData) => testTokenization(testData, tokenizeExtended));
});
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { TokenType } from '../../../src/common/enums/token-types';
import { type TokenData } from '../../helpers/test-interfaces';
import { type PseudoValues } from './test-creator';

/**
* Helper function to generate token expectations for values that should be tokenized as delim tokens.
*
* @param inputs Inputs to generate delim pseudo values for.
* @returns Expected token data for each input.
*/
export const generateDelimStream = (inputs: string[]): PseudoValues => {
const result: PseudoValues = {};

for (const input of inputs) {
result[input] = input.split('').map((_, index) => (
[TokenType.Delim, index, index + 1] as TokenData
));
}

return result;
};
Loading

0 comments on commit ca47032

Please sign in to comment.