Merge branch 'feature/AG-26623-1' into feature/AG-26623-4

AdguardTeam · Oct 20, 2023 · b3ebae5 · b3ebae5
2 parents 6275efb + 993e1cc
commit b3ebae5
Show file tree

Hide file tree

Showing 47 changed files with 2,961 additions and 349 deletions.
diff --git a/packages/css-tokenizer/.markdownlint.json b/packages/css-tokenizer/.markdownlint.json
@@ -0,0 +1,15 @@
+{
+    "ul-indent": { "indent": 4 },
+    "line-length": {
+        "stern": true,
+        "line_length": 120
+    },
+    "no-multiple-blanks": { "maximum": 2 },
+    "no-inline-html": { "allowed_elements": ["a", "details", "summary", "img"]},
+    "no-duplicate-header": { "siblings_only": true },
+    "no-blanks-blockquote": false,
+    "no-bare-urls": false,
+    "ul-style": { "style": "dash" },
+    "blanks-around-fences": { "list_items": false },
+    "emphasis-style": { "style": "asterisk" }
+}
diff --git a/packages/css-tokenizer/.markdownlintignore b/packages/css-tokenizer/.markdownlintignore
@@ -0,0 +1 @@
+node_modules/
diff --git a/packages/css-tokenizer/README.md b/packages/css-tokenizer/README.md
@@ -1,3 +1,146 @@
 # CSS / Extended CSS Tokenizer
 
-TODO: Write a project description
+[![npm-badge]][npm-url] [![install-size-badge]][install-size-url] [![license-badge]][license-url]
+
+This library provides two distinct CSS tokenizers:
+
+1. **Standard CSS Tokenizer**: This tokenizer strictly adheres to the CSS Syntax Level 3 specification outlined by the
+[W3C][css-syntax].
+1. **Extended CSS Tokenizer**: Designed to extend the capabilities of the standard tokenizer, this component introduces
+support for special pseudo-classes like `:contains()` and `:xpath()`.
+
+## Motivation
+
+To appreciate the necessity for a custom tokenizer, it's essential to understand the concept of Extended CSS, recognize
+the challenges it poses, and discover how we can effectively address these issues.
+
+### What is Extended CSS?
+
+Extended CSS is a superset of CSS used by adblockers to provide more robust filtering capabilities. In practical terms,
+Extended CSS introduces additional pseudo-classes that are not defined in the CSS specification. Notable examples
+include `:contains()` and `:xpath()`:
+
+- `:contains()`: Empowers the selection of elements based on specific text within their `innerText` property.
+- `:xpath()`: Enables selection based on an [XPath expression][xpath-mdn].
+
+### Why do we need a custom tokenizer?
+
+The standard CSS tokenizer cannot consistently handle these special pseudo-classes. Therefore, a custom tokenizer is
+required to manage them correctly.
+
+For example, the `:contains()` pseudo-class can have the following syntax:
+
+```css
+div:contains(aaa'bbb)
+```
+
+A standard CSS tokenizer interprets the single quotation mark (`'`) as a string delimiter, causing an error due to the
+lack of a closing `)` character. This deviation from the expected syntax results in a parsing issue.
+
+The `:xpath()` pseudo-class poses a similar challenge for a standard CSS tokenizer, as it can have syntax like this:
+
+```css
+div:xpath(//*...)
+```
+
+A standard tokenizer mistakenly identifies the `/*` sequence as the start of a comment, leading to incorrect parsing.
+
+## The solution: Custom function handlers
+
+We've designed the standard CSS tokenizer to rigorously adhere to the CSS Syntax Level 3 specification. However, we've
+also introduced the ability to handle certain pseudo-classes in a custom manner, akin to how the `<url-token>` is
+managed in the CSS specs. When the tokenizer encounters a function token (pattern: `function-name(`), it searches for a
+handler function in the `functionHandlers` map based on the function name and calls the custom handler if it exists.
+
+The custom handler receives a single argument: the shared tokenizer context object, which can be used to manage the
+function, similar to how other tokens are handled in the library.
+
+This approach allows us to maintain a native, specification-compliant CSS tokenizer with minimal overhead while also
+providing the flexibility to manage special pseudo-classes in a custom way.
+
+In essence, the Extended CSS tokenizer is a standard CSS tokenizer with custom function handlers for special
+pseudo-classes.
+
+### No new token types
+
+It's crucial to emphasize that our implementation remains committed to the token types specified in the CSS W3C
+standards. We do not introduce new token types, ensuring that our tokenizer stays in harmony with the official CSS
+Syntax Level 3 specification. This dedication to adhering to industry standards and best practices guarantees that our
+library maintains compatibility and consistency with CSS-related tools and workflows.
+
+By preserving the standard CSS token types, we aim to provide users with a reliable and seamless experience while
+working with CSS, upholding the integrity of the language as defined by the W3C.
+
+## Example
+
+Here's a straightforward example of how to use the library:
+
+```js
+// `tokenize` is a regular CSS tokenizer (and doesn't support Extended CSS)
+// `tokenizeExtended` is an Extended CSS tokenizer
+const { tokenize, tokenizeExtended, getFormattedTokenName } = require('@adguard/css-tokenizer');
+
+// Input to tokenize
+const css = `div:contains(aa'bb) { display: none !important; }`;
+
+// Prepare table
+const rows = [];
+rows.push(['Token', 'Start', 'End', 'Fragment']);
+
+// Tokenize the input - feel free to try `tokenize` and `tokenizeExtended`
+tokenizeExtended(css, (token, start, end) => {
+    rows.push([getFormattedTokenName(token), start, end, css.substring(start, end)]);
+});
+
+// Print the tokenization result as a table
+console.table(rows);
+```
+
+## API
+
+Tokenization is accomplished by calling the tokenize or tokenizeExtended function. Both functions accept the following
+arguments:
+
+- `source`: The CSS source string to tokenize.
+- `onToken`: A callback function invoked for each token found in the source string, with the following arguments:
+    <!-- TODO: Add link -->
+    - `token`: The token type (you can see token types here).
+    - `start`: The starting index of the token in the source string.
+    - `end`: The ending index of the token in the source string.
+- `onError`: A callback function called when an error occurs during tokenization. Errors do not break the tokenization
+process, as the tokenizer is tolerant and attempts to recover from errors in line with the CSS Syntax Level 3
+specification. The callback function accepts the following arguments:
+    - `message`: The error message.
+    - `start`: The starting index of the error in the source string.
+    - `end`: The ending index of the error in the source string.
+- `functionHandlers`: This allows for the customized handling of functions. Map keys correspond to function names,
+while the values are void callback functions serving as "tokenizer context" functions. These functions can be used to
+manage pseudo-classes and have only one argument: the shared tokenizer context object.
+
+> [!NOTE]
+> Our API and token list is also compatible with the [CSSTree][css-tree-repo]'s tokenizer API, and in the long term, we
+> plan to integrate this library into CSSTree via our [ECSSTree library][ecss-tree-repo], see
+> [this issue][css-tree-issue] for more details.
+
+## Ideas & Questions
+
+If you have any questions or ideas for new features, please [open an issue][new-issue-url] or a
+[discussion][discussions-url]. We will be happy to discuss it with you.
+
+## License
+
+This project is licensed under the MIT license. See the [LICENSE][license-url] file for details.
+
+[css-syntax]: https://www.w3.org/TR/css-syntax-3/
+[css-tree-issue]: https://github.com/csstree/csstree/issues/253
+[css-tree-repo]: https://github.com/csstree/csstree
+[discussions-url]: https://github.com/AdguardTeam/tsurlfilter/discussions
+[ecss-tree-repo]: https://github.com/AdguardTeam/ecsstree
+[install-size-badge]: https://packagephobia.com/badge?p=@adguard/css-tokenizer
+[install-size-url]: https://packagephobia.com/result?p=@adguard/css-tokenizer
+[license-badge]: https://img.shields.io/npm/l/@adguard/css-tokenizer
+[license-url]: https://github.com/AdguardTeam/tsurlfilter/blob/master/packages/css-tokenizer/LICENSE
+[new-issue-url]: https://github.com/AdguardTeam/tsurlfilter/issues/new
+[npm-badge]: https://img.shields.io/npm/v/@adguard/css-tokenizer
+[npm-url]: https://www.npmjs.com/package/@adguard/css-tokenizer
+[xpath-mdn]: https://developer.mozilla.org/en-US/docs/Web/XPath
diff --git a/packages/css-tokenizer/benchmark/README.md b/packages/css-tokenizer/benchmark/README.md
@@ -0,0 +1,13 @@
+# CSS Tokenizer benchmark
+
+This benchmark is used to compare the performance of the CSS Tokenizers.
+
+## Usage
+
+Simply run the following command to run the benchmark:
+
+```sh
+yarn benchmark
+```
+
+This will run the build for the library and then run the benchmark.
diff --git a/packages/css-tokenizer/benchmark/config.ts b/packages/css-tokenizer/benchmark/config.ts
@@ -0,0 +1,128 @@
+/* eslint-disable no-console */
+/* eslint-disable import/no-extraneous-dependencies */
+/**
+ * @file Configuration for the benchmarking script
+ *
+ * @see {@link https://github.com/stylelint/css-parser/issues/1}
+ */
+
+import * as CssTree from 'css-tree';
+import * as CssToolsCssTokenizer from '@csstools/css-tokenizer';
+import * as CssToolsTokenizer from '@csstools/tokenizer';
+// Note: this module has no types
+import * as parseCss from 'parse-css/parse-css';
+// Note: this is an ES module, but we use esbuild to make a bundle on the fly before running the benchmark
+import * as cssLex from 'csslex';
+
+// eslint-disable-next-line import/no-relative-packages, import/extensions
+import * as AdGuardCssTokenizer from '../dist/csstokenizer';
+import { type Resource, type Tokenizer } from './interfaces';
+
+// Add `tokenize` function to the `CssTree` module
+declare module 'css-tree' {
+    export function tokenize(css: string, callback: (token: number, start: number, end: number) => void): void;
+}
+
+/**
+ * Resources to benchmark
+ */
+export const resources: Resource[] = [
+    {
+        name: 'Bootstrap CSS',
+        url: 'https://cdn.jsdelivr.net/npm/bootstrap@latest/dist/css/bootstrap.css',
+    },
+    {
+        name: 'Bulma CSS',
+        url: 'https://cdn.jsdelivr.net/npm/bulma@latest/css/bulma.css',
+    },
+    {
+        name: 'AdGuard Base List',
+        url: 'https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_2_Base/filter.txt',
+        adblock: true,
+    },
+    {
+        name: 'uBlock Base List',
+        url: 'https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/filters.txt',
+        adblock: true,
+    },
+];
+
+/**
+ * Tokenizers to benchmark
+ */
+export const tokenizers: Tokenizer[] = [
+    {
+        // https://github.com/AdguardTeam/tsurlfilter/tree/master/packages/css-tokenizer
+        name: '@adguard/css-tokenizer',
+        tokenize: (css: string) => {
+            let count = 0;
+            AdGuardCssTokenizer.tokenize(css, () => { count += 1; });
+            return count;
+        },
+    },
+    {
+        // https://github.com/AdguardTeam/tsurlfilter/tree/master/packages/css-tokenizer
+        name: '@adguard/css-tokenizer (extended)',
+        tokenize: (css: string) => {
+            let count = 0;
+            AdGuardCssTokenizer.tokenizeExtended(css, () => { count += 1; });
+            return count;
+        },
+    },
+    {
+        // https://github.com/csstree/csstree
+        name: 'css-tree',
+        tokenize: (css: string) => {
+            let count = 0;
+            CssTree.tokenize(css, () => { count += 1; });
+            return count;
+        },
+    },
+    {
+        // https://github.com/csstools/tokenizer
+        name: '@csstools/tokenizer',
+        tokenize: (css: string) => {
+            let count = 1; // first token
+            const tokenizer = CssToolsTokenizer.tokenize(css);
+            while (!tokenizer().done) {
+                count += 1;
+            }
+            return count;
+        },
+    },
+    {
+        // https://github.com/csstools/postcss-plugins/tree/main/packages/css-tokenizer
+        name: '@csstools/css-tokenizer',
+        tokenize: (css: string) => {
+            return CssToolsCssTokenizer.tokenize({ css }).length;
+        },
+    },
+    {
+        // https://github.com/tabatkins/parse-css
+        name: 'parse-css',
+        tokenize: (css: string) => {
+            // This tokenizer uses console.log while consuming numbers
+            const { log } = console;
+            console.log = () => {};
+
+            let tokens = 0;
+
+            try {
+                // Run the tokenizer
+                tokens = parseCss.tokenize(css).length;
+            } finally {
+                // Restore console.log
+                console.log = log;
+            }
+
+            return tokens;
+        },
+    },
+    {
+        // https://github.com/keithamus/csslex
+        name: 'csslex',
+        tokenize: (css: string) => {
+            return Array.from(cssLex.lex(css)).length;
+        },
+    },
+];