-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge branch 'feature/AG-26623-1' into feature/AG-26623-4
- Loading branch information
Showing
47 changed files
with
2,961 additions
and
349 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
{ | ||
"ul-indent": { "indent": 4 }, | ||
"line-length": { | ||
"stern": true, | ||
"line_length": 120 | ||
}, | ||
"no-multiple-blanks": { "maximum": 2 }, | ||
"no-inline-html": { "allowed_elements": ["a", "details", "summary", "img"]}, | ||
"no-duplicate-header": { "siblings_only": true }, | ||
"no-blanks-blockquote": false, | ||
"no-bare-urls": false, | ||
"ul-style": { "style": "dash" }, | ||
"blanks-around-fences": { "list_items": false }, | ||
"emphasis-style": { "style": "asterisk" } | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1 @@ | ||
node_modules/ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,146 @@ | ||
# CSS / Extended CSS Tokenizer | ||
|
||
TODO: Write a project description | ||
[![npm-badge]][npm-url] [![install-size-badge]][install-size-url] [![license-badge]][license-url] | ||
|
||
This library provides two distinct CSS tokenizers: | ||
|
||
1. **Standard CSS Tokenizer**: This tokenizer strictly adheres to the CSS Syntax Level 3 specification outlined by the | ||
[W3C][css-syntax]. | ||
1. **Extended CSS Tokenizer**: Designed to extend the capabilities of the standard tokenizer, this component introduces | ||
support for special pseudo-classes like `:contains()` and `:xpath()`. | ||
|
||
## Motivation | ||
|
||
To appreciate the necessity for a custom tokenizer, it's essential to understand the concept of Extended CSS, recognize | ||
the challenges it poses, and discover how we can effectively address these issues. | ||
|
||
### What is Extended CSS? | ||
|
||
Extended CSS is a superset of CSS used by adblockers to provide more robust filtering capabilities. In practical terms, | ||
Extended CSS introduces additional pseudo-classes that are not defined in the CSS specification. Notable examples | ||
include `:contains()` and `:xpath()`: | ||
|
||
- `:contains()`: Empowers the selection of elements based on specific text within their `innerText` property. | ||
- `:xpath()`: Enables selection based on an [XPath expression][xpath-mdn]. | ||
|
||
### Why do we need a custom tokenizer? | ||
|
||
The standard CSS tokenizer cannot consistently handle these special pseudo-classes. Therefore, a custom tokenizer is | ||
required to manage them correctly. | ||
|
||
For example, the `:contains()` pseudo-class can have the following syntax: | ||
|
||
```css | ||
div:contains(aaa'bbb) | ||
``` | ||
|
||
A standard CSS tokenizer interprets the single quotation mark (`'`) as a string delimiter, causing an error due to the | ||
lack of a closing `)` character. This deviation from the expected syntax results in a parsing issue. | ||
|
||
The `:xpath()` pseudo-class poses a similar challenge for a standard CSS tokenizer, as it can have syntax like this: | ||
|
||
```css | ||
div:xpath(//*...) | ||
``` | ||
A standard tokenizer mistakenly identifies the `/*` sequence as the start of a comment, leading to incorrect parsing. | ||
## The solution: Custom function handlers | ||
We've designed the standard CSS tokenizer to rigorously adhere to the CSS Syntax Level 3 specification. However, we've | ||
also introduced the ability to handle certain pseudo-classes in a custom manner, akin to how the `<url-token>` is | ||
managed in the CSS specs. When the tokenizer encounters a function token (pattern: `function-name(`), it searches for a | ||
handler function in the `functionHandlers` map based on the function name and calls the custom handler if it exists. | ||
The custom handler receives a single argument: the shared tokenizer context object, which can be used to manage the | ||
function, similar to how other tokens are handled in the library. | ||
This approach allows us to maintain a native, specification-compliant CSS tokenizer with minimal overhead while also | ||
providing the flexibility to manage special pseudo-classes in a custom way. | ||
In essence, the Extended CSS tokenizer is a standard CSS tokenizer with custom function handlers for special | ||
pseudo-classes. | ||
### No new token types | ||
It's crucial to emphasize that our implementation remains committed to the token types specified in the CSS W3C | ||
standards. We do not introduce new token types, ensuring that our tokenizer stays in harmony with the official CSS | ||
Syntax Level 3 specification. This dedication to adhering to industry standards and best practices guarantees that our | ||
library maintains compatibility and consistency with CSS-related tools and workflows. | ||
By preserving the standard CSS token types, we aim to provide users with a reliable and seamless experience while | ||
working with CSS, upholding the integrity of the language as defined by the W3C. | ||
## Example | ||
Here's a straightforward example of how to use the library: | ||
```js | ||
// `tokenize` is a regular CSS tokenizer (and doesn't support Extended CSS) | ||
// `tokenizeExtended` is an Extended CSS tokenizer | ||
const { tokenize, tokenizeExtended, getFormattedTokenName } = require('@adguard/css-tokenizer'); | ||
// Input to tokenize | ||
const css = `div:contains(aa'bb) { display: none !important; }`; | ||
// Prepare table | ||
const rows = []; | ||
rows.push(['Token', 'Start', 'End', 'Fragment']); | ||
// Tokenize the input - feel free to try `tokenize` and `tokenizeExtended` | ||
tokenizeExtended(css, (token, start, end) => { | ||
rows.push([getFormattedTokenName(token), start, end, css.substring(start, end)]); | ||
}); | ||
// Print the tokenization result as a table | ||
console.table(rows); | ||
``` | ||
## API | ||
Tokenization is accomplished by calling the tokenize or tokenizeExtended function. Both functions accept the following | ||
arguments: | ||
- `source`: The CSS source string to tokenize. | ||
- `onToken`: A callback function invoked for each token found in the source string, with the following arguments: | ||
<!-- TODO: Add link --> | ||
- `token`: The token type (you can see token types here). | ||
- `start`: The starting index of the token in the source string. | ||
- `end`: The ending index of the token in the source string. | ||
- `onError`: A callback function called when an error occurs during tokenization. Errors do not break the tokenization | ||
process, as the tokenizer is tolerant and attempts to recover from errors in line with the CSS Syntax Level 3 | ||
specification. The callback function accepts the following arguments: | ||
- `message`: The error message. | ||
- `start`: The starting index of the error in the source string. | ||
- `end`: The ending index of the error in the source string. | ||
- `functionHandlers`: This allows for the customized handling of functions. Map keys correspond to function names, | ||
while the values are void callback functions serving as "tokenizer context" functions. These functions can be used to | ||
manage pseudo-classes and have only one argument: the shared tokenizer context object. | ||
> [!NOTE] | ||
> Our API and token list is also compatible with the [CSSTree][css-tree-repo]'s tokenizer API, and in the long term, we | ||
> plan to integrate this library into CSSTree via our [ECSSTree library][ecss-tree-repo], see | ||
> [this issue][css-tree-issue] for more details. | ||
## Ideas & Questions | ||
If you have any questions or ideas for new features, please [open an issue][new-issue-url] or a | ||
[discussion][discussions-url]. We will be happy to discuss it with you. | ||
## License | ||
This project is licensed under the MIT license. See the [LICENSE][license-url] file for details. | ||
[css-syntax]: https://www.w3.org/TR/css-syntax-3/ | ||
[css-tree-issue]: https://github.com/csstree/csstree/issues/253 | ||
[css-tree-repo]: https://github.com/csstree/csstree | ||
[discussions-url]: https://github.com/AdguardTeam/tsurlfilter/discussions | ||
[ecss-tree-repo]: https://github.com/AdguardTeam/ecsstree | ||
[install-size-badge]: https://packagephobia.com/badge?p=@adguard/css-tokenizer | ||
[install-size-url]: https://packagephobia.com/result?p=@adguard/css-tokenizer | ||
[license-badge]: https://img.shields.io/npm/l/@adguard/css-tokenizer | ||
[license-url]: https://github.com/AdguardTeam/tsurlfilter/blob/master/packages/css-tokenizer/LICENSE | ||
[new-issue-url]: https://github.com/AdguardTeam/tsurlfilter/issues/new | ||
[npm-badge]: https://img.shields.io/npm/v/@adguard/css-tokenizer | ||
[npm-url]: https://www.npmjs.com/package/@adguard/css-tokenizer | ||
[xpath-mdn]: https://developer.mozilla.org/en-US/docs/Web/XPath |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
# CSS Tokenizer benchmark | ||
|
||
This benchmark is used to compare the performance of the CSS Tokenizers. | ||
|
||
## Usage | ||
|
||
Simply run the following command to run the benchmark: | ||
|
||
```sh | ||
yarn benchmark | ||
``` | ||
|
||
This will run the build for the library and then run the benchmark. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,128 @@ | ||
/* eslint-disable no-console */ | ||
/* eslint-disable import/no-extraneous-dependencies */ | ||
/** | ||
* @file Configuration for the benchmarking script | ||
* | ||
* @see {@link https://github.com/stylelint/css-parser/issues/1} | ||
*/ | ||
|
||
import * as CssTree from 'css-tree'; | ||
import * as CssToolsCssTokenizer from '@csstools/css-tokenizer'; | ||
import * as CssToolsTokenizer from '@csstools/tokenizer'; | ||
// Note: this module has no types | ||
import * as parseCss from 'parse-css/parse-css'; | ||
// Note: this is an ES module, but we use esbuild to make a bundle on the fly before running the benchmark | ||
import * as cssLex from 'csslex'; | ||
|
||
// eslint-disable-next-line import/no-relative-packages, import/extensions | ||
import * as AdGuardCssTokenizer from '../dist/csstokenizer'; | ||
import { type Resource, type Tokenizer } from './interfaces'; | ||
|
||
// Add `tokenize` function to the `CssTree` module | ||
declare module 'css-tree' { | ||
export function tokenize(css: string, callback: (token: number, start: number, end: number) => void): void; | ||
} | ||
|
||
/** | ||
* Resources to benchmark | ||
*/ | ||
export const resources: Resource[] = [ | ||
{ | ||
name: 'Bootstrap CSS', | ||
url: 'https://cdn.jsdelivr.net/npm/bootstrap@latest/dist/css/bootstrap.css', | ||
}, | ||
{ | ||
name: 'Bulma CSS', | ||
url: 'https://cdn.jsdelivr.net/npm/bulma@latest/css/bulma.css', | ||
}, | ||
{ | ||
name: 'AdGuard Base List', | ||
url: 'https://raw.githubusercontent.com/AdguardTeam/FiltersRegistry/master/filters/filter_2_Base/filter.txt', | ||
adblock: true, | ||
}, | ||
{ | ||
name: 'uBlock Base List', | ||
url: 'https://raw.githubusercontent.com/uBlockOrigin/uAssets/master/filters/filters.txt', | ||
adblock: true, | ||
}, | ||
]; | ||
|
||
/** | ||
* Tokenizers to benchmark | ||
*/ | ||
export const tokenizers: Tokenizer[] = [ | ||
{ | ||
// https://github.com/AdguardTeam/tsurlfilter/tree/master/packages/css-tokenizer | ||
name: '@adguard/css-tokenizer', | ||
tokenize: (css: string) => { | ||
let count = 0; | ||
AdGuardCssTokenizer.tokenize(css, () => { count += 1; }); | ||
return count; | ||
}, | ||
}, | ||
{ | ||
// https://github.com/AdguardTeam/tsurlfilter/tree/master/packages/css-tokenizer | ||
name: '@adguard/css-tokenizer (extended)', | ||
tokenize: (css: string) => { | ||
let count = 0; | ||
AdGuardCssTokenizer.tokenizeExtended(css, () => { count += 1; }); | ||
return count; | ||
}, | ||
}, | ||
{ | ||
// https://github.com/csstree/csstree | ||
name: 'css-tree', | ||
tokenize: (css: string) => { | ||
let count = 0; | ||
CssTree.tokenize(css, () => { count += 1; }); | ||
return count; | ||
}, | ||
}, | ||
{ | ||
// https://github.com/csstools/tokenizer | ||
name: '@csstools/tokenizer', | ||
tokenize: (css: string) => { | ||
let count = 1; // first token | ||
const tokenizer = CssToolsTokenizer.tokenize(css); | ||
while (!tokenizer().done) { | ||
count += 1; | ||
} | ||
return count; | ||
}, | ||
}, | ||
{ | ||
// https://github.com/csstools/postcss-plugins/tree/main/packages/css-tokenizer | ||
name: '@csstools/css-tokenizer', | ||
tokenize: (css: string) => { | ||
return CssToolsCssTokenizer.tokenize({ css }).length; | ||
}, | ||
}, | ||
{ | ||
// https://github.com/tabatkins/parse-css | ||
name: 'parse-css', | ||
tokenize: (css: string) => { | ||
// This tokenizer uses console.log while consuming numbers | ||
const { log } = console; | ||
console.log = () => {}; | ||
|
||
let tokens = 0; | ||
|
||
try { | ||
// Run the tokenizer | ||
tokens = parseCss.tokenize(css).length; | ||
} finally { | ||
// Restore console.log | ||
console.log = log; | ||
} | ||
|
||
return tokens; | ||
}, | ||
}, | ||
{ | ||
// https://github.com/keithamus/csslex | ||
name: 'csslex', | ||
tokenize: (css: string) => { | ||
return Array.from(cssLex.lex(css)).length; | ||
}, | ||
}, | ||
]; |
Oops, something went wrong.