diff --git a/packages/parse5-html-rewriting-stream/lib/index.ts b/packages/parse5-html-rewriting-stream/lib/index.ts index 24d6e6205..1d0694062 100644 --- a/packages/parse5-html-rewriting-stream/lib/index.ts +++ b/packages/parse5-html-rewriting-stream/lib/index.ts @@ -1,4 +1,4 @@ -import type { Token, Location } from 'parse5/dist/common/token.js'; +import type { Location } from 'parse5/dist/common/token.js'; import { SAXParser, EndTag, StartTag, Doctype, Text, Comment, SaxToken } from 'parse5-sax-parser'; import { escapeString } from 'parse5/dist/serializer/index.js'; @@ -73,9 +73,9 @@ export class RewritingStream extends SAXParser { } // Events - protected override _handleToken(token: Token): boolean { - if (!super._handleToken(token)) { - this.emitRaw(this._getRawHtml(token.location!)); + protected override emitIfListenerExists(eventName: string, token: SaxToken): boolean { + if (!super.emitIfListenerExists(eventName, token)) { + this.emitRaw(this._getRawHtml(token.sourceCodeLocation!)); } // NOTE: don't skip new lines after
and other tags, diff --git a/packages/parse5-sax-parser/lib/index.ts b/packages/parse5-sax-parser/lib/index.ts index 304047870..53e492599 100644 --- a/packages/parse5-sax-parser/lib/index.ts +++ b/packages/parse5-sax-parser/lib/index.ts @@ -1,6 +1,13 @@ import { Transform } from 'node:stream'; -import { Tokenizer } from 'parse5/dist/tokenizer/index.js'; -import { TokenType, Token, CharacterToken, Attribute, Location } from 'parse5/dist/common/token.js'; +import type { Tokenizer, TokenHandler } from 'parse5/dist/tokenizer/index.js'; +import type { + Attribute, + Location, + TagToken, + CommentToken, + DoctypeToken, + CharacterToken, +} from 'parse5/dist/common/token.js'; import { DevNullStream } from './dev-null-stream.js'; import { ParserFeedbackSimulator } from './parser-feedback-simulator.js'; @@ -39,13 +46,13 @@ export interface SAXParserOptions { * }); * ``` */ -export class SAXParser extends Transform { +export class SAXParser extends Transform implements TokenHandler { protected options: SAXParserOptions; - protected tokenizer: Tokenizer; protected parserFeedbackSimulator: ParserFeedbackSimulator; - private pendingText: CharacterToken | null = null; + private pendingText: Text | null = null; private lastChunkWritten = false; private stopped = false; + protected tokenizer: Tokenizer; /** * @param options Parsing options. @@ -58,8 +65,8 @@ export class SAXParser extends Transform { ...options, }; - this.tokenizer = new Tokenizer(this.options); - this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.tokenizer); + this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.options, this); + this.tokenizer = this.parserFeedbackSimulator.tokenizer; // NOTE: always pipe stream to the /dev/null stream to avoid // `highWaterMark` hit even if we don't have consumers. @@ -127,96 +134,95 @@ export class SAXParser extends Transform { } private _runParsingLoop(): void { - let token = null; + while (!this.stopped && this.tokenizer.active) { + this.tokenizer.getNextToken(); + } + } - do { - token = this.parserFeedbackSimulator.getNextToken(); + /** @internal */ + onCharacter({ chars, location }: CharacterToken): void { + if (this.pendingText === null) { + this.pendingText = { text: chars, sourceCodeLocation: location }; + } else { + this.pendingText.text += chars; - if (token.type === TokenType.HIBERNATION) { - break; + if (location && this.pendingText.sourceCodeLocation) { + const { endLine, endCol, endOffset } = location; + this.pendingText.sourceCodeLocation = { + ...this.pendingText.sourceCodeLocation, + endLine, + endCol, + endOffset, + }; } + } + } - if ( - token.type === TokenType.CHARACTER || - token.type === TokenType.WHITESPACE_CHARACTER || - token.type === TokenType.NULL_CHARACTER - ) { - if (this.pendingText === null) { - token.type = TokenType.CHARACTER; - this.pendingText = token; - } else { - this.pendingText.chars += token.chars; - - if (token.location && this.pendingText.location) { - const { endLine, endCol, endOffset } = token.location; - this.pendingText.location = { - ...this.pendingText.location, - endLine, - endCol, - endOffset, - }; - } - } - } else { - this._emitPendingText(); - this._handleToken(token); - } - } while (!this.stopped && token.type !== TokenType.EOF); + /** @internal */ + onWhitespaceCharacter(token: CharacterToken): void { + this.onCharacter(token); } - protected _handleToken(token: Token): boolean { - switch (token.type) { - case TokenType.EOF: { - return true; - } - case TokenType.START_TAG: { - const startTag: StartTag = { - tagName: token.tagName, - attrs: token.attrs, - selfClosing: token.selfClosing, - sourceCodeLocation: token.location, - }; - return this._emitIfListenerExists('startTag', startTag); - } - case TokenType.END_TAG: { - const endTag: EndTag = { - tagName: token.tagName, - sourceCodeLocation: token.location, - }; - return this._emitIfListenerExists('endTag', endTag); - } - case TokenType.COMMENT: { - const comment: Comment = { - text: token.data, - sourceCodeLocation: token.location, - }; - return this._emitIfListenerExists('comment', comment); - } - case TokenType.DOCTYPE: { - const doctype: Doctype = { - name: token.name, - publicId: token.publicId, - systemId: token.systemId, - sourceCodeLocation: token.location, - }; - return this._emitIfListenerExists('doctype', doctype); - } - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: - case TokenType.WHITESPACE_CHARACTER: { - const text: Text = { - text: token.chars, - sourceCodeLocation: token.location, - }; - return this._emitIfListenerExists('text', text); - } - case TokenType.HIBERNATION: { - return this._emitIfListenerExists('hibernation', {}); - } - } + /** @internal */ + onNullCharacter(token: CharacterToken): void { + this.onCharacter(token); + } + + /** @internal */ + onEof(): void { + this._emitPendingText(); + this.stopped = true; + } + + /** @internal */ + onStartTag(token: TagToken): void { + this._emitPendingText(); + + const startTag: StartTag = { + tagName: token.tagName, + attrs: token.attrs, + selfClosing: token.selfClosing, + sourceCodeLocation: token.location, + }; + this.emitIfListenerExists('startTag', startTag); + } + + /** @internal */ + onEndTag(token: TagToken): void { + this._emitPendingText(); + + const endTag: EndTag = { + tagName: token.tagName, + sourceCodeLocation: token.location, + }; + this.emitIfListenerExists('endTag', endTag); + } + + /** @internal */ + onDoctype(token: DoctypeToken): void { + this._emitPendingText(); + + const doctype: Doctype = { + name: token.name, + publicId: token.publicId, + systemId: token.systemId, + sourceCodeLocation: token.location, + }; + this.emitIfListenerExists('doctype', doctype); + } + + /** @internal */ + onComment(token: CommentToken): void { + this._emitPendingText(); + + const comment: Comment = { + text: token.data, + sourceCodeLocation: token.location, + }; + this.emitIfListenerExists('comment', comment); } - private _emitIfListenerExists(eventName: string, token: SaxToken): boolean { + protected emitIfListenerExists(eventName: string, token: SaxToken): boolean { if (this.listenerCount(eventName) === 0) { return false; } @@ -232,7 +238,7 @@ export class SAXParser extends Transform { private _emitPendingText(): void { if (this.pendingText !== null) { - this._handleToken(this.pendingText); + this.emitIfListenerExists('text', this.pendingText); this.pendingText = null; } } diff --git a/packages/parse5-sax-parser/lib/parser-feedback-simulator.ts b/packages/parse5-sax-parser/lib/parser-feedback-simulator.ts index 73b4221e0..e51c6ee31 100644 --- a/packages/parse5-sax-parser/lib/parser-feedback-simulator.ts +++ b/packages/parse5-sax-parser/lib/parser-feedback-simulator.ts @@ -1,63 +1,74 @@ -import { Tokenizer, TokenizerMode } from 'parse5/dist/tokenizer/index.js'; -import { TokenType, Token, TagToken } from 'parse5/dist/common/token.js'; +import { Tokenizer, TokenizerOptions, TokenizerMode, TokenHandler } from 'parse5/dist/tokenizer/index.js'; +import { TokenType, TagToken, CommentToken, DoctypeToken, CharacterToken, EOFToken } from 'parse5/dist/common/token.js'; import * as foreignContent from 'parse5/dist/common/foreign-content.js'; import * as unicode from 'parse5/dist/common/unicode.js'; import { TAG_ID as $, TAG_NAMES as TN, NAMESPACES as NS, getTagID } from 'parse5/dist/common/html.js'; //ParserFeedbackSimulator //Simulates adjustment of the Tokenizer which performed by standard parser during tree construction. -export class ParserFeedbackSimulator { +export class ParserFeedbackSimulator implements TokenHandler { private namespaceStack: NS[] = []; private inForeignContent = false; public skipNextNewLine = false; + public tokenizer: Tokenizer; - constructor(private tokenizer: Tokenizer) { + constructor(options: TokenizerOptions, private handler: TokenHandler) { + this.tokenizer = new Tokenizer(options, this); this._enterNamespace(NS.HTML); } - public getNextToken(): Token { - const token = this.tokenizer.getNextToken(); + /** @internal */ + onNullCharacter(token: CharacterToken): void { + this.skipNextNewLine = false; - switch (token.type) { - case TokenType.START_TAG: { - this._handleStartTagToken(token); - break; - } - case TokenType.END_TAG: { - this._handleEndTagToken(token); - break; - } - - case TokenType.NULL_CHARACTER: { - this.skipNextNewLine = false; - if (this.inForeignContent) { - token.type = TokenType.CHARACTER; - token.chars = unicode.REPLACEMENT_CHARACTER; - } - break; - } - case TokenType.WHITESPACE_CHARACTER: { - if (this.skipNextNewLine && token.chars.charCodeAt(0) === unicode.CODE_POINTS.LINE_FEED) { - this.skipNextNewLine = false; + if (this.inForeignContent) { + this.handler.onCharacter({ + type: TokenType.CHARACTER, + chars: unicode.REPLACEMENT_CHARACTER, + location: token.location, + }); + } else { + this.handler.onNullCharacter(token); + } + } - if (token.chars.length === 1) { - return this.getNextToken(); - } + /** @internal */ + onWhitespaceCharacter(token: CharacterToken): void { + if (this.skipNextNewLine && token.chars.charCodeAt(0) === unicode.CODE_POINTS.LINE_FEED) { + this.skipNextNewLine = false; - token.chars = token.chars.substr(1); - } - break; - } - case TokenType.HIBERNATION: { - // Ignore - break; - } - default: { - this.skipNextNewLine = false; + if (token.chars.length === 1) { + return; } + + token.chars = token.chars.substr(1); } - return token; + this.handler.onWhitespaceCharacter(token); + } + + /** @internal */ + onCharacter(token: CharacterToken): void { + this.skipNextNewLine = false; + this.handler.onCharacter(token); + } + + /** @internal */ + onComment(token: CommentToken): void { + this.skipNextNewLine = false; + this.handler.onComment(token); + } + + /** @internal */ + onDoctype(token: DoctypeToken): void { + this.skipNextNewLine = false; + this.handler.onDoctype(token); + } + + /** @internal */ + onEof(token: EOFToken): void { + this.skipNextNewLine = false; + this.handler.onEof(token); } //Namespace stack mutations @@ -103,7 +114,8 @@ export class ParserFeedbackSimulator { } } - private _handleStartTagToken(token: TagToken): void { + /** @internal */ + onStartTag(token: TagToken): void { let tn = token.tagID; switch (tn) { @@ -122,24 +134,23 @@ export class ParserFeedbackSimulator { if (this.inForeignContent) { if (foreignContent.causesExit(token)) { this._leaveCurrentNamespace(); - return; - } - - const currentNs = this.namespaceStack[0]; - - if (currentNs === NS.MATHML) { - foreignContent.adjustTokenMathMLAttrs(token); - } else if (currentNs === NS.SVG) { - foreignContent.adjustTokenSVGTagName(token); - foreignContent.adjustTokenSVGAttrs(token); - } + } else { + const currentNs = this.namespaceStack[0]; + + if (currentNs === NS.MATHML) { + foreignContent.adjustTokenMathMLAttrs(token); + } else if (currentNs === NS.SVG) { + foreignContent.adjustTokenSVGTagName(token); + foreignContent.adjustTokenSVGAttrs(token); + } - foreignContent.adjustTokenXMLAttrs(token); + foreignContent.adjustTokenXMLAttrs(token); - tn = token.tagID; + tn = token.tagID; - if (!token.selfClosing && foreignContent.isIntegrationPoint(tn, currentNs, token.attrs)) { - this._enterNamespace(NS.HTML); + if (!token.selfClosing && foreignContent.isIntegrationPoint(tn, currentNs, token.attrs)) { + this._enterNamespace(NS.HTML); + } } } else { switch (tn) { @@ -160,9 +171,12 @@ export class ParserFeedbackSimulator { this._ensureTokenizerMode(tn); } + + this.handler.onStartTag(token); } - private _handleEndTagToken(token: TagToken): void { + /** @internal */ + onEndTag(token: TagToken): void { let tn = token.tagID; if (!this.inForeignContent) { @@ -191,5 +205,7 @@ export class ParserFeedbackSimulator { if (this.namespaceStack[0] === NS.SVG) { foreignContent.adjustTokenSVGTagName(token); } + + this.handler.onEndTag(token); } } diff --git a/packages/parse5-sax-parser/test/parser-feedback-simulator.test.ts b/packages/parse5-sax-parser/test/parser-feedback-simulator.test.ts index 1015398a6..858769a6f 100644 --- a/packages/parse5-sax-parser/test/parser-feedback-simulator.test.ts +++ b/packages/parse5-sax-parser/test/parser-feedback-simulator.test.ts @@ -1,13 +1,11 @@ -import { Tokenizer } from 'parse5/dist/tokenizer/index.js'; -import type { Token } from 'parse5/dist/common/token.js'; import { generateTokenizationTests } from 'parse5-test-utils/utils/generate-tokenization-tests.js'; import { ParserFeedbackSimulator } from '../lib/parser-feedback-simulator.js'; const feedbackPath = new URL('../../../test/data/parser-feedback', import.meta.url); -generateTokenizationTests('ParserFeedbackSimulator', 'ParserFeedbackSimulator', feedbackPath.pathname, () => { - const tokenizer = new Tokenizer({}); - const feedbackSimulator = new ParserFeedbackSimulator(tokenizer); - - return { tokenizer, getNextToken: (): Token => feedbackSimulator.getNextToken() }; -}); +generateTokenizationTests( + 'ParserFeedbackSimulator', + 'ParserFeedbackSimulator', + feedbackPath.pathname, + (handler) => new ParserFeedbackSimulator({}, handler).tokenizer +); diff --git a/packages/parse5/lib/parser/index.ts b/packages/parse5/lib/parser/index.ts index b3045620a..bc60640be 100644 --- a/packages/parse5/lib/parser/index.ts +++ b/packages/parse5/lib/parser/index.ts @@ -1,4 +1,5 @@ -import { Tokenizer, TokenizerMode } from '../tokenizer/index.js'; +import { TokenizerMode } from '../tokenizer/index.js'; +import { QueuedTokenizer } from '../tokenizer/queued.js'; import { OpenElementStack } from './open-element-stack.js'; import { FormattingElementList, ElementEntry, EntryType } from './formatting-element-list.js'; import * as defaultTreeAdapter from '../tree-adapters/default.js'; @@ -146,7 +147,7 @@ export class Parser{ this.document = document ?? this.treeAdapter.createDocument(); - this.tokenizer = new Tokenizer(this.options); + this.tokenizer = new QueuedTokenizer(this.options); this.activeFormattingElements = new FormattingElementList(this.treeAdapter); this.fragmentContextID = fragmentContext ? getTagID(this.treeAdapter.getTagName(fragmentContext)) : $.UNKNOWN; @@ -210,8 +211,7 @@ export class Parser { return fragment; } - tokenizer: Tokenizer; - + tokenizer: QueuedTokenizer; stopped = false; insertionMode = InsertionMode.INITIAL; originalInsertionMode = InsertionMode.INITIAL; diff --git a/packages/parse5/lib/tokenizer/index.test.ts b/packages/parse5/lib/tokenizer/index.test.ts index 1c1a0c67a..f34733077 100644 --- a/packages/parse5/lib/tokenizer/index.test.ts +++ b/packages/parse5/lib/tokenizer/index.test.ts @@ -1,24 +1,18 @@ import * as parse5 from 'parse5'; import { Tokenizer } from 'parse5/dist/tokenizer/index.js'; -import type { Token } from 'parse5/dist/common/token'; import { generateTokenizationTests } from 'parse5-test-utils/utils/generate-tokenization-tests.js'; const dataPath = new URL('../../../../test/data/html5lib-tests/tokenizer', import.meta.url); +const tokenizerOpts = { + sourceCodeLocationInfo: true, +}; -generateTokenizationTests('tokenizer', 'Tokenizer', dataPath.pathname, ({ errors }) => { - const tokenizer = new Tokenizer({ - sourceCodeLocationInfo: true, - onParseError(err): void { - errors.push({ - code: err.code, - line: err.startLine, - col: err.startCol, - }); - }, - }); - - return { tokenizer, getNextToken: (): Token => tokenizer.getNextToken() }; -}); +generateTokenizationTests( + 'tokenizer', + 'Tokenizer', + dataPath.pathname, + (handler) => new Tokenizer(tokenizerOpts, handler) +); describe('tokenizer', () => { it('Regression - `<<` in comment parses correctly (GH-325)', () => { diff --git a/packages/parse5/lib/tokenizer/index.ts b/packages/parse5/lib/tokenizer/index.ts index eead9ec90..8968bb9fa 100644 --- a/packages/parse5/lib/tokenizer/index.ts +++ b/packages/parse5/lib/tokenizer/index.ts @@ -13,6 +13,7 @@ import { CharacterToken, DoctypeToken, TagToken, + EOFToken, getTokenAttr, CommentToken, Attribute, @@ -53,8 +54,6 @@ const C1_CONTROLS_REFERENCE_REPLACEMENTS = new Map([ [0x9f, 0x01_78], ]); -const HIBERNATION_TOKEN: Token = { type: TokenType.HIBERNATION, location: null }; - //States const enum State { DATA, @@ -203,11 +202,29 @@ function isScriptDataDoubleEscapeSequenceEnd(cp: number): boolean { return isWhitespace(cp) || cp === $.SOLIDUS || cp === $.GREATER_THAN_SIGN; } +export interface TokenizerOptions { + sourceCodeLocationInfo?: boolean; +} + +export interface TokenHandler { + onComment(token: CommentToken): void; + onDoctype(token: DoctypeToken): void; + onStartTag(token: TagToken): void; + onEndTag(token: TagToken): void; + onEof(token: EOFToken): void; + onCharacter(token: CharacterToken): void; + onNullCharacter(token: CharacterToken): void; + onWhitespaceCharacter(token: CharacterToken): void; + + onParseError?: ParserErrorHandler | null; +} + //Tokenizer export class Tokenizer { public preprocessor: Preprocessor; - private tokenQueue: Token[] = []; + /** Indicates that the next token has been emitted, and `getNextToken` should return. */ + private hasEmitted = false; public allowCDATA = false; public lastStartTagName = ''; @@ -225,24 +242,19 @@ export class Tokenizer { private currentToken: Token | null = null; private currentAttr: Attribute = { name: '', value: '' }; - private addLocationInfo; - private onParseError; - - constructor(options: { sourceCodeLocationInfo?: boolean; onParseError?: ParserErrorHandler | null }) { - this.addLocationInfo = !!options.sourceCodeLocationInfo; - this.onParseError = options.onParseError ?? null; - this.preprocessor = new Preprocessor(options); + constructor(private options: TokenizerOptions, private handler: TokenHandler) { + this.preprocessor = new Preprocessor(handler); this.currentLocation = this.getCurrentLocation(-1); } //Errors private _err(code: ERR): void { - this.onParseError?.(this.preprocessor.getError(code)); + this.handler.onParseError?.(this.preprocessor.getError(code)); } // NOTE: `offset` may never run across line boundaries. private getCurrentLocation(offset: number): Location | null { - if (!this.addLocationInfo) { + if (!this.options.sourceCodeLocationInfo) { return null; } @@ -257,8 +269,9 @@ export class Tokenizer { } //API - public getNextToken(): Token { - while (this.tokenQueue.length === 0 && this.active) { + public getNextToken(): void { + this.hasEmitted = false; + while (!this.hasEmitted && this.active) { this.consumedAfterSnapshot = 0; const cp = this._consume(); @@ -267,8 +280,6 @@ export class Tokenizer { this._callState(cp); } } - - return this.tokenQueue.shift()!; } public write(chunk: string, isLastChunk: boolean): void { @@ -286,7 +297,6 @@ export class Tokenizer { if (this.preprocessor.endOfChunkHit) { this._unconsume(this.consumedAfterSnapshot); this.active = false; - this.tokenQueue.push(HIBERNATION_TOKEN); return true; } @@ -378,18 +388,6 @@ export class Tokenizer { }; } - private _createEOFToken(): void { - const location = this.getCurrentLocation(0); - - if (location) { - location.endLine = location.startLine; - location.endCol = location.startCol; - location.endOffset = location.startOffset; - } - - this.currentToken = { type: TokenType.EOF, location }; - } - //Tag attributes private _createAttr(attrNameFirstCh: string): void { this.currentAttr = { @@ -426,46 +424,53 @@ export class Tokenizer { } //Token emission - private _emitCurrentToken(): void { - const ct = this.currentToken!; - + private prepareToken(ct: Token): void { this._emitCurrentCharacterToken(ct.location); - this.currentToken = null; - //NOTE: store emited start tag's tagName to determine is the following end tag token is appropriate. - switch (ct.type) { - case TokenType.START_TAG: { - ct.tagID = getTagID(ct.tagName); - this.lastStartTagName = ct.tagName; - break; - } - case TokenType.END_TAG: { - ct.tagID = getTagID(ct.tagName); - - if (ct.attrs.length > 0) { - this._err(ERR.endTagWithAttributes); - } - - if (ct.selfClosing) { - this._err(ERR.endTagWithTrailingSolidus); - } - break; - } - default: - // Do nothing - } - - if (ct.location && ct.type !== TokenType.EOF) { + if (ct.location) { ct.location.endLine = this.preprocessor.line; ct.location.endCol = this.preprocessor.col + 1; ct.location.endOffset = this.preprocessor.offset + 1; } - this.tokenQueue.push(ct); + this.hasEmitted = true; this.currentLocation = this.getCurrentLocation(-1); } + private emitCurrentTagToken(): void { + const ct = this.currentToken as TagToken; + + this.prepareToken(ct); + + ct.tagID = getTagID(ct.tagName); + + if (ct.type === TokenType.START_TAG) { + this.lastStartTagName = ct.tagName; + this.handler.onStartTag(ct); + } else { + if (ct.attrs.length > 0) { + this._err(ERR.endTagWithAttributes); + } + + if (ct.selfClosing) { + this._err(ERR.endTagWithTrailingSolidus); + } + + this.handler.onEndTag(ct); + } + } + + private emitCurrentComment(ct: CommentToken): void { + this.prepareToken(ct); + this.handler.onComment(ct); + } + + private emitCurrentDoctype(ct: DoctypeToken): void { + this.prepareToken(ct); + this.handler.onDoctype(ct); + } + private _emitCurrentCharacterToken(nextLocation: Location | null): void { if (this.currentCharacterToken) { //NOTE: if we have pending character token make it's end location equal to the @@ -476,14 +481,38 @@ export class Tokenizer { this.currentCharacterToken.location.endOffset = nextLocation.startOffset; } - this.tokenQueue.push(this.currentCharacterToken); + switch (this.currentCharacterToken.type) { + case TokenType.CHARACTER: { + this.handler.onCharacter(this.currentCharacterToken); + break; + } + case TokenType.NULL_CHARACTER: { + this.handler.onNullCharacter(this.currentCharacterToken); + break; + } + case TokenType.WHITESPACE_CHARACTER: { + this.handler.onWhitespaceCharacter(this.currentCharacterToken); + break; + } + } + + this.hasEmitted = true; this.currentCharacterToken = null; } } private _emitEOFToken(): void { - this._createEOFToken(); - this._emitCurrentToken(); + const location = this.getCurrentLocation(0); + + if (location) { + location.endLine = location.startLine; + location.endCol = location.startCol; + location.endOffset = location.startOffset; + } + + this._emitCurrentCharacterToken(location); + this.handler.onEof({ type: TokenType.EOF, location }); + this.hasEmitted = true; } //Characters emission @@ -497,16 +526,17 @@ export class Tokenizer { //2)TokenType.WHITESPACE_CHARACTER - any whitespace/new-line character sequences (e.g. '\n \r\t \f') //3)TokenType.CHARACTER - any character sequence which don't belong to groups 1 and 2 (e.g. 'abcdef1234@@#$%^') private _appendCharToCurrentCharacterToken(type: CharacterToken['type'], ch: string): void { - if (this.currentCharacterToken && this.currentCharacterToken.type !== type) { - this.currentLocation = this.getCurrentLocation(0); - this._emitCurrentCharacterToken(this.currentLocation); - } - if (this.currentCharacterToken) { - this.currentCharacterToken.chars += ch; - } else { - this._createCharacterToken(type, ch); + if (this.currentCharacterToken.type !== type) { + this.currentLocation = this.getCurrentLocation(0); + this._emitCurrentCharacterToken(this.currentLocation); + } else { + this.currentCharacterToken.chars += ch; + return; + } } + + this._createCharacterToken(type, ch); } private _emitCodePoint(cp: number): void { @@ -1142,7 +1172,7 @@ export class Tokenizer { } case $.GREATER_THAN_SIGN: { this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentTagToken(); break; } case $.NULL: { @@ -1213,7 +1243,7 @@ export class Tokenizer { } case $.GREATER_THAN_SIGN: { this._advanceBy(this.lastStartTagName.length); - this._emitCurrentToken(); + this.emitCurrentTagToken(); this.state = State.DATA; return false; } @@ -1710,7 +1740,7 @@ export class Tokenizer { } case $.GREATER_THAN_SIGN: { this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentTagToken(); break; } case $.EOF: { @@ -1748,7 +1778,7 @@ export class Tokenizer { case $.GREATER_THAN_SIGN: { this._err(ERR.missingAttributeValue); this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentTagToken(); break; } default: { @@ -1836,7 +1866,7 @@ export class Tokenizer { case $.GREATER_THAN_SIGN: { this._leaveAttrValue(); this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentTagToken(); break; } case $.NULL: { @@ -1884,7 +1914,7 @@ export class Tokenizer { case $.GREATER_THAN_SIGN: { this._leaveAttrValue(); this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentTagToken(); break; } case $.EOF: { @@ -1905,9 +1935,10 @@ export class Tokenizer { private _stateSelfClosingStartTag(cp: number): void { switch (cp) { case $.GREATER_THAN_SIGN: { - (this.currentToken as TagToken).selfClosing = true; + const token = this.currentToken as TagToken; + token.selfClosing = true; this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentTagToken(); break; } case $.EOF: { @@ -1931,11 +1962,11 @@ export class Tokenizer { switch (cp) { case $.GREATER_THAN_SIGN: { this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentComment(token); break; } case $.EOF: { - this._emitCurrentToken(); + this.emitCurrentComment(token); this._emitEOFToken(); break; } @@ -1992,7 +2023,8 @@ export class Tokenizer { case $.GREATER_THAN_SIGN: { this._err(ERR.abruptClosingOfEmptyComment); this.state = State.DATA; - this._emitCurrentToken(); + const token = this.currentToken as CommentToken; + this.emitCurrentComment(token); break; } default: { @@ -2005,6 +2037,7 @@ export class Tokenizer { // Comment start dash state //------------------------------------------------------------------ private _stateCommentStartDash(cp: number): void { + const token = this.currentToken as CommentToken; switch (cp) { case $.HYPHEN_MINUS: { this.state = State.COMMENT_END; @@ -2013,17 +2046,17 @@ export class Tokenizer { case $.GREATER_THAN_SIGN: { this._err(ERR.abruptClosingOfEmptyComment); this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentComment(token); break; } case $.EOF: { this._err(ERR.eofInComment); - this._emitCurrentToken(); + this.emitCurrentComment(token); this._emitEOFToken(); break; } default: { - (this.currentToken as CommentToken).data += '-'; + token.data += '-'; this.state = State.COMMENT; this._stateComment(cp); } @@ -2052,7 +2085,7 @@ export class Tokenizer { } case $.EOF: { this._err(ERR.eofInComment); - this._emitCurrentToken(); + this.emitCurrentComment(token); this._emitEOFToken(); break; } @@ -2120,6 +2153,7 @@ export class Tokenizer { // Comment end dash state //------------------------------------------------------------------ private _stateCommentEndDash(cp: number): void { + const token = this.currentToken as CommentToken; switch (cp) { case $.HYPHEN_MINUS: { this.state = State.COMMENT_END; @@ -2127,12 +2161,12 @@ export class Tokenizer { } case $.EOF: { this._err(ERR.eofInComment); - this._emitCurrentToken(); + this.emitCurrentComment(token); this._emitEOFToken(); break; } default: { - (this.currentToken as CommentToken).data += '-'; + token.data += '-'; this.state = State.COMMENT; this._stateComment(cp); } @@ -2147,7 +2181,7 @@ export class Tokenizer { switch (cp) { case $.GREATER_THAN_SIGN: { this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentComment(token); break; } case $.EXCLAMATION_MARK: { @@ -2160,7 +2194,7 @@ export class Tokenizer { } case $.EOF: { this._err(ERR.eofInComment); - this._emitCurrentToken(); + this.emitCurrentComment(token); this._emitEOFToken(); break; } @@ -2186,12 +2220,12 @@ export class Tokenizer { case $.GREATER_THAN_SIGN: { this._err(ERR.incorrectlyClosedComment); this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentComment(token); break; } case $.EOF: { this._err(ERR.eofInComment); - this._emitCurrentToken(); + this.emitCurrentComment(token); this._emitEOFToken(); break; } @@ -2222,8 +2256,9 @@ export class Tokenizer { case $.EOF: { this._err(ERR.eofInDoctype); this._createDoctypeToken(null); - (this.currentToken as DoctypeToken).forceQuirks = true; - this._emitCurrentToken(); + const token = this.currentToken as DoctypeToken; + token.forceQuirks = true; + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } @@ -2259,16 +2294,18 @@ export class Tokenizer { case $.GREATER_THAN_SIGN: { this._err(ERR.missingDoctypeName); this._createDoctypeToken(null); - (this.currentToken as DoctypeToken).forceQuirks = true; - this._emitCurrentToken(); + const token = this.currentToken as DoctypeToken; + token.forceQuirks = true; + this.emitCurrentDoctype(token); this.state = State.DATA; break; } case $.EOF: { this._err(ERR.eofInDoctype); this._createDoctypeToken(null); - (this.currentToken as DoctypeToken).forceQuirks = true; - this._emitCurrentToken(); + const token = this.currentToken as DoctypeToken; + token.forceQuirks = true; + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } @@ -2294,7 +2331,7 @@ export class Tokenizer { } case $.GREATER_THAN_SIGN: { this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); break; } case $.NULL: { @@ -2305,7 +2342,7 @@ export class Tokenizer { case $.EOF: { this._err(ERR.eofInDoctype); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } @@ -2330,13 +2367,13 @@ export class Tokenizer { } case $.GREATER_THAN_SIGN: { this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); break; } case $.EOF: { this._err(ERR.eofInDoctype); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } @@ -2386,13 +2423,13 @@ export class Tokenizer { this._err(ERR.missingDoctypePublicIdentifier); token.forceQuirks = true; this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); break; } case $.EOF: { this._err(ERR.eofInDoctype); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } @@ -2432,13 +2469,13 @@ export class Tokenizer { this._err(ERR.missingDoctypePublicIdentifier); token.forceQuirks = true; this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); break; } case $.EOF: { this._err(ERR.eofInDoctype); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } @@ -2469,14 +2506,14 @@ export class Tokenizer { case $.GREATER_THAN_SIGN: { this._err(ERR.abruptDoctypePublicIdentifier); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this.state = State.DATA; break; } case $.EOF: { this._err(ERR.eofInDoctype); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } @@ -2504,14 +2541,14 @@ export class Tokenizer { case $.GREATER_THAN_SIGN: { this._err(ERR.abruptDoctypePublicIdentifier); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this.state = State.DATA; break; } case $.EOF: { this._err(ERR.eofInDoctype); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } @@ -2536,7 +2573,7 @@ export class Tokenizer { } case $.GREATER_THAN_SIGN: { this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); break; } case $.QUOTATION_MARK: { @@ -2554,7 +2591,7 @@ export class Tokenizer { case $.EOF: { this._err(ERR.eofInDoctype); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } @@ -2581,7 +2618,7 @@ export class Tokenizer { break; } case $.GREATER_THAN_SIGN: { - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this.state = State.DATA; break; } @@ -2598,7 +2635,7 @@ export class Tokenizer { case $.EOF: { this._err(ERR.eofInDoctype); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } @@ -2640,13 +2677,13 @@ export class Tokenizer { this._err(ERR.missingDoctypeSystemIdentifier); token.forceQuirks = true; this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); break; } case $.EOF: { this._err(ERR.eofInDoctype); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } @@ -2686,13 +2723,13 @@ export class Tokenizer { this._err(ERR.missingDoctypeSystemIdentifier); token.forceQuirks = true; this.state = State.DATA; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); break; } case $.EOF: { this._err(ERR.eofInDoctype); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } @@ -2723,14 +2760,14 @@ export class Tokenizer { case $.GREATER_THAN_SIGN: { this._err(ERR.abruptDoctypeSystemIdentifier); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this.state = State.DATA; break; } case $.EOF: { this._err(ERR.eofInDoctype); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } @@ -2758,14 +2795,14 @@ export class Tokenizer { case $.GREATER_THAN_SIGN: { this._err(ERR.abruptDoctypeSystemIdentifier); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this.state = State.DATA; break; } case $.EOF: { this._err(ERR.eofInDoctype); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } @@ -2789,14 +2826,14 @@ export class Tokenizer { break; } case $.GREATER_THAN_SIGN: { - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this.state = State.DATA; break; } case $.EOF: { this._err(ERR.eofInDoctype); token.forceQuirks = true; - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } @@ -2811,9 +2848,11 @@ export class Tokenizer { // Bogus DOCTYPE state //------------------------------------------------------------------ private _stateBogusDoctype(cp: number): void { + const token = this.currentToken as DoctypeToken; + switch (cp) { case $.GREATER_THAN_SIGN: { - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this.state = State.DATA; break; } @@ -2822,7 +2861,7 @@ export class Tokenizer { break; } case $.EOF: { - this._emitCurrentToken(); + this.emitCurrentDoctype(token); this._emitEOFToken(); break; } diff --git a/packages/parse5/lib/tokenizer/preprocessor.ts b/packages/parse5/lib/tokenizer/preprocessor.ts index 7cb6d1ec3..7dbc82193 100644 --- a/packages/parse5/lib/tokenizer/preprocessor.ts +++ b/packages/parse5/lib/tokenizer/preprocessor.ts @@ -30,11 +30,7 @@ export class Preprocessor { public droppedBufferSize = 0; public line = 1; - onParseError: ParserErrorHandler | null; - - constructor(options: { onParseError?: ParserErrorHandler | null }) { - this.onParseError = options.onParseError ?? null; - } + constructor(private handler: { onParseError?: ParserErrorHandler | null }) {} /** The column on the current line. If we just saw a gap (eg. a surrogate pair), return the index before. */ public get col(): number { @@ -62,9 +58,9 @@ export class Preprocessor { //NOTE: avoid reporting error twice on advance/retreat private lastErrOffset = -1; private _err(code: ERR): void { - if (this.onParseError && this.lastErrOffset !== this.offset) { + if (this.handler.onParseError && this.lastErrOffset !== this.offset) { this.lastErrOffset = this.offset; - this.onParseError(this.getError(code)); + this.handler.onParseError(this.getError(code)); } } @@ -210,7 +206,7 @@ export class Preprocessor { //range (ASCII alphanumeric, whitespaces, big chunk of BMP) //before going into detailed performance cost validation. const isCommonValidRange = - this.onParseError === null || + this.handler.onParseError === null || (cp > 0x1f && cp < 0x7f) || cp === $.LINE_FEED || cp === $.CARRIAGE_RETURN || diff --git a/packages/parse5/lib/tokenizer/queued.ts b/packages/parse5/lib/tokenizer/queued.ts new file mode 100644 index 000000000..6ac8178a2 --- /dev/null +++ b/packages/parse5/lib/tokenizer/queued.ts @@ -0,0 +1,105 @@ +import { TokenType, Token, CharacterToken, DoctypeToken, TagToken, EOFToken, CommentToken } from '../common/token.js'; +import { TokenHandler, Tokenizer, TokenizerOptions, TokenizerMode } from './index.js'; +import type { ParserErrorHandler } from '../common/error-codes.js'; +import type { Preprocessor } from './preprocessor.js'; + +const HIBERNATION_TOKEN: Token = { type: TokenType.HIBERNATION, location: null }; + +/** A token handler implemnetation that calls the same function for all tokens. */ +export abstract class SinglePathHandler implements TokenHandler { + protected abstract handleToken(token: Token): void; + + onComment(token: CommentToken): void { + this.handleToken(token); + } + onDoctype(token: DoctypeToken): void { + this.handleToken(token); + } + onStartTag(token: TagToken): void { + this.handleToken(token); + } + onEndTag(token: TagToken): void { + this.handleToken(token); + } + onEof(token: EOFToken): void { + this.handleToken(token); + } + onCharacter(token: CharacterToken): void { + this.handleToken(token); + } + onNullCharacter(token: CharacterToken): void { + this.handleToken(token); + } + onWhitespaceCharacter(token: CharacterToken): void { + this.handleToken(token); + } +} + +class QueuedHandler extends SinglePathHandler { + private tokenQueue: Token[] = []; + + protected handleToken(token: Token): void { + this.tokenQueue.push(token); + } + + constructor(public onParseError: ParserErrorHandler | null) { + super(); + } + + public getNextToken(tokenizer: Tokenizer): Token { + while (this.tokenQueue.length === 0 && tokenizer.active) { + tokenizer.getNextToken(); + } + + if (this.tokenQueue.length === 0 && !tokenizer.active) { + this.tokenQueue.push(HIBERNATION_TOKEN); + } + + return this.tokenQueue.shift()!; + } +} + +export interface QueuedTokenizerOptions extends TokenizerOptions { + onParseError?: ParserErrorHandler | null; +} + +/** + * Provides the same interface as the old tokenizer, while allowing users to + * read data one token at a time. + */ +export class QueuedTokenizer { + private tokenizer: Tokenizer; + private handler: QueuedHandler; + + constructor(options: QueuedTokenizerOptions) { + this.handler = new QueuedHandler(options.onParseError ?? null); + this.tokenizer = new Tokenizer(options, this.handler); + } + + set allowCDATA(val: boolean) { + this.tokenizer.allowCDATA = val; + } + + get preprocessor(): Preprocessor { + return this.tokenizer.preprocessor; + } + get active(): boolean { + return this.tokenizer.active; + } + + set state(val: typeof TokenizerMode[keyof typeof TokenizerMode]) { + this.tokenizer.state = val; + } + + public write(chunk: string, isLastChunk: boolean): void { + this.tokenizer.write(chunk, isLastChunk); + } + + public insertHtmlAtCurrentPos(str: string): void { + this.tokenizer.insertHtmlAtCurrentPos(str); + } + + public getNextToken(): Token { + return this.handler.getNextToken(this.tokenizer); + } +} diff --git a/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts b/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts index e45a464ce..fe486a0c8 100644 --- a/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts +++ b/packages/parse5/lib/tokenizer/tokenizer-location-info.test.ts @@ -1,8 +1,61 @@ import * as assert from 'node:assert'; import { Tokenizer, TokenizerMode } from './index.js'; -import { TokenType } from '../common/token.js'; +import { SinglePathHandler } from './queued.js'; +import { Location, EOFToken, Token } from '../common/token.js'; import { getSubstringByLineCol, normalizeNewLine } from 'parse5-test-utils/utils/common.js'; +interface LocationInfoTestCase { + initialMode: typeof TokenizerMode[keyof typeof TokenizerMode]; + lastStartTagName: string; + htmlChunks: string[]; +} + +/** Receives events and immediately compares them against the expected values. */ +class LocationInfoHandler extends SinglePathHandler { + public sawEof = false; + /** The index of the last html chunk. */ + private idx = 0; + /** All of the lines in the input. */ + private lines: string[]; + + constructor(private testCase: LocationInfoTestCase, private html: string) { + super(); + this.lines = html.split(/\r?\n/g); + } + + protected handleToken(token: Token): void { + this.validateLocation(token.location); + } + + private validateLocation(location: Location | null): void { + assert.ok(location); + + //Offsets + const actual = this.html.substring(location.startOffset, location.endOffset); + const chunk = this.testCase.htmlChunks[this.idx]; + + assert.strictEqual(actual, chunk); + + //Line/col + const line = getSubstringByLineCol(this.lines, location); + const expected = normalizeNewLine(chunk); + + assert.strictEqual(line, expected); + + this.idx += 1; + } + + override onEof({ location }: EOFToken): void { + assert.ok(location); + assert.strictEqual(location.endOffset, location.startOffset); + assert.strictEqual(location.endOffset, this.html.length); + + assert.strictEqual(this.idx, this.testCase.htmlChunks.length); + + this.sawEof = true; + } +} + it('Location Info (Tokenizer)', () => { const testCases = [ { @@ -99,8 +152,8 @@ it('Location Info (Tokenizer)', () => { for (const testCase of testCases) { const html = testCase.htmlChunks.join(''); - const lines = html.split(/\r?\n/g); - const tokenizer = new Tokenizer({ sourceCodeLocationInfo: true }); + const handler = new LocationInfoHandler(testCase, html); + const tokenizer = new Tokenizer({ sourceCodeLocationInfo: true }, handler); const lastChunkIdx = testCase.htmlChunks.length - 1; for (let i = 0; i < testCase.htmlChunks.length; i++) { @@ -113,27 +166,8 @@ it('Location Info (Tokenizer)', () => { tokenizer.lastStartTagName = testCase.lastStartTagName; tokenizer.allowCDATA = !!testCase.allowCDATA; - for (let token = tokenizer.getNextToken(), j = 0; token.type !== TokenType.EOF; ) { - if (token.type === TokenType.HIBERNATION) { - continue; - } - - assert.ok(token.location); - - //Offsets - let actual = html.substring(token.location.startOffset, token.location.endOffset); - - assert.strictEqual(actual, testCase.htmlChunks[j]); - - //Line/col - actual = getSubstringByLineCol(lines, token.location); - - const expected = normalizeNewLine(testCase.htmlChunks[j]); - - assert.strictEqual(actual, expected); - - token = tokenizer.getNextToken(); - j++; + while (!handler.sawEof) { + tokenizer.getNextToken(); } } }); diff --git a/scripts/generate-parser-feedback-test/index.ts b/scripts/generate-parser-feedback-test/index.ts index f673749f7..bb4cde455 100644 --- a/scripts/generate-parser-feedback-test/index.ts +++ b/scripts/generate-parser-feedback-test/index.ts @@ -1,12 +1,12 @@ import { readFile, writeFile } from 'node:fs/promises'; import { basename } from 'node:path'; -import { Parser } from '../../packages/parse5/dist/parser/index.js'; -import * as defaultTreeAdapter from '../../packages/parse5/dist/tree-adapters/default.js'; -import { convertTokenToHtml5Lib } from 'parse5-test-utils/utils/generate-tokenization-tests.js'; +import { Parser } from 'parse5/dist/parser/index.js'; +import * as defaultTreeAdapter from 'parse5/dist/tree-adapters/default.js'; +import { HtmlLibToken } from 'parse5-test-utils/utils/generate-tokenization-tests.js'; import { parseDatFile } from 'parse5-test-utils/utils/parse-dat-file.js'; import { addSlashes } from 'parse5-test-utils/utils/common.js'; -import { TokenType, Token } from '../../packages/parse5/dist/common/token.js'; -import type { TreeAdapterTypeMap } from '../../packages/parse5/dist/tree-adapters/interface.js'; +import { TokenType, Token } from 'parse5/dist/common/token.js'; +import type { TreeAdapterTypeMap } from 'parse5/dist/tree-adapters/interface.js'; // eslint-disable-next-line no-console main().catch(console.error); @@ -41,7 +41,41 @@ function appendToken(dest: Token[], token: Token): void { dest.push(token); } -function collectParserTokens(html: string): ReturnType [] { +function convertTokenToHtml5Lib(token: Token): HtmlLibToken { + switch (token.type) { + case TokenType.CHARACTER: + case TokenType.NULL_CHARACTER: + case TokenType.WHITESPACE_CHARACTER: + return ['Character', token.chars]; + + case TokenType.START_TAG: { + const reformatedAttrs = Object.fromEntries(token.attrs.map(({ name, value }) => [name, value])); + const startTagEntry: HtmlLibToken = ['StartTag', token.tagName, reformatedAttrs]; + + if (token.selfClosing) { + startTagEntry.push(true); + } + + return startTagEntry; + } + + case TokenType.END_TAG: + // NOTE: parser feedback simulator can produce adjusted SVG + // tag names for end tag tokens so we need to lower case it + return ['EndTag', token.tagName.toLowerCase()]; + + case TokenType.COMMENT: + return ['Comment', token.data]; + + case TokenType.DOCTYPE: + return ['DOCTYPE', token.name, token.publicId, token.systemId, !token.forceQuirks]; + + default: + throw new TypeError(`Unrecognized token type: ${token.type}`); + } +} + +function collectParserTokens(html: string): HtmlLibToken[] { const tokens: Token[] = []; class ExtendedParser extends Parser { diff --git a/test/utils/generate-tokenization-tests.ts b/test/utils/generate-tokenization-tests.ts index fb3dc8481..5b9f4c59e 100644 --- a/test/utils/generate-tokenization-tests.ts +++ b/test/utils/generate-tokenization-tests.ts @@ -1,45 +1,12 @@ import * as assert from 'node:assert'; import * as fs from 'node:fs'; import * as path from 'node:path'; -import { Tokenizer, TokenizerMode } from 'parse5/dist/tokenizer/index.js'; +import { type Tokenizer, TokenizerMode, type TokenHandler } from 'parse5/dist/tokenizer/index.js'; import { makeChunks } from './common.js'; -import { TokenType, Token } from 'parse5/dist/common/token.js'; +import type { CommentToken, DoctypeToken, TagToken, CharacterToken } from 'parse5/dist/common/token.js'; +import type { ParserError } from 'parse5/dist/common/error-codes.js'; -type HtmlLibToken = [string, string | null, ...unknown[]]; - -export function convertTokenToHtml5Lib(token: Token): HtmlLibToken { - switch (token.type) { - case TokenType.CHARACTER: - case TokenType.NULL_CHARACTER: - case TokenType.WHITESPACE_CHARACTER: - return ['Character', token.chars]; - - case TokenType.START_TAG: { - const reformatedAttrs = Object.fromEntries(token.attrs.map(({ name, value }) => [name, value])); - const startTagEntry: HtmlLibToken = ['StartTag', token.tagName, reformatedAttrs]; - - if (token.selfClosing) { - startTagEntry.push(true); - } - - return startTagEntry; - } - - case TokenType.END_TAG: - // NOTE: parser feedback simulator can produce adjusted SVG - // tag names for end tag tokens so we need to lower case it - return ['EndTag', token.tagName.toLowerCase()]; - - case TokenType.COMMENT: - return ['Comment', token.data]; - - case TokenType.DOCTYPE: - return ['DOCTYPE', token.name, token.publicId, token.systemId, !token.forceQuirks]; - - default: - throw new TypeError(`Unrecognized token type: ${token.type}`); - } -} +export type HtmlLibToken = [string, string | null, ...unknown[]]; interface TokenError { code: string; @@ -47,44 +14,122 @@ interface TokenError { col: number; } +const TestsWithBrokenErrors: Record = { + /* + * 57.entities has an error that is not part of the test data. + * + * TODO: Move this to the test data. + */ + 'Undefined named entity in attribute value ending in semicolon and whose name starts with a known entity name.': [ + { code: 'unknown-named-character-reference', col: 12, line: 1 }, + ], +}; + interface TokenSourceData { tokens: HtmlLibToken[]; errors: TokenError[]; } -type TokenSourceCreator = (data: TokenSourceData) => { - tokenizer: Tokenizer; - getNextToken: () => Token; -}; +type TokenSourceCreator = (data: TokenizeHandler) => Tokenizer; + +/** Receives events and immediately compares them against the expected values. We check the entire output again at the end. */ +class TokenizeHandler implements TokenSourceData, TokenHandler { + constructor(private testData: LoadedTest) {} + + private addToken(token: HtmlLibToken): void { + assert.deepStrictEqual(token, this.testData.expected[this.tokens.length]); + + this.tokens.push(token); + } + + onComment(token: CommentToken): void { + this.addToken(['Comment', token.data]); + } + onDoctype(token: DoctypeToken): void { + this.addToken(['DOCTYPE', token.name, token.publicId, token.systemId, !token.forceQuirks]); + } + onStartTag(token: TagToken): void { + const reformatedAttrs = Object.fromEntries(token.attrs.map(({ name, value }) => [name, value])); + const startTagEntry: HtmlLibToken = ['StartTag', token.tagName, reformatedAttrs]; + + if (token.selfClosing) { + startTagEntry.push(true); + } + + this.addToken(startTagEntry); + } + onEndTag(token: TagToken): void { + // NOTE: parser feedback simulator can produce adjusted SVG + // tag names for end tag tokens so we need to lower case it + this.addToken(['EndTag', token.tagName.toLowerCase()]); + } + onEof(): void { + this.sawEof = true; + } + onCharacter(token: CharacterToken): void { + const lastEntry = this.tokens[this.tokens.length - 1]; + + if (lastEntry && lastEntry[0] === 'Character' && lastEntry[1] != null) { + lastEntry[1] += token.chars; + } else { + this.tokens.push(['Character', token.chars]); + } + + const actual = this.tokens[this.tokens.length - 1]; + const expected = this.testData.expected[this.tokens.length - 1]; + assert.strictEqual('Character', expected[0]); + assert.ok(typeof actual[1] === 'string'); + assert.ok(expected[1]?.startsWith(actual[1])); + } + onNullCharacter(token: CharacterToken): void { + this.onCharacter(token); + } + onWhitespaceCharacter(token: CharacterToken): void { + this.onCharacter(token); + } + onParseError(err: ParserError): void { + assert.ok( + this.testData.expectedErrors.some( + ({ code, line, col }) => code === err.code && line === err.startLine && col === err.startCol + ) + ); + + this.errors.push({ + code: err.code, + line: err.startLine, + col: err.startCol, + }); + } + + public sawEof = false; + public tokens: HtmlLibToken[] = []; + public errors: TokenError[] = []; +} function tokenize( createTokenSource: TokenSourceCreator, chunks: string | string[], - initialState: Tokenizer['state'], - lastStartTag: string | null + testData: LoadedTest ): TokenSourceData { - const result: TokenSourceData = { tokens: [], errors: [] }; - const { tokenizer, getNextToken } = createTokenSource(result); - let token: Token = { type: TokenType.HIBERNATION, location: null }; + const result = new TokenizeHandler(testData); + const tokenizer = createTokenSource(result); let chunkIdx = 0; // NOTE: set small waterline for testing purposes tokenizer.preprocessor.bufferWaterline = 8; - tokenizer.state = initialState; + tokenizer.state = testData.initialState; - if (lastStartTag) { - tokenizer.lastStartTagName = lastStartTag; + if (testData.lastStartTag) { + tokenizer.lastStartTagName = testData.lastStartTag; } - do { - if (token.type === TokenType.HIBERNATION) { - tokenizer.write(chunks[chunkIdx], ++chunkIdx === chunks.length); + while (!result.sawEof) { + if (tokenizer.active) { + tokenizer.getNextToken(); } else { - appendTokenEntry(result.tokens, convertTokenToHtml5Lib(token)); + tokenizer.write(chunks[chunkIdx], ++chunkIdx === chunks.length); } - - token = getNextToken(); - } while (token.type !== TokenType.EOF); + } // Sort errors by line and column result.errors.sort((err1, err2) => err1.line - err2.line || err1.col - err2.col); @@ -108,27 +153,6 @@ function unescapeDescrIO(testDescr: TestDescription): void { } } -function appendTokenEntry(result: HtmlLibToken[], tokenEntry: HtmlLibToken): void { - if (tokenEntry[0] === 'Character') { - const lastEntry = result[result.length - 1]; - - if (lastEntry && lastEntry[0] === 'Character' && lastEntry[1] != null) { - lastEntry[1] += tokenEntry[1]; - return; - } - } - - result.push(tokenEntry); -} - -function concatCharacterTokens(tokenEntries: HtmlLibToken[]): HtmlLibToken[] { - const result: HtmlLibToken[] = []; - - for (const tokenEntry of tokenEntries) appendTokenEntry(result, tokenEntry); - - return result; -} - function getTokenizerSuitableStateName(testDataStateName: string): Tokenizer['state'] { const name = testDataStateName.slice(0, -6).replace(' ', '_').toUpperCase(); return TokenizerMode[name as keyof typeof TokenizerMode]; @@ -141,7 +165,7 @@ interface TestDescription { description: string; input: string; lastStartTag: string; - errors?: string[]; + errors?: TokenError[]; } interface LoadedTest { @@ -153,7 +177,7 @@ interface LoadedTest { initialState: Tokenizer['state']; initialStateName: string; lastStartTag: string; - expectedErrors: string[]; + expectedErrors: TokenError[]; } function loadTests(dataDirPath: string): LoadedTest[] { @@ -194,11 +218,13 @@ function loadTests(dataDirPath: string): LoadedTest[] { setName, name: descr.description, input: descr.input, - expected: concatCharacterTokens(expected), + expected, initialState: getTokenizerSuitableStateName(initialStateName), initialStateName, lastStartTag: descr.lastStartTag, - expectedErrors: descr.errors || [], + expectedErrors: TestsWithBrokenErrors[descr.description] + ? TestsWithBrokenErrors[descr.description] + : descr.errors || [], }); } } @@ -218,28 +244,10 @@ export function generateTokenizationTests( it(testName, () => { const chunks = makeChunks(testData.input); - const result = tokenize( - createTokenSource, - chunks, - testData.initialState as Tokenizer['state'], - testData.lastStartTag - ); + const result = tokenize(createTokenSource, chunks, testData); assert.deepEqual(result.tokens, testData.expected, `Chunks: ${JSON.stringify(chunks)}`); - - /* - * 57.entities has an error that is not part of the test data. - * - * TODO: Move this to the test data. - */ - if ( - testName === - 'Tokenizer - 57.entities - Undefined named entity in attribute value ending in semicolon and whose name starts with a known entity name. - Initial state: Data state' - ) { - assert.deepEqual(result.errors, [{ code: 'unknown-named-character-reference', col: 12, line: 1 }]); - } else { - assert.deepEqual(result.errors, testData.expectedErrors || []); - } + assert.deepEqual(result.errors, testData.expectedErrors || []); }); } }