-
-
Notifications
You must be signed in to change notification settings - Fork 232
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
refactor(tokenizer): Introduce events #404
Merged
Merged
Changes from 16 commits
Commits
Show all changes
19 commits
Select commit
Hold shift + click to select a range
aa8bf73
refactor(tokenizer): Introduce events
fb55 6efaa6d
Move EOF creation & emission to `_emitEOFToken`
fb55 9249638
Port SAX parser to use tokenizer events
fb55 9b59041
Rename tokenizer classes
fb55 764498e
Flatten text & eof events
fb55 6c9e816
Shorten callback names
fb55 5b7b487
Simplify `_emitCurrentCharacterToken`
fb55 4a16907
refactor(tokenizer): Move `onParseError` to handler
fb55 a85f632
Use tokenizer events for `generateTokenizationTests`
fb55 f212403
Use tokenizer events for `location-info` tests
fb55 3f36ff8
Directly emit by token type
fb55 a82d4c4
Reintroduce character & eof tokens
fb55 ff0efa0
Add abstract class `SinglePathHandler`
fb55 a28d45d
Remove unused queued tokenizer properties
fb55 936a523
Move queued handler to its own file
fb55 be2ab48
Revert unwrapping of `currentCharacterToken`
fb55 8631aa5
Add some comments
fb55 2c411d3
Merge remote-tracking branch 'upstream/master' into refactor/tokenize…
fb55 d65af65
Fix comments
fb55 File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,13 @@ | ||
import { Transform } from 'node:stream'; | ||
import { Tokenizer } from 'parse5/dist/tokenizer/index.js'; | ||
import { TokenType, Token, CharacterToken, Attribute, Location } from 'parse5/dist/common/token.js'; | ||
import type { Tokenizer, TokenHandler } from 'parse5/dist/tokenizer/index.js'; | ||
import type { | ||
Attribute, | ||
Location, | ||
TagToken, | ||
CommentToken, | ||
DoctypeToken, | ||
CharacterToken, | ||
} from 'parse5/dist/common/token.js'; | ||
import { DevNullStream } from './dev-null-stream.js'; | ||
import { ParserFeedbackSimulator } from './parser-feedback-simulator.js'; | ||
|
||
|
@@ -39,13 +46,13 @@ export interface SAXParserOptions { | |
* }); | ||
* ``` | ||
*/ | ||
export class SAXParser extends Transform { | ||
export class SAXParser extends Transform implements TokenHandler { | ||
protected options: SAXParserOptions; | ||
protected tokenizer: Tokenizer; | ||
protected parserFeedbackSimulator: ParserFeedbackSimulator; | ||
private pendingText: CharacterToken | null = null; | ||
private pendingText: Text | null = null; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With |
||
private lastChunkWritten = false; | ||
private stopped = false; | ||
protected tokenizer: Tokenizer; | ||
|
||
/** | ||
* @param options Parsing options. | ||
|
@@ -58,8 +65,8 @@ export class SAXParser extends Transform { | |
...options, | ||
}; | ||
|
||
this.tokenizer = new Tokenizer(this.options); | ||
this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.tokenizer); | ||
this.parserFeedbackSimulator = new ParserFeedbackSimulator(this.options, this); | ||
this.tokenizer = this.parserFeedbackSimulator.tokenizer; | ||
|
||
// NOTE: always pipe stream to the /dev/null stream to avoid | ||
// `highWaterMark` hit even if we don't have consumers. | ||
|
@@ -127,96 +134,95 @@ export class SAXParser extends Transform { | |
} | ||
|
||
private _runParsingLoop(): void { | ||
let token = null; | ||
while (!this.stopped && this.tokenizer.active) { | ||
this.tokenizer.getNextToken(); | ||
} | ||
} | ||
|
||
do { | ||
token = this.parserFeedbackSimulator.getNextToken(); | ||
/** @internal */ | ||
onCharacter({ chars, location }: CharacterToken): void { | ||
if (this.pendingText === null) { | ||
this.pendingText = { text: chars, sourceCodeLocation: location }; | ||
} else { | ||
this.pendingText.text += chars; | ||
|
||
if (token.type === TokenType.HIBERNATION) { | ||
break; | ||
if (location && this.pendingText.sourceCodeLocation) { | ||
const { endLine, endCol, endOffset } = location; | ||
this.pendingText.sourceCodeLocation = { | ||
...this.pendingText.sourceCodeLocation, | ||
endLine, | ||
endCol, | ||
endOffset, | ||
}; | ||
} | ||
} | ||
} | ||
|
||
if ( | ||
token.type === TokenType.CHARACTER || | ||
token.type === TokenType.WHITESPACE_CHARACTER || | ||
token.type === TokenType.NULL_CHARACTER | ||
) { | ||
if (this.pendingText === null) { | ||
token.type = TokenType.CHARACTER; | ||
this.pendingText = token; | ||
} else { | ||
this.pendingText.chars += token.chars; | ||
|
||
if (token.location && this.pendingText.location) { | ||
const { endLine, endCol, endOffset } = token.location; | ||
this.pendingText.location = { | ||
...this.pendingText.location, | ||
endLine, | ||
endCol, | ||
endOffset, | ||
}; | ||
} | ||
} | ||
} else { | ||
this._emitPendingText(); | ||
this._handleToken(token); | ||
} | ||
} while (!this.stopped && token.type !== TokenType.EOF); | ||
/** @internal */ | ||
onWhitespaceCharacter(token: CharacterToken): void { | ||
this.onCharacter(token); | ||
} | ||
|
||
protected _handleToken(token: Token): boolean { | ||
switch (token.type) { | ||
case TokenType.EOF: { | ||
return true; | ||
} | ||
case TokenType.START_TAG: { | ||
const startTag: StartTag = { | ||
tagName: token.tagName, | ||
attrs: token.attrs, | ||
selfClosing: token.selfClosing, | ||
sourceCodeLocation: token.location, | ||
}; | ||
return this._emitIfListenerExists('startTag', startTag); | ||
} | ||
case TokenType.END_TAG: { | ||
const endTag: EndTag = { | ||
tagName: token.tagName, | ||
sourceCodeLocation: token.location, | ||
}; | ||
return this._emitIfListenerExists('endTag', endTag); | ||
} | ||
case TokenType.COMMENT: { | ||
const comment: Comment = { | ||
text: token.data, | ||
sourceCodeLocation: token.location, | ||
}; | ||
return this._emitIfListenerExists('comment', comment); | ||
} | ||
case TokenType.DOCTYPE: { | ||
const doctype: Doctype = { | ||
name: token.name, | ||
publicId: token.publicId, | ||
systemId: token.systemId, | ||
sourceCodeLocation: token.location, | ||
}; | ||
return this._emitIfListenerExists('doctype', doctype); | ||
} | ||
case TokenType.CHARACTER: | ||
case TokenType.NULL_CHARACTER: | ||
case TokenType.WHITESPACE_CHARACTER: { | ||
const text: Text = { | ||
text: token.chars, | ||
sourceCodeLocation: token.location, | ||
}; | ||
return this._emitIfListenerExists('text', text); | ||
} | ||
case TokenType.HIBERNATION: { | ||
return this._emitIfListenerExists('hibernation', {}); | ||
} | ||
} | ||
/** @internal */ | ||
onNullCharacter(token: CharacterToken): void { | ||
this.onCharacter(token); | ||
} | ||
|
||
/** @internal */ | ||
onEof(): void { | ||
this._emitPendingText(); | ||
this.stopped = true; | ||
} | ||
|
||
/** @internal */ | ||
onStartTag(token: TagToken): void { | ||
this._emitPendingText(); | ||
|
||
const startTag: StartTag = { | ||
tagName: token.tagName, | ||
attrs: token.attrs, | ||
selfClosing: token.selfClosing, | ||
sourceCodeLocation: token.location, | ||
}; | ||
this.emitIfListenerExists('startTag', startTag); | ||
} | ||
|
||
/** @internal */ | ||
onEndTag(token: TagToken): void { | ||
this._emitPendingText(); | ||
|
||
const endTag: EndTag = { | ||
tagName: token.tagName, | ||
sourceCodeLocation: token.location, | ||
}; | ||
this.emitIfListenerExists('endTag', endTag); | ||
} | ||
|
||
/** @internal */ | ||
onDoctype(token: DoctypeToken): void { | ||
this._emitPendingText(); | ||
|
||
const doctype: Doctype = { | ||
name: token.name, | ||
publicId: token.publicId, | ||
systemId: token.systemId, | ||
sourceCodeLocation: token.location, | ||
}; | ||
this.emitIfListenerExists('doctype', doctype); | ||
} | ||
|
||
/** @internal */ | ||
onComment(token: CommentToken): void { | ||
this._emitPendingText(); | ||
|
||
const comment: Comment = { | ||
text: token.data, | ||
sourceCodeLocation: token.location, | ||
}; | ||
this.emitIfListenerExists('comment', comment); | ||
} | ||
|
||
private _emitIfListenerExists(eventName: string, token: SaxToken): boolean { | ||
protected emitIfListenerExists(eventName: string, token: SaxToken): boolean { | ||
if (this.listenerCount(eventName) === 0) { | ||
return false; | ||
} | ||
|
@@ -232,7 +238,7 @@ export class SAXParser extends Transform { | |
|
||
private _emitPendingText(): void { | ||
if (this.pendingText !== null) { | ||
this._handleToken(this.pendingText); | ||
this.emitIfListenerExists('text', this.pendingText); | ||
this.pendingText = null; | ||
} | ||
} | ||
|
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The SAX parser now handles events, so
_handleToken
is gone. We can overrideemitIfListenerExists
instead for the same effect.