Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add optional byte offsets to nodes #25

Merged
merged 5 commits into from
Jan 28, 2023
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
90 changes: 62 additions & 28 deletions src/lib/Parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import { XmlText } from './XmlText.js';

import type { XmlNode } from './XmlNode.js';


const emptyString = '';

/**
Expand All @@ -29,11 +28,18 @@ export class Parser {
* @param options Parser options.
*/
constructor(xml: string, options: ParserOptions = {}) {
this.document = new XmlDocument();
this.currentNode = this.document;
let doc = this.document = new XmlDocument();
let scanner = this.scanner = new StringScanner(xml);

this.currentNode = doc;
this.options = options;
this.scanner = new StringScanner(normalizeXmlString(xml));

if (this.options.includeOffsets) {
doc.start = 0;
doc.end = xml.length;
}

scanner.consumeStringFast('\uFEFF'); // byte order mark
this.consumeProlog();

if (!this.consumeElement()) {
Expand All @@ -42,17 +48,22 @@ export class Parser {

while (this.consumeMisc()) {} // eslint-disable-line no-empty

if (!this.scanner.isEnd) {
if (!scanner.isEnd) {
throw this.error('Extra content at the end of the document');
}
}

/**
* Adds the given `XmlNode` as a child of `this.currentNode`.
*/
addNode(node: XmlNode) {
addNode(node: XmlNode, charIndex: number) {
node.parent = this.currentNode;

if (this.options.includeOffsets) {
node.start = this.scanner.charIndexToByteIndex(charIndex);
node.end = this.scanner.charIndexToByteIndex();
}

// @ts-expect-error: XmlDocument has a more limited set of possible children
// than XmlElement so TypeScript is unhappy, but we always do the right
// thing.
Expand All @@ -63,22 +74,29 @@ export class Parser {
* Adds the given _text_ to the document, either by appending it to a
* preceding `XmlText` node (if possible) or by creating a new `XmlText` node.
*/
addText(text: string) {
addText(text: string, charIndex: number) {
let { children } = this.currentNode;
let { length } = children;

text = normalizeLineBreaks(text);

if (length > 0) {
let prevNode = children[length - 1];

if (prevNode instanceof XmlText) {
// The previous node is a text node, so we can append to it and avoid
// creating another node.
prevNode.text += text;

if (this.options.includeOffsets) {
prevNode.end = this.scanner.charIndexToByteIndex();
}

return;
}
}

this.addNode(new XmlText(text));
this.addNode(new XmlText(text), charIndex);
}

/**
Expand Down Expand Up @@ -199,6 +217,7 @@ export class Parser {
*/
consumeCdataSection(): boolean {
let { scanner } = this;
let startIndex = scanner.charIndex;

if (!scanner.consumeStringFast('<![CDATA[')) {
return false;
Expand All @@ -212,9 +231,9 @@ export class Parser {
}

if (this.options.preserveCdata) {
this.addNode(new XmlCdata(text));
this.addNode(new XmlCdata(normalizeLineBreaks(text)), startIndex);
} else {
this.addText(text);
this.addText(text, startIndex);
}

return true;
Expand All @@ -228,6 +247,7 @@ export class Parser {
*/
consumeCharData(): boolean {
let { scanner } = this;
let startIndex = scanner.charIndex;
let charData = scanner.consumeUntilMatch(syntax.endCharData);

if (!charData) {
Expand All @@ -240,7 +260,7 @@ export class Parser {
throw this.error('Element content may not contain the CDATA section close delimiter `]]>`');
}

this.addText(charData);
this.addText(charData, startIndex);
return true;
}

Expand All @@ -252,6 +272,7 @@ export class Parser {
*/
consumeComment(): boolean {
let { scanner } = this;
let startIndex = scanner.charIndex;

if (!scanner.consumeStringFast('<!--')) {
return false;
Expand All @@ -269,7 +290,7 @@ export class Parser {
}

if (this.options.preserveComments) {
this.addNode(new XmlComment(content.trim()));
this.addNode(new XmlComment(normalizeLineBreaks(content.trim())), startIndex);
}

return true;
Expand All @@ -285,10 +306,11 @@ export class Parser {
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#entproc
*/
consumeContentReference(): boolean {
let startIndex = this.scanner.charIndex;
let ref = this.consumeReference();

if (ref) {
this.addText(ref);
this.addText(ref, startIndex);
return true;
}

Expand Down Expand Up @@ -334,7 +356,7 @@ export class Parser {
*/
consumeElement(): boolean {
let { scanner } = this;
let mark = scanner.charIndex;
let startIndex = scanner.charIndex;

if (!scanner.consumeStringFast('<')) {
return false;
Expand All @@ -343,7 +365,7 @@ export class Parser {
let name = this.consumeName();

if (!name) {
scanner.reset(mark);
scanner.reset(startIndex);
return false;
}

Expand Down Expand Up @@ -390,7 +412,7 @@ export class Parser {
this.currentNode = element.parent;
}

this.addNode(element);
this.addNode(element, startIndex);
return true;
}

Expand Down Expand Up @@ -443,7 +465,7 @@ export class Parser {
*/
consumeProcessingInstruction(): boolean {
let { scanner } = this;
let mark = scanner.charIndex;
let startIndex = scanner.charIndex;

if (!scanner.consumeStringFast('<?')) {
return false;
Expand All @@ -453,7 +475,7 @@ export class Parser {

if (name) {
if (name.toLowerCase() === 'xml') {
scanner.reset(mark);
scanner.reset(startIndex);
throw this.error("XML declaration isn't allowed here");
}
} else {
Expand All @@ -462,7 +484,7 @@ export class Parser {

if (!this.consumeWhitespace()) {
if (scanner.consumeStringFast('?>')) {
this.addNode(new XmlProcessingInstruction(name));
this.addNode(new XmlProcessingInstruction(name), startIndex);
return true;
}

Expand All @@ -476,7 +498,7 @@ export class Parser {
throw this.error('Unterminated processing instruction');
}

this.addNode(new XmlProcessingInstruction(name, content));
this.addNode(new XmlProcessingInstruction(name, normalizeLineBreaks(content)), startIndex);
return true;
}

Expand All @@ -488,7 +510,7 @@ export class Parser {
*/
consumeProlog(): boolean {
let { scanner } = this;
let mark = scanner.charIndex;
let startIndex = scanner.charIndex;

this.consumeXmlDeclaration();

Expand All @@ -498,7 +520,7 @@ export class Parser {
while (this.consumeMisc()) {} // eslint-disable-line no-empty
}

return mark < scanner.charIndex;
return startIndex < scanner.charIndex;
}

/**
Expand Down Expand Up @@ -761,15 +783,19 @@ export class Parser {
// -- Private Functions --------------------------------------------------------

/**
* Normalizes the given XML string by stripping a byte order mark (if present)
* and replacing CRLF sequences and lone CR characters with LF characters.
* Normalizes line breaks in the given text by replacing CRLF sequences and lone
* CR characters with LF characters.
*/
function normalizeXmlString(xml: string): string {
if (xml[0] === '\uFEFF') {
xml = xml.slice(1);
function normalizeLineBreaks(text: string): string {
let i = 0;

while ((i = text.indexOf('\r', i)) !== -1) {
text = text[i + 1] === '\n'
? text.slice(0, i) + text.slice(i + 1)
: text.slice(0, i) + '\n' + text.slice(i + 1);
}

return xml.replace(/\r\n?/g, '\n');
return text;
}

// -- Types --------------------------------------------------------------------
Expand All @@ -782,6 +808,14 @@ export type ParserOptions = {
*/
ignoreUndefinedEntities?: boolean;

/**
* When `true`, the starting and ending byte offsets of each node in the input
* string will be made available via `start` and `end` properties on the node.
*
* @default false
*/
includeOffsets?: boolean;

/**
* When `true`, CDATA sections will be preserved in the document as `XmlCdata`
* nodes. Otherwise CDATA sections will be represented as `XmlText` nodes,
Expand Down
20 changes: 10 additions & 10 deletions src/lib/StringScanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,6 @@ export class StringScanner {

// -- Protected Methods ------------------------------------------------------

/**
* Returns the byte index of the given character index in the string. The two
* may differ in strings that contain multibyte characters.
*/
protected charIndexToByteIndex(charIndex: number = this.charIndex): number {
return this.multiByteMode
? (this.charsToBytes as number[])[charIndex] ?? Infinity
: charIndex;
}

/**
* Returns the number of characters in the given string, which may differ from
* the byte length if the string contains multibyte characters.
Expand All @@ -75,6 +65,16 @@ export class StringScanner {
this.charIndex = Math.min(this.charCount, this.charIndex + count);
}

/**
* Returns the byte index of the given character index in the string. The two
* may differ in strings that contain multibyte characters.
*/
charIndexToByteIndex(charIndex: number = this.charIndex): number {
return this.multiByteMode
? (this.charsToBytes as number[])[charIndex] ?? Infinity
: charIndex;
}

/**
* Consumes and returns the given number of characters if possible, advancing
* the scanner and stopping if the end of the string is reached.
Expand Down
17 changes: 17 additions & 0 deletions src/lib/XmlNode.ts
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,12 @@ export class XmlNode {
return this.parent?.document ?? null;
}

/**
* Ending byte offset of this node in the original XML string, or `-1` if the
* offset is unknown.
*/
end = -1;

/**
* Whether this node is the root node of the document.
*/
Expand All @@ -71,6 +77,12 @@ export class XmlNode {
return Boolean(this.parent?.preserveWhitespace);
}

/**
* Starting byte offset of this node in the original XML string, or `-1` if
* the offset is unknown.
*/
start = -1;

/**
* Type of this node.
*
Expand Down Expand Up @@ -102,6 +114,11 @@ export class XmlNode {
json.preserveWhitespace = true;
}

if (this.start !== -1) {
json.start = this.start;
json.end = this.end;
}

return json;
}
}
2 changes: 1 addition & 1 deletion src/lib/syntax.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ export const attValueCharSingleQuote = /[^'&<]+/y;
*
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize
*/
export const attValueNormalizedWhitespace = /[\t\n]/g;
export const attValueNormalizedWhitespace = /\r\n|[\n\r\t]/g;

/**
* Regular expression that matches one or more characters that signal the end of
Expand Down
27 changes: 27 additions & 0 deletions tests/lib/Parser.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ describe('Parser', () => {
assert.strictEqual(doc.root.name, 'root');
});

it('normalizes whitespace in attribute values', () => {
let { root } = parseXml('<root attr=" one two\tthree\nfour\r\nfive\rsix " />');
assert.strictEqual(root.attributes.attr, ' one two three four five six ');
});

describe('when `options.ignoreUndefinedEntities` is `true`', () => {
beforeEach(() => {
options.ignoreUndefinedEntities = true;
Expand All @@ -52,6 +57,28 @@ describe('Parser', () => {
});
});

describe('when `options.includeOffsets` is `true`', () => {
it('the start offset is a byte offset, not a character offset', () => {
let { root } = parseXml('<root><cat>🐈</cat><dog>🐕</dog></root>', { includeOffsets: true });
assert.strictEqual(root.children[1].start, 19);
});

it('the end offset is a byte offset, not a character offset', () => {
let { root } = parseXml('<root><cat>🐈</cat><dog>🐕</dog></root>', { includeOffsets: true });
assert.strictEqual(root.children[1].end, 32);
});

it('a byte order mark character is counted in the offset', () => {
let { root } = parseXml('\uFEFF<root>foo</root>', { includeOffsets: true });
assert.strictEqual(root.children[0].start, 7);
});

it('a carriage return character is not counted in the offset', () => {
let { root } = parseXml('<root>\rfoo</root>', { includeOffsets: true });
assert.strictEqual(root.children[0].start, 6);
});
});

describe('when `options.resolveUndefinedEntity` is set', () => {
beforeEach(() => {
options.resolveUndefinedEntity = (ref) => {
Expand Down
Loading