Skip to content

Commit

Permalink
Merge branch 'offsets' into next
Browse files Browse the repository at this point in the history
  • Loading branch information
rgrove committed Jan 28, 2023
2 parents 18e5e98 + 87b6213 commit dd7c8b1
Show file tree
Hide file tree
Showing 12 changed files with 340 additions and 39 deletions.
90 changes: 62 additions & 28 deletions src/lib/Parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@ import { XmlText } from './XmlText.js';

import type { XmlNode } from './XmlNode.js';


const emptyString = '';

/**
Expand All @@ -29,11 +28,18 @@ export class Parser {
* @param options Parser options.
*/
constructor(xml: string, options: ParserOptions = {}) {
this.document = new XmlDocument();
this.currentNode = this.document;
let doc = this.document = new XmlDocument();
let scanner = this.scanner = new StringScanner(xml);

this.currentNode = doc;
this.options = options;
this.scanner = new StringScanner(normalizeXmlString(xml));

if (this.options.includeOffsets) {
doc.start = 0;
doc.end = xml.length;
}

scanner.consumeStringFast('\uFEFF'); // byte order mark
this.consumeProlog();

if (!this.consumeElement()) {
Expand All @@ -42,17 +48,22 @@ export class Parser {

while (this.consumeMisc()) {} // eslint-disable-line no-empty

if (!this.scanner.isEnd) {
if (!scanner.isEnd) {
throw this.error('Extra content at the end of the document');
}
}

/**
* Adds the given `XmlNode` as a child of `this.currentNode`.
*/
addNode(node: XmlNode) {
addNode(node: XmlNode, charIndex: number) {
node.parent = this.currentNode;

if (this.options.includeOffsets) {
node.start = this.scanner.charIndexToByteIndex(charIndex);
node.end = this.scanner.charIndexToByteIndex();
}

// @ts-expect-error: XmlDocument has a more limited set of possible children
// than XmlElement so TypeScript is unhappy, but we always do the right
// thing.
Expand All @@ -63,22 +74,29 @@ export class Parser {
* Adds the given _text_ to the document, either by appending it to a
* preceding `XmlText` node (if possible) or by creating a new `XmlText` node.
*/
addText(text: string) {
addText(text: string, charIndex: number) {
let { children } = this.currentNode;
let { length } = children;

text = normalizeLineBreaks(text);

if (length > 0) {
let prevNode = children[length - 1];

if (prevNode instanceof XmlText) {
// The previous node is a text node, so we can append to it and avoid
// creating another node.
prevNode.text += text;

if (this.options.includeOffsets) {
prevNode.end = this.scanner.charIndexToByteIndex();
}

return;
}
}

this.addNode(new XmlText(text));
this.addNode(new XmlText(text), charIndex);
}

/**
Expand Down Expand Up @@ -199,6 +217,7 @@ export class Parser {
*/
consumeCdataSection(): boolean {
let { scanner } = this;
let startIndex = scanner.charIndex;

if (!scanner.consumeStringFast('<![CDATA[')) {
return false;
Expand All @@ -212,9 +231,9 @@ export class Parser {
}

if (this.options.preserveCdata) {
this.addNode(new XmlCdata(text));
this.addNode(new XmlCdata(normalizeLineBreaks(text)), startIndex);
} else {
this.addText(text);
this.addText(text, startIndex);
}

return true;
Expand All @@ -228,6 +247,7 @@ export class Parser {
*/
consumeCharData(): boolean {
let { scanner } = this;
let startIndex = scanner.charIndex;
let charData = scanner.consumeUntilMatch(syntax.endCharData);

if (!charData) {
Expand All @@ -240,7 +260,7 @@ export class Parser {
throw this.error('Element content may not contain the CDATA section close delimiter `]]>`');
}

this.addText(charData);
this.addText(charData, startIndex);
return true;
}

Expand All @@ -252,6 +272,7 @@ export class Parser {
*/
consumeComment(): boolean {
let { scanner } = this;
let startIndex = scanner.charIndex;

if (!scanner.consumeStringFast('<!--')) {
return false;
Expand All @@ -269,7 +290,7 @@ export class Parser {
}

if (this.options.preserveComments) {
this.addNode(new XmlComment(content.trim()));
this.addNode(new XmlComment(normalizeLineBreaks(content.trim())), startIndex);
}

return true;
Expand All @@ -285,10 +306,11 @@ export class Parser {
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#entproc
*/
consumeContentReference(): boolean {
let startIndex = this.scanner.charIndex;
let ref = this.consumeReference();

if (ref) {
this.addText(ref);
this.addText(ref, startIndex);
return true;
}

Expand Down Expand Up @@ -334,7 +356,7 @@ export class Parser {
*/
consumeElement(): boolean {
let { scanner } = this;
let mark = scanner.charIndex;
let startIndex = scanner.charIndex;

if (!scanner.consumeStringFast('<')) {
return false;
Expand All @@ -343,7 +365,7 @@ export class Parser {
let name = this.consumeName();

if (!name) {
scanner.reset(mark);
scanner.reset(startIndex);
return false;
}

Expand Down Expand Up @@ -390,7 +412,7 @@ export class Parser {
this.currentNode = element.parent;
}

this.addNode(element);
this.addNode(element, startIndex);
return true;
}

Expand Down Expand Up @@ -443,7 +465,7 @@ export class Parser {
*/
consumeProcessingInstruction(): boolean {
let { scanner } = this;
let mark = scanner.charIndex;
let startIndex = scanner.charIndex;

if (!scanner.consumeStringFast('<?')) {
return false;
Expand All @@ -453,7 +475,7 @@ export class Parser {

if (name) {
if (name.toLowerCase() === 'xml') {
scanner.reset(mark);
scanner.reset(startIndex);
throw this.error("XML declaration isn't allowed here");
}
} else {
Expand All @@ -462,7 +484,7 @@ export class Parser {

if (!this.consumeWhitespace()) {
if (scanner.consumeStringFast('?>')) {
this.addNode(new XmlProcessingInstruction(name));
this.addNode(new XmlProcessingInstruction(name), startIndex);
return true;
}

Expand All @@ -476,7 +498,7 @@ export class Parser {
throw this.error('Unterminated processing instruction');
}

this.addNode(new XmlProcessingInstruction(name, content));
this.addNode(new XmlProcessingInstruction(name, normalizeLineBreaks(content)), startIndex);
return true;
}

Expand All @@ -488,7 +510,7 @@ export class Parser {
*/
consumeProlog(): boolean {
let { scanner } = this;
let mark = scanner.charIndex;
let startIndex = scanner.charIndex;

this.consumeXmlDeclaration();

Expand All @@ -498,7 +520,7 @@ export class Parser {
while (this.consumeMisc()) {} // eslint-disable-line no-empty
}

return mark < scanner.charIndex;
return startIndex < scanner.charIndex;
}

/**
Expand Down Expand Up @@ -761,15 +783,19 @@ export class Parser {
// -- Private Functions --------------------------------------------------------

/**
* Normalizes the given XML string by stripping a byte order mark (if present)
* and replacing CRLF sequences and lone CR characters with LF characters.
* Normalizes line breaks in the given text by replacing CRLF sequences and lone
* CR characters with LF characters.
*/
function normalizeXmlString(xml: string): string {
if (xml[0] === '\uFEFF') {
xml = xml.slice(1);
function normalizeLineBreaks(text: string): string {
let i = 0;

while ((i = text.indexOf('\r', i)) !== -1) {
text = text[i + 1] === '\n'
? text.slice(0, i) + text.slice(i + 1)
: text.slice(0, i) + '\n' + text.slice(i + 1);
}

return xml.replace(/\r\n?/g, '\n');
return text;
}

// -- Types --------------------------------------------------------------------
Expand All @@ -782,6 +808,14 @@ export type ParserOptions = {
*/
ignoreUndefinedEntities?: boolean;

/**
* When `true`, the starting and ending byte offsets of each node in the input
* string will be made available via `start` and `end` properties on the node.
*
* @default false
*/
includeOffsets?: boolean;

/**
* When `true`, CDATA sections will be preserved in the document as `XmlCdata`
* nodes. Otherwise CDATA sections will be represented as `XmlText` nodes,
Expand Down
20 changes: 10 additions & 10 deletions src/lib/StringScanner.ts
Original file line number Diff line number Diff line change
Expand Up @@ -42,16 +42,6 @@ export class StringScanner {

// -- Protected Methods ------------------------------------------------------

/**
* Returns the byte index of the given character index in the string. The two
* may differ in strings that contain multibyte characters.
*/
protected charIndexToByteIndex(charIndex: number = this.charIndex): number {
return this.multiByteMode
? (this.charsToBytes as number[])[charIndex] ?? Infinity
: charIndex;
}

/**
* Returns the number of characters in the given string, which may differ from
* the byte length if the string contains multibyte characters.
Expand All @@ -75,6 +65,16 @@ export class StringScanner {
this.charIndex = Math.min(this.charCount, this.charIndex + count);
}

/**
* Returns the byte index of the given character index in the string. The two
* may differ in strings that contain multibyte characters.
*/
charIndexToByteIndex(charIndex: number = this.charIndex): number {
return this.multiByteMode
? (this.charsToBytes as number[])[charIndex] ?? Infinity
: charIndex;
}

/**
* Consumes and returns the given number of characters if possible, advancing
* the scanner and stopping if the end of the string is reached.
Expand Down
17 changes: 17 additions & 0 deletions src/lib/XmlNode.ts
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,18 @@ export class XmlNode {
*/
parent: XmlDocument | XmlElement | null = null;

/**
* Starting byte offset of this node in the original XML string, or `-1` if
* the offset is unknown.
*/
start = -1;

/**
* Ending byte offset of this node in the original XML string, or `-1` if the
* offset is unknown.
*/
end = -1;

/**
* Document that contains this node, or `null` if this node is not associated
* with a document.
Expand Down Expand Up @@ -102,6 +114,11 @@ export class XmlNode {
json.preserveWhitespace = true;
}

if (this.start !== -1) {
json.start = this.start;
json.end = this.end;
}

return json;
}
}
2 changes: 1 addition & 1 deletion src/lib/syntax.ts
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@ export const attValueCharSingleQuote = /[^'&<]+/y;
*
* @see https://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize
*/
export const attValueNormalizedWhitespace = /[\t\n]/g;
export const attValueNormalizedWhitespace = /\r\n|[\n\r\t]/g;

/**
* Regular expression that matches one or more characters that signal the end of
Expand Down
27 changes: 27 additions & 0 deletions tests/lib/Parser.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,11 @@ describe('Parser', () => {
assert.strictEqual(doc.root.name, 'root');
});

it('normalizes whitespace in attribute values', () => {
let { root } = parseXml('<root attr=" one two\tthree\nfour\r\nfive\rsix " />');
assert.strictEqual(root.attributes.attr, ' one two three four five six ');
});

describe('when `options.ignoreUndefinedEntities` is `true`', () => {
beforeEach(() => {
options.ignoreUndefinedEntities = true;
Expand All @@ -52,6 +57,28 @@ describe('Parser', () => {
});
});

describe('when `options.includeOffsets` is `true`', () => {
it('the start offset is a byte offset, not a character offset', () => {
let { root } = parseXml('<root><cat>🐈</cat><dog>🐕</dog></root>', { includeOffsets: true });
assert.strictEqual(root.children[1].start, 19);
});

it('the end offset is a byte offset, not a character offset', () => {
let { root } = parseXml('<root><cat>🐈</cat><dog>🐕</dog></root>', { includeOffsets: true });
assert.strictEqual(root.children[1].end, 32);
});

it('a byte order mark character is counted in the offset', () => {
let { root } = parseXml('\uFEFF<root>foo</root>', { includeOffsets: true });
assert.strictEqual(root.children[0].start, 7);
});

it('a carriage return character is not counted in the offset', () => {
let { root } = parseXml('<root>\rfoo</root>', { includeOffsets: true });
assert.strictEqual(root.children[0].start, 6);
});
});

describe('when `options.resolveUndefinedEntity` is set', () => {
beforeEach(() => {
options.resolveUndefinedEntity = (ref) => {
Expand Down
Loading

0 comments on commit dd7c8b1

Please sign in to comment.