diff --git a/src/lib/Parser.ts b/src/lib/Parser.ts index 7e884e9..ab7a103 100644 --- a/src/lib/Parser.ts +++ b/src/lib/Parser.ts @@ -9,7 +9,6 @@ import { XmlText } from './XmlText.js'; import type { XmlNode } from './XmlNode.js'; - const emptyString = ''; /** @@ -29,11 +28,18 @@ export class Parser { * @param options Parser options. */ constructor(xml: string, options: ParserOptions = {}) { - this.document = new XmlDocument(); - this.currentNode = this.document; + let doc = this.document = new XmlDocument(); + let scanner = this.scanner = new StringScanner(xml); + + this.currentNode = doc; this.options = options; - this.scanner = new StringScanner(normalizeXmlString(xml)); + if (this.options.includeOffsets) { + doc.start = 0; + doc.end = xml.length; + } + + scanner.consumeStringFast('\uFEFF'); // byte order mark this.consumeProlog(); if (!this.consumeElement()) { @@ -42,7 +48,7 @@ export class Parser { while (this.consumeMisc()) {} // eslint-disable-line no-empty - if (!this.scanner.isEnd) { + if (!scanner.isEnd) { throw this.error('Extra content at the end of the document'); } } @@ -50,9 +56,14 @@ export class Parser { /** * Adds the given `XmlNode` as a child of `this.currentNode`. */ - addNode(node: XmlNode) { + addNode(node: XmlNode, charIndex: number) { node.parent = this.currentNode; + if (this.options.includeOffsets) { + node.start = this.scanner.charIndexToByteIndex(charIndex); + node.end = this.scanner.charIndexToByteIndex(); + } + // @ts-expect-error: XmlDocument has a more limited set of possible children // than XmlElement so TypeScript is unhappy, but we always do the right // thing. @@ -63,10 +74,12 @@ export class Parser { * Adds the given _text_ to the document, either by appending it to a * preceding `XmlText` node (if possible) or by creating a new `XmlText` node. */ - addText(text: string) { + addText(text: string, charIndex: number) { let { children } = this.currentNode; let { length } = children; + text = normalizeLineBreaks(text); + if (length > 0) { let prevNode = children[length - 1]; @@ -74,11 +87,16 @@ export class Parser { // The previous node is a text node, so we can append to it and avoid // creating another node. prevNode.text += text; + + if (this.options.includeOffsets) { + prevNode.end = this.scanner.charIndexToByteIndex(); + } + return; } } - this.addNode(new XmlText(text)); + this.addNode(new XmlText(text), charIndex); } /** @@ -199,6 +217,7 @@ export class Parser { */ consumeCdataSection(): boolean { let { scanner } = this; + let startIndex = scanner.charIndex; if (!scanner.consumeStringFast('`'); } - this.addText(charData); + this.addText(charData, startIndex); return true; } @@ -252,6 +272,7 @@ export class Parser { */ consumeComment(): boolean { let { scanner } = this; + let startIndex = scanner.charIndex; if (!scanner.consumeStringFast('`, { preserveComments: true }); + assert.strictEqual(root.children[0].start, -1); + }); + }); + + describe('end', () => { + it('is `-1`', () => { + let { root } = parseXml(``, { preserveComments: true }); + assert.strictEqual(root.children[0].end, -1); + }); + }); + }); + + describe('when `options.includeOffsets` is `true`', () => { + describe('start', () => { + it('is the starting byte offset of the comment', () => { + let { root } = parseXml(``, { includeOffsets: true, preserveComments: true }); + assert.strictEqual(root.children[0].start, 6); + }); + }); + + describe('end', () => { + it('is the ending byte offset of the comment', () => { + let { root } = parseXml(``, { includeOffsets: true, preserveComments: true }); + assert.strictEqual(root.children[0].end, 29); + }); + }); + }); + describe('parent', () => { it('is the parent node', () => { let { root } = parseXml(``, { preserveComments: true }); diff --git a/tests/lib/XmlDocument.test.js b/tests/lib/XmlDocument.test.js index 21eafc7..86bb755 100644 --- a/tests/lib/XmlDocument.test.js +++ b/tests/lib/XmlDocument.test.js @@ -39,6 +39,38 @@ describe('XmlDocument', () => { }); }); + describe('when `options.includeOffsets` is `false`', () => { + describe('start', () => { + it('is `-1`', () => { + let doc = parseXml(''); + assert.strictEqual(doc.start, -1); + }); + }); + + describe('end', () => { + it('is `-1`', () => { + let doc = parseXml(''); + assert.strictEqual(doc.end, -1); + }); + }); + }); + + describe('when `options.includeOffsets` is `true`', () => { + describe('start', () => { + it('is the starting byte offset of the document', () => { + let doc = parseXml('', { includeOffsets: true }); + assert.strictEqual(doc.start, 0); + }); + }); + + describe('end', () => { + it('is the ending byte offset of the document', () => { + let doc = parseXml('', { includeOffsets: true }); + assert.strictEqual(doc.end, 8); + }); + }); + }); + describe('parent', () => { it('is `null`', () => { assert.strictEqual(parseXml('').parent, null); diff --git a/tests/lib/XmlElement.test.js b/tests/lib/XmlElement.test.js index 05eb751..7443e7c 100644 --- a/tests/lib/XmlElement.test.js +++ b/tests/lib/XmlElement.test.js @@ -129,6 +129,42 @@ describe('XmlElement', () => { }); }); + describe('when `options.includeOffsets` is `false`', () => { + describe('start', () => { + it('is `-1`', () => { + let { root } = parseXml(''); + assert.strictEqual(root.start, -1); + }); + }); + + describe('end', () => { + it('is `-1`', () => { + let { root } = parseXml(''); + assert.strictEqual(root.end, -1); + }); + }); + }); + + describe('when `options.includeOffsets` is `true`', () => { + describe('start', () => { + it('is the starting byte offset of the element', () => { + let { root } = parseXml('', { includeOffsets: true }); + assert.strictEqual(root.start, 0); + assert.strictEqual(root.children[0].start, 3); + assert.strictEqual(root.children[0].children[0].start, 6); + }); + }); + + describe('end', () => { + it('is the ending byte offset of the element', () => { + let { root } = parseXml('', { includeOffsets: true }); + assert.strictEqual(root.end, 19); + assert.strictEqual(root.children[0].end, 15); + assert.strictEqual(root.children[0].children[0].end, 11); + }); + }); + }); + describe('parent', () => { describe('when the element is the root element', () => { it('is the document', () => { diff --git a/tests/lib/XmlNode.test.js b/tests/lib/XmlNode.test.js index c79ff99..39a6a1f 100644 --- a/tests/lib/XmlNode.test.js +++ b/tests/lib/XmlNode.test.js @@ -15,6 +15,28 @@ describe('XmlNode', () => { }); }); + describe('toJSON()', () => { + describe('when `start` is `-1`', () => { + it('doesn\'t include the `start` or `end` properties', () => { + let json = new XmlNode().toJSON(); + assert.strictEqual(json.start, undefined); + assert.strictEqual(json.end, undefined); + }); + }); + + describe('when `start` is greater than -1', () => { + it('includes the `start` and `end` properties', () => { + let node = new XmlNode(); + node.start = 0; + node.end = 3; + + let json = node.toJSON(); + assert.strictEqual(json.start, 0); + assert.strictEqual(json.end, 3); + }); + }); + }); + describe('type', () => { it('is an empty string', () => { let node = new XmlNode(); diff --git a/tests/lib/XmlProcessingInstruction.test.js b/tests/lib/XmlProcessingInstruction.test.js index 681605c..ab8a24a 100644 --- a/tests/lib/XmlProcessingInstruction.test.js +++ b/tests/lib/XmlProcessingInstruction.test.js @@ -30,6 +30,38 @@ describe('XmlProcessingInstruction', () => { }); }); + describe('when `options.includeOffsets` is `false`', () => { + describe('start', () => { + it('is `-1`', () => { + let { root } = parseXml(''); + assert.strictEqual(root.children[0].start, -1); + }); + }); + + describe('end', () => { + it('is `-1`', () => { + let { root } = parseXml(''); + assert.strictEqual(root.children[0].end, -1); + }); + }); + }); + + describe('when `options.includeOffsets` is `true`', () => { + describe('start', () => { + it('is the starting byte offset of the processing instruction', () => { + let { root } = parseXml('', { includeOffsets: true }); + assert.strictEqual(root.children[0].start, 6); + }); + }); + + describe('end', () => { + it('is the ending byte offset of the processing instruction', () => { + let { root } = parseXml('', { includeOffsets: true }); + assert.strictEqual(root.children[0].end, 13); + }); + }); + }); + describe('parent', () => { it('is the parent element', () => { let { root } = parseXml(''); diff --git a/tests/lib/XmlText.test.js b/tests/lib/XmlText.test.js index 4a5cb7a..ef86860 100644 --- a/tests/lib/XmlText.test.js +++ b/tests/lib/XmlText.test.js @@ -23,6 +23,43 @@ describe('XmlText', () => { }); }); + describe('when `options.includeOffsets` is `false`', () => { + describe('start', () => { + it('is `-1`', () => { + let { root } = parseXml(' foo & bar\r\nbaz '); + assert.strictEqual(root.children[0].start, -1); + }); + }); + + describe('end', () => { + it('is `-1`', () => { + let { root } = parseXml(' foo & bar\r\nbaz '); + assert.strictEqual(root.children[0].end, -1); + }); + }); + }); + + describe('when `options.includeOffsets` is `true`', () => { + describe('start', () => { + it('is the starting byte offset of the text node', () => { + let { root } = parseXml(' foo ', { includeOffsets: true }); + assert.strictEqual(root.children[0].start, 6); + }); + }); + + describe('end', () => { + it('is the ending byte offset of the text node', () => { + let { root } = parseXml(' foo ', { includeOffsets: true }); + assert.strictEqual(root.children[0].end, 11); + }); + + it('is correct after multiple text nodes have been merged', () => { + let { root } = parseXml('one&twothree', { includeOffsets: true }); + assert.strictEqual(root.children[0].end, 38); + }); + }); + }); + describe('text', () => { it('is the text content of the text node', () => { let { root } = parseXml(' foo & bar\r\nbaz ');