From 657cc11806e6dfb0872e0365b3c9b9b2c3831963 Mon Sep 17 00:00:00 2001
From: Ryan Grove <ryan@wonko.com>
Date: Mon, 16 Jan 2023 16:10:48 -0800
Subject: [PATCH 1/5] Add optional byte offsets to nodes

This adds a new parser option named `includeOffsets`, which defaults to
`false`. When `includeOffsets` is `true`, the byte offset of each node
in the input string will be made available via an `offset` property on
the node.

Note that this offset doesn't take into account any carriage return
(`\r`) characters in the input string because carriage returns are
removed during a normalization step before parsing begins.

Closes #24
---
 src/lib/Parser.ts                          | 65 ++++++++++++++--------
 src/lib/StringScanner.ts                   | 20 +++----
 src/lib/XmlNode.ts                         | 10 ++++
 tests/lib/Parser.test.js                   | 17 ++++++
 tests/lib/XmlCdata.test.js                 | 16 ++++++
 tests/lib/XmlComment.test.js               | 16 ++++++
 tests/lib/XmlDocument.test.js              | 16 ++++++
 tests/lib/XmlElement.test.js               | 18 ++++++
 tests/lib/XmlNode.test.js                  | 17 ++++++
 tests/lib/XmlProcessingInstruction.test.js | 16 ++++++
 tests/lib/XmlText.test.js                  | 16 ++++++
 11 files changed, 195 insertions(+), 32 deletions(-)

diff --git a/src/lib/Parser.ts b/src/lib/Parser.ts
index 7e884e9..ac9787f 100644
--- a/src/lib/Parser.ts
+++ b/src/lib/Parser.ts
@@ -9,7 +9,6 @@ import { XmlText } from './XmlText.js';
 
 import type { XmlNode } from './XmlNode.js';
 
-
 const emptyString = '';
 
 /**
@@ -34,6 +33,11 @@ export class Parser {
     this.options = options;
     this.scanner = new StringScanner(normalizeXmlString(xml));
 
+    if (this.options.includeOffsets) {
+      this.document.offset = 0;
+    }
+
+    this.scanner.consumeStringFast('\uFEFF'); // byte order mark
     this.consumeProlog();
 
     if (!this.consumeElement()) {
@@ -50,9 +54,13 @@ export class Parser {
   /**
    * Adds the given `XmlNode` as a child of `this.currentNode`.
    */
-  addNode(node: XmlNode) {
+  addNode(node: XmlNode, charIndex: number) {
     node.parent = this.currentNode;
 
+    if (this.options.includeOffsets) {
+      node.offset = this.scanner.charIndexToByteIndex(charIndex);
+    }
+
     // @ts-expect-error: XmlDocument has a more limited set of possible children
     // than XmlElement so TypeScript is unhappy, but we always do the right
     // thing.
@@ -63,7 +71,7 @@ export class Parser {
    * Adds the given _text_ to the document, either by appending it to a
    * preceding `XmlText` node (if possible) or by creating a new `XmlText` node.
    */
-  addText(text: string) {
+  addText(text: string, charIndex: number) {
     let { children } = this.currentNode;
     let { length } = children;
 
@@ -78,7 +86,7 @@ export class Parser {
       }
     }
 
-    this.addNode(new XmlText(text));
+    this.addNode(new XmlText(text), charIndex);
   }
 
   /**
@@ -199,6 +207,7 @@ export class Parser {
    */
   consumeCdataSection(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
 
     if (!scanner.consumeStringFast('<![CDATA[')) {
       return false;
@@ -212,9 +221,9 @@ export class Parser {
     }
 
     if (this.options.preserveCdata) {
-      this.addNode(new XmlCdata(text));
+      this.addNode(new XmlCdata(text), startIndex);
     } else {
-      this.addText(text);
+      this.addText(text, startIndex);
     }
 
     return true;
@@ -228,6 +237,7 @@ export class Parser {
    */
   consumeCharData(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
     let charData = scanner.consumeUntilMatch(syntax.endCharData);
 
     if (!charData) {
@@ -240,7 +250,7 @@ export class Parser {
       throw this.error('Element content may not contain the CDATA section close delimiter `]]>`');
     }
 
-    this.addText(charData);
+    this.addText(charData, startIndex);
     return true;
   }
 
@@ -252,6 +262,7 @@ export class Parser {
    */
   consumeComment(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
 
     if (!scanner.consumeStringFast('<!--')) {
       return false;
@@ -269,7 +280,7 @@ export class Parser {
     }
 
     if (this.options.preserveComments) {
-      this.addNode(new XmlComment(content.trim()));
+      this.addNode(new XmlComment(content.trim()), startIndex);
     }
 
     return true;
@@ -285,10 +296,11 @@ export class Parser {
    * @see https://www.w3.org/TR/2008/REC-xml-20081126/#entproc
    */
   consumeContentReference(): boolean {
+    let startIndex = this.scanner.charIndex;
     let ref = this.consumeReference();
 
     if (ref) {
-      this.addText(ref);
+      this.addText(ref, startIndex);
       return true;
     }
 
@@ -334,7 +346,7 @@ export class Parser {
    */
   consumeElement(): boolean {
     let { scanner } = this;
-    let mark = scanner.charIndex;
+    let startIndex = scanner.charIndex;
 
     if (!scanner.consumeStringFast('<')) {
       return false;
@@ -343,7 +355,7 @@ export class Parser {
     let name = this.consumeName();
 
     if (!name) {
-      scanner.reset(mark);
+      scanner.reset(startIndex);
       return false;
     }
 
@@ -390,7 +402,7 @@ export class Parser {
       this.currentNode = element.parent;
     }
 
-    this.addNode(element);
+    this.addNode(element, startIndex);
     return true;
   }
 
@@ -443,7 +455,7 @@ export class Parser {
    */
   consumeProcessingInstruction(): boolean {
     let { scanner } = this;
-    let mark = scanner.charIndex;
+    let startIndex = scanner.charIndex;
 
     if (!scanner.consumeStringFast('<?')) {
       return false;
@@ -453,7 +465,7 @@ export class Parser {
 
     if (name) {
       if (name.toLowerCase() === 'xml') {
-        scanner.reset(mark);
+        scanner.reset(startIndex);
         throw this.error("XML declaration isn't allowed here");
       }
     } else {
@@ -462,7 +474,7 @@ export class Parser {
 
     if (!this.consumeWhitespace()) {
       if (scanner.consumeStringFast('?>')) {
-        this.addNode(new XmlProcessingInstruction(name));
+        this.addNode(new XmlProcessingInstruction(name), startIndex);
         return true;
       }
 
@@ -476,7 +488,7 @@ export class Parser {
       throw this.error('Unterminated processing instruction');
     }
 
-    this.addNode(new XmlProcessingInstruction(name, content));
+    this.addNode(new XmlProcessingInstruction(name, content), startIndex);
     return true;
   }
 
@@ -488,7 +500,7 @@ export class Parser {
    */
   consumeProlog(): boolean {
     let { scanner } = this;
-    let mark = scanner.charIndex;
+    let startIndex = scanner.charIndex;
 
     this.consumeXmlDeclaration();
 
@@ -498,7 +510,7 @@ export class Parser {
       while (this.consumeMisc()) {} // eslint-disable-line no-empty
     }
 
-    return mark < scanner.charIndex;
+    return startIndex < scanner.charIndex;
   }
 
   /**
@@ -765,10 +777,6 @@ export class Parser {
  * and replacing CRLF sequences and lone CR characters with LF characters.
  */
 function normalizeXmlString(xml: string): string {
-  if (xml[0] === '\uFEFF') {
-    xml = xml.slice(1);
-  }
-
   return xml.replace(/\r\n?/g, '\n');
 }
 
@@ -782,6 +790,19 @@ export type ParserOptions = {
    */
   ignoreUndefinedEntities?: boolean;
 
+  /**
+   * When `true`, the byte offset of each node in the input string will be
+   * made available via an `offset` property on the node.
+   *
+   * Note that this offset doesn't take into account any carriage return (`\r`)
+   * characters in the input string because carriage returns are removed during
+   * a normalization step before parsing begins.
+   *
+   * @default false
+   * @see https://www.w3.org/TR/2008/REC-xml-20081126/#sec-line-ends
+   */
+  includeOffsets?: boolean;
+
   /**
    * When `true`, CDATA sections will be preserved in the document as `XmlCdata`
    * nodes. Otherwise CDATA sections will be represented as `XmlText` nodes,
diff --git a/src/lib/StringScanner.ts b/src/lib/StringScanner.ts
index 5c48b17..5fe9f9e 100644
--- a/src/lib/StringScanner.ts
+++ b/src/lib/StringScanner.ts
@@ -42,16 +42,6 @@ export class StringScanner {
 
   // -- Protected Methods ------------------------------------------------------
 
-  /**
-   * Returns the byte index of the given character index in the string. The two
-   * may differ in strings that contain multibyte characters.
-   */
-  protected charIndexToByteIndex(charIndex: number = this.charIndex): number {
-    return this.multiByteMode
-      ? (this.charsToBytes as number[])[charIndex] ?? Infinity
-      : charIndex;
-  }
-
   /**
    * Returns the number of characters in the given string, which may differ from
    * the byte length if the string contains multibyte characters.
@@ -75,6 +65,16 @@ export class StringScanner {
     this.charIndex = Math.min(this.charCount, this.charIndex + count);
   }
 
+  /**
+   * Returns the byte index of the given character index in the string. The two
+   * may differ in strings that contain multibyte characters.
+   */
+  charIndexToByteIndex(charIndex: number = this.charIndex): number {
+    return this.multiByteMode
+      ? (this.charsToBytes as number[])[charIndex] ?? Infinity
+      : charIndex;
+  }
+
   /**
    * Consumes and returns the given number of characters if possible, advancing
    * the scanner and stopping if the end of the string is reached.
diff --git a/src/lib/XmlNode.ts b/src/lib/XmlNode.ts
index 656328a..888283a 100644
--- a/src/lib/XmlNode.ts
+++ b/src/lib/XmlNode.ts
@@ -36,6 +36,12 @@ export class XmlNode {
    */
   static readonly TYPE_TEXT = 'text';
 
+  /**
+   * Byte offset of this node in the original XML string, or `-1` if the offset
+   * is unknown.
+   */
+  offset = -1;
+
   /**
    * Parent node of this node, or `null` if this node has no parent.
    */
@@ -102,6 +108,10 @@ export class XmlNode {
       json.preserveWhitespace = true;
     }
 
+    if (this.offset !== -1) {
+      json.offset = this.offset;
+    }
+
     return json;
   }
 }
diff --git a/tests/lib/Parser.test.js b/tests/lib/Parser.test.js
index ce1bf43..0cc4ab3 100644
--- a/tests/lib/Parser.test.js
+++ b/tests/lib/Parser.test.js
@@ -52,6 +52,23 @@ describe('Parser', () => {
     });
   });
 
+  describe('when `options.includeOffsets` is `true`', () => {
+    it('the offset is a byte offset, not a character offset', () => {
+      let { root } = parseXml('<root><cat>🐈</cat><dog>🐕</dog></root>', { includeOffsets: true });
+      assert.strictEqual(root.children[1].offset, 19);
+    });
+
+    it('a byte order mark character is counted in the offset', () => {
+      let { root } = parseXml('\uFEFF<root>foo</root>', { includeOffsets: true });
+      assert.strictEqual(root.children[0].offset, 7);
+    });
+
+    it('a carriage return character is not counted in the offset', () => {
+      let { root } = parseXml('<root>\rfoo</root>', { includeOffsets: true });
+      assert.strictEqual(root.children[0].offset, 6);
+    });
+  });
+
   describe('when `options.resolveUndefinedEntity` is set', () => {
     beforeEach(() => {
       options.resolveUndefinedEntity = (ref) => {
diff --git a/tests/lib/XmlCdata.test.js b/tests/lib/XmlCdata.test.js
index 0b7d8fd..cf0c4d4 100644
--- a/tests/lib/XmlCdata.test.js
+++ b/tests/lib/XmlCdata.test.js
@@ -26,6 +26,22 @@ describe('XmlCdata', () => {
     assert.strictEqual(node.parent, root);
   });
 
+  describe('offset', () => {
+    describe('when `options.includeOffsets` is `false`', () => {
+      it('is `-1`', () => {
+        let { root } = parseXml(xml, { preserveCdata: true });
+        assert.strictEqual(root.children[0].offset, -1);
+      });
+    });
+
+    describe('when `options.includeOffsets` is `true`', () => {
+      it('is the byte offset of the CDATA section', () => {
+        let { root } = parseXml(xml, { includeOffsets: true, preserveCdata: true });
+        assert.strictEqual(root.children[0].offset, 6);
+      });
+    });
+  });
+
   describe('type', () => {
     it('is `XmlNode.TYPE_CDATA`', () => {
       let { root } = parseXml(xml, { preserveCdata: true });
diff --git a/tests/lib/XmlComment.test.js b/tests/lib/XmlComment.test.js
index 2ea400c..9f275ab 100644
--- a/tests/lib/XmlComment.test.js
+++ b/tests/lib/XmlComment.test.js
@@ -35,6 +35,22 @@ describe('XmlComment', () => {
     });
   });
 
+  describe('offset', () => {
+    describe('when `options.includeOffsets` is `false`', () => {
+      it('is `-1`', () => {
+        let { root } = parseXml(`<root><!-- I'm a comment! --></root>`, { preserveComments: true });
+        assert.strictEqual(root.children[0].offset, -1);
+      });
+    });
+
+    describe('when `options.includeOffsets` is `true`', () => {
+      it('is the byte offset of the comment', () => {
+        let { root } = parseXml(`<root><!-- I'm a comment! --></root>`, { includeOffsets: true, preserveComments: true });
+        assert.strictEqual(root.children[0].offset, 6);
+      });
+    });
+  });
+
   describe('parent', () => {
     it('is the parent node', () => {
       let { root } = parseXml(`<root><!-- I'm a comment! --></root>`, { preserveComments: true });
diff --git a/tests/lib/XmlDocument.test.js b/tests/lib/XmlDocument.test.js
index 21eafc7..195bb45 100644
--- a/tests/lib/XmlDocument.test.js
+++ b/tests/lib/XmlDocument.test.js
@@ -39,6 +39,22 @@ describe('XmlDocument', () => {
     });
   });
 
+  describe('offset', () => {
+    describe('when `options.includeOffsets` is `false`', () => {
+      it('is `-1`', () => {
+        let doc = parseXml('<root />');
+        assert.strictEqual(doc.offset, -1);
+      });
+    });
+
+    describe('when `options.includeOffsets` is `true`', () => {
+      it('is the byte offset of the document (which is always `0`)', () => {
+        let doc = parseXml('<root />', { includeOffsets: true });
+        assert.strictEqual(doc.offset, 0);
+      });
+    });
+  });
+
   describe('parent', () => {
     it('is `null`', () => {
       assert.strictEqual(parseXml('<root />').parent, null);
diff --git a/tests/lib/XmlElement.test.js b/tests/lib/XmlElement.test.js
index 05eb751..25a8c78 100644
--- a/tests/lib/XmlElement.test.js
+++ b/tests/lib/XmlElement.test.js
@@ -129,6 +129,24 @@ describe('XmlElement', () => {
     });
   });
 
+  describe('offset', () => {
+    describe('when `options.includeOffsets` is `false`', () => {
+      it('is `-1`', () => {
+        let { root } = parseXml('<root />');
+        assert.strictEqual(root.offset, -1);
+      });
+    });
+
+    describe('when `options.includeOffsets` is `true`', () => {
+      it('is the offset of the element in the document', () => {
+        let { root } = parseXml('<a><b><c /></b></a>', { includeOffsets: true });
+        assert.strictEqual(root.offset, 0);
+        assert.strictEqual(root.children[0].offset, 3);
+        assert.strictEqual(root.children[0].children[0].offset, 6);
+      });
+    });
+  });
+
   describe('parent', () => {
     describe('when the element is the root element', () => {
       it('is the document', () => {
diff --git a/tests/lib/XmlNode.test.js b/tests/lib/XmlNode.test.js
index c79ff99..fd24ae5 100644
--- a/tests/lib/XmlNode.test.js
+++ b/tests/lib/XmlNode.test.js
@@ -15,6 +15,23 @@ describe('XmlNode', () => {
     });
   });
 
+  describe('toJSON()', () => {
+    describe('when `offset` is `-1`', () => {
+      it('doesn\'t include an `offset` property', () => {
+        let node = new XmlNode();
+        assert.strictEqual(node.toJSON().offset, undefined);
+      });
+    });
+
+    describe('when `offset` is greater than -1', () => {
+      it('includes an `offset` property', () => {
+        let node = new XmlNode();
+        node.offset = 0;
+        assert.strictEqual(node.toJSON().offset, 0);
+      });
+    });
+  });
+
   describe('type', () => {
     it('is an empty string', () => {
       let node = new XmlNode();
diff --git a/tests/lib/XmlProcessingInstruction.test.js b/tests/lib/XmlProcessingInstruction.test.js
index 681605c..a507ee9 100644
--- a/tests/lib/XmlProcessingInstruction.test.js
+++ b/tests/lib/XmlProcessingInstruction.test.js
@@ -30,6 +30,22 @@ describe('XmlProcessingInstruction', () => {
     });
   });
 
+  describe('offset', () => {
+    describe('when `options.includeOffsets` is `false`', () => {
+      it('is `-1`', () => {
+        let { root } = parseXml('<root><?foo?></root>');
+        assert.strictEqual(root.children[0].offset, -1);
+      });
+    });
+
+    describe('when `options.includeOffsets` is `true`', () => {
+      it('is the byte offset of the processing instruction', () => {
+        let { root } = parseXml('<root><?foo?></root>', { includeOffsets: true });
+        assert.strictEqual(root.children[0].offset, 6);
+      });
+    });
+  });
+
   describe('parent', () => {
     it('is the parent element', () => {
       let { root } = parseXml('<root><?foo?></root>');
diff --git a/tests/lib/XmlText.test.js b/tests/lib/XmlText.test.js
index 4a5cb7a..1a608bc 100644
--- a/tests/lib/XmlText.test.js
+++ b/tests/lib/XmlText.test.js
@@ -23,6 +23,22 @@ describe('XmlText', () => {
     });
   });
 
+  describe('offset', () => {
+    describe('when `options.includeOffsets` is `false`', () => {
+      it('is `-1`', () => {
+        let { root } = parseXml('<root> foo &amp; bar\r\nbaz </root>');
+        assert.strictEqual(root.children[0].offset, -1);
+      });
+    });
+
+    describe('when `options.includeOffsets` is `true`', () => {
+      it('is the byte offset of the text node', () => {
+        let { root } = parseXml('<root> foo </root>', { includeOffsets: true });
+        assert.strictEqual(root.children[0].offset, 6);
+      });
+    });
+  });
+
   describe('text', () => {
     it('is the text content of the text node', () => {
       let { root } = parseXml('<root> foo &amp;  bar\r\nbaz </root>');

From 3a0275025e366a4affe994a30fca87b825d00e10 Mon Sep 17 00:00:00 2001
From: Ryan Grove <ryan@wonko.com>
Date: Mon, 16 Jan 2023 16:16:22 -0800
Subject: [PATCH 2/5] Update comment

---
 src/lib/Parser.ts | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/lib/Parser.ts b/src/lib/Parser.ts
index ac9787f..39783c9 100644
--- a/src/lib/Parser.ts
+++ b/src/lib/Parser.ts
@@ -773,8 +773,8 @@ export class Parser {
 // -- Private Functions --------------------------------------------------------
 
 /**
- * Normalizes the given XML string by stripping a byte order mark (if present)
- * and replacing CRLF sequences and lone CR characters with LF characters.
+ * Normalizes the given XML string by replacing CRLF sequences and lone CR
+ * characters with LF characters.
  */
 function normalizeXmlString(xml: string): string {
   return xml.replace(/\r\n?/g, '\n');

From 7bb471849572f9f0a120d0d1dfc3809313d9570e Mon Sep 17 00:00:00 2001
From: Ryan Grove <ryan@wonko.com>
Date: Sat, 21 Jan 2023 17:06:03 -0800
Subject: [PATCH 3/5] Normalize line breaks during parsing instead of at input
 time

This will allow us to accurate report byte offsets in input strings that
contain carriage return characters.

While the additional calls to `normalizeLineBreaks()` may seem like a
potential performance problem, rewriting the normalization to avoid a
regex has actually resulted in a very small performance improvement
overall.
---
 src/lib/Parser.ts        | 26 ++++++++++++++++++--------
 src/lib/syntax.ts        |  2 +-
 tests/lib/Parser.test.js |  5 +++++
 3 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/src/lib/Parser.ts b/src/lib/Parser.ts
index 39783c9..909d64b 100644
--- a/src/lib/Parser.ts
+++ b/src/lib/Parser.ts
@@ -31,7 +31,7 @@ export class Parser {
     this.document = new XmlDocument();
     this.currentNode = this.document;
     this.options = options;
-    this.scanner = new StringScanner(normalizeXmlString(xml));
+    this.scanner = new StringScanner(xml);
 
     if (this.options.includeOffsets) {
       this.document.offset = 0;
@@ -75,6 +75,8 @@ export class Parser {
     let { children } = this.currentNode;
     let { length } = children;
 
+    text = normalizeLineBreaks(text);
+
     if (length > 0) {
       let prevNode = children[length - 1];
 
@@ -221,7 +223,7 @@ export class Parser {
     }
 
     if (this.options.preserveCdata) {
-      this.addNode(new XmlCdata(text), startIndex);
+      this.addNode(new XmlCdata(normalizeLineBreaks(text)), startIndex);
     } else {
       this.addText(text, startIndex);
     }
@@ -280,7 +282,7 @@ export class Parser {
     }
 
     if (this.options.preserveComments) {
-      this.addNode(new XmlComment(content.trim()), startIndex);
+      this.addNode(new XmlComment(normalizeLineBreaks(content.trim())), startIndex);
     }
 
     return true;
@@ -488,7 +490,7 @@ export class Parser {
       throw this.error('Unterminated processing instruction');
     }
 
-    this.addNode(new XmlProcessingInstruction(name, content), startIndex);
+    this.addNode(new XmlProcessingInstruction(name, normalizeLineBreaks(content)), startIndex);
     return true;
   }
 
@@ -773,11 +775,19 @@ export class Parser {
 // -- Private Functions --------------------------------------------------------
 
 /**
- * Normalizes the given XML string by replacing CRLF sequences and lone CR
- * characters with LF characters.
+ * Normalizes line breaks in the given text by replacing CRLF sequences and lone
+ * CR characters with LF characters.
  */
-function normalizeXmlString(xml: string): string {
-  return xml.replace(/\r\n?/g, '\n');
+function normalizeLineBreaks(text: string): string {
+  let i = 0;
+
+  while ((i = text.indexOf('\r', i)) !== -1) {
+    text = text[i + 1] === '\n'
+      ? text.slice(0, i) + text.slice(i + 1)
+      : text.slice(0, i) + '\n' + text.slice(i + 1);
+  }
+
+  return text;
 }
 
 // -- Types --------------------------------------------------------------------
diff --git a/src/lib/syntax.ts b/src/lib/syntax.ts
index 79da80f..6cb4b97 100644
--- a/src/lib/syntax.ts
+++ b/src/lib/syntax.ts
@@ -20,7 +20,7 @@ export const attValueCharSingleQuote = /[^'&<]+/y;
  *
  * @see https://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize
  */
-export const attValueNormalizedWhitespace = /[\t\n]/g;
+export const attValueNormalizedWhitespace = /\r\n|[\n\r\t]/g;
 
 /**
  * Regular expression that matches one or more characters that signal the end of
diff --git a/tests/lib/Parser.test.js b/tests/lib/Parser.test.js
index 0cc4ab3..8571811 100644
--- a/tests/lib/Parser.test.js
+++ b/tests/lib/Parser.test.js
@@ -40,6 +40,11 @@ describe('Parser', () => {
     assert.strictEqual(doc.root.name, 'root');
   });
 
+  it('normalizes whitespace in attribute values', () => {
+    let { root } = parseXml('<root attr=" one two\tthree\nfour\r\nfive\rsix " />');
+    assert.strictEqual(root.attributes.attr, ' one two three four five six ');
+  });
+
   describe('when `options.ignoreUndefinedEntities` is `true`', () => {
     beforeEach(() => {
       options.ignoreUndefinedEntities = true;

From 1b4ca3866378ac3c1cc209eb3337b17177e22aab Mon Sep 17 00:00:00 2001
From: Ryan Grove <ryan@wonko.com>
Date: Sat, 21 Jan 2023 17:57:16 -0800
Subject: [PATCH 4/5] Include both start and end offsets when `includeOffsets`
 is true

---
 src/lib/Parser.ts                          | 31 +++++++++++---------
 src/lib/XmlNode.ts                         | 23 ++++++++++-----
 tests/lib/Parser.test.js                   | 13 ++++++---
 tests/lib/XmlCdata.test.js                 | 28 ++++++++++++++----
 tests/lib/XmlComment.test.js               | 28 ++++++++++++++----
 tests/lib/XmlDocument.test.js              | 28 ++++++++++++++----
 tests/lib/XmlElement.test.js               | 34 +++++++++++++++++-----
 tests/lib/XmlNode.test.js                  | 21 ++++++++-----
 tests/lib/XmlProcessingInstruction.test.js | 28 ++++++++++++++----
 tests/lib/XmlText.test.js                  | 33 +++++++++++++++++----
 10 files changed, 195 insertions(+), 72 deletions(-)

diff --git a/src/lib/Parser.ts b/src/lib/Parser.ts
index 909d64b..ab7a103 100644
--- a/src/lib/Parser.ts
+++ b/src/lib/Parser.ts
@@ -28,16 +28,18 @@ export class Parser {
    * @param options Parser options.
    */
   constructor(xml: string, options: ParserOptions = {}) {
-    this.document = new XmlDocument();
-    this.currentNode = this.document;
+    let doc = this.document = new XmlDocument();
+    let scanner = this.scanner = new StringScanner(xml);
+
+    this.currentNode = doc;
     this.options = options;
-    this.scanner = new StringScanner(xml);
 
     if (this.options.includeOffsets) {
-      this.document.offset = 0;
+      doc.start = 0;
+      doc.end = xml.length;
     }
 
-    this.scanner.consumeStringFast('\uFEFF'); // byte order mark
+    scanner.consumeStringFast('\uFEFF'); // byte order mark
     this.consumeProlog();
 
     if (!this.consumeElement()) {
@@ -46,7 +48,7 @@ export class Parser {
 
     while (this.consumeMisc()) {} // eslint-disable-line no-empty
 
-    if (!this.scanner.isEnd) {
+    if (!scanner.isEnd) {
       throw this.error('Extra content at the end of the document');
     }
   }
@@ -58,7 +60,8 @@ export class Parser {
     node.parent = this.currentNode;
 
     if (this.options.includeOffsets) {
-      node.offset = this.scanner.charIndexToByteIndex(charIndex);
+      node.start = this.scanner.charIndexToByteIndex(charIndex);
+      node.end = this.scanner.charIndexToByteIndex();
     }
 
     // @ts-expect-error: XmlDocument has a more limited set of possible children
@@ -84,6 +87,11 @@ export class Parser {
         // The previous node is a text node, so we can append to it and avoid
         // creating another node.
         prevNode.text += text;
+
+        if (this.options.includeOffsets) {
+          prevNode.end = this.scanner.charIndexToByteIndex();
+        }
+
         return;
       }
     }
@@ -801,15 +809,10 @@ export type ParserOptions = {
   ignoreUndefinedEntities?: boolean;
 
   /**
-   * When `true`, the byte offset of each node in the input string will be
-   * made available via an `offset` property on the node.
-   *
-   * Note that this offset doesn't take into account any carriage return (`\r`)
-   * characters in the input string because carriage returns are removed during
-   * a normalization step before parsing begins.
+   * When `true`, the starting and ending byte offsets of each node in the input
+   * string will be made available via `start` and `end` properties on the node.
    *
    * @default false
-   * @see https://www.w3.org/TR/2008/REC-xml-20081126/#sec-line-ends
    */
   includeOffsets?: boolean;
 
diff --git a/src/lib/XmlNode.ts b/src/lib/XmlNode.ts
index 888283a..ddefae6 100644
--- a/src/lib/XmlNode.ts
+++ b/src/lib/XmlNode.ts
@@ -36,12 +36,6 @@ export class XmlNode {
    */
   static readonly TYPE_TEXT = 'text';
 
-  /**
-   * Byte offset of this node in the original XML string, or `-1` if the offset
-   * is unknown.
-   */
-  offset = -1;
-
   /**
    * Parent node of this node, or `null` if this node has no parent.
    */
@@ -55,6 +49,12 @@ export class XmlNode {
     return this.parent?.document ?? null;
   }
 
+  /**
+   * Ending byte offset of this node in the original XML string, or `-1` if the
+   * offset is unknown.
+   */
+  end = -1;
+
   /**
    * Whether this node is the root node of the document.
    */
@@ -77,6 +77,12 @@ export class XmlNode {
     return Boolean(this.parent?.preserveWhitespace);
   }
 
+  /**
+   * Starting byte offset of this node in the original XML string, or `-1` if
+   * the offset is unknown.
+   */
+  start = -1;
+
   /**
    * Type of this node.
    *
@@ -108,8 +114,9 @@ export class XmlNode {
       json.preserveWhitespace = true;
     }
 
-    if (this.offset !== -1) {
-      json.offset = this.offset;
+    if (this.start !== -1) {
+      json.start = this.start;
+      json.end = this.end;
     }
 
     return json;
diff --git a/tests/lib/Parser.test.js b/tests/lib/Parser.test.js
index 8571811..c360403 100644
--- a/tests/lib/Parser.test.js
+++ b/tests/lib/Parser.test.js
@@ -58,19 +58,24 @@ describe('Parser', () => {
   });
 
   describe('when `options.includeOffsets` is `true`', () => {
-    it('the offset is a byte offset, not a character offset', () => {
+    it('the start offset is a byte offset, not a character offset', () => {
       let { root } = parseXml('<root><cat>🐈</cat><dog>🐕</dog></root>', { includeOffsets: true });
-      assert.strictEqual(root.children[1].offset, 19);
+      assert.strictEqual(root.children[1].start, 19);
+    });
+
+    it('the end offset is a byte offset, not a character offset', () => {
+      let { root } = parseXml('<root><cat>🐈</cat><dog>🐕</dog></root>', { includeOffsets: true });
+      assert.strictEqual(root.children[1].end, 32);
     });
 
     it('a byte order mark character is counted in the offset', () => {
       let { root } = parseXml('\uFEFF<root>foo</root>', { includeOffsets: true });
-      assert.strictEqual(root.children[0].offset, 7);
+      assert.strictEqual(root.children[0].start, 7);
     });
 
     it('a carriage return character is not counted in the offset', () => {
       let { root } = parseXml('<root>\rfoo</root>', { includeOffsets: true });
-      assert.strictEqual(root.children[0].offset, 6);
+      assert.strictEqual(root.children[0].start, 6);
     });
   });
 
diff --git a/tests/lib/XmlCdata.test.js b/tests/lib/XmlCdata.test.js
index cf0c4d4..2a1ab96 100644
--- a/tests/lib/XmlCdata.test.js
+++ b/tests/lib/XmlCdata.test.js
@@ -26,18 +26,34 @@ describe('XmlCdata', () => {
     assert.strictEqual(node.parent, root);
   });
 
-  describe('offset', () => {
-    describe('when `options.includeOffsets` is `false`', () => {
+  describe('when `options.includeOffsets` is `false`', () => {
+    describe('start', () => {
       it('is `-1`', () => {
         let { root } = parseXml(xml, { preserveCdata: true });
-        assert.strictEqual(root.children[0].offset, -1);
+        assert.strictEqual(root.children[0].start, -1);
       });
     });
 
-    describe('when `options.includeOffsets` is `true`', () => {
-      it('is the byte offset of the CDATA section', () => {
+    describe('end', () => {
+      it('is `-1`', () => {
+        let { root } = parseXml(xml, { preserveCdata: true });
+        assert.strictEqual(root.children[0].end, -1);
+      });
+    });
+  });
+
+  describe('when `options.includeOffsets` is `true`', () => {
+    describe('start', () => {
+      it('is the starting byte offset of the CDATA section', () => {
+        let { root } = parseXml(xml, { includeOffsets: true, preserveCdata: true });
+        assert.strictEqual(root.children[0].start, 6);
+      });
+    });
+
+    describe('end', () => {
+      it('is the ending byte offset of the CDATA section', () => {
         let { root } = parseXml(xml, { includeOffsets: true, preserveCdata: true });
-        assert.strictEqual(root.children[0].offset, 6);
+        assert.strictEqual(root.children[0].end, 33);
       });
     });
   });
diff --git a/tests/lib/XmlComment.test.js b/tests/lib/XmlComment.test.js
index 9f275ab..dc4d571 100644
--- a/tests/lib/XmlComment.test.js
+++ b/tests/lib/XmlComment.test.js
@@ -35,18 +35,34 @@ describe('XmlComment', () => {
     });
   });
 
-  describe('offset', () => {
-    describe('when `options.includeOffsets` is `false`', () => {
+  describe('when `options.includeOffsets` is `false`', () => {
+    describe('start', () => {
       it('is `-1`', () => {
         let { root } = parseXml(`<root><!-- I'm a comment! --></root>`, { preserveComments: true });
-        assert.strictEqual(root.children[0].offset, -1);
+        assert.strictEqual(root.children[0].start, -1);
       });
     });
 
-    describe('when `options.includeOffsets` is `true`', () => {
-      it('is the byte offset of the comment', () => {
+    describe('end', () => {
+      it('is `-1`', () => {
+        let { root } = parseXml(`<root><!-- I'm a comment! --></root>`, { preserveComments: true });
+        assert.strictEqual(root.children[0].end, -1);
+      });
+    });
+  });
+
+  describe('when `options.includeOffsets` is `true`', () => {
+    describe('start', () => {
+      it('is the starting byte offset of the comment', () => {
+        let { root } = parseXml(`<root><!-- I'm a comment! --></root>`, { includeOffsets: true, preserveComments: true });
+        assert.strictEqual(root.children[0].start, 6);
+      });
+    });
+
+    describe('end', () => {
+      it('is the ending byte offset of the comment', () => {
         let { root } = parseXml(`<root><!-- I'm a comment! --></root>`, { includeOffsets: true, preserveComments: true });
-        assert.strictEqual(root.children[0].offset, 6);
+        assert.strictEqual(root.children[0].end, 29);
       });
     });
   });
diff --git a/tests/lib/XmlDocument.test.js b/tests/lib/XmlDocument.test.js
index 195bb45..86bb755 100644
--- a/tests/lib/XmlDocument.test.js
+++ b/tests/lib/XmlDocument.test.js
@@ -39,18 +39,34 @@ describe('XmlDocument', () => {
     });
   });
 
-  describe('offset', () => {
-    describe('when `options.includeOffsets` is `false`', () => {
+  describe('when `options.includeOffsets` is `false`', () => {
+    describe('start', () => {
       it('is `-1`', () => {
         let doc = parseXml('<root />');
-        assert.strictEqual(doc.offset, -1);
+        assert.strictEqual(doc.start, -1);
       });
     });
 
-    describe('when `options.includeOffsets` is `true`', () => {
-      it('is the byte offset of the document (which is always `0`)', () => {
+    describe('end', () => {
+      it('is `-1`', () => {
+        let doc = parseXml('<root />');
+        assert.strictEqual(doc.end, -1);
+      });
+    });
+  });
+
+  describe('when `options.includeOffsets` is `true`', () => {
+    describe('start', () => {
+      it('is the starting byte offset of the document', () => {
+        let doc = parseXml('<root />', { includeOffsets: true });
+        assert.strictEqual(doc.start, 0);
+      });
+    });
+
+    describe('end', () => {
+      it('is the ending byte offset of the document', () => {
         let doc = parseXml('<root />', { includeOffsets: true });
-        assert.strictEqual(doc.offset, 0);
+        assert.strictEqual(doc.end, 8);
       });
     });
   });
diff --git a/tests/lib/XmlElement.test.js b/tests/lib/XmlElement.test.js
index 25a8c78..7443e7c 100644
--- a/tests/lib/XmlElement.test.js
+++ b/tests/lib/XmlElement.test.js
@@ -129,20 +129,38 @@ describe('XmlElement', () => {
     });
   });
 
-  describe('offset', () => {
-    describe('when `options.includeOffsets` is `false`', () => {
+  describe('when `options.includeOffsets` is `false`', () => {
+    describe('start', () => {
       it('is `-1`', () => {
         let { root } = parseXml('<root />');
-        assert.strictEqual(root.offset, -1);
+        assert.strictEqual(root.start, -1);
       });
     });
 
-    describe('when `options.includeOffsets` is `true`', () => {
-      it('is the offset of the element in the document', () => {
+    describe('end', () => {
+      it('is `-1`', () => {
+        let { root } = parseXml('<root />');
+        assert.strictEqual(root.end, -1);
+      });
+    });
+  });
+
+  describe('when `options.includeOffsets` is `true`', () => {
+    describe('start', () => {
+      it('is the starting byte offset of the element', () => {
+        let { root } = parseXml('<a><b><c /></b></a>', { includeOffsets: true });
+        assert.strictEqual(root.start, 0);
+        assert.strictEqual(root.children[0].start, 3);
+        assert.strictEqual(root.children[0].children[0].start, 6);
+      });
+    });
+
+    describe('end', () => {
+      it('is the ending byte offset of the element', () => {
         let { root } = parseXml('<a><b><c /></b></a>', { includeOffsets: true });
-        assert.strictEqual(root.offset, 0);
-        assert.strictEqual(root.children[0].offset, 3);
-        assert.strictEqual(root.children[0].children[0].offset, 6);
+        assert.strictEqual(root.end, 19);
+        assert.strictEqual(root.children[0].end, 15);
+        assert.strictEqual(root.children[0].children[0].end, 11);
       });
     });
   });
diff --git a/tests/lib/XmlNode.test.js b/tests/lib/XmlNode.test.js
index fd24ae5..39a6a1f 100644
--- a/tests/lib/XmlNode.test.js
+++ b/tests/lib/XmlNode.test.js
@@ -16,18 +16,23 @@ describe('XmlNode', () => {
   });
 
   describe('toJSON()', () => {
-    describe('when `offset` is `-1`', () => {
-      it('doesn\'t include an `offset` property', () => {
-        let node = new XmlNode();
-        assert.strictEqual(node.toJSON().offset, undefined);
+    describe('when `start` is `-1`', () => {
+      it('doesn\'t include the `start` or `end` properties', () => {
+        let json = new XmlNode().toJSON();
+        assert.strictEqual(json.start, undefined);
+        assert.strictEqual(json.end, undefined);
       });
     });
 
-    describe('when `offset` is greater than -1', () => {
-      it('includes an `offset` property', () => {
+    describe('when `start` is greater than -1', () => {
+      it('includes the `start` and `end` properties', () => {
         let node = new XmlNode();
-        node.offset = 0;
-        assert.strictEqual(node.toJSON().offset, 0);
+        node.start = 0;
+        node.end = 3;
+
+        let json = node.toJSON();
+        assert.strictEqual(json.start, 0);
+        assert.strictEqual(json.end, 3);
       });
     });
   });
diff --git a/tests/lib/XmlProcessingInstruction.test.js b/tests/lib/XmlProcessingInstruction.test.js
index a507ee9..ab8a24a 100644
--- a/tests/lib/XmlProcessingInstruction.test.js
+++ b/tests/lib/XmlProcessingInstruction.test.js
@@ -30,18 +30,34 @@ describe('XmlProcessingInstruction', () => {
     });
   });
 
-  describe('offset', () => {
-    describe('when `options.includeOffsets` is `false`', () => {
+  describe('when `options.includeOffsets` is `false`', () => {
+    describe('start', () => {
       it('is `-1`', () => {
         let { root } = parseXml('<root><?foo?></root>');
-        assert.strictEqual(root.children[0].offset, -1);
+        assert.strictEqual(root.children[0].start, -1);
       });
     });
 
-    describe('when `options.includeOffsets` is `true`', () => {
-      it('is the byte offset of the processing instruction', () => {
+    describe('end', () => {
+      it('is `-1`', () => {
+        let { root } = parseXml('<root><?foo?></root>');
+        assert.strictEqual(root.children[0].end, -1);
+      });
+    });
+  });
+
+  describe('when `options.includeOffsets` is `true`', () => {
+    describe('start', () => {
+      it('is the starting byte offset of the processing instruction', () => {
+        let { root } = parseXml('<root><?foo?></root>', { includeOffsets: true });
+        assert.strictEqual(root.children[0].start, 6);
+      });
+    });
+
+    describe('end', () => {
+      it('is the ending byte offset of the processing instruction', () => {
         let { root } = parseXml('<root><?foo?></root>', { includeOffsets: true });
-        assert.strictEqual(root.children[0].offset, 6);
+        assert.strictEqual(root.children[0].end, 13);
       });
     });
   });
diff --git a/tests/lib/XmlText.test.js b/tests/lib/XmlText.test.js
index 1a608bc..ef86860 100644
--- a/tests/lib/XmlText.test.js
+++ b/tests/lib/XmlText.test.js
@@ -23,18 +23,39 @@ describe('XmlText', () => {
     });
   });
 
-  describe('offset', () => {
-    describe('when `options.includeOffsets` is `false`', () => {
+  describe('when `options.includeOffsets` is `false`', () => {
+    describe('start', () => {
       it('is `-1`', () => {
         let { root } = parseXml('<root> foo &amp; bar\r\nbaz </root>');
-        assert.strictEqual(root.children[0].offset, -1);
+        assert.strictEqual(root.children[0].start, -1);
       });
     });
 
-    describe('when `options.includeOffsets` is `true`', () => {
-      it('is the byte offset of the text node', () => {
+    describe('end', () => {
+      it('is `-1`', () => {
+        let { root } = parseXml('<root> foo &amp; bar\r\nbaz </root>');
+        assert.strictEqual(root.children[0].end, -1);
+      });
+    });
+  });
+
+  describe('when `options.includeOffsets` is `true`', () => {
+    describe('start', () => {
+      it('is the starting byte offset of the text node', () => {
         let { root } = parseXml('<root> foo </root>', { includeOffsets: true });
-        assert.strictEqual(root.children[0].offset, 6);
+        assert.strictEqual(root.children[0].start, 6);
+      });
+    });
+
+    describe('end', () => {
+      it('is the ending byte offset of the text node', () => {
+        let { root } = parseXml('<root> foo </root>', { includeOffsets: true });
+        assert.strictEqual(root.children[0].end, 11);
+      });
+
+      it('is correct after multiple text nodes have been merged', () => {
+        let { root } = parseXml('<root>one&amp;two<!-- comment -->three</root>', { includeOffsets: true });
+        assert.strictEqual(root.children[0].end, 38);
       });
     });
   });

From 87b6213aabc28b36f6e080ac6cea3a2f06e0e34f Mon Sep 17 00:00:00 2001
From: Ryan Grove <ryan@wonko.com>
Date: Sun, 22 Jan 2023 17:46:24 -0800
Subject: [PATCH 5/5] Swap the order of the `start` and `end` properties

https://github.com/rgrove/parse-xml/pull/25#issuecomment-1399544480
---
 src/lib/XmlNode.ts | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/lib/XmlNode.ts b/src/lib/XmlNode.ts
index ddefae6..085b15a 100644
--- a/src/lib/XmlNode.ts
+++ b/src/lib/XmlNode.ts
@@ -42,12 +42,10 @@ export class XmlNode {
   parent: XmlDocument | XmlElement | null = null;
 
   /**
-   * Document that contains this node, or `null` if this node is not associated
-   * with a document.
+   * Starting byte offset of this node in the original XML string, or `-1` if
+   * the offset is unknown.
    */
-  get document(): XmlDocument | null {
-    return this.parent?.document ?? null;
-  }
+  start = -1;
 
   /**
    * Ending byte offset of this node in the original XML string, or `-1` if the
@@ -55,6 +53,14 @@ export class XmlNode {
    */
   end = -1;
 
+  /**
+   * Document that contains this node, or `null` if this node is not associated
+   * with a document.
+   */
+  get document(): XmlDocument | null {
+    return this.parent?.document ?? null;
+  }
+
   /**
    * Whether this node is the root node of the document.
    */
@@ -77,12 +83,6 @@ export class XmlNode {
     return Boolean(this.parent?.preserveWhitespace);
   }
 
-  /**
-   * Starting byte offset of this node in the original XML string, or `-1` if
-   * the offset is unknown.
-   */
-  start = -1;
-
   /**
    * Type of this node.
    *