rgrove · rgrove · Jan 28, 2023 · Jan 17, 2023 · Jan 17, 2023 · Jan 22, 2023
diff --git a/src/lib/Parser.ts b/src/lib/Parser.ts
@@ -9,7 +9,6 @@ import { XmlText } from './XmlText.js';
 
 import type { XmlNode } from './XmlNode.js';
 
-
 const emptyString = '';
 
 /**
@@ -29,11 +28,18 @@ export class Parser {
    * @param options Parser options.
    */
   constructor(xml: string, options: ParserOptions = {}) {
-    this.document = new XmlDocument();
-    this.currentNode = this.document;
+    let doc = this.document = new XmlDocument();
+    let scanner = this.scanner = new StringScanner(xml);
+
+    this.currentNode = doc;
     this.options = options;
-    this.scanner = new StringScanner(normalizeXmlString(xml));
 
+    if (this.options.includeOffsets) {
+      doc.start = 0;
+      doc.end = xml.length;
+    }
+
+    scanner.consumeStringFast('\uFEFF'); // byte order mark
     this.consumeProlog();
 
     if (!this.consumeElement()) {
@@ -42,17 +48,22 @@ export class Parser {
 
     while (this.consumeMisc()) {} // eslint-disable-line no-empty
 
-    if (!this.scanner.isEnd) {
+    if (!scanner.isEnd) {
       throw this.error('Extra content at the end of the document');
     }
   }
 
   /**
    * Adds the given `XmlNode` as a child of `this.currentNode`.
    */
-  addNode(node: XmlNode) {
+  addNode(node: XmlNode, charIndex: number) {
     node.parent = this.currentNode;
 
+    if (this.options.includeOffsets) {
+      node.start = this.scanner.charIndexToByteIndex(charIndex);
+      node.end = this.scanner.charIndexToByteIndex();
+    }
+
     // @ts-expect-error: XmlDocument has a more limited set of possible children
     // than XmlElement so TypeScript is unhappy, but we always do the right
     // thing.
@@ -63,22 +74,29 @@ export class Parser {
    * Adds the given _text_ to the document, either by appending it to a
    * preceding `XmlText` node (if possible) or by creating a new `XmlText` node.
    */
-  addText(text: string) {
+  addText(text: string, charIndex: number) {
     let { children } = this.currentNode;
     let { length } = children;
 
+    text = normalizeLineBreaks(text);
+
     if (length > 0) {
       let prevNode = children[length - 1];
 
       if (prevNode instanceof XmlText) {
         // The previous node is a text node, so we can append to it and avoid
         // creating another node.
         prevNode.text += text;
+
+        if (this.options.includeOffsets) {
+          prevNode.end = this.scanner.charIndexToByteIndex();
+        }
+
         return;
       }
     }
 
-    this.addNode(new XmlText(text));
+    this.addNode(new XmlText(text), charIndex);
   }
 
   /**
@@ -199,6 +217,7 @@ export class Parser {
    */
   consumeCdataSection(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
 
     if (!scanner.consumeStringFast('<![CDATA[')) {
       return false;
@@ -212,9 +231,9 @@ export class Parser {
     }
 
     if (this.options.preserveCdata) {
-      this.addNode(new XmlCdata(text));
+      this.addNode(new XmlCdata(normalizeLineBreaks(text)), startIndex);
     } else {
-      this.addText(text);
+      this.addText(text, startIndex);
     }
 
     return true;
@@ -228,6 +247,7 @@ export class Parser {
    */
   consumeCharData(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
     let charData = scanner.consumeUntilMatch(syntax.endCharData);
 
     if (!charData) {
@@ -240,7 +260,7 @@ export class Parser {
       throw this.error('Element content may not contain the CDATA section close delimiter `]]>`');
     }
 
-    this.addText(charData);
+    this.addText(charData, startIndex);
     return true;
   }
 
@@ -252,6 +272,7 @@ export class Parser {
    */
   consumeComment(): boolean {
     let { scanner } = this;
+    let startIndex = scanner.charIndex;
 
     if (!scanner.consumeStringFast('<!--')) {
       return false;
@@ -269,7 +290,7 @@ export class Parser {
     }
 
     if (this.options.preserveComments) {
-      this.addNode(new XmlComment(content.trim()));
+      this.addNode(new XmlComment(normalizeLineBreaks(content.trim())), startIndex);
     }
 
     return true;
@@ -285,10 +306,11 @@ export class Parser {
    * @see https://www.w3.org/TR/2008/REC-xml-20081126/#entproc
    */
   consumeContentReference(): boolean {
+    let startIndex = this.scanner.charIndex;
     let ref = this.consumeReference();
 
     if (ref) {
-      this.addText(ref);
+      this.addText(ref, startIndex);
       return true;
     }
 
@@ -334,7 +356,7 @@ export class Parser {
    */
   consumeElement(): boolean {
     let { scanner } = this;
-    let mark = scanner.charIndex;
+    let startIndex = scanner.charIndex;
 
     if (!scanner.consumeStringFast('<')) {
       return false;
@@ -343,7 +365,7 @@ export class Parser {
     let name = this.consumeName();
 
     if (!name) {
-      scanner.reset(mark);
+      scanner.reset(startIndex);
       return false;
     }
 
@@ -390,7 +412,7 @@ export class Parser {
       this.currentNode = element.parent;
     }
 
-    this.addNode(element);
+    this.addNode(element, startIndex);
     return true;
   }
 
@@ -443,7 +465,7 @@ export class Parser {
    */
   consumeProcessingInstruction(): boolean {
     let { scanner } = this;
-    let mark = scanner.charIndex;
+    let startIndex = scanner.charIndex;
 
     if (!scanner.consumeStringFast('<?')) {
       return false;
@@ -453,7 +475,7 @@ export class Parser {
 
     if (name) {
       if (name.toLowerCase() === 'xml') {
-        scanner.reset(mark);
+        scanner.reset(startIndex);
         throw this.error("XML declaration isn't allowed here");
       }
     } else {
@@ -462,7 +484,7 @@ export class Parser {
 
     if (!this.consumeWhitespace()) {
       if (scanner.consumeStringFast('?>')) {
-        this.addNode(new XmlProcessingInstruction(name));
+        this.addNode(new XmlProcessingInstruction(name), startIndex);
         return true;
       }
 
@@ -476,7 +498,7 @@ export class Parser {
       throw this.error('Unterminated processing instruction');
     }
 
-    this.addNode(new XmlProcessingInstruction(name, content));
+    this.addNode(new XmlProcessingInstruction(name, normalizeLineBreaks(content)), startIndex);
     return true;
   }
 
@@ -488,7 +510,7 @@ export class Parser {
    */
   consumeProlog(): boolean {
     let { scanner } = this;
-    let mark = scanner.charIndex;
+    let startIndex = scanner.charIndex;
 
     this.consumeXmlDeclaration();
 
@@ -498,7 +520,7 @@ export class Parser {
       while (this.consumeMisc()) {} // eslint-disable-line no-empty
     }
 
-    return mark < scanner.charIndex;
+    return startIndex < scanner.charIndex;
   }
 
   /**
@@ -761,15 +783,19 @@ export class Parser {
 // -- Private Functions --------------------------------------------------------
 
 /**
- * Normalizes the given XML string by stripping a byte order mark (if present)
- * and replacing CRLF sequences and lone CR characters with LF characters.
+ * Normalizes line breaks in the given text by replacing CRLF sequences and lone
+ * CR characters with LF characters.
  */
-function normalizeXmlString(xml: string): string {
-  if (xml[0] === '\uFEFF') {
-    xml = xml.slice(1);
+function normalizeLineBreaks(text: string): string {
+  let i = 0;
+
+  while ((i = text.indexOf('\r', i)) !== -1) {
+    text = text[i + 1] === '\n'
+      ? text.slice(0, i) + text.slice(i + 1)
+      : text.slice(0, i) + '\n' + text.slice(i + 1);
   }
 
-  return xml.replace(/\r\n?/g, '\n');
+  return text;
 }
 
 // -- Types --------------------------------------------------------------------
@@ -782,6 +808,14 @@ export type ParserOptions = {
    */
   ignoreUndefinedEntities?: boolean;
 
+  /**
+   * When `true`, the starting and ending byte offsets of each node in the input
+   * string will be made available via `start` and `end` properties on the node.
+   *
+   * @default false
+   */
+  includeOffsets?: boolean;
+
   /**
    * When `true`, CDATA sections will be preserved in the document as `XmlCdata`
    * nodes. Otherwise CDATA sections will be represented as `XmlText` nodes,

diff --git a/src/lib/StringScanner.ts b/src/lib/StringScanner.ts
@@ -42,16 +42,6 @@ export class StringScanner {
 
   // -- Protected Methods ------------------------------------------------------
 
-  /**
-   * Returns the byte index of the given character index in the string. The two
-   * may differ in strings that contain multibyte characters.
-   */
-  protected charIndexToByteIndex(charIndex: number = this.charIndex): number {
-    return this.multiByteMode
-      ? (this.charsToBytes as number[])[charIndex] ?? Infinity
-      : charIndex;
-  }
-
   /**
    * Returns the number of characters in the given string, which may differ from
    * the byte length if the string contains multibyte characters.
@@ -75,6 +65,16 @@ export class StringScanner {
     this.charIndex = Math.min(this.charCount, this.charIndex + count);
   }
 
+  /**
+   * Returns the byte index of the given character index in the string. The two
+   * may differ in strings that contain multibyte characters.
+   */
+  charIndexToByteIndex(charIndex: number = this.charIndex): number {
+    return this.multiByteMode
+      ? (this.charsToBytes as number[])[charIndex] ?? Infinity
+      : charIndex;
+  }
+
   /**
    * Consumes and returns the given number of characters if possible, advancing
    * the scanner and stopping if the end of the string is reached.

diff --git a/src/lib/XmlNode.ts b/src/lib/XmlNode.ts
@@ -49,6 +49,12 @@ export class XmlNode {
     return this.parent?.document ?? null;
   }
 
+  /**
+   * Ending byte offset of this node in the original XML string, or `-1` if the
+   * offset is unknown.
+   */
+  end = -1;
+
   /**
    * Whether this node is the root node of the document.
    */
@@ -71,6 +77,12 @@ export class XmlNode {
     return Boolean(this.parent?.preserveWhitespace);
   }
 
+  /**
+   * Starting byte offset of this node in the original XML string, or `-1` if
+   * the offset is unknown.
+   */
+  start = -1;
+
   /**
    * Type of this node.
    *
@@ -102,6 +114,11 @@ export class XmlNode {
       json.preserveWhitespace = true;
     }
 
+    if (this.start !== -1) {
+      json.start = this.start;
+      json.end = this.end;
+    }
+
     return json;
   }
 }
diff --git a/src/lib/syntax.ts b/src/lib/syntax.ts
@@ -20,7 +20,7 @@ export const attValueCharSingleQuote = /[^'&<]+/y;
  *
  * @see https://www.w3.org/TR/2008/REC-xml-20081126/#AVNormalize
  */
-export const attValueNormalizedWhitespace = /[\t\n]/g;
+export const attValueNormalizedWhitespace = /\r\n|[\n\r\t]/g;
 
 /**
  * Regular expression that matches one or more characters that signal the end of

diff --git a/tests/lib/Parser.test.js b/tests/lib/Parser.test.js
@@ -40,6 +40,11 @@ describe('Parser', () => {
     assert.strictEqual(doc.root.name, 'root');
   });
 
+  it('normalizes whitespace in attribute values', () => {
+    let { root } = parseXml('<root attr=" one two\tthree\nfour\r\nfive\rsix " />');
+    assert.strictEqual(root.attributes.attr, ' one two three four five six ');
+  });
+
   describe('when `options.ignoreUndefinedEntities` is `true`', () => {
     beforeEach(() => {
       options.ignoreUndefinedEntities = true;
@@ -52,6 +57,28 @@ describe('Parser', () => {
     });
   });
 
+  describe('when `options.includeOffsets` is `true`', () => {
+    it('the start offset is a byte offset, not a character offset', () => {
+      let { root } = parseXml('<root><cat>🐈</cat><dog>🐕</dog></root>', { includeOffsets: true });
+      assert.strictEqual(root.children[1].start, 19);
+    });
+
+    it('the end offset is a byte offset, not a character offset', () => {
+      let { root } = parseXml('<root><cat>🐈</cat><dog>🐕</dog></root>', { includeOffsets: true });
+      assert.strictEqual(root.children[1].end, 32);
+    });
+
+    it('a byte order mark character is counted in the offset', () => {
+      let { root } = parseXml('\uFEFF<root>foo</root>', { includeOffsets: true });
+      assert.strictEqual(root.children[0].start, 7);
+    });
+
+    it('a carriage return character is not counted in the offset', () => {
+      let { root } = parseXml('<root>\rfoo</root>', { includeOffsets: true });
+      assert.strictEqual(root.children[0].start, 6);
+    });
+  });
+
   describe('when `options.resolveUndefinedEntity` is set', () => {
     beforeEach(() => {
       options.resolveUndefinedEntity = (ref) => {