Skip to content

Commit

Permalink
Fix whitespace handling for various parsers. This fixes #128.
Browse files Browse the repository at this point in the history
  • Loading branch information
pdvrieze committed Mar 25, 2023
1 parent 6293ef3 commit 1ed0cd6
Show file tree
Hide file tree
Showing 11 changed files with 93 additions and 45 deletions.
1 change: 1 addition & 0 deletions Changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ Fixes:
- Support document fragments in DomReader
- Make the StAXReader not skip the StartDocument event initially.
- Make XmlBufferedReader.nextTagEvent process/ignore StartDocument.
- Made ignorable whitespace more consistent. #128

# 0.85.0 – Tying things up
*(Feb 19, 2023)<br />*
Expand Down
35 changes: 22 additions & 13 deletions core/src/commonMain/kotlin/nl/adaptivity/xmlutil/DomReader.kt
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@

package nl.adaptivity.xmlutil

import nl.adaptivity.xmlutil.core.impl.isXmlWhitespace
import nl.adaptivity.xmlutil.dom.*
import nl.adaptivity.xmlutil.util.*

Expand Down Expand Up @@ -58,6 +59,7 @@ public class DomReader(public val delegate: Node) : XmlReader {
NodeConsts.TEXT_NODE,
NodeConsts.PROCESSING_INSTRUCTION_NODE,
NodeConsts.CDATA_SECTION_NODE -> (current as CharacterData).data

else -> throw XmlException("Node is not a text node")
}

Expand All @@ -67,7 +69,7 @@ public class DomReader(public val delegate: Node) : XmlReader {
override val eventType: EventType
get() = when (val c = current) {
null -> EventType.END_DOCUMENT
else -> c.nodeType.toEventType(atEndOfElement)
else -> c.toEventType(atEndOfElement)
}

private var _namespaceAttrs: List<Attr>? = null
Expand Down Expand Up @@ -203,30 +205,33 @@ public class DomReader(public val delegate: Node) : XmlReader {
// This falls back all the way to the bottom to return the current even type (starting the sibling)
} else { // no more siblings, go back to parent
current = c.parentNode
return current?.nodeType?.toEventType(true) ?: EventType.END_DOCUMENT
return current?.toEventType(true) ?: EventType.END_DOCUMENT
}
}

c.firstChild != null -> { // If we have a child, the next element is the first child
current = c.firstChild
}

else -> {
// We have no children, but we have a sibling. We are at the end of this element, next we will return
// the sibling, or close the parent if there is no sibling
atEndOfElement = true
return EventType.END_ELEMENT
}
/*
else -> {
atEndOfElement = true // We are the last item in the parent, so the parent needs to be end of an element as well
return EventType.END_ELEMENT
}
*/
/*
else -> {
atEndOfElement = true // We are the last item in the parent, so the parent needs to be end of an element as well
return EventType.END_ELEMENT
}
*/
}
val nodeType = current!!.nodeType
val c = current!!
val nodeType = c.nodeType
if (nodeType != NodeConsts.ELEMENT_NODE && nodeType != NodeConsts.DOCUMENT_NODE) {
atEndOfElement = true // No child elements for things like text
}
return nodeType.toEventType(atEndOfElement)
return c.toEventType(atEndOfElement)
}
}

Expand Down Expand Up @@ -268,9 +273,9 @@ public class DomReader(public val delegate: Node) : XmlReader {
}


private fun Short.toEventType(endOfElement: Boolean): EventType {
private fun Node.toEventType(endOfElement: Boolean): EventType {
@Suppress("DEPRECATION")
return when (this) {
return when (nodeType) {
NodeConsts.ATTRIBUTE_NODE -> EventType.ATTRIBUTE
NodeConsts.CDATA_SECTION_NODE -> EventType.CDSECT
NodeConsts.COMMENT_NODE -> EventType.COMMENT
Expand All @@ -280,7 +285,11 @@ private fun Short.toEventType(endOfElement: Boolean): EventType {
NodeConsts.DOCUMENT_NODE -> if (endOfElement) EventType.END_DOCUMENT else EventType.START_DOCUMENT
// Node.DOCUMENT_NODE -> EventType.END_DOCUMENT
NodeConsts.PROCESSING_INSTRUCTION_NODE -> EventType.PROCESSING_INSTRUCTION
NodeConsts.TEXT_NODE -> EventType.TEXT
NodeConsts.TEXT_NODE -> when {
textContent!!.isXmlWhitespace() -> EventType.IGNORABLE_WHITESPACE
else -> EventType.TEXT
}

NodeConsts.ELEMENT_NODE -> if (endOfElement) EventType.END_ELEMENT else EventType.START_ELEMENT
// Node.ELEMENT_NODE -> EventType.END_ELEMENT
else -> throw XmlException("Unsupported event type ($this)")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ public enum class EventType {
},
IGNORABLE_WHITESPACE {
override val isIgnorable: Boolean get() = true
override val isTextElement: Boolean get() = true

override fun createEvent(reader: XmlReader): TextEvent = reader.run {
TextEvent(locationInfo, IGNORABLE_WHITESPACE, text)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,10 @@ package nl.adaptivity.xmlutil.core
import nl.adaptivity.xmlutil.*
import nl.adaptivity.xmlutil.EventType.*
import nl.adaptivity.xmlutil.core.impl.NamespaceHolder
import nl.adaptivity.xmlutil.core.impl.isXmlWhitespace
import nl.adaptivity.xmlutil.core.impl.multiplatform.Reader
import kotlin.jvm.JvmInline
import kotlin.jvm.JvmStatic

@ExperimentalXmlUtilApi
public class KtXmlReader internal constructor(
Expand Down Expand Up @@ -273,12 +275,14 @@ public class KtXmlReader internal constructor(
END_DOCUMENT -> return
TEXT -> {
pushText('<'.code, !token)
if (isWhitespace) _eventType = IGNORABLE_WHITESPACE
/*
if (depth == 0) {
if (isWhitespace) _eventType = IGNORABLE_WHITESPACE
// make exception switchable for instances.chg... !!!!
// else
// exception ("text '"+getText ()+"' not allowed outside root element");
}
*/
return
}
else -> {
Expand Down Expand Up @@ -453,7 +457,7 @@ public class KtXmlReader internal constructor(
}

private fun push(c: Int) {
isWhitespace = isWhitespace and (c <= ' '.code)
isWhitespace = isWhitespace and c.isXmlWhitespace()
if (txtBufPos + 1 >= txtBuf.size) { // +1 to have enough space for 2 surrogates, if needed
txtBuf = txtBuf.copyOf(txtBufPos * 4 / 3 + 4)
}
Expand Down Expand Up @@ -600,7 +604,7 @@ public class KtXmlReader internal constructor(
var next = peek(0)
var cbrCount = 0
while (next != -1 && next != delimiter) { // covers eof, '<', '"'
if (delimiter == ' '.code) if (next <= ' '.code || next == '>'.code) break
if (delimiter == ' '.code) if (next.isXmlWhitespace() || next == '>'.code) break
if (next == '&'.code) {
if (!resolveEntities) break
pushEntity()
Expand Down Expand Up @@ -965,4 +969,6 @@ public class KtXmlReader internal constructor(
set(value) {
attributes.data[index * 4 + 3] = value
}


}
Original file line number Diff line number Diff line change
Expand Up @@ -22,15 +22,6 @@ package nl.adaptivity.xmlutil.core.impl

import nl.adaptivity.xmlutil.*

internal val CharSequence.isIgnorableWhitespace: Boolean
get() = all(Char::isIgnorableWhitespace)

internal val Char.isIgnorableWhitespace: Boolean
get() = when (this) {
' ', '\t', '\r', '\n' -> true
else -> false
}

/**
* Base class for platform xml writers. It contains common code. */
@XmlUtilInternal
Expand Down Expand Up @@ -73,7 +64,7 @@ public abstract class PlatformXmlWriterBase(indentSequence: Iterable<XmlEvent.Te
fun sbToTextEvent() {
if (sb.isNotEmpty()) {
val text = sb.toString()
if (!text.isIgnorableWhitespace) {
if (!text.isXmlWhitespace()) {
throw XmlException("Indents can only be whitespace or comments: ${text}")
}
result.add(XmlEvent.TextEvent(null, EventType.IGNORABLE_WHITESPACE, text))
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
/*
* Copyright (c) 2023.
*
* This file is part of xmlutil.
*
* This file is licenced to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You should have received a copy of the license with the source distribution.
* Alternatively, you may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/

package nl.adaptivity.xmlutil.core.impl


private val WHITESPACE = BooleanArray(33).also {
it['\t'.code] = true
it['\n'.code] = true
it['\r'.code] = true
it[' '.code] = true
}

internal fun Char.isXmlWhitespace(): Boolean =
code.isXmlWhitespace()

internal fun CharSequence.isXmlWhitespace(): Boolean = all { it.code.isXmlWhitespace() }

internal fun Int.isXmlWhitespace(): Boolean = when {
this >= WHITESPACE.size -> false
else -> WHITESPACE[this]
}
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,6 @@ class TestBufferedXmlReader : TestCommonReader() {
testReadUnknownEntity { XmlBufferedReader(KtXmlReader(StringReader(it))) }
}

@Ignore
@Test
fun testIgnorableWhitespace() {
testIgnorableWhitespace(::createReader)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

package nl.adaptivity.xmlutil

import nl.adaptivity.xmlutil.core.KtXmlReader
import nl.adaptivity.xmlutil.core.impl.multiplatform.use
import kotlin.test.assertEquals
import kotlin.test.assertFalse
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,10 +20,7 @@

package nl.adaptivity.xmlutil

import nl.adaptivity.xmlutil.core.impl.multiplatform.use
import kotlin.test.Ignore
import kotlin.test.Test
import kotlin.test.assertEquals

class TestKtXmlReader : TestCommonReader() {

Expand Down Expand Up @@ -57,7 +54,6 @@ class TestKtXmlReader : TestCommonReader() {
testReadUnknownEntity(XmlStreaming::newGenericReader)
}

@Ignore
@Test
fun testIgnorableWhitespace() {
testIgnorableWhitespace(XmlStreaming::newGenericReader)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@

package nl.adaptivity.xmlutil

import kotlin.test.Ignore
import kotlin.test.Test

class TestXmlReader : TestCommonReader() {
Expand Down Expand Up @@ -51,7 +50,6 @@ class TestXmlReader : TestCommonReader() {
testReadEntity(XmlStreaming::newReader)
}

@Ignore
@Test
fun testIgnorableWhitespace() {
testIgnorableWhitespace(XmlStreaming::newReader)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
package nl.adaptivity.xmlutil.core

import nl.adaptivity.xmlutil.XmlException
import nl.adaptivity.xmlutil.core.impl.isIgnorableWhitespace
import nl.adaptivity.xmlutil.core.impl.isXmlWhitespace
import java.io.InputStream
import java.io.BufferedInputStream
import java.io.InputStreamReader
Expand Down Expand Up @@ -51,27 +51,33 @@ public fun KtXmlReader(inputStream: InputStream, encoding: String?, relaxed: Boo
0x00000FEFF -> {
enc = "UTF-32BE"
}

-0x20000 -> {
enc = "UTF-32LE"
}

0x03c -> {
enc = "UTF-32BE"
srcBuf[0] = '<'
}

0x03c000000 -> {
enc = "UTF-32LE"
srcBuf[0] = '<'
}

0x0003c003f -> {
enc = "UTF-16BE"
srcBuf[0] = '<'
srcBuf[1] = '?'
}

0x03c003f00 -> {
enc = "UTF-16LE"
srcBuf[0] = '<'
srcBuf[1] = '?'
}

0x03c3f786d -> {
while (true) {
val i: Int = bufferedInput.read()
Expand All @@ -81,21 +87,23 @@ public fun KtXmlReader(inputStream: InputStream, encoding: String?, relaxed: Boo
val xmlDeclContent = String(srcBuf, 0, srcBufCount)
var encAttrOffset = -1
do {
encAttrOffset = xmlDeclContent.indexOf("encoding", encAttrOffset +1)
encAttrOffset = xmlDeclContent.indexOf("encoding", encAttrOffset + 1)
// TODO handle xml 1.1 whitespace
} while (!(encAttrOffset == 0 || xmlDeclContent[encAttrOffset - 1].isIgnorableWhitespace))
} while (!(encAttrOffset == 0 || xmlDeclContent[encAttrOffset - 1].isXmlWhitespace()))

if (encAttrOffset >= 0) {
var eqPos = encAttrOffset+8
var eqPos = encAttrOffset + 8
if (relaxed) {
while (eqPos<xmlDeclContent.length && xmlDeclContent[eqPos].isIgnorableWhitespace) { eqPos++ }
while (eqPos < xmlDeclContent.length && xmlDeclContent[eqPos].isXmlWhitespace()) {
eqPos++
}
}
if (eqPos >= xmlDeclContent.length || xmlDeclContent[eqPos] != '=') {
error("Missing equality character in encoding attribute")
}
var openQuotPos=eqPos+1
var openQuotPos = eqPos + 1
if (relaxed) {
while (openQuotPos < xmlDeclContent.length && xmlDeclContent[openQuotPos].isIgnorableWhitespace) {
while (openQuotPos < xmlDeclContent.length && xmlDeclContent[openQuotPos].isXmlWhitespace()) {
openQuotPos++
}

Expand All @@ -105,13 +113,13 @@ public fun KtXmlReader(inputStream: InputStream, encoding: String?, relaxed: Boo
}
val delim = xmlDeclContent[openQuotPos]
if (delim == '"' || delim == '\'') {
var endQuotPos = openQuotPos+1
var endQuotPos = openQuotPos + 1
while (endQuotPos < xmlDeclContent.length && xmlDeclContent[endQuotPos] != delim) {
endQuotPos++
}
if (endQuotPos<xmlDeclContent.length) {
enc = xmlDeclContent.substring(openQuotPos+1, endQuotPos)
} else {
if (endQuotPos < xmlDeclContent.length) {
enc = xmlDeclContent.substring(openQuotPos + 1, endQuotPos)
} else {
error("Missing closing quote in encoding")
}
} else {
Expand All @@ -132,6 +140,7 @@ public fun KtXmlReader(inputStream: InputStream, encoding: String?, relaxed: Boo
srcBuf[0] = srcBuf[3]
}
}

else -> if (chk and -0x10000 == -0x1010000) {
enc = "UTF-16BE"
srcBuf[0] = (srcBuf[2].code shl 8 or srcBuf[3].code).toChar()
Expand Down

0 comments on commit 1ed0cd6

Please sign in to comment.