Skip to content

Commit

Permalink
DRILL-7979: Self-Closing XML Tags Cause Schema Change Exceptions (#2283)
Browse files Browse the repository at this point in the history
* Initial commit

* WIP

* Unit test working but attribute not popping off stack

* Everything working

* Removed Logback.xml

* Fixed Unit Test

* Fixed corrupt file

* Addressed Review Comments
  • Loading branch information
cgivre committed Aug 3, 2021
1 parent e73bf9a commit 129d740
Show file tree
Hide file tree
Showing 5 changed files with 125 additions and 12 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -72,9 +72,10 @@ public class XMLReader {
private InputStream fsStream;
private XMLEventReader reader;
private ImplicitColumns metadata;
private boolean isSelfClosingEvent;

/**
* This field indicates the various states in which the reader operates. The names should be self explanatory,
* This field indicates the various states in which the reader operates. The names should be self-explanatory,
* but they are used as the reader iterates over the XML tags to know what to do.
*/
private enum xmlState {
Expand All @@ -96,6 +97,7 @@ public XMLReader(InputStream fsStream, int dataLevel, int maxRecords) throws XML
nestedMapCollection = new HashMap<>();
this.dataLevel = dataLevel;
this.maxRecords = maxRecords;
isSelfClosingEvent = false;
}

public void open(RowSetLoader rootRowWriter, CustomErrorContext errorContext ) {
Expand Down Expand Up @@ -161,12 +163,18 @@ private boolean processElements() {
try {
nextEvent = reader.nextEvent();

// If the next event is whitespace, newlines, or other cruft that we don't need
// ignore and move to the next event
// If the next event is whitespace, newlines, or other cruft that we don't need,
// ignore the event and move to the next event
if (XMLUtils.isEmptyWhiteSpace(nextEvent)) {
continue;
}

// Reset the self-closing tag flag.
isSelfClosingEvent = isSelfClosingEvent(currentEvent, nextEvent);
if (isSelfClosingEvent) {
logger.debug("Found self closing event!!");
}

// Capture the previous and current event
XMLEvent lastEvent = currentEvent;
currentEvent = nextEvent;
Expand All @@ -184,6 +192,35 @@ private boolean processElements() {
return true;
}

/**
* One of the challenges with XML parsing are self-closing events. The streaming XML parser
* treats self-closing events as two events: a start event and an ending event. The issue is that
* the self-closing events can cause schema issues with Drill specifically, if a self-closing event
* is detected prior to a non-self-closing event, and that populated event contains a map or other nested data
* Drill will throw a schema change exception.
*
* Since Drill uses Java's streaming XML parser, unfortunately, it does not provide a means of identifying
* self-closing tags. This function does that by comparing the event with the previous event and looking for
* a condition where one event is a start and the other is an ending event. Additionally, the column number and
* character offsets must be the same, indicating that the two events are the same.
*
* @param e1 The first XMLEvent
* @param e2 The second XMLEvent
* @return True if the events represent a self-closing event, false if not.
*/
private boolean isSelfClosingEvent(XMLEvent e1, XMLEvent e2) {
// If either event is null return false.
if (e1 == null || e2 == null) {
return false;
} else if (XMLUtils.hasAttributes(e1) || XMLUtils.hasAttributes(e2)) {
return false;
}

return (e1.getLocation().getCharacterOffset() == e2.getLocation().getCharacterOffset()) &&
(e1.getLocation().getColumnNumber() == e2.getLocation().getColumnNumber()) &&
e1.isStartElement() && e2.isEndElement();
}

/**
* This function processes an actual XMLEvent. There are three possibilities:
* 1. The event is a start event
Expand Down Expand Up @@ -233,14 +270,13 @@ private void processEvent(XMLEvent currentEvent,
if (!rowStarted) {
currentTupleWriter = startRow(rootRowWriter);
} else {
if (lastEvent!= null &&
if (lastEvent != null &&
lastEvent.getEventType() == XMLStreamConstants.START_ELEMENT) {
/*
* Check the flag in the next section. If the next element is a character AND the flag is set,
* start a map. If not... ignore it all.
*/
changeState(xmlState.POSSIBLE_MAP);

rowWriterStack.push(currentTupleWriter);
}

Expand Down Expand Up @@ -292,6 +328,13 @@ private void processEvent(XMLEvent currentEvent,
case XMLStreamConstants.END_ELEMENT:
currentNestingLevel--;

if (isSelfClosingEvent) {
logger.debug("Closing self-closing event {}. ", fieldName);
isSelfClosingEvent = false;
attributePrefix = XMLUtils.removeField(attributePrefix,fieldName);
break;
}

if (currentNestingLevel < dataLevel - 1) {
break;
} else if (currentEvent.asEndElement().getName().toString().compareTo(rootDataFieldName) == 0) {
Expand All @@ -316,8 +359,10 @@ private void processEvent(XMLEvent currentEvent,

attributePrefix = XMLUtils.removeField(attributePrefix,fieldName);

} else if (currentState != xmlState.ROW_ENDED){
writeFieldData(fieldName, fieldValue, currentTupleWriter);
} else if (currentState != xmlState.ROW_ENDED) {
if ( !isSelfClosingEvent) {
writeFieldData(fieldName, fieldValue, currentTupleWriter);
}
// Clear out field name and value
attributePrefix = XMLUtils.removeField(attributePrefix, fieldName);

Expand Down Expand Up @@ -438,6 +483,7 @@ private TupleWriter getMapWriter(String mapName, TupleWriter rowWriter) {
// Add map to map collection for future use
nestedMapCollection.put(tempFieldName, new XMLMap(mapName, rowWriter.tuple(index)));
}
logger.debug("Index: {}, Fieldname: {}", index, mapName);
return rowWriter.tuple(index);
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,25 @@ public static String removeField(String prefix, String fieldName) {
}

int index = prefix.lastIndexOf(fieldName);
if (index == 0) {
if (index <= 0) {
return "";
} else if (index < 0) {
return prefix;
} else {
return prefix.substring(0, index - 1);
}
}

return prefix.substring(0, index-1);
/**
* Returns true if a given XMLEvent has attributes, false if not. Since only
* start elements can by definition have attributes, returns false if the event
* is not a start element.
* @param event The XMLEvent in question
* @return True if the XMLEvent has attributes, false if not.
*/
public static boolean hasAttributes(XMLEvent event) {
if (! event.isStartElement()) {
return false;
} else {
return event.asStartElement().getAttributes().hasNext();
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -543,4 +543,26 @@ public void testLimitPushdown() throws Exception {
.include("Limit", "maxRecords=2")
.match();
}

@Test
public void testMapError() throws Exception {
String sql = "SELECT * FROM cp.`xml/schemaChange.xml`";
RowSet results = client.queryBuilder().sql(sql).rowSet();

TupleMetadata expectedSchema = new SchemaBuilder()
.addMap("attributes")
.resumeSchema()
.addMap("parent")
.addNullable("link", MinorType.VARCHAR)
.addNullable("value", MinorType.VARCHAR)
.resumeSchema()
.build();

RowSet expected = client.rowSetBuilder(expectedSchema)
.addRow(mapArray(), mapArray(null, null))
.addRow(mapArray(), strArray("https://dev57595.service-now.com/api/now/table/task/46eaa7c9a9fe198100bbe282da0d4b7d", "46eaa7c9a9fe198100bbe282da0d4b7d"))
.build();

new RowSetComparison(expected).verifyAndClearAll(results);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ public void testRemoveField() {

// Test with missing field
String test3 = "field_1_field_2_field_3";
assertEquals(XMLUtils.removeField(test3, "field_4"), "field_1_field_2_field_3");
assertEquals(XMLUtils.removeField(test3, "field_4"), "");

// Test with empty string
String test4 = "";
Expand Down
32 changes: 32 additions & 0 deletions contrib/format-xml/src/test/resources/xml/schemaChange.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
<!--
Licensed to the Apache Software Foundation (ASF) under one
or more contributor license agreements. See the NOTICE file
distributed with this work for additional information
regarding copyright ownership. The ASF licenses this file
to you under the Apache License, Version 2.0 (the
"License"); you may not use this file except in compliance
with the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<response>
<result>
<parent/>
<reason/>
</result>
<result>
<parent>
<link>https://dev57595.service-now.com/api/now/table/task/46eaa7c9a9fe198100bbe282da0d4b7d</link>
<value>46eaa7c9a9fe198100bbe282da0d4b7d</value>
</parent>
<reason/>
</result>
</response>

0 comments on commit 129d740

Please sign in to comment.