Skip to content

Commit

Permalink
JNI JSON read with DataSource and infered schema, along with basic ja…
Browse files Browse the repository at this point in the history
…va nested Schema JSON reads (#14954)

This adds in support for some more JSON reading functionality. It allows us to infer the JSON schema using a DataSource as the input. It also adds in support for using a nested Schema when parsing JSON.

Authors:
  - Robert (Bobby) Evans (https://github.com/revans2)

Approvers:
  - Jason Lowe (https://github.com/jlowe)

URL: #14954
  • Loading branch information
revans2 authored Feb 8, 2024
1 parent 8503b31 commit 306c47c
Show file tree
Hide file tree
Showing 5 changed files with 845 additions and 147 deletions.
269 changes: 238 additions & 31 deletions java/src/main/java/ai/rapids/cudf/Schema.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -26,78 +26,285 @@
*/
public class Schema {
public static final Schema INFERRED = new Schema();
private final List<String> names;
private final List<DType> types;

private Schema(List<String> names, List<DType> types) {
this.names = new ArrayList<>(names);
this.types = new ArrayList<>(types);
private final DType topLevelType;
private final List<String> childNames;
private final List<Schema> childSchemas;
private boolean flattened = false;
private String[] flattenedNames;
private DType[] flattenedTypes;
private int[] flattenedCounts;

private Schema(DType topLevelType,
List<String> childNames,
List<Schema> childSchemas) {
this.topLevelType = topLevelType;
this.childNames = childNames;
this.childSchemas = childSchemas;
}

/**
* Inferred schema.
*/
private Schema() {
names = null;
types = null;
topLevelType = null;
childNames = null;
childSchemas = null;
}

/**
* Get the schema of a child element. Note that an inferred schema will have no children.
* @param i the index of the child to read.
* @return the new Schema
* @throws IndexOutOfBoundsException if the index is not in the range of children.
*/
public Schema getChild(int i) {
if (childSchemas == null) {
throw new IndexOutOfBoundsException("There are 0 children in this schema");
}
return childSchemas.get(i);
}

@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(topLevelType);
if (topLevelType == DType.STRUCT) {
sb.append("{");
if (childNames != null) {
for (int i = 0; i < childNames.size(); i++) {
if (i != 0) {
sb.append(", ");
}
sb.append(childNames.get(i));
sb.append(": ");
sb.append(childSchemas.get(i));
}
}
sb.append("}");
} else if (topLevelType == DType.LIST) {
sb.append("[");
if (childNames != null) {
for (int i = 0; i < childNames.size(); i++) {
if (i != 0) {
sb.append(", ");
}
sb.append(childSchemas.get(i));
}
}
sb.append("]");
}
return sb.toString();
}

private void flattenIfNeeded() {
if (!flattened) {
int flatLen = flattenedLength(0);
if (flatLen == 0) {
flattenedNames = null;
flattenedTypes = null;
flattenedCounts = null;
} else {
String[] names = new String[flatLen];
DType[] types = new DType[flatLen];
int[] counts = new int[flatLen];
collectFlattened(names, types, counts, 0);
flattenedNames = names;
flattenedTypes = types;
flattenedCounts = counts;
}
flattened = true;
}
}

private int flattenedLength(int startingLength) {
if (childSchemas != null) {
for (Schema child: childSchemas) {
startingLength++;
startingLength = child.flattenedLength(startingLength);
}
}
return startingLength;
}

private int collectFlattened(String[] names, DType[] types, int[] counts, int offset) {
if (childSchemas != null) {
for (int i = 0; i < childSchemas.size(); i++) {
Schema child = childSchemas.get(i);
names[offset] = childNames.get(i);
types[offset] = child.topLevelType;
if (child.childNames != null) {
counts[offset] = child.childNames.size();
} else {
counts[offset] = 0;
}
offset++;
offset = this.childSchemas.get(i).collectFlattened(names, types, counts, offset);
}
}
return offset;
}

public static Builder builder() {
return new Builder();
return new Builder(DType.STRUCT);
}

public String[] getFlattenedColumnNames() {
flattenIfNeeded();
return flattenedNames;
}

public String[] getColumnNames() {
if (names == null) {
if (childNames == null) {
return null;
}
return names.toArray(new String[names.size()]);
return childNames.toArray(new String[childNames.size()]);
}

public boolean isNested() {
return childSchemas != null && childSchemas.size() > 0;
}

/**
* This is really for a top level struct schema where it is nested, but
* for things like CSV we care that it does not have any children that are also
* nested.
*/
public boolean hasNestedChildren() {
if (childSchemas != null) {
for (Schema child: childSchemas) {
if (child.isNested()) {
return true;
}
}
}
return false;
}

int[] getTypeIds() {
if (types == null) {
int[] getFlattenedTypeIds() {
flattenIfNeeded();
if (flattenedTypes == null) {
return null;
}
int[] ret = new int[types.size()];
for (int i = 0; i < types.size(); i++) {
ret[i] = types.get(i).getTypeId().nativeId;
int[] ret = new int[flattenedTypes.length];
for (int i = 0; i < flattenedTypes.length; i++) {
ret[i] = flattenedTypes[i].getTypeId().nativeId;
}
return ret;
}

int[] getTypeScales() {
if (types == null) {
int[] getFlattenedTypeScales() {
flattenIfNeeded();
if (flattenedTypes == null) {
return null;
}
int[] ret = new int[types.size()];
for (int i = 0; i < types.size(); i++) {
ret[i] = types.get(i).getScale();
int[] ret = new int[flattenedTypes.length];
for (int i = 0; i < flattenedTypes.length; i++) {
ret[i] = flattenedTypes[i].getScale();
}
return ret;
}

DType[] getTypes() {
if (types == null) {
DType[] getFlattenedTypes() {
flattenIfNeeded();
return flattenedTypes;
}

public DType[] getChildTypes() {
if (childSchemas == null) {
return null;
}
DType[] ret = new DType[types.size()];
for (int i = 0; i < types.size(); i++) {
ret[i] = types.get(i);
DType[] ret = new DType[childSchemas.size()];
for (int i = 0; i < ret.length; i++) {
ret[i] = childSchemas.get(i).topLevelType;
}
return ret;
}

int[] getFlattenedNumChildren() {
flattenIfNeeded();
return flattenedCounts;
}

public DType getType() {
return topLevelType;
}

/**
* Check to see if the schema includes a struct at all.
* @return true if this or any one of its descendants contains a struct, else false.
*/
public boolean isStructOrHasStructDescendant() {
if (DType.STRUCT == topLevelType) {
return true;
} else if (DType.LIST == topLevelType) {
return childSchemas.stream().anyMatch(Schema::isStructOrHasStructDescendant);
}
return false;
}

public static class Builder {
private final List<String> names = new ArrayList<>();
private final List<DType> types = new ArrayList<>();
private final DType topLevelType;
private final List<String> names;
private final List<Builder> types;

public Builder column(DType type, String name) {
types.add(type);
private Builder(DType topLevelType) {
this.topLevelType = topLevelType;
if (topLevelType == DType.STRUCT || topLevelType == DType.LIST) {
// There can be children
names = new ArrayList<>();
types = new ArrayList<>();
} else {
names = null;
types = null;
}
}

/**
* Add a new column
* @param type the type of column to add
* @param name the name of the column to add (Ignored for list types)
* @return the builder for the new column. This should really only be used when the type
* passed in is a LIST or a STRUCT.
*/
public Builder addColumn(DType type, String name) {
if (names == null) {
throw new IllegalStateException("A column of type " + topLevelType +
" cannot have children");
}
if (topLevelType == DType.LIST && names.size() > 0) {
throw new IllegalStateException("A LIST column can only have one child");
}
if (names.contains(name)) {
throw new IllegalStateException("Cannot add duplicate names to a schema");
}
Builder ret = new Builder(type);
types.add(ret);
names.add(name);
return ret;
}

/**
* Adds a single column to the current schema. addColumn is preferred as it can be used
* to support nested types.
* @param type the type of the column.
* @param name the name of the column.
* @return this for chaining.
*/
public Builder column(DType type, String name) {
addColumn(type, name);
return this;
}

public Schema build() {
return new Schema(names, types);
List<Schema> children = null;
if (types != null) {
children = new ArrayList<>(types.size());
for (Builder b: types) {
children.add(b.build());
}
}
return new Schema(topLevelType, names, children);
}
}
}
Loading

0 comments on commit 306c47c

Please sign in to comment.