Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JNI JSON read with DataSource and infered schema, along with basic java nested Schema JSON reads #14954

Merged
merged 6 commits into from
Feb 8, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
269 changes: 238 additions & 31 deletions java/src/main/java/ai/rapids/cudf/Schema.java
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
/*
*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -26,78 +26,285 @@
*/
public class Schema {
public static final Schema INFERRED = new Schema();
private final List<String> names;
private final List<DType> types;

private Schema(List<String> names, List<DType> types) {
this.names = new ArrayList<>(names);
this.types = new ArrayList<>(types);
private final DType topLevelType;
private final List<String> childNames;
private final List<Schema> childSchemas;
private boolean flattened = false;
private String[] flattenedNames;
private DType[] flattenedTypes;
private int[] flattenedCounts;

private Schema(DType topLevelType,
List<String> childNames,
List<Schema> childSchemas) {
this.topLevelType = topLevelType;
this.childNames = childNames;
this.childSchemas = childSchemas;
}

/**
* Inferred schema.
*/
private Schema() {
names = null;
types = null;
topLevelType = null;
childNames = null;
childSchemas = null;
}

/**
* Get the schema of a child element. Note that an inferred schema will have no children.
* @param i the index of the child to read.
* @return the new Schema
* @throws IndexOutOfBoundsException if the index is not in the range of children.
*/
public Schema getChild(int i) {
jlowe marked this conversation as resolved.
Show resolved Hide resolved
if (childSchemas == null) {
throw new IndexOutOfBoundsException("There are 0 children in this schema");
}
return childSchemas.get(i);
}

@Override
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append(topLevelType);
if (topLevelType == DType.STRUCT) {
sb.append("{");
if (childNames != null) {
for (int i = 0; i < childNames.size(); i++) {
if (i != 0) {
sb.append(", ");
}
sb.append(childNames.get(i));
sb.append(": ");
sb.append(childSchemas.get(i));
}
}
sb.append("}");
} else if (topLevelType == DType.LIST) {
sb.append("[");
if (childNames != null) {
for (int i = 0; i < childNames.size(); i++) {
if (i != 0) {
sb.append(", ");
}
sb.append(childSchemas.get(i));
}
}
sb.append("]");
}
return sb.toString();
}

private void flattenIfNeeded() {
if (!flattened) {
int flatLen = flattenedLength(0);
if (flatLen == 0) {
flattenedNames = null;
flattenedTypes = null;
flattenedCounts = null;
} else {
String[] names = new String[flatLen];
DType[] types = new DType[flatLen];
int[] counts = new int[flatLen];
collectFlattened(names, types, counts, 0);
flattenedNames = names;
flattenedTypes = types;
flattenedCounts = counts;
}
flattened = true;
}
}

private int flattenedLength(int startingLength) {
if (childSchemas != null) {
for (Schema child: childSchemas) {
startingLength++;
startingLength = child.flattenedLength(startingLength);
}
}
return startingLength;
}

private int collectFlattened(String[] names, DType[] types, int[] counts, int offset) {
if (childSchemas != null) {
for (int i = 0; i < childSchemas.size(); i++) {
Schema child = childSchemas.get(i);
names[offset] = childNames.get(i);
types[offset] = child.topLevelType;
if (child.childNames != null) {
counts[offset] = child.childNames.size();
} else {
counts[offset] = 0;
}
offset++;
offset = this.childSchemas.get(i).collectFlattened(names, types, counts, offset);
}
}
return offset;
}

public static Builder builder() {
return new Builder();
return new Builder(DType.STRUCT);
}

public String[] getFlattenedColumnNames() {
flattenIfNeeded();
return flattenedNames;
}

public String[] getColumnNames() {
if (names == null) {
if (childNames == null) {
return null;
}
return names.toArray(new String[names.size()]);
return childNames.toArray(new String[childNames.size()]);
}

public boolean isNested() {
return childSchemas != null && childSchemas.size() > 0;
}

/**
* This is really for a top level struct schema where it is nested, but
* for things like CSV we care that it does not have any children that are also
* nested.
*/
public boolean hasNestedChildren() {
if (childSchemas != null) {
for (Schema child: childSchemas) {
if (child.isNested()) {
return true;
}
}
}
return false;
}

int[] getTypeIds() {
if (types == null) {
int[] getFlattenedTypeIds() {
flattenIfNeeded();
if (flattenedTypes == null) {
return null;
}
int[] ret = new int[types.size()];
for (int i = 0; i < types.size(); i++) {
ret[i] = types.get(i).getTypeId().nativeId;
int[] ret = new int[flattenedTypes.length];
for (int i = 0; i < flattenedTypes.length; i++) {
ret[i] = flattenedTypes[i].getTypeId().nativeId;
}
return ret;
}

int[] getTypeScales() {
if (types == null) {
int[] getFlattenedTypeScales() {
flattenIfNeeded();
if (flattenedTypes == null) {
return null;
}
int[] ret = new int[types.size()];
for (int i = 0; i < types.size(); i++) {
ret[i] = types.get(i).getScale();
int[] ret = new int[flattenedTypes.length];
for (int i = 0; i < flattenedTypes.length; i++) {
ret[i] = flattenedTypes[i].getScale();
}
return ret;
}

DType[] getTypes() {
if (types == null) {
DType[] getFlattenedTypes() {
flattenIfNeeded();
return flattenedTypes;
}

public DType[] getChildTypes() {
if (childSchemas == null) {
return null;
}
DType[] ret = new DType[types.size()];
for (int i = 0; i < types.size(); i++) {
ret[i] = types.get(i);
DType[] ret = new DType[childSchemas.size()];
for (int i = 0; i < ret.length; i++) {
ret[i] = childSchemas.get(i).topLevelType;
}
return ret;
}

int[] getFlattenedNumChildren() {
flattenIfNeeded();
return flattenedCounts;
}

public DType getType() {
return topLevelType;
}

/**
* Check to see if the schema includes a struct at all.
* @return true if this or any one of its descendants contains a struct, else false.
*/
public boolean isStructOrHasStructDescendant() {
if (DType.STRUCT == topLevelType) {
return true;
} else if (DType.LIST == topLevelType) {
return childSchemas.stream().anyMatch(Schema::isStructOrHasStructDescendant);
}
return false;
}

public static class Builder {
private final List<String> names = new ArrayList<>();
private final List<DType> types = new ArrayList<>();
private final DType topLevelType;
private final List<String> names;
private final List<Builder> types;

public Builder column(DType type, String name) {
types.add(type);
private Builder(DType topLevelType) {
this.topLevelType = topLevelType;
if (topLevelType == DType.STRUCT || topLevelType == DType.LIST) {
// There can be children
names = new ArrayList<>();
types = new ArrayList<>();
} else {
names = null;
types = null;
}
}

/**
* Add a new column
* @param type the type of column to add
* @param name the name of the column to add (Ignored for list types)
* @return the builder for the new column. This should really only be used when the type
* passed in is a LIST or a STRUCT.
*/
public Builder addColumn(DType type, String name) {
if (names == null) {
throw new IllegalStateException("A column of type " + topLevelType +
" cannot have children");
}
if (topLevelType == DType.LIST && names.size() > 0) {
throw new IllegalStateException("A LIST column can only have one child");
}
if (names.contains(name)) {
throw new IllegalStateException("Cannot add duplicate names to a schema");
}
Builder ret = new Builder(type);
types.add(ret);
names.add(name);
return ret;
}

/**
* Adds a single column to the current schema. addColumn is preferred as it can be used
* to support nested types.
* @param type the type of the column.
* @param name the name of the column.
* @return this for chaining.
*/
public Builder column(DType type, String name) {
addColumn(type, name);
return this;
}

public Schema build() {
return new Schema(names, types);
List<Schema> children = null;
if (types != null) {
children = new ArrayList<>(types.size());
for (Builder b: types) {
children.add(b.build());
}
}
return new Schema(topLevelType, names, children);
}
}
}
Loading
Loading