JNI JSON read with DataSource and infered schema, along with basic ja…

…va nested Schema JSON reads (#14954) This adds in support for some more JSON reading functionality. It allows us to infer the JSON schema using a DataSource as the input. It also adds in support for using a nested Schema when parsing JSON. Authors: - Robert (Bobby) Evans (https://github.com/revans2) Approvers: - Jason Lowe (https://github.com/jlowe) URL: #14954
rapidsai · Feb 8, 2024 · 306c47c · 306c47c
1 parent 8503b31
commit 306c47c
Show file tree

Hide file tree

Showing 5 changed files with 845 additions and 147 deletions.
diff --git a/java/src/main/java/ai/rapids/cudf/Schema.java b/java/src/main/java/ai/rapids/cudf/Schema.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -26,78 +26,285 @@
  */
 public class Schema {
   public static final Schema INFERRED = new Schema();
-  private final List<String> names;
-  private final List<DType> types;
 
-  private Schema(List<String> names, List<DType> types) {
-    this.names = new ArrayList<>(names);
-    this.types = new ArrayList<>(types);
+  private final DType topLevelType;
+  private final List<String> childNames;
+  private final List<Schema> childSchemas;
+  private boolean flattened = false;
+  private String[] flattenedNames;
+  private DType[] flattenedTypes;
+  private int[] flattenedCounts;
+
+  private Schema(DType topLevelType,
+                 List<String> childNames,
+                 List<Schema> childSchemas) {
+    this.topLevelType = topLevelType;
+    this.childNames = childNames;
+    this.childSchemas = childSchemas;
   }
 
   /**
    * Inferred schema.
    */
   private Schema() {
-    names = null;
-    types = null;
+    topLevelType = null;
+    childNames = null;
+    childSchemas = null;
+  }
+
+  /**
+   * Get the schema of a child element. Note that an inferred schema will have no children.
+   * @param i the index of the child to read.
+   * @return the new Schema
+   * @throws IndexOutOfBoundsException if the index is not in the range of children.
+   */
+  public Schema getChild(int i) {
+    if (childSchemas == null) {
+      throw new IndexOutOfBoundsException("There are 0 children in this schema");
+    }
+    return childSchemas.get(i);
+  }
+
+  @Override
+  public String toString() {
+    StringBuilder sb = new StringBuilder();
+    sb.append(topLevelType);
+    if (topLevelType == DType.STRUCT) {
+      sb.append("{");
+      if (childNames != null) {
+        for (int i = 0; i < childNames.size(); i++) {
+          if (i != 0) {
+            sb.append(", ");
+          }
+          sb.append(childNames.get(i));
+          sb.append(": ");
+          sb.append(childSchemas.get(i));
+        }
+      }
+      sb.append("}");
+    } else if (topLevelType == DType.LIST) {
+      sb.append("[");
+      if (childNames != null) {
+        for (int i = 0; i < childNames.size(); i++) {
+          if (i != 0) {
+            sb.append(", ");
+          }
+          sb.append(childSchemas.get(i));
+        }
+      }
+      sb.append("]");
+    }
+    return sb.toString();
+  }
+
+  private void flattenIfNeeded() {
+    if (!flattened) {
+      int flatLen = flattenedLength(0);
+      if (flatLen == 0) {
+        flattenedNames = null;
+        flattenedTypes = null;
+        flattenedCounts = null;
+      } else {
+        String[] names = new String[flatLen];
+        DType[] types = new DType[flatLen];
+        int[] counts = new int[flatLen];
+        collectFlattened(names, types, counts, 0);
+        flattenedNames = names;
+        flattenedTypes = types;
+        flattenedCounts = counts;
+      }
+      flattened = true;
+    }
+  }
+
+  private int flattenedLength(int startingLength) {
+    if (childSchemas != null) {
+      for (Schema child: childSchemas) {
+        startingLength++;
+        startingLength = child.flattenedLength(startingLength);
+      }
+    }
+    return startingLength;
+  }
+
+  private int collectFlattened(String[] names, DType[] types, int[] counts, int offset) {
+    if (childSchemas != null) {
+      for (int i = 0; i < childSchemas.size(); i++) {
+        Schema child = childSchemas.get(i);
+        names[offset] = childNames.get(i);
+        types[offset] = child.topLevelType;
+        if (child.childNames != null) {
+          counts[offset] = child.childNames.size();
+        } else {
+          counts[offset] = 0;
+        }
+        offset++;
+        offset = this.childSchemas.get(i).collectFlattened(names, types, counts, offset);
+      }
+    }
+    return offset;
   }
 
   public static Builder builder() {
-    return new Builder();
+    return new Builder(DType.STRUCT);
+  }
+
+  public String[] getFlattenedColumnNames() {
+    flattenIfNeeded();
+    return flattenedNames;
   }
 
   public String[] getColumnNames() {
-    if (names == null) {
+    if (childNames == null) {
       return null;
     }
-    return names.toArray(new String[names.size()]);
+    return childNames.toArray(new String[childNames.size()]);
+  }
+
+  public boolean isNested() {
+    return childSchemas != null && childSchemas.size() > 0;
+  }
+
+  /**
+   * This is really for a top level struct schema where it is nested, but
+   * for things like CSV we care that it does not have any children that are also
+   * nested.
+   */
+  public boolean hasNestedChildren() {
+    if (childSchemas != null) {
+      for (Schema child: childSchemas) {
+        if (child.isNested()) {
+          return true;
+        }
+      }
+    }
+    return false;
   }
 
-  int[] getTypeIds() {
-    if (types == null) {
+  int[] getFlattenedTypeIds() {
+    flattenIfNeeded();
+    if (flattenedTypes == null) {
       return null;
     }
-    int[] ret = new int[types.size()];
-    for (int i = 0; i < types.size(); i++) {
-      ret[i] = types.get(i).getTypeId().nativeId;
+    int[] ret = new int[flattenedTypes.length];
+    for (int i = 0; i < flattenedTypes.length; i++) {
+      ret[i] = flattenedTypes[i].getTypeId().nativeId;
     }
     return ret;
   }
 
-  int[] getTypeScales() {
-    if (types == null) {
+  int[] getFlattenedTypeScales() {
+    flattenIfNeeded();
+    if (flattenedTypes == null) {
       return null;
     }
-    int[] ret = new int[types.size()];
-    for (int i = 0; i < types.size(); i++) {
-      ret[i] = types.get(i).getScale();
+    int[] ret = new int[flattenedTypes.length];
+    for (int i = 0; i < flattenedTypes.length; i++) {
+      ret[i] = flattenedTypes[i].getScale();
     }
     return ret;
   }
 
-  DType[] getTypes() {
-    if (types == null) {
+  DType[] getFlattenedTypes() {
+    flattenIfNeeded();
+    return flattenedTypes;
+  }
+
+  public DType[] getChildTypes() {
+    if (childSchemas == null) {
       return null;
     }
-    DType[] ret = new DType[types.size()];
-    for (int i = 0; i < types.size(); i++) {
-      ret[i] = types.get(i);
+    DType[] ret = new DType[childSchemas.size()];
+    for (int i = 0; i < ret.length; i++) {
+      ret[i] = childSchemas.get(i).topLevelType;
     }
     return ret;
   }
 
+  int[] getFlattenedNumChildren() {
+    flattenIfNeeded();
+    return flattenedCounts;
+  }
+
+  public DType getType() {
+    return topLevelType;
+  }
+
+  /**
+   * Check to see if the schema includes a struct at all.
+   * @return true if this or any one of its descendants contains a struct, else false.
+   */
+  public boolean isStructOrHasStructDescendant() {
+    if (DType.STRUCT == topLevelType) {
+      return true;
+    } else if (DType.LIST == topLevelType) {
+      return childSchemas.stream().anyMatch(Schema::isStructOrHasStructDescendant);
+    }
+    return false;
+  }
+
   public static class Builder {
-    private final List<String> names = new ArrayList<>();
-    private final List<DType> types = new ArrayList<>();
+    private final DType topLevelType;
+    private final List<String> names;
+    private final List<Builder> types;
 
-    public Builder column(DType type, String name) {
-      types.add(type);
+    private Builder(DType topLevelType) {
+      this.topLevelType = topLevelType;
+      if (topLevelType == DType.STRUCT || topLevelType == DType.LIST) {
+        // There can be children
+        names = new ArrayList<>();
+        types = new ArrayList<>();
+      } else {
+        names = null;
+        types = null;
+      }
+    }
+
+    /**
+     * Add a new column
+     * @param type the type of column to add
+     * @param name the name of the column to add (Ignored for list types)
+     * @return the builder for the new column. This should really only be used when the type
+     * passed in is a LIST or a STRUCT.
+     */
+    public Builder addColumn(DType type, String name) {
+      if (names == null) {
+        throw new IllegalStateException("A column of type " + topLevelType +
+            " cannot have children");
+      }
+      if (topLevelType == DType.LIST && names.size() > 0) {
+        throw new IllegalStateException("A LIST column can only have one child");
+      }
+      if (names.contains(name)) {
+        throw new IllegalStateException("Cannot add duplicate names to a schema");
+      }
+      Builder ret = new Builder(type);
+      types.add(ret);
       names.add(name);
+      return ret;
+    }
+
+    /**
+     * Adds a single column to the current schema. addColumn is preferred as it can be used
+     * to support nested types.
+     * @param type the type of the column.
+     * @param name the name of the column.
+     * @return this for chaining.
+     */
+    public Builder column(DType type, String name) {
+      addColumn(type, name);
       return this;
     }
 
     public Schema build() {
-      return new Schema(names, types);
+      List<Schema> children = null;
+      if (types != null) {
+        children = new ArrayList<>(types.size());
+        for (Builder b: types) {
+          children.add(b.build());
+        }
+      }
+      return new Schema(topLevelType, names, children);
     }
   }
 }