Merge pull request #15 from rapidsai/branch-0.13

Update local fork 0.13
rapidsai · Jan 30, 2020 · 395518b · 395518b
2 parents e51abbf + e3732e3
commit 395518b
Show file tree

Hide file tree

Showing 109 changed files with 11,430 additions and 5,735 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,16 +3,37 @@
 ## New Features
 
 ## Improvements
+- PR #3698 Add count_(un)set_bits functions taking multiple ranges and updated slice to compute null counts at once.
+- PR #3909 Move java backend to libcudf++
+- PR #3971 Adding `as_table` to convert Column to Table in python
+- PR #3910 Adding sinh, cosh, tanh, asinh, acosh, atanh cube root and rint unary support.
+- PR #3972 Add Java bindings for left_semi_join and left_anti_join
+- PR #3975 Simplify and generalize data handling in `Buffer`
+- PR #3911 Adding null boolean handling for copy_if_else
+- PR #4002 Adding to_frame and fix for categorical column issue
+
+- PR #3897 Port cuIO JSON reader to cudf::column types
 
 ## Bug Fixes
 
 - PR #3888 Drop `ptr=None` from `DeviceBuffer` call
+- PR #3976 Fix string serialization and memory_usage method to be consistent
+- PR #3902 Fix conversion of large size GPU array to dataframe
+- PR #3953 Fix overflow in column_buffer when computing the device buffer size
+- PR #3959 Add missing hash-dispatch function for cudf.Series
+- PR #3970 Fix for Series Pickle
+- PR #3964 Restore legacy NVStrings and NVCategory dependencies in Java jar
+- PR #3982 Fix java unary op enum and add missing ops
+- PR #3979 Add `name` to Series serialize and deserialize
+- PR #4005 Fix null mask allocation bug in gather_bitmask
+- PR #4000 Fix dask_cudf sort_values performance for single partitions
 
 
 # cuDF 0.12.0 (Date TBD)
 
 ## New Features
 
+- PR #3759 Updated 10 Minutes with clarification on how `dask_cudf` uses `cudf` API
 - PR #3224 Define and implement new join APIs.
 - PR #3284 Add gpu-accelerated parquet writer
 - PR #3254 Python redesign for libcudf++
@@ -68,7 +89,7 @@
 - PR #3639 Define and implement `nans_to_nulls`
 - PR #3561 Rework contains implementation in search
 - PR #3616 Add aggregation infrastructure for argmax/argmin.
-- PR #3673 Parquet reader: improve rounding of timestamp conversion to seconds 
+- PR #3673 Parquet reader: improve rounding of timestamp conversion to seconds
 - PR #3699 Stringify libcudacxx headers for binary op JIT
 - PR #3697 Improve column insert performance for wide frames
 - PR #3616 Add aggregation infrastructure for argmax/argmin.
@@ -85,6 +106,7 @@
 - PR #2438 Build GBench Benchmarks in CI
 - PR #3713 Adding aggregation support to rolling_window
 - PR #3875 Add abstract sink for IO writers, used by ORC and Parquet writers for now
+- PR #3916 Refactor gather bindings
 
 ## Bug Fixes
 
@@ -96,7 +118,7 @@
 - PR #3563 Use `__cuda_array_interface__` for serialization
 - PR #3564 Fix cuda memory access error in gather_bitmask_kernel
 - PR #3548 Replaced CUDA_RT_CALL with CUDA_TRY
-- PR #3486 Pandas > 0.25 compatability 
+- PR #3486 Pandas > 0.25 compatability
 - PR #3622 Fix new warnings and errors when building with gcc-8
 - PR #3588 Remove avro reader column order reversal
 - PR #3629 Fix hash map test failure
@@ -124,7 +146,7 @@
 - PR #3769 Don't look for a `name` attribute in column
 - PR #3783 Bind cuDF operators to Dask Dataframe
 - PR #3775 Fix segfault when reading compressed CSV files larger than 4GB
-- PR #3799 Align indices of Series inputs when adding as columns to DataFrame 
+- PR #3799 Align indices of Series inputs when adding as columns to DataFrame
 - PR #3803 Keep name when unpickling Index objects
 - PR #3804 Fix cuda crash in AVRO reader
 - PR #3766 Remove references to cudf::type_id::CATEGORY from IO code
@@ -139,10 +161,11 @@
 - PR #3869 Fixed size calculation in NVStrings::byte_count()
 - PR #3868 Fix apply_grouped moving average example
 - PR #3900 Properly link `NVStrings` and `NVCategory` into tests
-- PR #3868 Fix apply_grouped moving average example 
+- PR #3868 Fix apply_grouped moving average example
 - PR #3871 Fix `split_out` error
 - PR #3886 Fix string column materialization from column view
 - PR #3893 Parquet reader: fix segfault reading empty parquet file
+- PR #3931 Dask-cudf groupby `.agg` multicolumn handling fix
 
 
 # cuDF 0.11.0 (11 Dec 2019)

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -497,6 +497,8 @@ add_library(cudf
             src/io/csv/csv_gpu.cu
             src/io/csv/reader_impl.cu
             src/io/json/legacy/json_reader_impl.cu
+            src/io/json/reader_impl.cu
+            src/io/json/json_gpu.cu
             src/io/orc/legacy/orc_reader_impl.cu
             src/io/orc/legacy/orc_writer_impl.cu
             src/io/orc/orc.cpp
@@ -525,6 +527,8 @@ add_library(cudf
             src/io/functions.cpp
             src/io/statistics/column_stats.cu
             src/io/utilities/datasource.cpp
+            src/io/utilities/parsing_utils.cu
+            src/io/utilities/type_conversion.cu
             src/io/utilities/data_sink.cpp
             src/io/utilities/legacy/parsing_utils.cu
             src/utilities/legacy/cuda_utils.cu

diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
@@ -482,16 +482,16 @@ std::vector<contiguous_split_result> contiguous_split(cudf::table_view const& in
  *          @p rhs based on the value of the corresponding element in @p boolean_mask
  *
  * Selects each element i in the output column from either @p rhs or @p lhs using the following rule:
- *          output[i] = (boolean_mask[i]) ? lhs[i] : rhs[i]
+ *          output[i] = (boolean_mask.valid(i) and boolean_mask[i]) ? lhs[i] : rhs[i]
  *          
  * @throws cudf::logic_error if lhs and rhs are not of the same type
  * @throws cudf::logic_error if lhs and rhs are not of the same length 
- * @throws cudf::logic_error if boolean_mask contains nulls
  * @throws cudf::logic_error if boolean mask is not of type bool8
  * @throws cudf::logic_error if boolean mask is not of the same length as lhs and rhs  
  * @param[in] left-hand column_view
  * @param[in] right-hand column_view
- * @param[in] Non-nullable column of `BOOL8` elements that control selection from `lhs` or `rhs`
+ * @param[in] column of `BOOL8` representing "left (true) / right (false)" boolean for each element and
+ *            null element represents false.
  * @param[in] mr resource for allocating device memory
  *
  * @returns new column with the selected elements
@@ -504,14 +504,15 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs, column_view const&
  *          @p rhs based on the value of the corresponding element in @p boolean_mask
  *
  * Selects each element i in the output column from either @p rhs or @p lhs using the following rule:
- *          output[i] = (boolean_mask[i]) ? lhs : rhs[i]
+ *          output[i] = (boolean_mask.valid(i) and boolean_mask[i]) ? lhs : rhs[i]
  *         
  * @throws cudf::logic_error if lhs and rhs are not of the same type 
  * @throws cudf::logic_error if boolean mask is not of type bool8
  * @throws cudf::logic_error if boolean mask is not of the same length as rhs  
  * @param[in] left-hand scalar
  * @param[in] right-hand column_view
- * @param[in] column_view representing "left (true) / right (false)" boolean for each element
+ * @param[in] column of `BOOL8` representing "left (true) / right (false)" boolean for each element and
+ *            null element represents false.
  * @param[in] mr resource for allocating device memory 
  *
  * @returns new column with the selected elements
@@ -524,14 +525,15 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs, column_view const& rhs,
  *          @p rhs based on the value of the corresponding element in @p boolean_mask
  *
  * Selects each element i in the output column from either @p rhs or @p lhs using the following rule:
- *          output[i] = (boolean_mask[i]) ? lhs[i] : rhs
+ *          output[i] = (boolean_mask.valid(i) and boolean_mask[i]) ? lhs[i] : rhs
  *         
  * @throws cudf::logic_error if lhs and rhs are not of the same type 
  * @throws cudf::logic_error if boolean mask is not of type bool8
  * @throws cudf::logic_error if boolean mask is not of the same length as lhs  
  * @param[in] left-hand column_view
  * @param[in] right-hand scalar
- * @param[in] column_view representing "left (true) / right (false)" boolean for each element
+ * @param[in] column of `BOOL8` representing "left (true) / right (false)" boolean for each element and
+ *            null element represents false.
  * @param[in] mr resource for allocating device memory 
  *
  * @returns new column with the selected elements
@@ -544,12 +546,13 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs, scalar const& rhs,
  *          @p rhs based on the value of the corresponding element in @p boolean_mask
  *
  * Selects each element i in the output column from either @p rhs or @p lhs using the following rule:
- *          output[i] = (boolean_mask[i]) ? lhs : rhs
+ *          output[i] = (boolean_mask.valid(i) and boolean_mask[i]) ? lhs : rhs
  *          
  * @throws cudf::logic_error if boolean mask is not of type bool8 
  * @param[in] left-hand scalar
  * @param[in] right-hand scalar
- * @param[in] column_view representing "left (true) / right (false)" boolean for each element
+ * @param[in] column of `BOOL8` representing "left (true) / right (false)" boolean for each element and
+ *            null element represents false.
  * @param[in] mr resource for allocating device memory 
  *
  * @returns new column with the selected elements

diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
@@ -31,12 +31,12 @@ namespace detail {
  * @note It is the caller's responsibility to ensure that the returned view
  * does not outlive the viewed device memory.
  *
- * @throws `cudf::logic_error` if `begin < 0`, `end < begin` or
+ * @throws cudf::logic_error if `begin < 0`, `end < begin` or
  * `end > input.size()`.
  *
- * @param input View of input column to slice
- * @param begin Index of the first desired element in the slice (inclusive).
- * @param end Index of the last desired element in the slice (exclusive).
+ * @param[in] input View of input column to slice
+ * @param[in] begin Index of the first desired element in the slice (inclusive).
+ * @param[in] end Index of the last desired element in the slice (exclusive).
  *
  * @return ColumnView View of the elements `[begin,end)` from `input`.
  *---------------------------------------------------------------------------**/
@@ -63,6 +63,15 @@ ColumnView slice(ColumnView const& input,
                      input.offset() + begin, children);
 }
 
+/**
+ * @copydoc cudf::experimental::slice(column_view const&,std::vector<size_type> const&)
+ *
+ * @param stream Optional CUDA stream on which to execute kernels
+ */
+std::vector<column_view> slice(column_view const& input,
+                               std::vector<size_type> const& indices,
+                               cudaStream_t stream = 0);
+
 /**
  * @copydoc cudf::experimental::contiguous_split
  *

diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
@@ -15,10 +15,34 @@
  */
 #pragma once
 
+#include <cudf/types.hpp>
+
+#include <vector>
+
 namespace cudf {
 
 namespace detail {
 
+/**
+ * @copydoc cudf::segmented_count_set_bits
+ *
+ * @param[in] stream Optional CUDA stream on which to execute kernels
+ */
+std::vector<size_type>
+segmented_count_set_bits(bitmask_type const* bitmask,
+                         std::vector<size_type> const& indices,
+                         cudaStream_t stream = 0);
+
+/**
+ * @copydoc cudf::segmented_count_unset_bits
+ *
+ * @param[in] stream Optional CUDA stream on which to execute kernels
+ */
+std::vector<size_type>
+segmented_count_unset_bits(bitmask_type const* bitmask,
+                           std::vector<size_type> const& indices,
+                           cudaStream_t stream = 0);
+
 /**---------------------------------------------------------------------------*
  * @brief Concatenates `views[i]`'s bitmask from the bits
  * `[views[i].offset(), views[i].offset() + views[i].size())` for all elements

diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
@@ -60,7 +60,7 @@ void gather_bitmask(table_view const& source, MapIterator gather_map,
   // Create null mask if source is nullable but target is not
   for (size_t i = 0; i < target.size(); ++i) {
     if (source.column(i).nullable() and not target[i]->nullable()) {
-      auto mask = create_null_mask(target.size(), mask_state::ALL_VALID, stream, mr);
+      auto mask = create_null_mask(target[i]->size(), mask_state::ALL_VALID, stream, mr);
       target[i]->set_null_mask(std::move(mask), 0);
     }
   }

diff --git a/cpp/include/cudf/io/functions.hpp b/cpp/include/cudf/io/functions.hpp
@@ -77,6 +77,52 @@ table_with_metadata read_avro(
     read_avro_args const& args,
     rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
 
+/**---------------------------------------------------------------------------*
+ * @brief Input arguments to the `read_json` interface
+ *
+ * Available parameters and are closely patterned after PANDAS' `read_json` API.
+ * Not all parameters are unsupported. If the matching PANDAS' parameter
+ * has a default value of `None`, then a default value of `-1` or `0` may be
+ * used as the equivalent.
+ *
+ * Parameters in PANDAS that are unavailable or in cudf:
+ *  `orient`                - currently fixed-format
+ *  `typ`                   - data is always returned as a cudf::table
+ *  `convert_axes`          - use column functions for axes operations instead
+ *  `convert_dates`         - dates are detected automatically
+ *  `keep_default_dates`    - dates are detected automatically
+ *  `numpy`                 - data is always returned as a cudf::table
+ *  `precise_float`         - there is only one converter
+ *  `date_unit`             - only millisecond units are supported
+ *  `encoding`              - only ASCII-encoded data is supported
+ *  `chunksize`             - use `byte_range_xxx` for chunking instead
+ *---------------------------------------------------------------------------**/
+struct read_json_args {
+  source_info source;
+
+  ///< Data types of the column; empty to infer dtypes
+  std::vector<std::string> dtype;
+  /// Specify the compression format of the source or infer from file extension
+  compression_type compression = compression_type::AUTO;
+
+  ///< Read the file as a json object per line
+  bool lines = false;
+
+  ///< Bytes to skip from the start
+  size_t byte_range_offset = 0;             
+  ///< Bytes to read; always reads complete rows
+  size_t byte_range_size = 0;
+
+   /// Whether to parse dates as DD/MM versus MM/DD
+  bool dayfirst = false;
+
+  explicit read_json_args(const source_info& src) : source(src) {}
+};
+
+// Freeform API wraps the detail reader class API
+table_with_metadata read_json(read_json_args const& args,
+                                rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource()); 
+
 /**
  * @brief Settings to use for `read_csv()`
  */