Add DMatrix and Annotator

hcho3 · Apr 28, 2023 · 6062a13 · 6062a13
1 parent 7b1c039
commit 6062a13
Show file tree

Hide file tree

Showing 7 changed files with 293 additions and 13 deletions.
diff --git a/python/pyproject.toml b/python/pyproject.toml
@@ -34,4 +34,7 @@ dependencies = [
 [project.optional-dependencies]
 scikit-learn = ["scikit-learn"]
 
+[tool.mypy]
+plugins = "numpy.typing.mypy_plugin"
+
 [tool.hatch.build.targets.wheel.hooks.custom]
diff --git a/python/tl2cgen/__init__.py b/python/tl2cgen/__init__.py
@@ -2,8 +2,9 @@
 """
 TL2cgen: Model compiler for decision tree ensembles
 """
-from .core import _py_version, generate_c_code
+from .core import _py_version, annotate_branch, generate_c_code
+from .data import DMatrix
 
 __version__ = _py_version()
 
-__all__ = ["generate_c_code"]
+__all__ = ["annotate_branch", "generate_c_code", "DMatrix"]
diff --git a/python/tl2cgen/core.py b/python/tl2cgen/core.py
@@ -4,7 +4,8 @@
 
 import treelite
 
-from .handle_class import _Compiler, _convert_treelite_model
+from .data import DMatrix
+from .handle_class import _Annotator, _Compiler, _convert_treelite_model
 
 
 def _py_version() -> str:
@@ -59,3 +60,33 @@ def generate_c_code(
     _model = _convert_treelite_model(model)
     compiler_obj = _Compiler(params, compiler, verbose)
     compiler_obj.compile(_model, dirpath)
+
+
+def annotate_branch(
+    path: Union[str, pathlib.Path],
+    model: treelite.Model,
+    dmat: DMatrix,
+    nthread: Optional[int],
+    verbose: bool = False,
+) -> None:
+    """
+    Annotate branches in a given model using frequency patterns in the training data and save
+    the annotation data to a JSON file. Each node gets the count of the instances that belong to it.
+
+    Parameters
+    ----------
+    path :
+        Location of JSON file
+    model :
+        Model to annotate
+    dmat :
+        Data matrix representing the training data
+    nthread :
+        Number of threads to use while annotating. If missing, use all physical cores in the system.
+    verbose :
+        Whether to print extra messages
+    """
+    _model = _convert_treelite_model(model)
+    nthread = nthread if nthread is not None else 0
+    annotator = _Annotator(_model, dmat, nthread, verbose)
+    annotator.save(path)
diff --git a/python/tl2cgen/data.py b/python/tl2cgen/data.py
@@ -0,0 +1,181 @@
+"""Data matrix"""
+import ctypes
+from typing import Optional, Tuple, Union
+
+import numpy as np
+import numpy.typing as npt
+import scipy  # type: ignore
+
+from .dtypes import (
+    numpy_type_to_type_info,
+    type_info_to_ctypes_type,
+    type_info_to_numpy_type,
+)
+from .exception import TL2cgenError
+from .libloader import _LIB, _check_call
+from .util import c_str
+
+
+class DMatrix:
+    """Data matrix used in TL2cgen.
+
+    Parameters
+    ----------
+    data :
+        Data source
+    dtype :
+        If specified, the data will be casted into the corresponding data type.
+    missing :
+        Value in the data that represents a missing entry. If set to ``None``,
+        ``numpy.nan`` will be used.
+    """
+
+    # pylint: disable=R0902,R0903,R0913
+
+    def __init__(
+        self,
+        data: Union[str, npt.NDArray, scipy.sparse.csr_matrix],
+        *,
+        dtype: Optional[str] = None,
+        missing: Optional[float] = None,
+    ):
+        if data is None:
+            raise TL2cgenError("'data' argument cannot be None")
+
+        self.handle = ctypes.c_void_p()
+
+        if isinstance(data, (str,)):
+            raise TL2cgenError(
+                "'data' argument cannot be a string. Did you mean to load data from a text file? "
+                "Please use the following packages to load the text file:\n"
+                "   * CSV file: Use pandas.read_csv() or numpy.loadtxt()\n"
+                "   * LIBSVM file: Use sklearn.datasets.load_svmlight_file()"
+            )
+        if isinstance(data, scipy.sparse.csr_matrix):
+            self._init_from_csr(data, dtype=dtype)
+        elif isinstance(data, scipy.sparse.csc_matrix):
+            self._init_from_csr(data.tocsr(), dtype=dtype)
+        elif isinstance(data, np.ndarray):
+            self._init_from_npy2d(data, missing=missing, dtype=dtype)
+        else:  # any type that's convertible to CSR matrix is O.K.
+            try:
+                csr = scipy.sparse.csr_matrix(data)
+                self._init_from_csr(csr, dtype=dtype)
+            except Exception as e:
+                raise TypeError(
+                    f"Cannot initialize DMatrix from {type(data).__name__}"
+                ) from e
+        num_row, num_col, nelem = self._get_dims()
+        self.shape = (num_row, num_col)
+        self.size = nelem
+
+    def _init_from_csr(
+        self, csr: scipy.sparse.csr_matrix, *, dtype: Optional[str] = None
+    ) -> None:
+        """Initialize data from a CSR (Compressed Sparse Row) matrix"""
+        if len(csr.indices) != len(csr.data):
+            raise ValueError(
+                f"indices and data not of same length: {len(csr.indices)} vs {len(csr.data)}"
+            )
+        if len(csr.indptr) != csr.shape[0] + 1:
+            raise ValueError(
+                "len(indptr) must be equal to 1 + [number of rows]"
+                f"len(indptr) = {len(csr.indptr)} vs 1 + [number of rows] = {1 + csr.shape[0]}"
+            )
+        if csr.indptr[-1] != len(csr.data):
+            raise ValueError(
+                "last entry of indptr must be equal to len(data)"
+                f"indptr[-1] = {csr.indptr[-1]} vs len(data) = {len(csr.data)}"
+            )
+
+        if dtype is None:
+            data_type = csr.data.dtype
+        else:
+            data_type = type_info_to_numpy_type(dtype)
+        data_type_code = numpy_type_to_type_info(data_type)
+        data_ptr_type = ctypes.POINTER(type_info_to_ctypes_type(data_type_code))
+        if data_type_code not in ["float32", "float64"]:
+            raise ValueError("data should be either float32 or float64 type")
+
+        data = np.array(csr.data, copy=False, dtype=data_type, order="C")
+        indices = np.array(csr.indices, copy=False, dtype=np.uintc, order="C")
+        indptr = np.array(csr.indptr, copy=False, dtype=np.uintp, order="C")
+        _check_call(
+            _LIB.TL2cgenDMatrixCreateFromCSR(
+                data.ctypes.data_as(data_ptr_type),
+                c_str(data_type_code),
+                indices.ctypes.data_as(ctypes.POINTER(ctypes.c_uint)),
+                indptr.ctypes.data_as(ctypes.POINTER(ctypes.c_size_t)),
+                ctypes.c_size_t(csr.shape[0]),
+                ctypes.c_size_t(csr.shape[1]),
+                ctypes.byref(self.handle),
+            )
+        )
+
+    def _init_from_npy2d(
+        self,
+        mat: npt.NDArray,
+        *,
+        missing: Optional[float] = None,
+        dtype: Optional[str] = None,
+    ) -> None:
+        """
+        Initialize data from a 2-D numpy matrix.
+        If ``mat`` does not have ``order='C'`` (also known as row-major) or is not
+        contiguous, a temporary copy will be made.
+        If ``mat`` does not have ``dtype=numpy.float32``, a temporary copy will be
+        made also.
+        Thus, as many as two temporary copies of data can be made. One should set
+        input layout and type judiciously to conserve memory.
+        """
+        if len(mat.shape) != 2:
+            raise ValueError("Input numpy.ndarray must be two-dimensional")
+        data_type: npt.DTypeLike = (
+            mat.dtype if dtype is None else type_info_to_numpy_type(dtype)
+        )
+        data_type_code = numpy_type_to_type_info(data_type)
+        data_ptr_type = ctypes.POINTER(type_info_to_ctypes_type(data_type_code))
+        if data_type_code not in ["float32", "float64"]:
+            raise ValueError("data should be either float32 or float64 type")
+        # flatten the array by rows and ensure it is float32.
+        # we try to avoid data copies if possible
+        # (reshape returns a view when possible and we explicitly tell np.array to
+        #  avoid copying)
+        data = np.array(mat.reshape(mat.size), copy=False, dtype=data_type)
+        missing = missing if missing is not None else np.nan
+        missing_ar = np.array([missing], dtype=data_type, order="C")
+        _check_call(
+            _LIB.TL2cgenDMatrixCreateFromMat(
+                data.ctypes.data_as(data_ptr_type),
+                c_str(data_type_code),
+                ctypes.c_size_t(mat.shape[0]),
+                ctypes.c_size_t(mat.shape[1]),
+                missing_ar.ctypes.data_as(data_ptr_type),
+                ctypes.byref(self.handle),
+            )
+        )
+
+    def _get_dims(self) -> Tuple[int, int, int]:
+        num_row = ctypes.c_size_t()
+        num_col = ctypes.c_size_t()
+        nelem = ctypes.c_size_t()
+        _check_call(
+            _LIB.TL2cgenDMatrixGetDimension(
+                self.handle,
+                ctypes.byref(num_row),
+                ctypes.byref(num_col),
+                ctypes.byref(nelem),
+            )
+        )
+        return num_row.value, num_col.value, nelem.value
+
+    def __del__(self):
+        if self.handle:
+            _check_call(_LIB.TL2cgenDMatrixFree(self.handle))
+            self.handle = None
+
+    def __repr__(self):
+        return (
+            f"<{self.shape[0]}x{self.shape[1]} sparse matrix of type tl2cgen.DMatrix\n"
+            f"        with {self.size} stored elements in Compressed Sparse Row format>"
+        )
diff --git a/python/tl2cgen/dtypes.py b/python/tl2cgen/dtypes.py
@@ -0,0 +1,40 @@
+"""Utility functions to handle types"""
+
+import ctypes
+from typing import Any, Dict
+
+import numpy as np
+import numpy.typing as npt
+
+_CTYPES_TYPE_TABLE: Dict[str, Any] = {
+    "uint32": ctypes.c_uint32,
+    "float32": ctypes.c_float,
+    "float64": ctypes.c_double,
+}
+
+_NUMPY_TYPE_TABLE: Dict[str, npt.DTypeLike] = {
+    "uint32": np.uint32,
+    "float32": np.float32,
+    "float64": np.float64,
+}
+
+
+def type_info_to_ctypes_type(type_info: str) -> Any:
+    """Obtain ctypes type corresponding to a given TypeInfo"""
+    return _CTYPES_TYPE_TABLE[type_info]
+
+
+def type_info_to_numpy_type(type_info: str) -> npt.DTypeLike:
+    """Obtain ctypes type corresponding to a given TypeInfo"""
+    return _NUMPY_TYPE_TABLE[type_info]
+
+
+def numpy_type_to_type_info(type_info: npt.DTypeLike) -> str:
+    """Obtain TypeInfo corresponding to a given NumPy type"""
+    if type_info == np.uint32:
+        return "uint32"
+    if type_info == np.float32:
+        return "float32"
+    if type_info == np.float64:
+        return "float64"
+    raise ValueError(f"Unrecognized NumPy type: {type_info}")
diff --git a/python/tl2cgen/handle_class.py b/python/tl2cgen/handle_class.py
@@ -6,6 +6,7 @@
 
 import treelite
 
+from .data import DMatrix
 from .libloader import _LIB, _check_call
 from .util import c_str
 
@@ -35,6 +36,37 @@ def __del__(self):
             _check_call(_LIB.TL2cgenFreeTreeliteModel(self.handle))
 
 
+class _Annotator:
+    """Annotator object"""
+
+    def __init__(
+        self,
+        model: _TreeliteModel,
+        dmat: DMatrix,
+        nthread: int,
+        verbose: bool = False,
+    ):
+        self.handle = ctypes.c_void_p()
+        _check_call(
+            _LIB.TL2cgenAnnotateBranch(
+                model.handle,
+                dmat.handle,
+                ctypes.c_int(nthread),
+                ctypes.c_int(1 if verbose else 0),
+                ctypes.byref(self.handle),
+            )
+        )
+
+    def save(self, path: Union[str, pathlib.Path]):
+        """Save annotation data to a JSON file"""
+        path = pathlib.Path(path).expanduser().resolve()
+        _check_call(_LIB.TL2cgenAnnotationSave(self.handle, c_str(str(path))))
+
+    def __del__(self):
+        if self.handle:
+            _check_call(_LIB.TL2cgenAnnotationFree(self.handle))
+
+
 class _Compiler:
     """Compiler object"""
 
@@ -57,16 +89,7 @@ def __init__(
         )
 
     def compile(self, model: _TreeliteModel, dirpath: Union[str, pathlib.Path]) -> None:
-        """
-        Generate prediction code
-
-        Parameters
-        ----------
-        model :
-            Model to convert to C code
-        dirpath :
-            Directory to store header and source files
-        """
+        """Generate prediction code"""
         dirpath = pathlib.Path(dirpath).expanduser().resolve()
         _check_call(
             _LIB.TL2cgenCompilerGenerateCode(

diff --git a/src/c_api/c_api_error.cc b/src/c_api/c_api_error.cc
@@ -4,6 +4,7 @@
  * \author Hyunsu Cho
  * \brief C error handling
  */
+#include <tl2cgen/c_api.h>
 #include <tl2cgen/c_api_error.h>
 #include <tl2cgen/thread_local.h>
 #include <tl2cgen/version.h>