Skip to content

Commit

Permalink
Add DMatrix and Annotator
Browse files Browse the repository at this point in the history
  • Loading branch information
hcho3 committed Apr 28, 2023
1 parent 7b1c039 commit 6062a13
Show file tree
Hide file tree
Showing 7 changed files with 293 additions and 13 deletions.
3 changes: 3 additions & 0 deletions python/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,7 @@ dependencies = [
[project.optional-dependencies]
scikit-learn = ["scikit-learn"]

[tool.mypy]
plugins = "numpy.typing.mypy_plugin"

[tool.hatch.build.targets.wheel.hooks.custom]
5 changes: 3 additions & 2 deletions python/tl2cgen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,9 @@
"""
TL2cgen: Model compiler for decision tree ensembles
"""
from .core import _py_version, generate_c_code
from .core import _py_version, annotate_branch, generate_c_code
from .data import DMatrix

__version__ = _py_version()

__all__ = ["generate_c_code"]
__all__ = ["annotate_branch", "generate_c_code", "DMatrix"]
33 changes: 32 additions & 1 deletion python/tl2cgen/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@

import treelite

from .handle_class import _Compiler, _convert_treelite_model
from .data import DMatrix
from .handle_class import _Annotator, _Compiler, _convert_treelite_model


def _py_version() -> str:
Expand Down Expand Up @@ -59,3 +60,33 @@ def generate_c_code(
_model = _convert_treelite_model(model)
compiler_obj = _Compiler(params, compiler, verbose)
compiler_obj.compile(_model, dirpath)


def annotate_branch(
path: Union[str, pathlib.Path],
model: treelite.Model,
dmat: DMatrix,
nthread: Optional[int],
verbose: bool = False,
) -> None:
"""
Annotate branches in a given model using frequency patterns in the training data and save
the annotation data to a JSON file. Each node gets the count of the instances that belong to it.
Parameters
----------
path :
Location of JSON file
model :
Model to annotate
dmat :
Data matrix representing the training data
nthread :
Number of threads to use while annotating. If missing, use all physical cores in the system.
verbose :
Whether to print extra messages
"""
_model = _convert_treelite_model(model)
nthread = nthread if nthread is not None else 0
annotator = _Annotator(_model, dmat, nthread, verbose)
annotator.save(path)
181 changes: 181 additions & 0 deletions python/tl2cgen/data.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,181 @@
"""Data matrix"""
import ctypes
from typing import Optional, Tuple, Union

import numpy as np
import numpy.typing as npt
import scipy # type: ignore

from .dtypes import (
numpy_type_to_type_info,
type_info_to_ctypes_type,
type_info_to_numpy_type,
)
from .exception import TL2cgenError
from .libloader import _LIB, _check_call
from .util import c_str


class DMatrix:
"""Data matrix used in TL2cgen.
Parameters
----------
data :
Data source
dtype :
If specified, the data will be casted into the corresponding data type.
missing :
Value in the data that represents a missing entry. If set to ``None``,
``numpy.nan`` will be used.
"""

# pylint: disable=R0902,R0903,R0913

def __init__(
self,
data: Union[str, npt.NDArray, scipy.sparse.csr_matrix],
*,
dtype: Optional[str] = None,
missing: Optional[float] = None,
):
if data is None:
raise TL2cgenError("'data' argument cannot be None")

self.handle = ctypes.c_void_p()

if isinstance(data, (str,)):
raise TL2cgenError(
"'data' argument cannot be a string. Did you mean to load data from a text file? "
"Please use the following packages to load the text file:\n"
" * CSV file: Use pandas.read_csv() or numpy.loadtxt()\n"
" * LIBSVM file: Use sklearn.datasets.load_svmlight_file()"
)
if isinstance(data, scipy.sparse.csr_matrix):
self._init_from_csr(data, dtype=dtype)
elif isinstance(data, scipy.sparse.csc_matrix):
self._init_from_csr(data.tocsr(), dtype=dtype)
elif isinstance(data, np.ndarray):
self._init_from_npy2d(data, missing=missing, dtype=dtype)
else: # any type that's convertible to CSR matrix is O.K.
try:
csr = scipy.sparse.csr_matrix(data)
self._init_from_csr(csr, dtype=dtype)
except Exception as e:
raise TypeError(
f"Cannot initialize DMatrix from {type(data).__name__}"
) from e
num_row, num_col, nelem = self._get_dims()
self.shape = (num_row, num_col)
self.size = nelem

def _init_from_csr(
self, csr: scipy.sparse.csr_matrix, *, dtype: Optional[str] = None
) -> None:
"""Initialize data from a CSR (Compressed Sparse Row) matrix"""
if len(csr.indices) != len(csr.data):
raise ValueError(
f"indices and data not of same length: {len(csr.indices)} vs {len(csr.data)}"
)
if len(csr.indptr) != csr.shape[0] + 1:
raise ValueError(
"len(indptr) must be equal to 1 + [number of rows]"
f"len(indptr) = {len(csr.indptr)} vs 1 + [number of rows] = {1 + csr.shape[0]}"
)
if csr.indptr[-1] != len(csr.data):
raise ValueError(
"last entry of indptr must be equal to len(data)"
f"indptr[-1] = {csr.indptr[-1]} vs len(data) = {len(csr.data)}"
)

if dtype is None:
data_type = csr.data.dtype
else:
data_type = type_info_to_numpy_type(dtype)
data_type_code = numpy_type_to_type_info(data_type)
data_ptr_type = ctypes.POINTER(type_info_to_ctypes_type(data_type_code))
if data_type_code not in ["float32", "float64"]:
raise ValueError("data should be either float32 or float64 type")

data = np.array(csr.data, copy=False, dtype=data_type, order="C")
indices = np.array(csr.indices, copy=False, dtype=np.uintc, order="C")
indptr = np.array(csr.indptr, copy=False, dtype=np.uintp, order="C")
_check_call(
_LIB.TL2cgenDMatrixCreateFromCSR(
data.ctypes.data_as(data_ptr_type),
c_str(data_type_code),
indices.ctypes.data_as(ctypes.POINTER(ctypes.c_uint)),
indptr.ctypes.data_as(ctypes.POINTER(ctypes.c_size_t)),
ctypes.c_size_t(csr.shape[0]),
ctypes.c_size_t(csr.shape[1]),
ctypes.byref(self.handle),
)
)

def _init_from_npy2d(
self,
mat: npt.NDArray,
*,
missing: Optional[float] = None,
dtype: Optional[str] = None,
) -> None:
"""
Initialize data from a 2-D numpy matrix.
If ``mat`` does not have ``order='C'`` (also known as row-major) or is not
contiguous, a temporary copy will be made.
If ``mat`` does not have ``dtype=numpy.float32``, a temporary copy will be
made also.
Thus, as many as two temporary copies of data can be made. One should set
input layout and type judiciously to conserve memory.
"""
if len(mat.shape) != 2:
raise ValueError("Input numpy.ndarray must be two-dimensional")
data_type: npt.DTypeLike = (
mat.dtype if dtype is None else type_info_to_numpy_type(dtype)
)
data_type_code = numpy_type_to_type_info(data_type)
data_ptr_type = ctypes.POINTER(type_info_to_ctypes_type(data_type_code))
if data_type_code not in ["float32", "float64"]:
raise ValueError("data should be either float32 or float64 type")
# flatten the array by rows and ensure it is float32.
# we try to avoid data copies if possible
# (reshape returns a view when possible and we explicitly tell np.array to
# avoid copying)
data = np.array(mat.reshape(mat.size), copy=False, dtype=data_type)
missing = missing if missing is not None else np.nan
missing_ar = np.array([missing], dtype=data_type, order="C")
_check_call(
_LIB.TL2cgenDMatrixCreateFromMat(
data.ctypes.data_as(data_ptr_type),
c_str(data_type_code),
ctypes.c_size_t(mat.shape[0]),
ctypes.c_size_t(mat.shape[1]),
missing_ar.ctypes.data_as(data_ptr_type),
ctypes.byref(self.handle),
)
)

def _get_dims(self) -> Tuple[int, int, int]:
num_row = ctypes.c_size_t()
num_col = ctypes.c_size_t()
nelem = ctypes.c_size_t()
_check_call(
_LIB.TL2cgenDMatrixGetDimension(
self.handle,
ctypes.byref(num_row),
ctypes.byref(num_col),
ctypes.byref(nelem),
)
)
return num_row.value, num_col.value, nelem.value

def __del__(self):
if self.handle:
_check_call(_LIB.TL2cgenDMatrixFree(self.handle))
self.handle = None

def __repr__(self):
return (
f"<{self.shape[0]}x{self.shape[1]} sparse matrix of type tl2cgen.DMatrix\n"
f" with {self.size} stored elements in Compressed Sparse Row format>"
)
40 changes: 40 additions & 0 deletions python/tl2cgen/dtypes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
"""Utility functions to handle types"""

import ctypes
from typing import Any, Dict

import numpy as np
import numpy.typing as npt

_CTYPES_TYPE_TABLE: Dict[str, Any] = {
"uint32": ctypes.c_uint32,
"float32": ctypes.c_float,
"float64": ctypes.c_double,
}

_NUMPY_TYPE_TABLE: Dict[str, npt.DTypeLike] = {
"uint32": np.uint32,
"float32": np.float32,
"float64": np.float64,
}


def type_info_to_ctypes_type(type_info: str) -> Any:
"""Obtain ctypes type corresponding to a given TypeInfo"""
return _CTYPES_TYPE_TABLE[type_info]


def type_info_to_numpy_type(type_info: str) -> npt.DTypeLike:
"""Obtain ctypes type corresponding to a given TypeInfo"""
return _NUMPY_TYPE_TABLE[type_info]


def numpy_type_to_type_info(type_info: npt.DTypeLike) -> str:
"""Obtain TypeInfo corresponding to a given NumPy type"""
if type_info == np.uint32:
return "uint32"
if type_info == np.float32:
return "float32"
if type_info == np.float64:
return "float64"
raise ValueError(f"Unrecognized NumPy type: {type_info}")
43 changes: 33 additions & 10 deletions python/tl2cgen/handle_class.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@

import treelite

from .data import DMatrix
from .libloader import _LIB, _check_call
from .util import c_str

Expand Down Expand Up @@ -35,6 +36,37 @@ def __del__(self):
_check_call(_LIB.TL2cgenFreeTreeliteModel(self.handle))


class _Annotator:
"""Annotator object"""

def __init__(
self,
model: _TreeliteModel,
dmat: DMatrix,
nthread: int,
verbose: bool = False,
):
self.handle = ctypes.c_void_p()
_check_call(
_LIB.TL2cgenAnnotateBranch(
model.handle,
dmat.handle,
ctypes.c_int(nthread),
ctypes.c_int(1 if verbose else 0),
ctypes.byref(self.handle),
)
)

def save(self, path: Union[str, pathlib.Path]):
"""Save annotation data to a JSON file"""
path = pathlib.Path(path).expanduser().resolve()
_check_call(_LIB.TL2cgenAnnotationSave(self.handle, c_str(str(path))))

def __del__(self):
if self.handle:
_check_call(_LIB.TL2cgenAnnotationFree(self.handle))


class _Compiler:
"""Compiler object"""

Expand All @@ -57,16 +89,7 @@ def __init__(
)

def compile(self, model: _TreeliteModel, dirpath: Union[str, pathlib.Path]) -> None:
"""
Generate prediction code
Parameters
----------
model :
Model to convert to C code
dirpath :
Directory to store header and source files
"""
"""Generate prediction code"""
dirpath = pathlib.Path(dirpath).expanduser().resolve()
_check_call(
_LIB.TL2cgenCompilerGenerateCode(
Expand Down
1 change: 1 addition & 0 deletions src/c_api/c_api_error.cc
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
* \author Hyunsu Cho
* \brief C error handling
*/
#include <tl2cgen/c_api.h>
#include <tl2cgen/c_api_error.h>
#include <tl2cgen/thread_local.h>
#include <tl2cgen/version.h>
Expand Down

0 comments on commit 6062a13

Please sign in to comment.