Skip to content

Commit

Permalink
Support multi-runs and seeds in KMeans clustering (#321)
Browse files Browse the repository at this point in the history
* Add mutli-run K-Means with different seeds

Signed-off-by: FedyuninV <valery.fedyunin@abbyy.com>

* Use double for inertia

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>

* Use inertia measured during clustering (instead of separate external measuring after clustering)

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>

* Add RunCount and Seed to Python wrapper

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>

* Update docs and comments

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>

* Reduce diff size

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>

* Remove debug code

Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com>

Co-authored-by: Stanislav Angeliuk <59917951+SAngeliuk@users.noreply.github.com>
  • Loading branch information
Valeriy Fedyunin and SAngeliuk authored May 25, 2021
1 parent 396135f commit 93ad8d7
Show file tree
Hide file tree
Showing 6 changed files with 173 additions and 68 deletions.
20 changes: 17 additions & 3 deletions NeoML/Python/neoml/Clustering.py
Original file line number Diff line number Diff line change
Expand Up @@ -242,10 +242,19 @@ class KMeans(PythonWrapper.KMeans) :
:param distance: the distance function.
:type distance: str, {'euclid', 'machalanobis', 'cosine'}, default='euclid'
:param thread_count: number of threads
:type thread_count: int, > 0, default=1
:param run_count: number of runs, the result is the best of the runs (based on inertia)
:type run_count: int, > 0, default=1
:param seed: the initial seed for random
:type seed: int, default=3306
"""

def __init__(self, max_iteration_count, cluster_count, algo='lloyd', init='default', distance='euclid',
thread_count=1):
thread_count=1, run_count=1, seed=3306):
if algo != 'elkan' and algo != 'lloyd':
raise ValueError('The `algo` must be one of {`elkan`, `lloyd`}.')
if init != 'k++' and init != 'default':
Expand All @@ -257,8 +266,13 @@ def __init__(self, max_iteration_count, cluster_count, algo='lloyd', init='defau
if cluster_count <= 0:
raise ValueError('The `cluster_count` must be > 0.')
if thread_count <= 0:
raise ValueError('The `thread_count` must be < 0')
super().__init__(algo, init, distance, int(max_iteration_count), int(cluster_count), int(thread_count))
raise ValueError('The `thread_count` must be > 0')
if run_count <= 0:
raise ValueError('The `run_count` must be > 0')
if not isinstance(seed, int):
raise ValueError('The `seed` must be integer')
super().__init__(algo, init, distance, int(max_iteration_count), int(cluster_count), int(thread_count),
int(run_count), int(seed))

def clusterize(self, X, weight=None):
"""Performs clustering of the given data.
Expand Down
4 changes: 3 additions & 1 deletion NeoML/Python/src/PyClustering.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ void InitializeClustering(py::module& m)
py::class_<CPyKMeans>(m, "KMeans")
.def( py::init(
[]( const std::string& algo, const std::string& init, const std::string& distance,
int max_iteration_count, int cluster_count, int thread_count )
int max_iteration_count, int cluster_count, int thread_count, int run_count, int seed )
{
CKMeansClustering::CParam p;

Expand Down Expand Up @@ -227,6 +227,8 @@ void InitializeClustering(py::module& m)
p.InitialClustersCount = cluster_count;
p.MaxIterations = max_iteration_count;
p.ThreadCount = thread_count;
p.RunCount = run_count;
p.Seed = seed;
return new CPyKMeans( p );
})
)
Expand Down
3 changes: 3 additions & 0 deletions NeoML/docs/en/API/Clustering/kMeans.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ The clustering parameters are described by the `CKMeansClustering::CParam` struc
- *Initialization* - the initialization algorithm
- *MaxIterations* — the maximum number of algorithm iterations
- *Tolerance* - tolerance for stop criteria of Elkan algorithm
- *ThreadCount* - number of threads used during calculations
- *RunCount* - number of runs of the alogrithm (the result with least inertia will be returned)
- *Seed* - the initial seed for random

## Sample

Expand Down
5 changes: 4 additions & 1 deletion NeoML/docs/ru/API/Clustering/kMeans.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,10 @@
- *InitialClustersCount* — начальное количество кластеров: при создании кластеризатора вы можете передать в конструктор массив длины *InitialClustersCount* с центрами кластеров, которые должны использоваться на первой итерации алгоритма; в противном случае на первой итерации в качестве центров будут взяты случайные элементы входных данных;
- *Initialization* - используемый алгоритм инициализации;
- *MaxIterations* — максимальное количество итераций алгоритма;
- *Tolerance* - критерий остановки для алгоритма Elkan
- *Tolerance* - критерий остановки для алгоритма Elkan;
- *ThreadCount* - количество потоков, используемых во время работы алгоритма;
- *RunCount* - количество запусков алгоритма, в итоге будет возвращен результат с наименьшей инерцией кластеров;
- *Seed* - `seed` для генерации случайных чисел.

## Пример

Expand Down
36 changes: 22 additions & 14 deletions NeoML/include/NeoML/TraditionalML/KMeansClustering.h
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,14 @@ class NEOML_API CKMeansClustering : public IClustering {
double Tolerance;
// Number of threads used in KMeans
int ThreadCount;
// Number of runs of algorithm
// If more than one then the best variant (least ineratia) will be returned
int RunCount;
// Initial seed for random
int Seed;

CParam() : Algo( KMA_Lloyd ), DistanceFunc( DF_Euclid ), InitialClustersCount( 1 ), Initialization( KMI_Default ),
MaxIterations( 1 ), Tolerance( 1e-5f ), ThreadCount( 1 )
MaxIterations( 1 ), Tolerance( 1e-5f ), ThreadCount( 1 ), RunCount( 1 ), Seed( 0xCEA )
{
}
};
Expand Down Expand Up @@ -99,24 +104,27 @@ class NEOML_API CKMeansClustering : public IClustering {
CObjectArray<CCommonCluster> clusters; // the current clusters
CArray<CClusterCenter> initialClusterCenters; // the initial cluster centers

// Single run of clusterization with given seed
bool runClusterization( IClusteringData* input, int seed, CClusteringResult& result, double& inertia );

// Initial cluster selection for sparse data
void selectInitialClusters( const CFloatMatrixDesc& matrix );
void defaultInitialization( const CFloatMatrixDesc& matrix );
void kMeansPlusPlusInitialization( const CFloatMatrixDesc& matrix );
void selectInitialClusters( const CFloatMatrixDesc& matrix, int seed );
void defaultInitialization( const CFloatMatrixDesc& matrix, int seed );
void kMeansPlusPlusInitialization( const CFloatMatrixDesc& matrix, int seed );

// Sparse data clusterization
bool clusterize( const CFloatMatrixDesc& matrix, const CArray<double>& weights );
bool clusterize( const CFloatMatrixDesc& matrix, const CArray<double>& weights, double& inertia );

// Lloyd algorithm implementation for sparse data
bool lloydClusterization( const CFloatMatrixDesc& matrix, const CArray<double>& weights );
void classifyAllData( const CFloatMatrixDesc& matrix, CArray<int>& dataCluster );
int findNearestCluster( const CFloatMatrixDesc& matrix, int dataIndex ) const;
bool lloydClusterization( const CFloatMatrixDesc& matrix, const CArray<double>& weights, double& inertia );
void classifyAllData( const CFloatMatrixDesc& matrix, CArray<int>& dataCluster, double& inertia );
int findNearestCluster( const CFloatMatrixDesc& matrix, int dataIndex, double& inertia ) const;
void storeClusterCenters( CArray<CClusterCenter>& result );
bool updateClusters( const CFloatMatrixDesc& matrix, const CArray<double>& weights,
const CArray<int>& dataCluster, const CArray<CClusterCenter>& oldCenters );

// Elkan algorithm implementation for sparse data
bool elkanClusterization( const CFloatMatrixDesc& matrix, const CArray<double>& weights );
bool elkanClusterization( const CFloatMatrixDesc& matrix, const CArray<double>& weights, double& inertia );
void initializeElkanStatistics( const CFloatMatrixDesc& matrix, CArray<int>& assignments,
CArray<float>& upperBounds, CVariableMatrix<float>& lowerBounds, CVariableMatrix<float>& clusterDists,
CArray<float>& closestClusterDist, CArray<float>& moveDistance );
Expand All @@ -132,14 +140,14 @@ class NEOML_API CKMeansClustering : public IClustering {
const CVariableMatrix<float>& clusterDists, int currentCluster, int clusterToProcess, int id) const;

// Specific case for dense data with Euclidean metrics and Lloyd algorithm
bool denseLloydL2Clusterize( IClusteringData* rawData, CClusteringResult& result );
bool denseLloydL2Clusterize( IClusteringData* rawData, int seed, CClusteringResult& result, double& inertia );
// Initial cluster selection
void selectInitialClusters( const CDnnBlob& data, CDnnBlob& centers );
void defaultInitialization( const CDnnBlob& data, CDnnBlob& centers );
void kMeansPlusPlusInitialization( const CDnnBlob& data, CDnnBlob& centers );
void selectInitialClusters( const CDnnBlob& data, int seed, CDnnBlob& centers );
void defaultInitialization( const CDnnBlob& data, int seed, CDnnBlob& centers );
void kMeansPlusPlusInitialization( const CDnnBlob& data, int seed, CDnnBlob& centers );
// Lloyd algorithm implementation
bool lloydBlobClusterization( const CDnnBlob& data, const CDnnBlob& weight,
CDnnBlob& centers, CDnnBlob& sizes, CDnnBlob& labels );
CDnnBlob& centers, CDnnBlob& sizes, CDnnBlob& labels, double& inertia );
double assignClosest( const CDnnBlob& data, const CDnnBlob& squaredData, const CDnnBlob& weight,
const CDnnBlob& centers, CDnnBlob& labels );
void recalcCenters( const CDnnBlob& data, const CDnnBlob& weight, const CDnnBlob& labels,
Expand Down
Loading

0 comments on commit 93ad8d7

Please sign in to comment.