Support multi-runs and seeds in KMeans clustering (#321)

* Add mutli-run K-Means with different seeds Signed-off-by: FedyuninV <valery.fedyunin@abbyy.com> * Use double for inertia Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com> * Use inertia measured during clustering (instead of separate external measuring after clustering) Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com> * Add RunCount and Seed to Python wrapper Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com> * Update docs and comments Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com> * Reduce diff size Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com> * Remove debug code Signed-off-by: Valeriy Fedyunin <valery.fedyunin@abbyy.com> Co-authored-by: Stanislav Angeliuk <59917951+SAngeliuk@users.noreply.github.com>
neoml-lib · May 25, 2021 · 93ad8d7 · 93ad8d7
1 parent 396135f
commit 93ad8d7
Show file tree

Hide file tree

Showing 6 changed files with 173 additions and 68 deletions.
diff --git a/NeoML/Python/neoml/Clustering.py b/NeoML/Python/neoml/Clustering.py
@@ -242,10 +242,19 @@ class KMeans(PythonWrapper.KMeans) :
 
     :param distance: the distance function.
     :type distance: str, {'euclid', 'machalanobis', 'cosine'}, default='euclid'
+
+    :param thread_count: number of threads
+    :type thread_count: int, > 0, default=1
+
+    :param run_count: number of runs, the result is the best of the runs (based on inertia)
+    :type run_count: int, > 0, default=1
+
+    :param seed: the initial seed for random
+    :type seed: int, default=3306
     """
 
     def __init__(self, max_iteration_count, cluster_count, algo='lloyd', init='default', distance='euclid',
-                 thread_count=1):
+                 thread_count=1, run_count=1, seed=3306):
         if algo != 'elkan' and algo != 'lloyd':
             raise ValueError('The `algo` must be one of {`elkan`, `lloyd`}.')
         if init != 'k++' and init != 'default':
@@ -257,8 +266,13 @@ def __init__(self, max_iteration_count, cluster_count, algo='lloyd', init='defau
         if cluster_count <= 0:
             raise ValueError('The `cluster_count` must be > 0.')
         if thread_count <= 0:
-            raise ValueError('The `thread_count` must be < 0')
-        super().__init__(algo, init, distance, int(max_iteration_count), int(cluster_count), int(thread_count))
+            raise ValueError('The `thread_count` must be > 0')
+        if run_count <= 0:
+            raise ValueError('The `run_count` must be > 0')
+        if not isinstance(seed, int):
+            raise ValueError('The `seed` must be integer')
+        super().__init__(algo, init, distance, int(max_iteration_count), int(cluster_count), int(thread_count),
+            int(run_count), int(seed))
 
     def clusterize(self, X, weight=None):
         """Performs clustering of the given data.

diff --git a/NeoML/Python/src/PyClustering.cpp b/NeoML/Python/src/PyClustering.cpp
@@ -199,7 +199,7 @@ void InitializeClustering(py::module& m)
 	py::class_<CPyKMeans>(m, "KMeans")
 		.def( py::init(
 			[]( const std::string& algo, const std::string& init, const std::string& distance,
-				int max_iteration_count, int cluster_count, int thread_count )
+				int max_iteration_count, int cluster_count, int thread_count, int run_count, int seed )
 			{
 				CKMeansClustering::CParam p;
 
@@ -227,6 +227,8 @@ void InitializeClustering(py::module& m)
 				p.InitialClustersCount = cluster_count;
 				p.MaxIterations = max_iteration_count;
 				p.ThreadCount = thread_count;
+				p.RunCount = run_count;
+				p.Seed = seed;
 				return new CPyKMeans( p );
 			})
 		)

diff --git a/NeoML/docs/en/API/Clustering/kMeans.md b/NeoML/docs/en/API/Clustering/kMeans.md
@@ -26,6 +26,9 @@ The clustering parameters are described by the `CKMeansClustering::CParam` struc
 - *Initialization* - the initialization algorithm
 - *MaxIterations* — the maximum number of algorithm iterations
 - *Tolerance* - tolerance for stop criteria of Elkan algorithm
+- *ThreadCount* - number of threads used during calculations
+- *RunCount* - number of runs of the alogrithm (the result with least inertia will be returned)
+- *Seed* - the initial seed for random
 
 ## Sample
 

diff --git a/NeoML/docs/ru/API/Clustering/kMeans.md b/NeoML/docs/ru/API/Clustering/kMeans.md
@@ -26,7 +26,10 @@
 - *InitialClustersCount* — начальное количество кластеров: при создании кластеризатора вы можете передать в конструктор массив длины *InitialClustersCount* с центрами кластеров, которые должны использоваться на первой итерации алгоритма; в противном случае на первой итерации в качестве центров будут взяты случайные элементы входных данных;
 - *Initialization* - используемый алгоритм инициализации;
 - *MaxIterations* — максимальное количество итераций алгоритма;
-- *Tolerance* - критерий остановки для алгоритма Elkan
+- *Tolerance* - критерий остановки для алгоритма Elkan;
+- *ThreadCount* - количество потоков, используемых во время работы алгоритма;
+- *RunCount* - количество запусков алгоритма, в итоге будет возвращен результат с наименьшей инерцией кластеров;
+- *Seed* - `seed` для генерации случайных чисел.
 
 ## Пример
 

diff --git a/NeoML/include/NeoML/TraditionalML/KMeansClustering.h b/NeoML/include/NeoML/TraditionalML/KMeansClustering.h
@@ -69,9 +69,14 @@ class NEOML_API CKMeansClustering : public IClustering {
 		double Tolerance;
 		// Number of threads used in KMeans
 		int ThreadCount;
+		// Number of runs of algorithm
+		// If more than one then the best variant (least ineratia) will be returned
+		int RunCount;
+		// Initial seed for random
+		int Seed;
 
 		CParam() : Algo( KMA_Lloyd ), DistanceFunc( DF_Euclid ), InitialClustersCount( 1 ), Initialization( KMI_Default ),
-			MaxIterations( 1 ), Tolerance( 1e-5f ), ThreadCount( 1 )
+			MaxIterations( 1 ), Tolerance( 1e-5f ), ThreadCount( 1 ), RunCount( 1 ), Seed( 0xCEA )
 		{
 		}
 	};
@@ -99,24 +104,27 @@ class NEOML_API CKMeansClustering : public IClustering {
 	CObjectArray<CCommonCluster> clusters; // the current clusters
 	CArray<CClusterCenter> initialClusterCenters; // the initial cluster centers
 
+	// Single run of clusterization with given seed
+	bool runClusterization( IClusteringData* input, int seed, CClusteringResult& result, double& inertia );
+
 	// Initial cluster selection for sparse data
-	void selectInitialClusters( const CFloatMatrixDesc& matrix );
-	void defaultInitialization( const CFloatMatrixDesc& matrix );
-	void kMeansPlusPlusInitialization( const CFloatMatrixDesc& matrix );
+	void selectInitialClusters( const CFloatMatrixDesc& matrix, int seed );
+	void defaultInitialization( const CFloatMatrixDesc& matrix, int seed );
+	void kMeansPlusPlusInitialization( const CFloatMatrixDesc& matrix, int seed );
 
 	// Sparse data clusterization
-	bool clusterize( const CFloatMatrixDesc& matrix, const CArray<double>& weights );
+	bool clusterize( const CFloatMatrixDesc& matrix, const CArray<double>& weights, double& inertia );
 
 	// Lloyd algorithm implementation for sparse data
-	bool lloydClusterization( const CFloatMatrixDesc& matrix, const CArray<double>& weights );
-	void classifyAllData( const CFloatMatrixDesc& matrix, CArray<int>& dataCluster );
-	int findNearestCluster( const CFloatMatrixDesc& matrix, int dataIndex ) const;
+	bool lloydClusterization( const CFloatMatrixDesc& matrix, const CArray<double>& weights, double& inertia );
+	void classifyAllData( const CFloatMatrixDesc& matrix, CArray<int>& dataCluster, double& inertia );
+	int findNearestCluster( const CFloatMatrixDesc& matrix, int dataIndex, double& inertia ) const;
 	void storeClusterCenters( CArray<CClusterCenter>& result );
 	bool updateClusters( const CFloatMatrixDesc& matrix, const CArray<double>& weights,
 		const CArray<int>& dataCluster, const CArray<CClusterCenter>& oldCenters );
 
 	// Elkan algorithm implementation for sparse data
-	bool elkanClusterization( const CFloatMatrixDesc& matrix, const CArray<double>& weights );
+	bool elkanClusterization( const CFloatMatrixDesc& matrix, const CArray<double>& weights, double& inertia );
 	void initializeElkanStatistics( const CFloatMatrixDesc& matrix, CArray<int>& assignments,
 		CArray<float>& upperBounds, CVariableMatrix<float>& lowerBounds, CVariableMatrix<float>& clusterDists,
 		CArray<float>& closestClusterDist, CArray<float>& moveDistance );
@@ -132,14 +140,14 @@ class NEOML_API CKMeansClustering : public IClustering {
 		const CVariableMatrix<float>& clusterDists, int currentCluster, int clusterToProcess, int id) const;
 
 	// Specific case for dense data with Euclidean metrics and Lloyd algorithm
-	bool denseLloydL2Clusterize( IClusteringData* rawData, CClusteringResult& result );
+	bool denseLloydL2Clusterize( IClusteringData* rawData, int seed, CClusteringResult& result, double& inertia );
 	// Initial cluster selection
-	void selectInitialClusters( const CDnnBlob& data, CDnnBlob& centers );
-	void defaultInitialization( const CDnnBlob& data, CDnnBlob& centers );
-	void kMeansPlusPlusInitialization( const CDnnBlob& data, CDnnBlob& centers );
+	void selectInitialClusters( const CDnnBlob& data, int seed, CDnnBlob& centers );
+	void defaultInitialization( const CDnnBlob& data, int seed, CDnnBlob& centers );
+	void kMeansPlusPlusInitialization( const CDnnBlob& data, int seed, CDnnBlob& centers );
 	// Lloyd algorithm implementation
 	bool lloydBlobClusterization( const CDnnBlob& data, const CDnnBlob& weight,
-		CDnnBlob& centers, CDnnBlob& sizes, CDnnBlob& labels );
+		CDnnBlob& centers, CDnnBlob& sizes, CDnnBlob& labels, double& inertia );
 	double assignClosest( const CDnnBlob& data, const CDnnBlob& squaredData, const CDnnBlob& weight,
 		const CDnnBlob& centers, CDnnBlob& labels );
 	void recalcCenters( const CDnnBlob& data, const CDnnBlob& weight, const CDnnBlob& labels,