Python* API Reference for Intel® Data Analytics Acceleration Library 2020 Update 1

kmeans_init_dense_batch.py

1 # file: kmeans_init_dense_batch.py
2 #===============================================================================
3 # Copyright 2014-2020 Intel Corporation
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #===============================================================================
17 
18 #
19 # ! Content:
20 # ! Python example of dense K-Means clustering with different initialization methods
21 # ! in the batch processing mode
22 # !*****************************************************************************
23 
24 #
25 
26 
27 #
28 
29 import os
30 import numpy as np
31 from daal.algorithms import kmeans
32 import daal.algorithms.kmeans.init
33 from daal.data_management import HomogenNumericTable, FileDataSource, DataSource, BlockDescriptor, readOnly
34 
35 DAAL_PREFIX = os.path.join('..', 'data')
36 # Input data set
37 datasetFileName = os.path.join(DAAL_PREFIX, 'batch', 'kmeans_init_dense.csv')
38 
39 # K-Means algorithm parameters
40 nMaxIterations = 1000
41 cAccuracyThreshold = 0.01
42 nClusters = 20
43 
44 def getSingleValue(pTbl, ntype):
45  block = BlockDescriptor(ntype=ntype)
46  pTbl.getBlockOfRows(0, 1, readOnly, block)
47  value = block.getArray().flatten()[0]
48  pTbl.releaseBlockOfRows(block)
49  return value
50 
51 
52 def runKmeans(inputData, nClusters, method, methodName, oversamplingFactor = -1.0):
53  # Get initial clusters for the K-Means algorithm
54  init = kmeans.init.Batch(nClusters, fptype=np.float32, method=method)
55  init.input.set(kmeans.init.data, inputData)
56  if oversamplingFactor > 0:
57  init.parameter.oversamplingFactor = oversamplingFactor
58  if method == kmeans.init.parallelPlusDense:
59  print("K-means init parameters: method = " + methodName + ", oversamplingFactor = "
60  + str(init.parameter.oversamplingFactor) + ", nRounds = " + str(init.parameter.nRounds))
61  else:
62  print("K-means init parameters: method = " + methodName)
63 
64  centroids = init.compute().get(kmeans.init.centroids)
65 
66  # Create an algorithm object for the K-Means algorithm
67  algorithm = kmeans.Batch(nClusters, nMaxIterations)
68 
69  algorithm.input.set(kmeans.data, inputData)
70  algorithm.input.set(kmeans.inputCentroids, centroids)
71  algorithm.parameter.accuracyThreshold = cAccuracyThreshold
72  print("K-means algorithm parameters: maxIterations = " + str(algorithm.parameter.maxIterations)
73  + ", accuracyThreshold = " + str(algorithm.parameter.accuracyThreshold))
74  res = algorithm.compute()
75 
76  # Print the results
77  goalFunc = getSingleValue(res.get(kmeans.objectiveFunction), ntype=np.float32)
78  nIterations = getSingleValue(res.get(kmeans.nIterations), ntype=np.intc)
79  print("K-means algorithm results: Objective function value = " + str(goalFunc*1e-6)
80  + "*1E+6, number of iterations = " + str(nIterations) + "\n")
81 
82 
83 if __name__ == "__main__":
84  # Initialize FileDataSource to retrieve the input data from a .csv file
85  inputData = HomogenNumericTable(ntype=np.float32)
86  dataSource = FileDataSource(datasetFileName,
87  DataSource.notAllocateNumericTable,
88  DataSource.doDictionaryFromContext)
89 
90  # Retrieve the data from the input file
91  dataSource.loadDataBlock(inputData)
92 
93  runKmeans(inputData, nClusters, kmeans.init.deterministicDense, "deterministicDense")
94  runKmeans(inputData, nClusters, kmeans.init.randomDense, "randomDense")
95  runKmeans(inputData, nClusters, kmeans.init.plusPlusDense, "plusPlusDense")
96  runKmeans(inputData, nClusters, kmeans.init.parallelPlusDense, "parallelPlusDense", 0.5)
97  runKmeans(inputData, nClusters, kmeans.init.parallelPlusDense, "parallelPlusDense", 2.0)

For more complete information about compiler optimizations, see our Optimization Notice.