Python* API Reference for Intel® Data Analytics Acceleration Library 2020 Update 1

df_cls_traverse_model.py

1 # file: df_cls_traverse_model.py
2 #===============================================================================
3 # Copyright 2014-2020 Intel Corporation
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #===============================================================================
17 
18 #
19 # ! Content:
20 # ! Python example of decision forest classification model traversal.
21 # !
22 # ! The program trains the decision forest classification model on a training
23 # ! datasetFileName and prints the trained model by its depth-first traversing.
24 # !*****************************************************************************
25 
26 #
27 
28 
29 #
30 from __future__ import print_function
31 
32 from daal.algorithms import classifier
33 from daal.algorithms import decision_forest
34 import daal.algorithms.decision_forest.classification
35 import daal.algorithms.decision_forest.classification.training
36 
37 from daal.data_management import (
38  FileDataSource, HomogenNumericTable, MergedNumericTable, NumericTableIface, DataSourceIface, features
39 )
40 
41 # Input data set parameters
42 trainDatasetFileName = "../data/batch/df_classification_train.csv"
43 categoricalFeaturesIndices = [2]
44 nFeatures = 3 # Number of features in training and testing data sets
45 
46 # Decision forest parameters
47 nTrees = 2
48 minObservationsInLeafNode = 8
49 maxTreeDepth = 15
50 
51 nClasses = 5 # Number of classes
52 
53 
54 def trainModel():
55 
56  # Create Numeric Tables for training data and dependent variables
57  trainData, trainDependentVariable = loadData(trainDatasetFileName)
58 
59  # Create an algorithm object to train the decision forest classification model
60  algorithm = decision_forest.classification.training.Batch(nClasses)
61 
62  # Pass a training data set and dependent values to the algorithm
63  algorithm.input.set(classifier.training.data, trainData)
64  algorithm.input.set(classifier.training.labels, trainDependentVariable)
65 
66  algorithm.parameter.nTrees = nTrees
67  algorithm.parameter.featuresPerNode = nFeatures
68  algorithm.parameter.minObservationsInLeafNode = minObservationsInLeafNode
69  algorithm.parameter.maxTreeDepth = maxTreeDepth
70 
71  # Build the decision forest classification model and return the result
72  return algorithm.compute()
73 
74 
75 def loadData(fileName):
76 
77  # Initialize FileDataSource<CSVFeatureManager> to retrieve the input data from a .csv file
78  trainDataSource = FileDataSource(
79  fileName, DataSourceIface.notAllocateNumericTable, DataSourceIface.doDictionaryFromContext
80  )
81 
82  # Create Numeric Tables for training data and dependent variables
83  data = HomogenNumericTable(nFeatures, 0, NumericTableIface.notAllocate)
84  dependentVar = HomogenNumericTable(1, 0, NumericTableIface.notAllocate)
85  mergedData = MergedNumericTable(data, dependentVar)
86 
87  # Retrieve the data from input file
88  trainDataSource.loadDataBlock(mergedData)
89 
90  dictionary = data.getDictionary()
91  for i in range(len(categoricalFeaturesIndices)):
92  dictionary[categoricalFeaturesIndices[i]].featureType = features.DAAL_CATEGORICAL
93 
94  return data, dependentVar
95 
96 
97 # Visitor class implementing NodeVisitor interface, prints out tree nodes of the model when it is called back by model traversal method
98 class PrintNodeVisitor(classifier.TreeNodeVisitor):
99 
100  def __init__(self):
101  super(PrintNodeVisitor, self).__init__()
102 
103  def onLeafNode(self, level, response):
104 
105  for i in range(level):
106  print(" ", end='')
107  print("Level {}, leaf node. Response value = {}".format(level, response))
108  return True
109 
110  def onSplitNode(self, level, featureIndex, featureValue):
111 
112  for i in range(level):
113  print(" ", end='')
114  print("Level {}, split node. Feature index = {}, feature value = {:.6g}".format(level, featureIndex, featureValue))
115  return True
116 
117 
118 def printModel(m):
119  visitor = PrintNodeVisitor()
120  print("Number of trees: {}".format(m.getNumberOfTrees()))
121  for i in range(m.getNumberOfTrees()):
122  print("Tree #{}".format(i))
123  m.traverseDF(i, visitor)
124 
125 
126 if __name__ == "__main__":
127 
128  trainingResult = trainModel()
129  printModel(trainingResult.get(classifier.training.model))

For more complete information about compiler optimizations, see our Optimization Notice.