Python* API Reference for Intel® Data Analytics Acceleration Library 2020 Update 1

datasource_featureextraction.py

1 # file: datasource_featureextraction.py
2 #===============================================================================
3 # Copyright 2014-2020 Intel Corporation
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #===============================================================================
17 
18 #
19 # ! Content:
20 # ! Python example for using of data source feature extraction
21 # !*****************************************************************************
22 
23 #
24 
25 
26 #
27 import os
28 import sys
29 
30 from daal.data_management import FileDataSource, DataSourceIface, ColumnFilter, OneHotEncoder
31 
32 utils_folder = os.path.realpath(os.path.abspath(os.path.dirname(os.path.dirname(__file__))))
33 if utils_folder not in sys.path:
34  sys.path.insert(0, utils_folder)
35 from utils import printNumericTable
36 
37 
38 # Input data set parameters
39 datasetFileName = "../data/batch/kmeans_dense.csv"
40 
41 if __name__ == "__main__":
42 
43  # Initialize FileDataSource to retrieve the input data from a .csv file
44  dataSource = FileDataSource(datasetFileName, DataSourceIface.doAllocateNumericTable)
45 
46  # Create data source dictionary from loading of the first .csv file
47  dataSource.createDictionaryFromContext()
48 
49  # Filter in 3 chosen columns from a .csv file
50  validList = [1, 2, 5]
51 
52  colFilter = ColumnFilter()
53  filterList = colFilter.list(validList)
54  dataSource.getFeatureManager().addModifier(filterList)
55 
56  # Consider column with index 1 as categorical and convert it into 3 binary categorical features
57  dataSource.getFeatureManager().addModifier(OneHotEncoder(1, 3))
58 
59  # Load data from .csv file
60  dataSource.loadDataBlock()
61 
62  # Print result
63  table = dataSource.getNumericTable()
64  printNumericTable(table, "Loaded data", 4, 20)

For more complete information about compiler optimizations, see our Optimization Notice.