Python* API Reference for Intel® Data Analytics Acceleration Library 2020 Update 1

simple_csv_feature_modifiers.py

1 # file: simple_csv_feature_modifiers.py
2 #===============================================================================
3 # Copyright 2014-2020 Intel Corporation
4 #
5 # Licensed under the Apache License, Version 2.0 (the "License");
6 # you may not use this file except in compliance with the License.
7 # You may obtain a copy of the License at
8 #
9 # http://www.apache.org/licenses/LICENSE-2.0
10 #
11 # Unless required by applicable law or agreed to in writing, software
12 # distributed under the License is distributed on an "AS IS" BASIS,
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 # See the License for the specific language governing permissions and
15 # limitations under the License.
16 #===============================================================================
17 
18 # ! Content:
19 # ! Python example of modifiers usage with file data source
20 # !*****************************************************************************
21 
22 #
23 
24 
25 #
26 
27 from daal.data_management import FileDataSource, CsvDataSourceOptions, modifiers, features
28 from daal.data_management.modifiers import csv
29 
30 import os, sys
31 utils_folder = os.path.realpath(os.path.abspath(os.path.dirname(os.path.dirname(__file__))))
32 if utils_folder not in sys.path:
33  sys.path.insert(0, utils_folder)
34 from utils import printNumericTable
35 
36 # Path to the CSV to be read
37 csvFileName = "../data/batch/mixed_text_and_numbers.csv"
38 
39 # Define options for CSV data source
40 csvOptions = CsvDataSourceOptions(CsvDataSourceOptions.allocateNumericTable |\
41  CsvDataSourceOptions.createDictionaryFromContext |\
42  CsvDataSourceOptions.parseHeader)
43 
44 # Read CSV using default data source behavior
45 def readDefault():
46  ds = FileDataSource(csvFileName, csvOptions)
47  # By default all numeric columns will be parsed as continuous
48  # features and other columns as categorical
49  ds.loadDataBlock()
50  printNumericTable(ds.getNumericTable(), "readDefault function result:")
51 
52 
53 # Read CSV and do basic filtering using columns indices
54 def readOnlySpecifiedColumnIndices():
55  ds = FileDataSource(csvFileName, csvOptions)
56  # This means that columns with indices 0, 1, 5 will be included to the output numeric
57  # table and other columns will be ignored. The first argument of method 'include' specifies
58  # the set of columns and the second one specifies modifier. in this case we use predefined
59  # automatic modifier that automatically decides how to parse column in the best way
60  print(modifiers.csv.automatic())
61  ds.getFeatureManager().addModifier([0,1,5], modifiers.csv.automatic())
62  ds.loadDataBlock()
63  printNumericTable(ds.getNumericTable(), "readOnlySpecifiedColumnIndices function result:")
64 
65 
66 # Read CSV and do basic filtering using columns names
67 def readOnlySpecifiedColumnNames():
68  ds = FileDataSource(csvFileName, csvOptions)
69  # The same as readOnlySpecifiedColumnIndices but uses column names instead of indices
70  ds.getFeatureManager().addModifier(["Numeric1", "Categorical0"], modifiers.csv.automatic())
71  ds.loadDataBlock()
72  printNumericTable(ds.getNumericTable(), "readOnlySpecifiedColumnNames function result:")
73 
74 
75 # Read CSV using multiple modifiers
76 def readUsingMultipleModifiers():
77  ds = FileDataSource(csvFileName, csvOptions)
78 
79  fm = ds.getFeatureManager()
80  fm.addModifier(["Numeric1"], modifiers.csv.continuous())
81  # let's mix position and names
82  fm.addModifier([6, "Categorical1"], modifiers.csv.categorical())
83 
84  ds.loadDataBlock()
85  printNumericTable(ds.getNumericTable(), "readUsingMultipleModifiers function result:")
86 
87 
88 if __name__ == "__main__":
89  # Read CSV using default data source behavior
90  readDefault()
91 
92  # Read CSV and do basic filtering using columns indices
93  readOnlySpecifiedColumnIndices()
94 
95  # Read CSV and do basic filtering using columns names
96  readOnlySpecifiedColumnNames()
97 
98  # Read CSV using multiple modifiers
99  readUsingMultipleModifiers()

For more complete information about compiler optimizations, see our Optimization Notice.