-
Notifications
You must be signed in to change notification settings - Fork 3
/
ta1-pipeline
56 lines (47 loc) · 2.14 KB
/
ta1-pipeline
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
#!/usr/bin/env python3
from os import path
from dsbox.datapreprocessing.profiler import Profiler, MyEncoder
import pandas as pd
import numpy as np
import json
import pprint
# Example for the documentation of the TA1 pipeline submission process
#
# It executes a TA1 pipeline using a ta1-pipeline-config.json file that follows this structure:
# {
# "problem_schema":"path/to/problem_schema.json",
# "dataset_schema":"path/to/dataset_schema.json",
# "data_root":"path/to/data/root/folder/",
# "output_file":"path/to/output/file"
# }
# Load the json configuration file
with open("ta1-pipeline-config.json", 'r') as inputFile:
jsonCall = json.load(inputFile)
inputFile.close()
# Load the input files from the data_root folder path information
dataRoot = jsonCall['data_root']
# Load profile targets as their raw values, not letting pandas automate dtypes
trainData = pd.read_csv( path.join(dataRoot, 'trainData.csv.gz'), dtype = object )
trainTargets = pd.read_csv( path.join(dataRoot, 'trainTargets.csv.gz'), dtype = object )
testData = pd.read_csv( path.join(dataRoot, 'testData.csv.gz') )
print(trainData.head())
print(trainTargets.head())
# print(np.asarray(trainTargets['Class']))
# print(testData.head())
# Initialize the DSBox Profiler
prof = Profiler()
# profile for the first two columns, excluding d3mIndex
trainData_profile = prof.produce(inputs=trainData.drop('d3mIndex',1).iloc[:,:2])
trainTargets_profile = prof.produce(inputs=trainTargets.drop('d3mIndex',1))
print("=== Profile for the first two columns in trainData, excluding d3mIndex ===")
print(json.dumps(trainData_profile, indent=4, cls=MyEncoder))
print("=== Profile for trainTargets, excluding d3mIndex ===")
print(json.dumps(trainTargets_profile, indent=4, cls=MyEncoder))
# To create a dummy prediction
random_sample = trainTargets['Class'].dropna()[0]
testData['Class'] = random_sample
predictedTargets = testData['Class']
# print(predictedTargets)
# Outputs the predicted targets in the location specified in the JSON configuration file
with open(jsonCall['output_file'], 'w') as outputFile:
output = pd.DataFrame(predictedTargets).to_csv(outputFile, index_label='d3mIndex', header=['Class'])