-
Notifications
You must be signed in to change notification settings - Fork 141
/
annotate.py
67 lines (59 loc) · 2.47 KB
/
annotate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import os
import settings
import pandas as pd
def read():
acquisition = pd.read_csv(os.path.join(settings.PROCESSED_DIR, "Acquisition.txt"), sep="|")
return acquisition
def count_performance_rows():
counts = {}
with open(os.path.join(settings.PROCESSED_DIR, "Performance.txt"), 'r') as f:
for i, line in enumerate(f):
if i == 0:
# Skip header row
continue
loan_id, date = line.split("|")
loan_id = int(loan_id)
if loan_id not in counts:
counts[loan_id] = {
"foreclosure_status": False,
"performance_count": 0
}
counts[loan_id]["performance_count"] += 1
if len(date.strip()) > 0:
counts[loan_id]["foreclosure_status"] = True
return counts
def get_performance_summary_value(loan_id, key, performance_summary):
value = performance_summary.get(loan_id, {
"foreclosure_status": False,
"performance_count": 0
})
return value[key]
def annotate(acquisition, performance_summary):
acquisition["foreclosure_status"] = acquisition["id"].apply(lambda x: get_performance_summary_value(x, "foreclosure_status", performance_summary))
acquisition["performance_count"] = acquisition["id"].apply(lambda x: get_performance_summary_value(x, "performance_count", performance_summary))
for column in [
"channel",
"seller",
"first_time_homebuyer",
"loan_purpose",
"property_type",
"occupancy_status",
"property_state",
"product_type"
]:
acquisition[column] = acquisition[column].astype('category').cat.codes
for start in ["first_payment", "origination"]:
column = "{}_date".format(start)
acquisition["{}_year".format(start)] = pd.to_numeric(acquisition[column].str.split('/').str.get(1))
acquisition["{}_month".format(start)] = pd.to_numeric(acquisition[column].str.split('/').str.get(0))
del acquisition[column]
acquisition = acquisition.fillna(-1)
acquisition = acquisition[acquisition["performance_count"] > settings.MINIMUM_TRACKING_QUARTERS]
return acquisition
def write(acquisition):
acquisition.to_csv(os.path.join(settings.PROCESSED_DIR, "train.csv"), index=False)
if __name__ == "__main__":
acquisition = read()
performance_summary = count_performance_rows()
acquisition = annotate(acquisition, performance_summary)
write(acquisition)