-
Notifications
You must be signed in to change notification settings - Fork 0
/
data_transformation.go
87 lines (73 loc) · 2.23 KB
/
data_transformation.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
package dataframe
import (
"math"
)
// Function to transform data, such as scaling, normalization, encoding categorical variables, etc.
func (df *DataFrame) TransformData() {
// Loop through each column in the DataFrame
for _, columnName := range df.header {
columnData := df.columns[columnName]
// Perform data transformation based on column type
switch columnData[0].(type) {
case int, int64, float64:
// Perform scaling or normalization on numeric columns
minVal, maxVal := df.getMinMaxValues(columnData)
df.scaleColumn(columnName, minVal, maxVal)
case string:
// Perform encoding on categorical columns
df.encodeColumn(columnName)
}
}
}
// Function to calculate the minimum and maximum values in a numeric column
func (df *DataFrame) getMinMaxValues(column []interface{}) (float64, float64) {
minVal := math.Inf(1)
maxVal := math.Inf(-1)
for _, value := range column {
if val, ok := value.(float64); ok {
if val < minVal {
minVal = val
}
if val > maxVal {
maxVal = val
}
}
}
return minVal, maxVal
}
// Function to scale a numeric column to a range of [0, 1]
func (df *DataFrame) scaleColumn(columnName string, minVal, maxVal float64) {
column := df.columns[columnName]
for i := 0; i < len(column); i++ {
if val, ok := column[i].(float64); ok {
scaledVal := (val - minVal) / (maxVal - minVal)
df.columns[columnName][i] = scaledVal
}
}
}
// Function to encode categorical column using one-hot encoding
func (df *DataFrame) encodeColumn(columnName string) {
column := df.columns[columnName]
uniqueValues := make(map[interface{}]bool)
// Get unique values in the column
for _, value := range column {
uniqueValues[value] = true
}
// Create new columns for each unique value
for value := range uniqueValues {
newColumnName := columnName + "_" + value.(string)
encodedValues := make([]interface{}, len(column))
// Encode the column values based on unique value presence
for i, val := range column {
if val == value {
encodedValues[i] = 1
} else {
encodedValues[i] = 0
}
}
// Add the new encoded column to the DataFrame
df.AddColumn(newColumnName, encodedValues)
}
// Remove the original categorical column from the DataFrame
df.RemoveColumn(columnName)
}