-
Notifications
You must be signed in to change notification settings - Fork 16
/
clara.py
163 lines (140 loc) · 4.28 KB
/
clara.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
# clara.py
# CLARA implemenation for pyCluster
# May 27, 2013
# http://daveti.blog.com
from util import importData, euclidean_distance, manhattan_distance, pearson_distance
from pam import kmedoids
import random
import sys
import time
# Global variables
debugEnabled = False # Debug
claraLoopNum = 5
distances_cache = {} # Save the tmp distance for acceleration (idxMedoid, idxData) -> distance
def averageCost(data, costF_idx, medoids_idx, cacheOn=False):
'''
Compute the average cost of medoids based on certain cost function and do the clustering
'''
# Init the cluster
size = len(data)
total_cost = {}
medoids = {}
for idx in medoids_idx:
medoids[idx] = []
total_cost[idx] = 0.0
# Compute the distance and do the clustering
for i in range(size):
choice = -1
# Make a big number
min_cost = float('inf')
for m in medoids:
if cacheOn == True:
# Check for cache
tmp = distances_cache.get((m,i), None)
if cacheOn == False or tmp == None:
if costF_idx == 0:
# euclidean_distance
tmp = euclidean_distance(data[m], data[i])
elif costF_idx == 1:
# manhattan_distance
tmp = manhattan_distance(data[m], data[i])
elif costF_idx == 2:
# pearson_distance
tmp = pearson_distance(data[m], data[i])
else:
print('Error: unknown cost function idx: ' % (costF_idx))
if cacheOn == True:
# Save the distance for acceleration
distances_cache[(m,i)] = tmp
# Clustering
if tmp < min_cost:
choice = m
min_cost = tmp
# Done the clustering
medoids[choice].append(i)
total_cost[choice] += min_cost
# Compute the average cost
avg_cost = 0.0
for idx in medoids_idx:
avg_cost += total_cost[idx] / len(medoids[idx])
# Return the average cost and clustering
return(avg_cost, medoids)
def clara(data, k):
'''
CLARA implemenation
1. For i = 1 to 5, repeat the following steps:
2. Draw a sample of 40 + 2k objects randomly from the
entire data set,2 and call Algorithm PAM to find
k medoids of the sample.
3. For each object Oj in the entire data set, determine
which of the k medoids is the most similar to Oj.
4. Calculate the average dissimilarity of the clustering
obtained in the previous step. If this value is less
than the current minimum, use this value as the
current minimum, and retain the k medoids found in
Step 2 as the best set of medoids obtained so far.
5. Return to Step 1 to start the next iteration.
'''
size = len(data)
min_avg_cost = float('inf')
best_choice = []
best_res = {}
for i in range(claraLoopNum):
# Construct the sampling subset
sampling_idx = random.sample([i for i in range(size)], (40+k*2))
sampling_data = []
for idx in sampling_idx:
sampling_data.append(data[idx])
# Run kmedoids for the sampling
pre_cost, pre_choice, pre_medoids = kmedoids(sampling_data, k)
if debugEnabled == True:
print('pre_cost: ', pre_cost)
print('pre_choice: ', pre_choice)
print('pre_medioids: ', pre_medoids)
# Convert the pre_choice from sampling_data to the whole data
pre_choice2 = []
for idx in pre_choice:
idx2 = data.index(sampling_data[idx])
pre_choice2.append(idx2)
if debugEnabled == True:
print('pre_choice2: ', pre_choice2)
# Clustering for all data set
tmp_avg_cost, tmp_medoids = averageCost(data, 0, pre_choice2)
if debugEnabled == True:
print('tmp_avg_cost: ', tmp_avg_cost)
print('tmp_medoids: ', tmp_medoids)
# Update the best
if tmp_avg_cost <= min_avg_cost:
min_avg_cost = tmp_avg_cost
best_choice = list(pre_choice2)
best_res = dict(tmp_medoids)
return(min_avg_cost, best_choice, best_res)
def main():
'''
Main function for PAM
'''
if len(sys.argv) != 3:
print('Error: invalid number of parameters')
return(1)
# Get the parameters
filePath = sys.argv[1]
k = int(sys.argv[2])
if debugEnabled == True:
print('filePath: ', filePath)
print('k: ', k)
# Run PAM for europe.txt
data = importData(filePath)
if debugEnabled == True:
for i in range(10):
print('data=', data[i])
# Add timeing
startTime = time.time()
best_cost, best_choice, best_medoids = clara(data, k)
endTime = time.time()
print('best_time: ', endTime - startTime)
print('best_cost: ', best_cost)
print('best_choice: ', best_choice)
print('best_medoids: ', best_medoids)
if __name__ == '__main__':
main()