-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathsimpleSVC.py
More file actions
237 lines (198 loc) · 7.55 KB
/
Copy pathsimpleSVC.py
File metadata and controls
237 lines (198 loc) · 7.55 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#!/usr/bin/env python3
"""
@author:Harold
@file: simpleSVC.py
@time: 02/09/2019
"""
# Reference: https://github.com/josiahw/SimpleSVClustering/blob/master/SimpleSVC.py
# -*- coding: utf-8 -*-
"""
Created on Mon Jul 11 20:40:56 2016
@author: josiahw
"""
import numpy as numpy
def polyKernel(a, b, pwr):
return numpy.dot(a, b) ** pwr # numpy.dot(a,a) - numpy.dot(b,b) # -1 #
def rbfKernel(a, b, gamma):
return numpy.exp(-gamma * numpy.linalg.norm(a - b))
class SimpleSVClustering:
w = None
a = None
b = None
C = None
sv = None
kernel = None
kargs = ()
tolerance = None
verbose = False
def __init__(self,
C,
tolerance=0.001,
kernel=numpy.dot,
kargs=()
):
"""
The parameters are:
- C: SVC cost
- tolerance: gradient descent solution accuracy
- kernel: the kernel function do use as k(a, b, *kargs)
- kargs: extra parameters for the kernel
"""
self.C = C
self.kernel = kernel
self.tolerance = tolerance
self.kargs = kargs
def _checkClass(self, a, b, n_checks=5):
"""
This does a straight line interpolation between a and b, using n_checks number of segments.
It returns True if a and b are connected by a high probability region, false otherwise.
NOTE: authors originally suggested 20 segments but that is SLOOOOOW, so we use 5. In practice it is pretty good.
"""
for i in numpy.arange(1.0 / n_checks, 1.0, 1.0 / n_checks):
if self._predict(i * a + (1 - i) * b) > self.b:
return False
return True
# test = [bool(self._predict(i*a + (1-i)*b) <= self.b) for i in numpy.arange(1.0/n_checks,1.0,1.0/n_checks)]
# return not False in test
def _getAllClasses(self, X):
"""
Assign class labels to each vector based on connected graph components.
TODO: The outputs of this should really be saved in order to embed new points into the clusters.
"""
# 1: build the connected clusters
unvisited = [range(len(X))]
clusters = []
while len(unvisited):
# create a new cluster with the first unvisited node
c = [unvisited[0]]
unvisited.pop(0)
i = 0
while i < len(c) and len(unvisited):
# for all nodes in the cluster, add all connected unvisited nodes and remove them fromt he unvisited list
unvisitedNew = []
for j in unvisited:
(c if self._checkClass(X[c[i], :], X[j, :]) else unvisitedNew).append(j)
unvisited = unvisitedNew
i += 1
clusters.append(c)
# 3: group components by classification
classifications = numpy.zeros(len(X)) - 1
for i in range(len(clusters)):
for c in clusters[i]:
classifications[c] = i
return classifications
def fit(self, X):
"""
Fit to data X with labels y.
"""
"""
Construct the Q matrix for solving
"""
Q = numpy.zeros((len(data), len(data)))
for i in range(len(data)):
for j in range(i, len(data)):
Qval = 1.
Qval *= self.kernel(*(
(data[i, :], data[j, :])
+ self.kargs
))
Q[i, j] = Q[j, i] = Qval
"""
Solve for a and w simultaneously by coordinate descent.
This means no quadratic solver is needed!
The support vectors correspond to non-zero values in a.
"""
self.w = numpy.zeros(X.shape[1])
self.a = numpy.zeros(X.shape[0])
delta = 10000000000.0
while delta > self.tolerance:
delta = 0.
for i in range(len(data)):
g = numpy.dot(Q[i, :], self.a) - Q[i, i]
adelta = self.a[i] - min(max(self.a[i] - g / Q[i, i], 0.0), self.C)
self.w += adelta * X[i, :]
delta += abs(adelta)
self.a[i] -= adelta
if self.verbose:
print("Descent step magnitude:", delta)
# get the data for support vectors
print(Q.shape)
Qshrunk = Q[self.a >= self.C / 100., :][:, self.a >= self.C / 100.]
print(Qshrunk.shape)
self.sv = X[self.a >= self.C / 100., :]
print(self.sv.shape)
self.a = (self.a)[self.a >= self.C / 100.]
print(self.a.shape)
# Do an all-pairs contour check
# calculate the contribution of all SVs
for i in range(len(self.a)):
for j in range(len(self.a)):
Qshrunk[i, j] *= self.a[i] * self.a[j]
# this is needed for radius calculation apparently
self.bOffset = numpy.sum(numpy.sum(Qshrunk))
if self.verbose:
print("Number of support vectors:", len(self.a))
"""
Select support vectors and solve for b to get the final classifier
"""
self.b = numpy.mean(self._predict(self.sv))
if self.verbose:
print("Bias value:", self.b)
def _predict(self, X):
"""
For SVClustering, we need to calculate radius rather than bias.
"""
if (len(X.shape) < 2):
X = X.reshape((1, -1))
clss = numpy.zeros(len(X))
for i in range(len(X)):
clss[i] += self.kernel(*((X[i, :], X[i, :]) + self.kargs))
for j in range(len(self.sv)):
clss[i] -= 2 * self.a[j] * self.kernel(*((self.sv[j, :], X[i, :]) + self.kargs))
return (clss + self.bOffset) ** 0.5
def predict(self, X):
"""
Predict classes for data X.
NOTE: this should really be done with either the fitting data or a superset of the fitting data.
"""
return self._getAllClasses(X)
if __name__ == '__main__':
import sklearn.datasets
data, labels = sklearn.datasets.make_moons(400, noise=0.01, random_state=0)
data -= numpy.mean(data, axis=0)
dl = numpy.concatenate((data.reshape(400, 2), labels.reshape(400, 1)), axis=1)
numpy.savetxt("data.txt", dl, delimiter=' ', fmt="%f")
# parameters can be sensitive, these ones work for two moons
C = 0.1
clss = SimpleSVClustering(C, 1e-10, rbfKernel, (3.5,))
clss.fit(data)
# check assigned classes for the two moons as a classification error
t = clss.predict(data)
# print(t)
print("Error", numpy.sum((labels - t) ** 2) / float(len(data)))
from matplotlib import pyplot
# generate a heatmap and display classified clusters.
a = numpy.zeros((100, 100))
for i in range(100):
for j in range(100):
a[j, i] = clss._predict(numpy.array([i * 4 / 100. - 2, j * 4 / 100. - 2]))
pyplot.imshow(a, cmap='hot', interpolation='nearest')
data *= 25.
data += 50.
numpy.savetxt("res.txt", data, delimiter=' ', fmt="%f")
# print(data[t == 1, 0], data[t == 1, 1])
pyplot.scatter(data[t == 0, 0], data[t == 0, 1], c='r')
pyplot.scatter(data[t == 1, 0], data[t == 1, 1], c='b')
# sklearn
from sklearn.svm import SVC
data, labels = sklearn.datasets.make_moons(400, noise=0.01, random_state=0)
clf = SVC(gamma='auto')
clf.fit(data, labels)
ct = clf.predict(data)
# print(ct)
print("Error sklearn", numpy.sum((labels - ct) ** 2) / float(len(data)))
data *= 25.
data += 50.
pyplot.scatter(data[ct == 0, 0], data[ct == 0, 1], c='r')
pyplot.scatter(data[ct == 1, 0], data[ct == 1, 1], c='b')
pyplot.show()