tests/simpleSVC.py at master · RoomOfAnalysis/tests · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
#!/usr/bin/env python3
"""
@author:Harold
@file: simpleSVC.py
@time: 02/09/2019
"""

# Reference: https://github.com/josiahw/SimpleSVClustering/blob/master/SimpleSVC.py

# -*- coding: utf-8 -*-
"""
Created on Mon Jul 11 20:40:56 2016

@author: josiahw
"""
import numpy as numpy


def polyKernel(a, b, pwr):
    return numpy.dot(a, b) ** pwr  # numpy.dot(a,a) - numpy.dot(b,b) # -1 #


def rbfKernel(a, b, gamma):
    return numpy.exp(-gamma * numpy.linalg.norm(a - b))


class SimpleSVClustering:
    w = None
    a = None
    b = None
    C = None
    sv = None
    kernel = None
    kargs = ()
    tolerance = None
    verbose = False

    def __init__(self,
                 C,
                 tolerance=0.001,
                 kernel=numpy.dot,
                 kargs=()
                 ):
        """
        The parameters are:
         - C: SVC cost
         - tolerance: gradient descent solution accuracy
         - kernel: the kernel function do use as k(a, b, *kargs)
         - kargs: extra parameters for the kernel
        """
        self.C = C
        self.kernel = kernel
        self.tolerance = tolerance
        self.kargs = kargs

    def _checkClass(self, a, b, n_checks=5):
        """
        This does a straight line interpolation between a and b, using n_checks number of segments.
        It returns True if a and b are connected by a high probability region, false otherwise.
        NOTE: authors originally suggested 20 segments but that is SLOOOOOW, so we use 5. In practice it is pretty good.
        """
        for i in numpy.arange(1.0 / n_checks, 1.0, 1.0 / n_checks):
            if self._predict(i * a + (1 - i) * b) > self.b:
                return False
        return True
        # test = [bool(self._predict(i*a + (1-i)*b) <= self.b) for i in numpy.arange(1.0/n_checks,1.0,1.0/n_checks)]
        # return not False in test

    def _getAllClasses(self, X):
        """
        Assign class labels to each vector based on connected graph components.
        TODO: The outputs of this should really be saved in order to embed new points into the clusters.
        """

        # 1: build the connected clusters
        unvisited = [range(len(X))]
        clusters = []
        while len(unvisited):
            # create a new cluster with the first unvisited node
            c = [unvisited[0]]
            unvisited.pop(0)
            i = 0
            while i < len(c) and len(unvisited):
                # for all nodes in the cluster, add all connected unvisited nodes and remove them fromt he unvisited list
                unvisitedNew = []
                for j in unvisited:
                    (c if self._checkClass(X[c[i], :], X[j, :]) else unvisitedNew).append(j)
                unvisited = unvisitedNew
                i += 1
            clusters.append(c)

        # 3: group components by classification
        classifications = numpy.zeros(len(X)) - 1
        for i in range(len(clusters)):
            for c in clusters[i]:
                classifications[c] = i
        return classifications

    def fit(self, X):
        """
        Fit to data X with labels y.
        """

        """
        Construct the Q matrix for solving
        """
        Q = numpy.zeros((len(data), len(data)))
        for i in range(len(data)):
            for j in range(i, len(data)):
                Qval = 1.
                Qval *= self.kernel(*(
                        (data[i, :], data[j, :])
                        + self.kargs
                ))
                Q[i, j] = Q[j, i] = Qval

        """
        Solve for a and w simultaneously by coordinate descent.
        This means no quadratic solver is needed!
        The support vectors correspond to non-zero values in a.
        """
        self.w = numpy.zeros(X.shape[1])
        self.a = numpy.zeros(X.shape[0])
        delta = 10000000000.0
        while delta > self.tolerance:
            delta = 0.
            for i in range(len(data)):
                g = numpy.dot(Q[i, :], self.a) - Q[i, i]
                adelta = self.a[i] - min(max(self.a[i] - g / Q[i, i], 0.0), self.C)
                self.w += adelta * X[i, :]
                delta += abs(adelta)
                self.a[i] -= adelta
            if self.verbose:
                print("Descent step magnitude:", delta)

        # get the data for support vectors
        print(Q.shape)
        Qshrunk = Q[self.a >= self.C / 100., :][:, self.a >= self.C / 100.]
        print(Qshrunk.shape)
        self.sv = X[self.a >= self.C / 100., :]
        print(self.sv.shape)
        self.a = (self.a)[self.a >= self.C / 100.]
        print(self.a.shape)

        # Do an all-pairs contour check

        # calculate the contribution of all SVs
        for i in range(len(self.a)):
            for j in range(len(self.a)):
                Qshrunk[i, j] *= self.a[i] * self.a[j]

        # this is needed for radius calculation apparently
        self.bOffset = numpy.sum(numpy.sum(Qshrunk))
        if self.verbose:
            print("Number of support vectors:", len(self.a))

        """
        Select support vectors and solve for b to get the final classifier
        """
        self.b = numpy.mean(self._predict(self.sv))

        if self.verbose:
            print("Bias value:", self.b)

    def _predict(self, X):
        """
        For SVClustering, we need to calculate radius rather than bias.
        """
        if (len(X.shape) < 2):
            X = X.reshape((1, -1))
        clss = numpy.zeros(len(X))
        for i in range(len(X)):
            clss[i] += self.kernel(*((X[i, :], X[i, :]) + self.kargs))
            for j in range(len(self.sv)):
                clss[i] -= 2 * self.a[j] * self.kernel(*((self.sv[j, :], X[i, :]) + self.kargs))
        return (clss + self.bOffset) ** 0.5

    def predict(self, X):
        """
        Predict classes for data X.
        NOTE: this should really be done with either the fitting data or a superset of the fitting data.
        """

        return self._getAllClasses(X)


if __name__ == '__main__':
    import sklearn.datasets

    data, labels = sklearn.datasets.make_moons(400, noise=0.01, random_state=0)
    data -= numpy.mean(data, axis=0)

    dl = numpy.concatenate((data.reshape(400, 2), labels.reshape(400, 1)), axis=1)
    numpy.savetxt("data.txt", dl, delimiter=' ', fmt="%f")

    # parameters can be sensitive, these ones work for two moons
    C = 0.1
    clss = SimpleSVClustering(C, 1e-10, rbfKernel, (3.5,))
    clss.fit(data)

    # check assigned classes for the two moons as a classification error
    t = clss.predict(data)
    # print(t)
    print("Error", numpy.sum((labels - t) ** 2) / float(len(data)))

    from matplotlib import pyplot

    # generate a heatmap and display classified clusters.
    a = numpy.zeros((100, 100))
    for i in range(100):
        for j in range(100):
            a[j, i] = clss._predict(numpy.array([i * 4 / 100. - 2, j * 4 / 100. - 2]))
    pyplot.imshow(a, cmap='hot', interpolation='nearest')
    data *= 25.
    data += 50.

    numpy.savetxt("res.txt", data, delimiter=' ', fmt="%f")
    # print(data[t == 1, 0], data[t == 1, 1])

    pyplot.scatter(data[t == 0, 0], data[t == 0, 1], c='r')
    pyplot.scatter(data[t == 1, 0], data[t == 1, 1], c='b')

    # sklearn
    from sklearn.svm import SVC

    data, labels = sklearn.datasets.make_moons(400, noise=0.01, random_state=0)
    clf = SVC(gamma='auto')
    clf.fit(data, labels)
    ct = clf.predict(data)
    # print(ct)
    print("Error sklearn", numpy.sum((labels - ct) ** 2) / float(len(data)))
    data *= 25.
    data += 50.
    pyplot.scatter(data[ct == 0, 0], data[ct == 0, 1], c='r')
    pyplot.scatter(data[ct == 1, 0], data[ct == 1, 1], c='b')

    pyplot.show()