Задача Используем набор данных Wine. Используем первые две главные компоненты в качестве признаков.

  • Обучить алгоритмы классификации по методу SVC для каждого типа ядер (linear,rbf,poly).
  • Вычислить точность (относительное число правильных ответов) на обучающем множестве .
  • Изобразить классы для каждого классификатора в плоскости, образованной двумя главными компонентами.
In [1]:
import numpy as np
import pandas as pd
from sklearn import svm, metrics
import matplotlib.pyplot as plt
%matplotlib inline
In [2]:
attrs = """class_id
Alcohol
Malic acid
Ash
Alcalinity of ash
Magnesium
Total phenols
Flavanoids
Nonflavanoid phenols
Proanthocyanins
Color intensity
Hue
OD280/OD315 of diluted wines
Proline""".split('\n')
print(attrs)
['class_id', 'Alcohol', 'Malic acid', 'Ash', 'Alcalinity of ash', 'Magnesium', 'Total phenols', 'Flavanoids', 'Nonflavanoid phenols', 'Proanthocyanins', 'Color intensity', 'Hue', 'OD280/OD315 of diluted wines', 'Proline']
In [3]:
wine_df=pd.read_csv("wine.data", sep=",", names=attrs)
wine_df
Out[3]:
class_id Alcohol Malic acid Ash Alcalinity of ash Magnesium Total phenols Flavanoids Nonflavanoid phenols Proanthocyanins Color intensity Hue OD280/OD315 of diluted wines Proline
0 1 14.23 1.71 2.43 15.6 127 2.80 3.06 0.28 2.29 5.640000 1.04 3.92 1065
1 1 13.20 1.78 2.14 11.2 100 2.65 2.76 0.26 1.28 4.380000 1.05 3.40 1050
2 1 13.16 2.36 2.67 18.6 101 2.80 3.24 0.30 2.81 5.680000 1.03 3.17 1185
3 1 14.37 1.95 2.50 16.8 113 3.85 3.49 0.24 2.18 7.800000 0.86 3.45 1480
4 1 13.24 2.59 2.87 21.0 118 2.80 2.69 0.39 1.82 4.320000 1.04 2.93 735
5 1 14.20 1.76 2.45 15.2 112 3.27 3.39 0.34 1.97 6.750000 1.05 2.85 1450
6 1 14.39 1.87 2.45 14.6 96 2.50 2.52 0.30 1.98 5.250000 1.02 3.58 1290
7 1 14.06 2.15 2.61 17.6 121 2.60 2.51 0.31 1.25 5.050000 1.06 3.58 1295
8 1 14.83 1.64 2.17 14.0 97 2.80 2.98 0.29 1.98 5.200000 1.08 2.85 1045
9 1 13.86 1.35 2.27 16.0 98 2.98 3.15 0.22 1.85 7.220000 1.01 3.55 1045
10 1 14.10 2.16 2.30 18.0 105 2.95 3.32 0.22 2.38 5.750000 1.25 3.17 1510
11 1 14.12 1.48 2.32 16.8 95 2.20 2.43 0.26 1.57 5.000000 1.17 2.82 1280
12 1 13.75 1.73 2.41 16.0 89 2.60 2.76 0.29 1.81 5.600000 1.15 2.90 1320
13 1 14.75 1.73 2.39 11.4 91 3.10 3.69 0.43 2.81 5.400000 1.25 2.73 1150
14 1 14.38 1.87 2.38 12.0 102 3.30 3.64 0.29 2.96 7.500000 1.20 3.00 1547
15 1 13.63 1.81 2.70 17.2 112 2.85 2.91 0.30 1.46 7.300000 1.28 2.88 1310
16 1 14.30 1.92 2.72 20.0 120 2.80 3.14 0.33 1.97 6.200000 1.07 2.65 1280
17 1 13.83 1.57 2.62 20.0 115 2.95 3.40 0.40 1.72 6.600000 1.13 2.57 1130
18 1 14.19 1.59 2.48 16.5 108 3.30 3.93 0.32 1.86 8.700000 1.23 2.82 1680
19 1 13.64 3.10 2.56 15.2 116 2.70 3.03 0.17 1.66 5.100000 0.96 3.36 845
20 1 14.06 1.63 2.28 16.0 126 3.00 3.17 0.24 2.10 5.650000 1.09 3.71 780
21 1 12.93 3.80 2.65 18.6 102 2.41 2.41 0.25 1.98 4.500000 1.03 3.52 770
22 1 13.71 1.86 2.36 16.6 101 2.61 2.88 0.27 1.69 3.800000 1.11 4.00 1035
23 1 12.85 1.60 2.52 17.8 95 2.48 2.37 0.26 1.46 3.930000 1.09 3.63 1015
24 1 13.50 1.81 2.61 20.0 96 2.53 2.61 0.28 1.66 3.520000 1.12 3.82 845
25 1 13.05 2.05 3.22 25.0 124 2.63 2.68 0.47 1.92 3.580000 1.13 3.20 830
26 1 13.39 1.77 2.62 16.1 93 2.85 2.94 0.34 1.45 4.800000 0.92 3.22 1195
27 1 13.30 1.72 2.14 17.0 94 2.40 2.19 0.27 1.35 3.950000 1.02 2.77 1285
28 1 13.87 1.90 2.80 19.4 107 2.95 2.97 0.37 1.76 4.500000 1.25 3.40 915
29 1 14.02 1.68 2.21 16.0 96 2.65 2.33 0.26 1.98 4.700000 1.04 3.59 1035
... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
148 3 13.32 3.24 2.38 21.5 92 1.93 0.76 0.45 1.25 8.420000 0.55 1.62 650
149 3 13.08 3.90 2.36 21.5 113 1.41 1.39 0.34 1.14 9.400000 0.57 1.33 550
150 3 13.50 3.12 2.62 24.0 123 1.40 1.57 0.22 1.25 8.600000 0.59 1.30 500
151 3 12.79 2.67 2.48 22.0 112 1.48 1.36 0.24 1.26 10.800000 0.48 1.47 480
152 3 13.11 1.90 2.75 25.5 116 2.20 1.28 0.26 1.56 7.100000 0.61 1.33 425
153 3 13.23 3.30 2.28 18.5 98 1.80 0.83 0.61 1.87 10.520000 0.56 1.51 675
154 3 12.58 1.29 2.10 20.0 103 1.48 0.58 0.53 1.40 7.600000 0.58 1.55 640
155 3 13.17 5.19 2.32 22.0 93 1.74 0.63 0.61 1.55 7.900000 0.60 1.48 725
156 3 13.84 4.12 2.38 19.5 89 1.80 0.83 0.48 1.56 9.010000 0.57 1.64 480
157 3 12.45 3.03 2.64 27.0 97 1.90 0.58 0.63 1.14 7.500000 0.67 1.73 880
158 3 14.34 1.68 2.70 25.0 98 2.80 1.31 0.53 2.70 13.000000 0.57 1.96 660
159 3 13.48 1.67 2.64 22.5 89 2.60 1.10 0.52 2.29 11.750000 0.57 1.78 620
160 3 12.36 3.83 2.38 21.0 88 2.30 0.92 0.50 1.04 7.650000 0.56 1.58 520
161 3 13.69 3.26 2.54 20.0 107 1.83 0.56 0.50 0.80 5.880000 0.96 1.82 680
162 3 12.85 3.27 2.58 22.0 106 1.65 0.60 0.60 0.96 5.580000 0.87 2.11 570
163 3 12.96 3.45 2.35 18.5 106 1.39 0.70 0.40 0.94 5.280000 0.68 1.75 675
164 3 13.78 2.76 2.30 22.0 90 1.35 0.68 0.41 1.03 9.580000 0.70 1.68 615
165 3 13.73 4.36 2.26 22.5 88 1.28 0.47 0.52 1.15 6.620000 0.78 1.75 520
166 3 13.45 3.70 2.60 23.0 111 1.70 0.92 0.43 1.46 10.680000 0.85 1.56 695
167 3 12.82 3.37 2.30 19.5 88 1.48 0.66 0.40 0.97 10.260000 0.72 1.75 685
168 3 13.58 2.58 2.69 24.5 105 1.55 0.84 0.39 1.54 8.660000 0.74 1.80 750
169 3 13.40 4.60 2.86 25.0 112 1.98 0.96 0.27 1.11 8.500000 0.67 1.92 630
170 3 12.20 3.03 2.32 19.0 96 1.25 0.49 0.40 0.73 5.500000 0.66 1.83 510
171 3 12.77 2.39 2.28 19.5 86 1.39 0.51 0.48 0.64 9.899999 0.57 1.63 470
172 3 14.16 2.51 2.48 20.0 91 1.68 0.70 0.44 1.24 9.700000 0.62 1.71 660
173 3 13.71 5.65 2.45 20.5 95 1.68 0.61 0.52 1.06 7.700000 0.64 1.74 740
174 3 13.40 3.91 2.48 23.0 102 1.80 0.75 0.43 1.41 7.300000 0.70 1.56 750
175 3 13.27 4.28 2.26 20.0 120 1.59 0.69 0.43 1.35 10.200000 0.59 1.56 835
176 3 13.17 2.59 2.37 20.0 120 1.65 0.68 0.53 1.46 9.300000 0.60 1.62 840
177 3 14.13 4.10 2.74 24.5 96 2.05 0.76 0.56 1.35 9.200000 0.61 1.60 560

178 rows × 14 columns

In [4]:
cols = tuple([wine_df[attr].values for attr in attrs[1:]])
x=np.c_[cols]
y = wine_df['class_id']
print(x, y)
[[  1.42300000e+01   1.71000000e+00   2.43000000e+00 ...,   1.04000000e+00
    3.92000000e+00   1.06500000e+03]
 [  1.32000000e+01   1.78000000e+00   2.14000000e+00 ...,   1.05000000e+00
    3.40000000e+00   1.05000000e+03]
 [  1.31600000e+01   2.36000000e+00   2.67000000e+00 ...,   1.03000000e+00
    3.17000000e+00   1.18500000e+03]
 ..., 
 [  1.32700000e+01   4.28000000e+00   2.26000000e+00 ...,   5.90000000e-01
    1.56000000e+00   8.35000000e+02]
 [  1.31700000e+01   2.59000000e+00   2.37000000e+00 ...,   6.00000000e-01
    1.62000000e+00   8.40000000e+02]
 [  1.41300000e+01   4.10000000e+00   2.74000000e+00 ...,   6.10000000e-01
    1.60000000e+00   5.60000000e+02]] 0      1
1      1
2      1
3      1
4      1
5      1
6      1
7      1
8      1
9      1
10     1
11     1
12     1
13     1
14     1
15     1
16     1
17     1
18     1
19     1
20     1
21     1
22     1
23     1
24     1
25     1
26     1
27     1
28     1
29     1
      ..
148    3
149    3
150    3
151    3
152    3
153    3
154    3
155    3
156    3
157    3
158    3
159    3
160    3
161    3
162    3
163    3
164    3
165    3
166    3
167    3
168    3
169    3
170    3
171    3
172    3
173    3
174    3
175    3
176    3
177    3
Name: class_id, dtype: int64
In [5]:
import sklearn.decomposition as decomposition
In [6]:
pca = decomposition.PCA()
In [7]:
pca.fit(x)
Out[7]:
PCA(copy=True, iterated_power='auto', n_components=None, random_state=None,
  svd_solver='auto', tol=0.0, whiten=False)
In [8]:
pca.explained_variance_ratio_
Out[8]:
array([  9.98091230e-01,   1.73591562e-03,   9.49589576e-05,
         5.02173562e-05,   1.23636847e-05,   8.46213034e-06,
         2.80681456e-06,   1.52308053e-06,   1.12783044e-06,
         7.21415811e-07,   3.78060267e-07,   2.12013755e-07,
         8.25392788e-08])
In [9]:
plt.figure(figsize=(15,10))
plt.bar(range(1,14),pca.explained_variance_ratio_)
plt.grid(1)
plt.minorticks_on()
plt.show()
In [10]:
pca.n_components = 2
X12 = pca.fit_transform(x)
print(pca.noise_variance_)
1.55306269038
In [11]:
plt.figure(figsize=(8.0,10.0))
plt.scatter(X12[:, 0], X12[:,1], s=25, c=y)
plt.xlabel("Компонент 1")
plt.ylabel("Компонент 2")
plt.grid(1)
plt.show()
In [12]:
def plot_map2d(clf, XX):
    x_min, x_max = XX[:,0].min(), XX[:,0].max()
    y_min, y_max = XX[:,1].min(), XX[:,1].max()
    x_range = np.linspace(x_min, x_max, 200)
    y_range = np.linspace(y_min, y_max, 200)
    xx, yy = np.meshgrid(x_range,y_range)
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    plt.imshow(Z, extent=(x_min, x_max,y_min, y_max), aspect="auto", interpolation="bilinear", origin="lower")
In [13]:
def myPlot(kname, X, **kw):
    plt.figure(figsize=(8.0,10.0))
    clf = svm.SVC(kernel=kname, **kw)
    clf.fit(X,y)    
    print('Точность', kname, metrics.accuracy_score(y, clf.predict(X12)))
    plt.title(kname)
    plot_map2d(clf, X)
    plt.scatter(clf.support_vectors_[:,0], clf.support_vectors_[:,1], s=144, c='w')
    plt.scatter(X[:,0], X[:,1], s=25, c=y)
    plt.xlabel('komp1')
    plt.ylabel('komp2')
    plt.show()
In [14]:
myPlot('linear',X12)
Точность linear 0.73595505618
In [15]:
myPlot('rbf',X12, gamma=0.001)
Точность rbf 0.842696629213
In [ ]:
myPlot('poly',X12)
In [ ]: