python svm算法smo cifar_使用smo算法编写svm对CIFAR-10数据分类
公式太難打了,弄成圖片,可能不太美觀,但知識沒變味
3:實驗內容
3.1 提取hog特征
本實驗的核心在于設計svm算法,因此提取特征使用庫函數實現,最主要代碼如下
from skimage import feature as ft
ft.hog(data[i],feature_vector=True,block_norm='L2-Hys',transform_sqrt=True)
3.2 使用SVM庫驗證特征提取后的分類效果
使用庫的核心代碼如下
trainmatrix=data2image(trainImg['Data'])
hogtrain=meanlie(feature_hog(trainmatrix))
testmatrix=data2image(testImg['Data'])
hogtest=meanlie(feature_hog(testmatrix))
from sklearn import svm
from skimage import feature as ft
clf=svm.SVC()
clf.fit(hogtrain,trainImg['Label'])
pre=clf.predict(hogtest)
分類結果如下
其中,count為正確分類的樣本數,為4075,總的測試集樣本數為5000。可以看到,分類準確性很高,達到0.815。這還是沒有仔細調參的結果,這樣的結果是很理想的,證明了hog特征+svm的思路切實可行,下面將其運用到自己編寫的svm算法上。
3.3 驗證自編二分類算法的正確性
本實驗是個多分類問題,因此自己編寫svm算法分兩步,第一步編寫二分類算法,第二步結合前面所選定的多分類策略基于此二分類算法實現多分類。
因此先來第一步:驗證自編二分類算法
這里我分別抽取訓練集和測試集中的6和9兩類,訓練集中每類分別選500個樣本(只是為了運行快一些),二分類的類命名為PlattSMO,單獨保存成自定義模塊plattSMO,方便導入
完整smo代碼太長,在附錄給出,調用部分主要代碼如下
trainmatrix=data2image(trainImg['Data'])
hogtrain=meanlie(feature_hog(trainmatrix))
testmatrix=data2image(testImg['Data'])
hogtest=meanlie(feature_hog(testmatrix))
hogtraindata,hogtrainlabel=extractClass(hogtrain,trainImg['Label'],6,9)
hogtraindata,hogtrainlabel=extractPart(hogtraindata,hogtrainlabel,500)
hogtestdata,hogtestlabel=extractClass(hogtest,testImg['Label'],6,9)
smo = plattSMO.PlattSMO(hogtraindata, hogtrainlabel, 0.05, 0.0001, 200, name='rbf', theta=20)
smo.smoP()
testResult = smo.predict(hogtestdata)
count=0
for i in range(len(testResult)):
if testResult[i]==hogtestlabel[i]:
count+=1
print('right rate:%f'%(float(count)/len(hogtestlabel)))
smoP函數就是完整的線性SMO算法
結果如下
其中,count是正確分類的樣本數,1724個,而測試集中這兩類樣本一共2000個,二分類準確率0.862。驗證了自編二分類代碼是正確的。
3.4 驗證自編多分類算法的正確性
多分類算法中,我們需要構建n(n-1)/2個二分類模型,只需調用PlattSMO類,實例化即可。構建好10個模型后,對于每個測試樣本,使用模型進行分類,調整權重,最后投票表決得出結果即可。多分類的類命名為LibSVM,保存成模塊libsvm,在主函數中調用即可。多分類代碼主要有訓練和預測函數,train和predict,train函數訓練模型,保存到self.classfy變量,predict函數多分類策略,值得一提的是,10個模型的分類結果可能會使得某幾個類別的權重相同,這種情況下我將這幾個權重最大(相同權重)的類別取出來,再對該樣本繼續分類,調整權重,投票。相當于實行兩次多分類策略,只是第二次的類別數較少一些(因為剔除了第一次權重小的類別)
完整代碼在附件給出,主要代碼如下
def __init__(self,data=[],label=[],C=0,toler=0,maxIter=0,**kernelargs):
self.classlabel = unique(label)
self.classNum = len(self.classlabel)
self.classfyNum = (self.classNum * (self.classNum-1))/2
self.classfy = []
self.dataSet={}
self.kernelargs = kernelargs
self.C = C
self.toler = toler
self.maxIter = maxIter
m = shape(data)[0]
for i in range(m):
label[i]=int(label[i])
if label[i] not in self.dataSet.keys():
self.dataSet[int(label[i])] = []
self.dataSet[int(label[i])].append(data[i][:])
else:
self.dataSet[int(label[i])].append(data[i][:])
def train(self):
num = self.classNum
for i in range(num):
for j in range(i+1,num):
data = []
label = [1.0]*shape(self.dataSet[self.classlabel[i]])[0]
label.extend([-1.0]*shape(self.dataSet[self.classlabel[j]])[0])
data.extend(self.dataSet[self.classlabel[i]])
data.extend(self.dataSet[self.classlabel[j]])
svm = PlattSMO(array(data),array(label),self.C,self.toler,self.maxIter,**self.kernelargs)
svm.smoP()
self.classfy.append(svm)
self.dataSet = None
def predict(self,data,label):
m = shape(data)[0]
num = self.classNum
classlabel = []
count = 0.0
for n in range(m):
result = [0] * num
index = -1
for i in range(num):
for j in range(i + 1, num):
index += 1
s = self.classfy[index]
t = s.predict([data[n]])[0]
if t > 0.0:
result[i] +=1
else:
result[j] +=1
#classlabel.append(self.classlabel[result.index(max(result))])
resultmax=max(result)
maxindex=result.index(resultmax)
index1=[maxindex]
for i in range(maxindex+1,5):
if result[i]==resultmax:
index1.append(i)
index2 = [0 for _ in range(len(index1))]
if len(index1) > 1:
for i in range(len(index1)):
for j in range(i+1,len(index1)):
if index1[i]==0:
s = self.classfy[index1[j-1]]
elif index1[i]==3:
s=self.classfy[9]
else:
s=self.classfy[2*index1[i]+index1[j]]
t = s.predict([data[n]])[0]
if t > 0.0:
index2[i]+=1
else:
index2[j]+=1
classlabel.append(self.classlabel[index1[index2.index(max(index2))]])
if classlabel[-1] != label[n]:
count +=1
print label[n],classlabel[n]
#print classlabel
countright=m-count
print "right rate:",countright / m
return classlabel
主函數調用核心代碼,libSVM.LibSVM參數很重要,這里選擇的松弛變量C為10,容錯率toler為0.0001,最大迭代次數maxIter為200,核函數為高斯核’rbf’,對應的帶寬theta為20。可調優
trainImg = loadData(file)
testImg=loadData(file1)
traindata,trainlabel=extractData(trainImg['Data'],trainImg['Label'],400)
trainmatrix=data2image(traindata)
hogtrain=meanlie(feature_hog(trainmatrix))
testmatrix=data2image(testImg['Data'])
hogtest=meanlie(feature_hog(testmatrix))
#C選10最好,0.678
svm = libSVM.LibSVM(hogtrain, trainlabel, 10, 0.0001, 200, name='rbf', theta=20)
svm.train()
svm.predict(hogtest,testImg['Label'])
訓練模型時,只取訓練集中一部分樣本,每個類別取200或者400個樣本,總共1000或者2000個樣本進行訓練,運行時間約5min以內,是很快的。但是若取全部樣本訓練,則時間難以忍受,當然只選擇這樣少的樣本訓練模型,一定會使得分類準確度下降,但是即便只選擇400個樣本,準確率已經可以達到0.678了,這是很不錯的結果,可以想見,當使用全樣本訓練時,結果應該可以達到前面使用svm庫的0.815的準確率。
實驗結果如下
每類抽取200個樣本
其中countright為正確分類的樣本數,3282個,分類正確率為0.6564
每類抽取400個樣本
正確分類的樣本數countright為3473,分類正確率為0.6946。比用每類200個樣本時高了約4%,而全部訓練樣本為14968個,如果用上全部樣本數據,準確率還會有不小的提升
附錄:
代碼中各函數功能說明
trialsvm.py
def meanlie(data):
data中每個元素除以對應列的均值,這一步驟替代歸一化,在提取hog特征后因為要進行1后續的分類,所以需要歸一化數據。而實驗發現歸一化后分類準確性不是很高,發現是由于每個特征對應的data中的列總是有個別的數極大,其他很小,因此采用每個元素除以所在列的均值的方式代替歸一化
def loadData(file):
導入數據集,很簡單
def data2image(data):
將每個樣本對應的行向量轉化成圖片并灰度化,因為hog特征是在圖片上提取,而數據集中圖片是以一個1x3072的行向量表示的
def feature_hog(data):
對傳進來的data提取hog特征,返回提取特征后的數據
def extractData(data,label,num):
從大的數據集data中抽取部分樣本,num是每類需要抽取的樣本數,返回抽取后的樣本數據和對應的標簽
plattSMO.py
class PlattSMO:
二分類的類
def init(self,dataMat,classlabels,C,toler,maxIter,**kernelargs):
初始化函數,初始化一些變量dataMat-數據矩陣,C - 松弛變量,classLabels - 數據標簽,toler - 容錯率,maxIter-最大迭代次數,**kernelargs-核函數有關的參數
def kernelTrans(self,x,z):
通過核函數將數據轉換更高維的空間
def calcEK(self,k):
計算誤差
def updateEK(self,k):
計算Ek,并更新誤差緩存
def selectJ(self,i,Ei):
內循環啟發方式2
def innerL(self,i):
優化的SMO算法
def smoP(self):
完整的線性SMO算法
def calcw(self):
計算權重W
def predict(self,testData):
預測函數,預測樣本類別
libsvm.py
class LibSVM:
多分類類
def train(self):
訓練函數,訓練10個分類模型
def predict(self,data,label):
預測函數,實現多分類策略,使用訓練模型對測試數據data進行預測,給出分類結果
trialsvm.py
# -*- coding: utf-8 -*-
"""
Created on Fri Dec 14 19:15:13 2018
@author: Administrator
"""
from mysvm import plattSMO,libSVM
import matplotlib.pyplot as plt
import numpy as np
import random
from numpy import *
import scipy.io as sio
from sklearn.decomposition import PCA
from sklearn import preprocessing
from skimage import feature as ft
def meanlie(data):
#每個元素除以對應列的均值,這一步驟替代歸一化,因歸一化效果不好
m,n=data.shape
meandata=np.mean(data,axis=0)# axis=0,計算每一列的均值
for i in range(n):
data[:,i]/=meandata[i]
return data
def loadData(file):
#file='G:/lecture of grade one/pattern recognition/data/train_data.mat'
trainImg=sio.loadmat(file)
return trainImg
def data2image(data):
newdata=[]
m,n=data.shape
for i in range(m):
img=data[i,:].reshape((3,32,32))
#gray=img.convert('L')
gray = img[0,:, :]*0.2990+img[1,:, :]*0.5870+img[2,:, :]*0.1140
newdata.append(gray)
#np.array(newdata)
return newdata
def feature_hog(data):
#提取hog特征
fea=[]
for i in range(len(data)):
#data[i]=Image.fromarray(data[i][0])
fea.append(ft.hog(data[i],feature_vector=True,block_norm='L2-Hys',transform_sqrt=True))
fea=np.array(fea)
return fea
'''
def extractClass(data,label,class1,class2):
#抽取兩類,并將類別標簽改為1或-1,方便做svm
m,n=data.shape
index=[]
for i in range(m):
if label[i][0]!=class1 and label[i][0]!=class2:
index.append(i)
data=np.delete(data,index,0)
label=np.delete(label,index,0)
min_max_scaler=preprocessing.MinMaxScaler()
data=min_max_scaler.fit_transform(data)
Y=[]
for i in label:
if i[0]==class1:#class1對應1
Y.append(1)
else:
Y.append(-1)#class2對應-1
Y=np.array(Y)
return data,Y
def extractPart(data,label,nums):
m,n=data.shape
index=[]
a=0;b=0
for i in range(m):
if label[i]==1 and anums and b>nums:
break
data=data[index]
label=label[index]
return data,label
'''
def extractData(data,label,num):
m,n=data.shape
count=[0,0,0,0,0]
cla=[0,6,7,8,9]
index=[]
for i in range(m):
for j in range(5):
if label[i]==cla[j] and count[j]
plattSMO.py
import sys
from numpy import *
from svm import *
from os import listdir
class PlattSMO:
def __init__(self,dataMat,classlabels,C,toler,maxIter,**kernelargs):
self.x = array(dataMat)
self.label = array(classlabels).transpose()
self.C = C
self.toler = toler
self.maxIter = maxIter
self.m = shape(dataMat)[0]
self.n = shape(dataMat)[1]
self.alpha = array(zeros(self.m),dtype='float64')
self.b = 0.0
self.eCache = array(zeros((self.m,2)))
self.K = zeros((self.m,self.m),dtype='float64')
self.kwargs = kernelargs
self.SV = ()
self.SVIndex = None
for i in range(self.m):
for j in range(self.m):
self.K[i,j] = self.kernelTrans(self.x[i,:],self.x[j,:])
def calcEK(self,k):
fxk = dot(self.alpha*self.label,self.K[:,k])+self.b
Ek = fxk - float(self.label[k])
return Ek
def updateEK(self,k):
Ek = self.calcEK(k)
self.eCache[k] = [1 ,Ek]
def selectJ(self,i,Ei):
maxE = 0.0
selectJ = 0
Ej = 0.0
validECacheList = nonzero(self.eCache[:,0])[0]
if len(validECacheList) > 1:
for k in validECacheList:
if k == i:continue
Ek = self.calcEK(k)
deltaE = abs(Ei-Ek)
if deltaE > maxE:
selectJ = k
maxE = deltaE
Ej = Ek
return selectJ,Ej
else:
selectJ = selectJrand(i,self.m)
Ej = self.calcEK(selectJ)
return selectJ,Ej
def innerL(self,i):
Ei = self.calcEK(i)
if (self.label[i] * Ei < -self.toler and self.alpha[i] < self.C) or \
(self.label[i] * Ei > self.toler and self.alpha[i] > 0):
self.updateEK(i)
j,Ej = self.selectJ(i,Ei)
alphaIOld = self.alpha[i].copy()
alphaJOld = self.alpha[j].copy()
if self.label[i] != self.label[j]:
L = max(0,self.alpha[j]-self.alpha[i])
H = min(self.C,self.C + self.alpha[j]-self.alpha[i])
else:
L = max(0,self.alpha[j]+self.alpha[i] - self.C)
H = min(self.C,self.alpha[i]+self.alpha[j])
if L == H:
return 0
eta = 2*self.K[i,j] - self.K[i,i] - self.K[j,j]
if eta >= 0:
return 0
self.alpha[j] -= self.label[j]*(Ei-Ej)/eta
self.alpha[j] = clipAlpha(self.alpha[j],H,L)
self.updateEK(j)
if abs(alphaJOld-self.alpha[j]) < 0.00001:
return 0
self.alpha[i] += self.label[i]*self.label[j]*(alphaJOld-self.alpha[j])
self.updateEK(i)
b1 = self.b - Ei - self.label[i] * self.K[i, i] * (self.alpha[i] - alphaIOld) - \
self.label[j] * self.K[i, j] * (self.alpha[j] - alphaJOld)
b2 = self.b - Ej - self.label[i] * self.K[i, j] * (self.alpha[i] - alphaIOld) - \
self.label[j] * self.K[j, j] * (self.alpha[j] - alphaJOld)
if 00) or (entrySet)):
alphaPairChanged = 0
if entrySet:
for i in range(self.m):
alphaPairChanged+=self.innerL(i)
iter += 1
else:
nonBounds = nonzero((self.alpha > 0)*(self.alpha < self.C))[0]
for i in nonBounds:
alphaPairChanged+=self.innerL(i)
iter+=1
if entrySet:
entrySet = False
elif alphaPairChanged == 0:
entrySet = True
self.SVIndex = nonzero(self.alpha)[0]
self.SV = self.x[self.SVIndex]
self.SVAlpha = self.alpha[self.SVIndex]
self.SVLabel = self.label[self.SVIndex]
self.x = None
self.K = None
self.label = None
self.alpha = None
self.eCache = None
# def K(self,i,j):
# return self.x[i,:]*self.x[j,:].T
def kernelTrans(self,x,z):
if array(x).ndim != 1 or array(x).ndim != 1:
raise Exception("input vector is not 1 dim")
if self.kwargs['name'] == 'linear':
return sum(x*z)
elif self.kwargs['name'] == 'rbf':
theta = self.kwargs['theta']
return exp(sum((x-z)*(x-z))/(-1*theta**2))
def calcw(self):
for i in range(self.m):
self.w += dot(self.alpha[i]*self.label[i],self.x[i,:])
def predict(self,testData):
test = array(testData)
#return (test * self.w + self.b).getA()
result = []
m = shape(test)[0]
for i in range(m):
tmp = self.b
for j in range(len(self.SVIndex)):
tmp += self.SVAlpha[j] * self.SVLabel[j] * self.kernelTrans(self.SV[j],test[i,:])
while tmp == 0:
tmp = random.uniform(-1,1)
if tmp > 0:
tmp = 1
else:
tmp = -1
result.append(tmp)
return result
def plotBestfit(data,label,w,b):
import matplotlib.pyplot as plt
n = shape(data)[0]
fig = plt.figure()
ax = fig.add_subplot(111)
x1 = []
x2 = []
y1 = []
y2 = []
for i in range(n):
if int(label[i]) == 1:
x1.append(data[i][0])
y1.append(data[i][1])
else:
x2.append(data[i][0])
y2.append(data[i][1])
ax.scatter(x1,y1,s=10,c='red',marker='s')
ax.scatter(x2,y2, s=10, c='green', marker='s')
x = arange(-2,10,0.1)
y = ((-b-w[0]*x)/w[1])
plt.plot(x,y)
plt.xlabel('X')
plt.ylabel('y')
plt.show()
def loadImage(dir,maps = None):
dirList = listdir(dir)
data = []
label = []
for file in dirList:
label.append(file.split('_')[0])
lines = open(dir +'/'+file).readlines()
row = len(lines)
col = len(lines[0].strip())
line = []
for i in range(row):
for j in range(col):
line.append(float(lines[i][j]))
data.append(line)
if maps != None:
label[-1] = float(maps[label[-1]])
else:
label[-1] = float(label[-1])
return array(data),array(label)
def main():
'''
data,label = loadDataSet('testSetRBF.txt')
smo = PlattSMO(data,label,200,0.0001,10000,name = 'rbf',theta = 1.3)
smo.smoP()
smo.calcw()
print smo.predict(data)
'''
maps = {'1':1.0,'9':-1.0}
data,label = loadImage("digits/trainingDigits",maps)
smo = PlattSMO(data, label, 200, 0.0001, 10000, name='rbf', theta=20)
smo.smoP()
print len(smo.SVIndex)
test,testLabel = loadImage("digits/testDigits",maps)
testResult = smo.predict(test)
m = shape(test)[0]
count = 0.0
for i in range(m):
if testLabel[i] != testResult[i]:
count += 1
print "classfied error rate is:",count / m
#smo.kernelTrans(data,smo.SV[0])
if __name__ == "__main__":
sys.exit(main())
libsvm.py
import sys
from numpy import *
from svm import *
from os import listdir
from plattSMO import PlattSMO
import pickle
class LibSVM:
def __init__(self,data=[],label=[],C=0,toler=0,maxIter=0,**kernelargs):
self.classlabel = unique(label)
self.classNum = len(self.classlabel)
self.classfyNum = (self.classNum * (self.classNum-1))/2
self.classfy = []
self.dataSet={}
self.kernelargs = kernelargs
self.C = C
self.toler = toler
self.maxIter = maxIter
m = shape(data)[0]
for i in range(m):
label[i]=int(label[i])
if label[i] not in self.dataSet.keys():
self.dataSet[int(label[i])] = []
self.dataSet[int(label[i])].append(data[i][:])
else:
self.dataSet[int(label[i])].append(data[i][:])
def train(self):
num = self.classNum
for i in range(num):
for j in range(i+1,num):
data = []
label = [1.0]*shape(self.dataSet[self.classlabel[i]])[0]
label.extend([-1.0]*shape(self.dataSet[self.classlabel[j]])[0])
data.extend(self.dataSet[self.classlabel[i]])
data.extend(self.dataSet[self.classlabel[j]])
svm = PlattSMO(array(data),array(label),self.C,self.toler,self.maxIter,**self.kernelargs)
svm.smoP()
self.classfy.append(svm)
self.dataSet = None
def predict(self,data,label):
m = shape(data)[0]
num = self.classNum
classlabel = []
count = 0.0
for n in range(m):
result = [0] * num
index = -1
for i in range(num):
for j in range(i + 1, num):
index += 1
s = self.classfy[index]
t = s.predict([data[n]])[0]
if t > 0.0:
result[i] +=1
else:
result[j] +=1
#classlabel.append(self.classlabel[result.index(max(result))])
resultmax=max(result)
maxindex=result.index(resultmax)
index1=[maxindex]
for i in range(maxindex+1,5):
if result[i]==resultmax:
index1.append(i)
index2 = [0 for _ in range(len(index1))]
if len(index1) > 1:
for i in range(len(index1)):
for j in range(i+1,len(index1)):
if index1[i]==0:
s = self.classfy[index1[j-1]]
elif index1[i]==3:
s=self.classfy[9]
else:
s=self.classfy[2*index1[i]+index1[j]]
t = s.predict([data[n]])[0]
if t > 0.0:
index2[i]+=1
else:
index2[j]+=1
classlabel.append(self.classlabel[index1[index2.index(max(index2))]])
if classlabel[-1] != label[n]:
count +=1
#print label[n],classlabel[n]
#print classlabel
countright=m-count
print "right rate:",countright / m
return classlabel
def save(self,filename):
fw = open(filename,'wb')
pickle.dump(self,fw,2)
fw.close()
@staticmethod
def load(filename):
fr = open(filename,'rb')
svm = pickle.load(fr)
fr.close()
return svm
def loadImage(dir,maps = None):
dirList = listdir(dir)
data = []
label = []
for file in dirList:
label.append(file.split('_')[0])
lines = open(dir +'/'+file).readlines()
row = len(lines)
col = len(lines[0].strip())
line = []
for i in range(row):
for j in range(col):
line.append(float(lines[i][j]))
data.append(line)
if maps != None:
label[-1] = float(maps[label[-1]])
else:
label[-1] = float(label[-1])
return data,label
def main():
'''
data,label = loadImage('trainingDigits')
svm = LibSVM(data, label, 200, 0.0001, 10000, name='rbf', theta=20)
svm.train()
svm.save("svm.txt")
'''
svm = LibSVM.load("svm.txt")
test,testlabel = loadImage('testDigits')
svm.predict(test,testlabel)
if __name__ == "__main__":
sys.exit(main())
總結
以上是生活随笔為你收集整理的python svm算法smo cifar_使用smo算法编写svm对CIFAR-10数据分类的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 判断丑数python_LintCode
- 下一篇: python第三方库文件传输助手_pyt