KNN简单实现
最近開始學習機器學習實戰,第一個就是KNN,由于K-近鄰算法比較簡單,這里不再介紹理論知識,直接看代碼實現:
KNN的簡單實現
需要用到的一些語法:
tile()
sum(axis=1)
argsort,sort 和 sorted,operator.itemgetter函數
get(),items(),iteritems()方法
運行結果:
('training data set:', array([[ 1. , 1.1],[ 1. , 1. ],[ 0. , 0. ],[ 0. , 0.1]])) ('labels of training data set:', ['A', 'A', 'B', 'B']) ('classCount:', {'A': 1, 'B': 2}) [('B', 2), ('A', 1)] ('Classification results:', 'B')至此一個最簡單的KNN分類就實現了
KNN算法改進約會網站的配對效果
數據的處理
會用到的語法:
matplotlib
min(iterable, *[, key, default])
其中file2matrix得到的是數組矩陣,也即是可以處理的數據格式,如下:
[[ 4.09200000e+04 8.32697600e+00 9.53952000e-01][ 1.44880000e+04 7.15346900e+00 1.67390400e+00][ 2.60520000e+04 1.44187100e+00 8.05124000e-01]..., [ 2.65750000e+04 1.06501020e+01 8.66627000e-01][ 4.81110000e+04 9.13452800e+00 7.28045000e-01][ 4.37570000e+04 7.88260100e+00 1.33244600e+00]][3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3, 2, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 1, 3, 1, 2, 1, 1, 2, 3, 3, 1, 2, 3, 3, 3, 1, 1, 1, 1, 2, 2, 1, 3, 2, 2, 2, 2, 3, 1, 2, 1, 2, 2, 2, 2, 2, 3, 2, 3, 1, 2, 3, 2, 2, 1, 3, 1, 1, 3, 3, 1, 2, 3, 1, 3, 1, 2, 2, 1, 1, 3, 3, 1, 2, 1, 3, 3, 2, 1, 1, 3, 1, 2, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 3, 1, 2, 1, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 3, 3, 1, 3, 2, 2, 3, 1, 3, 3, 3, 1, 3, 1, 1, 3, 3, 2, 3, 3, 1, 2, 3, 2, 2, 3, 3, 3, 1, 2, 2, 1, 1, 3, 2, 3, 3, 1, 2, 1, 3, 1, 2, 3, 2, 3, 1, 1, 1, 3, 2, 3, 1, 3, 2, 1, 3, 2, 2, 3, 2, 3, 2, 1, 1, 3, 1, 3, 2, 2, 2, 3, 2, 2, 1, 2, 2, 3, 1, 3, 3, 2, 1, 1, 1, 2, 1, 3, 3, 3, 3, 2, 1, 1, 1, 2, 3, 2, 1, 3, 1, 3, 2, 2, 3, 1, 3, 1, 1, 2, 1, 2, 2, 1, 3, 1, 3, 2, 3, 1, 2, 3, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 2, 1, 1, 1, 3, 3, 2, 1, 1, 1, 2, 2, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 3, 2, 3, 3, 3, 3, 1, 2, 3, 1, 1, 1, 3, 1, 3, 2, 2, 1, 3, 1, 3, 2, 2, 1, 2, 2, 3, 1, 3, 2, 1, 1, 3, 3, 2, 3, 3, 2, 3, 1, 3, 1, 3, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 2, 1, 3, 1, 1, 3, 3, 2, 2, 3, 1, 2, 3, 3, 2, 2, 1, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 1, 3, 3, 3, 2, 3, 2, 1, 1, 1, 1, 1, 3, 2, 2, 1, 2, 1, 3, 2, 1, 3, 2, 1, 3, 1, 1, 3, 3, 3, 3, 2, 1, 1, 2, 1, 3, 3, 2, 1, 2, 3, 2, 1, 2, 2, 2, 1, 1, 3, 1, 1, 2, 3, 1, 1, 2, 3, 1, 3, 1, 1, 2, 2, 1, 2, 2, 2, 3, 1, 1, 1, 3, 1, 3, 1, 3, 3, 1, 1, 1, 3, 2, 3, 3, 2, 2, 1, 1, 1, 2, 1, 2, 2, 3, 3, 3, 1, 1, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 1, 1, 1, 3, 3, 3, 3, 2, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 2, 3, 2, 1, 2, 2, 2, 3, 2, 1, 3, 2, 3, 2, 3, 2, 1, 1, 2, 3, 1, 3, 3, 3, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 1, 3, 3, 2, 2, 2, 3, 1, 2, 1, 1, 3, 2, 3, 2, 3, 2, 3, 3, 2, 2, 1, 3, 1, 2, 1, 3, 1, 1, 1, 3, 1, 1, 3, 3, 2, 2, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 1, 3, 3, 1, 2, 3, 1, 3, 1, 1, 2, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 2, 1, 3, 1, 3, 1, 1, 2, 2, 2, 3, 2, 2, 1, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 3, 2, 3, 2, 1, 2, 1, 1, 1, 2, 3, 2, 2, 1, 2, 2, 1, 3, 1, 3, 3, 3, 2, 2, 3, 3, 1, 2, 2, 2, 3, 1, 2, 1, 3, 1, 2, 3, 1, 1, 1, 2, 2, 3, 1, 3, 1, 1, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 2, 2, 3, 1, 3, 1, 2, 3, 2, 2, 3, 1, 2, 3, 2, 3, 1, 2, 2, 3, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 3, 2, 1, 3, 3, 3, 1, 1, 3, 1, 2, 3, 3, 2, 2, 2, 1, 2, 3, 2, 2, 3, 2, 2, 2, 3, 3, 2, 1, 3, 2, 1, 3, 3, 1, 2, 3, 2, 1, 3, 3, 3, 1, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 1, 2, 1, 3, 1, 2, 2, 1, 3, 2, 1, 3, 3, 2, 2, 2, 1, 2, 2, 1, 3, 1, 3, 1, 3, 3, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 3, 2, 2, 1, 3, 1, 2, 3, 1, 3, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 3, 3, 3, 1, 3, 2, 2, 1, 1, 3, 3, 2, 2, 2, 1, 2, 1, 2, 1, 3, 2, 1, 2, 2, 3, 1, 2, 2, 2, 3, 2, 1, 2, 1, 2, 3, 3, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2, 2, 2, 1, 3, 3, 3, 3, 3, 1, 1, 3, 2, 1, 2, 1, 2, 2, 3, 2, 2, 2, 3, 1, 2, 1, 2, 2, 1, 1, 2, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 1, 3, 3, 2, 3, 2, 3, 3, 2, 2, 1, 1, 1, 3, 3, 1, 1, 1, 3, 3, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 3, 1, 1, 2, 3, 2, 2, 1, 3, 1, 2, 3, 1, 2, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 2, 3, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 3, 3, 3]下圖是數據的散點圖:
歸一化后的數據:
[[ 0.44832535 0.39805139 0.56233353][ 0.15873259 0.34195467 0.98724416][ 0.28542943 0.06892523 0.47449629]..., [ 0.29115949 0.50910294 0.51079493][ 0.52711097 0.43665451 0.4290048 ][ 0.47940793 0.3768091 0.78571804]]測試算法
# coding=utf-8 from numpy import * import operator # 運算符模塊,執行排序操作時將用到 import matplotlib.pyplot as plt# 建立訓訓練集和相應的標簽 def createDataset():# 數組,注意此處是兩個中括號group=array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])labels=['A','A','B','B']return (group,labels)# 簡單分類 def classify0(inX,dataSet,labels,k):#shape[0]得到的是矩陣行數,shape[1]得到列數dataSetSize=dataSet.shape[0] # tile()得到和dataset相同的維數,進行相減diffMat=tile(inX,(dataSetSize,1))-dataSet # 各向量相減后平方sqDiffMat = diffMat**2# axis=1按行求和,得到了平方和sqDistances = sqDiffMat.sum(axis=1)# 開根號,求得輸入向量和訓練集各向量的歐氏距離distances = sqDistances**0.5# 得到各距離索引值,是升序,即最小距離到最大距離sortedDistIndicies = distances.argsort()classCount={} # 定義一個字典for i in range(k):# 前k個最小距離的標簽voteIlabel = labels[sortedDistIndicies[i]] # 累計投票數classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1# 把分類結果進行排序,然后返回得票數最多的分類結果# 其中iteritems()把字典分解為元祖列表,itemgetter(1)按照第二個元素的次序對元祖排序sortedClassCount = sorted(classCount.iteritems(), \key=operator.itemgetter(1), reverse=True)# 輸出分類標簽#print(sortedClassCount[0][0]) return sortedClassCount[0][0]# 數據預處理 def file2matrix(filename):'''從文件中讀入訓練數據,并存儲為矩陣'''fr=open(filename,'r')# 源代碼有錯誤arrayOfLines=fr.readlines() # 只能讀一次numberOfLines = len(arrayOfLines) # 得到樣本的行數returnMat = zeros((numberOfLines,3)) # 得到一個二維矩陣,行數是樣本的行數,每行3列print('row:%s and column:%s' %(returnMat.shape[0],returnMat.shape[1]))classLabelVector = [] # 得到一個一維的數組,存放樣本標簽index = 0for line in arrayOfLines:#strip() 方法用于移除字符串頭尾指定的字符(默認為所有的空字符,包括空格、換行(\n)、制表符(\t)等)line = line.strip() # 把回車符號給去掉#對于每一行,按照制表符切割字符串,得到的結果構成一個數組,listFromLine = line.split('\t')#print(listFromLine[0:4])# 把分割好的數據放至數據集,是一個1000*3的數組returnMat[index,:] = listFromLine[0:3] classLabelVector.append(int(listFromLine[-1]))index += 1return ( returnMat,classLabelVector) fr.close()# 歸一化數據 def autoNorm(dataSet):# 每列的最小值minvalsminVals=dataSet.min(0) # 0表示返回每列的最小值maxVals=dataSet.max(0)ranges=maxVals-minVals# 得到dataset相同行列數的0數組normDataSet=zeros(shape(dataSet))m = dataSet.shape[0] #數組的行數# tile復制形如[A,B,C](ABC分別代表每列的最小值)m行normDataSet = dataSet - tile(minVals, (m,1)) # 歸一化公式,注意是具體特征值相除normDataSet = normDataSet/tile(ranges, (m,1)) #element wise dividereturn normDataSet, ranges, minVals# 分類測試 def datingClassTest():hoRatio = 0.10 #hold out 10%datingDataMat,datingLabels = file2matrix('C:\Users\LiLong\Desktop\datingTestSet2.txt')normMat, ranges, minVals = autoNorm(datingDataMat)m = normMat.shape[0]# 測試數據的數量numTestVecs = int(m*hoRatio)print('the test number:',numTestVecs)errorCount = 0.0for i in range(numTestVecs):#normMat[i,:]表示輸入的測試集是前100行的數據,normMat[numTestVecs:m,:]表示訓練集#是100-1000的,datingLabels[numTestVecs:m]表示和訓練集是對應的classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\datingLabels[numTestVecs:m],3)print ("the classifier came back with: %d, the real answer is: %d"\% (classifierResult, datingLabels[i]))if (classifierResult != datingLabels[i]): errorCount += 1.0print "the total error rate is: %f" % (errorCount/float(numTestVecs))print errorCount# 讀的是datingTestSet2.txt,不是datingTestSet.txt #file_raw='C:\Users\LiLong\Desktop\datingTestSet2.txt' if __name__== "__main__": datingClassTest()結果:
row:1000 and column:3 ('the test number:', 100) the classifier came back with: 3, the real answer is: 3 the classifier came back with: 2, the real answer is: 2 the classifier came back with: 1, the real answer is: 1 the classifier came back with: 1, the real answer is: 1 the classifier came back with: 1, the real answer is: 1 the classifier came back with: 1, the real answer is: 1 the classifier came back with: 1, the real answer is: 1 the classifier came back with: 1, the real answer is: 1 ..., the classifier came back with: 2, the real answer is: 2 the classifier came back with: 3, the real answer is: 3 the classifier came back with: 2, the real answer is: 2 the classifier came back with: 3, the real answer is: 3 the classifier came back with: 2, the real answer is: 2 the classifier came back with: 1, the real answer is: 1 the classifier came back with: 2, the real answer is: 2 the classifier came back with: 3, the real answer is: 3 the classifier came back with: 1, the real answer is: 1 the classifier came back with: 2, the real answer is: 2 the classifier came back with: 1, the real answer is: 1 the classifier came back with: 2, the real answer is: 2 the classifier came back with: 1, the real answer is: 1 the classifier came back with: 2, the real answer is: 2 the classifier came back with: 1, the real answer is: 1 the classifier came back with: 3, the real answer is: 3 the classifier came back with: 3, the real answer is: 3 the classifier came back with: 2, the real answer is: 2 the classifier came back with: 1, the real answer is: 1 the classifier came back with: 3, the real answer is: 1 the total error rate is: 0.050000 5.0結果顯示錯誤率5.0%
與50位技術專家面對面20年技術見證,附贈技術全景圖總結
- 上一篇: Python内置函数min(iterab
- 下一篇: 雅阁如何选购 从品牌到配置全方位解析雅阁