机器学习实战之KNN算法

1.Python导入数据

knn.py

1
2
3
4
5
6
7
8
from numpy import * # import scientific computing package numpy
import operator # import operator modular

# createDataSet主要用来创建数据集和标签
def createDateSet():
group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
labels = ['A','A','B','B']
return group,labels

knnTest.py

1
2
3
import knn
datingDataMat,datingLabels = kNN.file2matrix('datingTestSet2.txt')
print(datingDataMat);print(datingLabels)

结果

1
2
3
4
5
6
7
8
[[  4.09200000e+04   8.32697600e+00   9.53952000e-01]
[ 1.44880000e+04 7.15346900e+00 1.67390400e+00]
[ 2.60520000e+04 1.44187100e+00 8.05124000e-01]
...,
[ 2.65750000e+04 1.06501020e+01 8.66627000e-01]
[ 4.81110000e+04 9.13452800e+00 7.28045000e-01]
[ 4.37570000e+04 7.88260100e+00 1.33244600e+00]]
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3, 2, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 1, 3, 1, 2, 1, 1, 2, 3, 3, 1, 2, 3, 3, 3, 1, 1, 1, 1, 2, 2, 1, 3, 2, 2, 2, 2, 3, 1, 2, 1, 2, 2, 2, 2, 2, 3, 2, 3, 1, 2, 3, 2, 2, 1, 3, 1, 1, 3, 3, 1, 2, 3, 1, 3, 1, 2, 2, 1, 1, 3, 3, 1, 2, 1, 3, 3, 2, 1, 1, 3, 1, 2, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 3, 1, 2, 1, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 3, 3, 1, 3, 2, 2, 3, 1, 3, 3, 3, 1, 3, 1, 1, 3, 3, 2, 3, 3, 1, 2, 3, 2, 2, 3, 3, 3, 1, 2, 2, 1, 1, 3, 2, 3, 3, 1, 2, 1, 3, 1, 2, 3, 2, 3, 1, 1, 1, 3, 2, 3, 1, 3, 2, 1, 3, 2, 2, 3, 2, 3, 2, 1, 1, 3, 1, 3, 2, 2, 2, 3, 2, 2, 1, 2, 2, 3, 1, 3, 3, 2, 1, 1, 1, 2, 1, 3, 3, 3, 3, 2, 1, 1, 1, 2, 3, 2, 1, 3, 1, 3, 2, 2, 3, 1, 3, 1, 1, 2, 1, 2, 2, 1, 3, 1, 3, 2, 3, 1, 2, 3, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 2, 1, 1, 1, 3, 3, 2, 1, 1, 1, 2, 2, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 3, 2, 3, 3, 3, 3, 1, 2, 3, 1, 1, 1, 3, 1, 3, 2, 2, 1, 3, 1, 3, 2, 2, 1, 2, 2, 3, 1, 3, 2, 1, 1, 3, 3, 2, 3, 3, 2, 3, 1, 3, 1, 3, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 2, 1, 3, 1, 1, 3, 3, 2, 2, 3, 1, 2, 3, 3, 2, 2, 1, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 1, 3, 3, 3, 2, 3, 2, 1, 1, 1, 1, 1, 3, 2, 2, 1, 2, 1, 3, 2, 1, 3, 2, 1, 3, 1, 1, 3, 3, 3, 3, 2, 1, 1, 2, 1, 3, 3, 2, 1, 2, 3, 2, 1, 2, 2, 2, 1, 1, 3, 1, 1, 2, 3, 1, 1, 2, 3, 1, 3, 1, 1, 2, 2, 1, 2, 2, 2, 3, 1, 1, 1, 3, 1, 3, 1, 3, 3, 1, 1, 1, 3, 2, 3, 3, 2, 2, 1, 1, 1, 2, 1, 2, 2, 3, 3, 3, 1, 1, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 1, 1, 1, 3, 3, 3, 3, 2, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 2, 3, 2, 1, 2, 2, 2, 3, 2, 1, 3, 2, 3, 2, 3, 2, 1, 1, 2, 3, 1, 3, 3, 3, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 1, 3, 3, 2, 2, 2, 3, 1, 2, 1, 1, 3, 2, 3, 2, 3, 2, 3, 3, 2, 2, 1, 3, 1, 2, 1, 3, 1, 1, 1, 3, 1, 1, 3, 3, 2, 2, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 1, 3, 3, 1, 2, 3, 1, 3, 1, 1, 2, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 2, 1, 3, 1, 3, 1, 1, 2, 2, 2, 3, 2, 2, 1, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 3, 2, 3, 2, 1, 2, 1, 1, 1, 2, 3, 2, 2, 1, 2, 2, 1, 3, 1, 3, 3, 3, 2, 2, 3, 3, 1, 2, 2, 2, 3, 1, 2, 1, 3, 1, 2, 3, 1, 1, 1, 2, 2, 3, 1, 3, 1, 1, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 2, 2, 3, 1, 3, 1, 2, 3, 2, 2, 3, 1, 2, 3, 2, 3, 1, 2, 2, 3, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 3, 2, 1, 3, 3, 3, 1, 1, 3, 1, 2, 3, 3, 2, 2, 2, 1, 2, 3, 2, 2, 3, 2, 2, 2, 3, 3, 2, 1, 3, 2, 1, 3, 3, 1, 2, 3, 2, 1, 3, 3, 3, 1, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 1, 2, 1, 3, 1, 2, 2, 1, 3, 2, 1, 3, 3, 2, 2, 2, 1, 2, 2, 1, 3, 1, 3, 1, 3, 3, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 3, 2, 2, 1, 3, 1, 2, 3, 1, 3, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 3, 3, 3, 1, 3, 2, 2, 1, 1, 3, 3, 2, 2, 2, 1, 2, 1, 2, 1, 3, 2, 1, 2, 2, 3, 1, 2, 2, 2, 3, 2, 1, 2, 1, 2, 3, 3, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2, 2, 2, 1, 3, 3, 3, 3, 3, 1, 1, 3, 2, 1, 2, 1, 2, 2, 3, 2, 2, 2, 3, 1, 2, 1, 2, 2, 1, 1, 2, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 1, 3, 3, 2, 3, 2, 3, 3, 2, 2, 1, 1, 1, 3, 3, 1, 1, 1, 3, 3, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 3, 1, 1, 2, 3, 2, 2, 1, 3, 1, 2, 3, 1, 2, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 2, 3, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 3, 3, 3]

2.KNN算法实现

knn.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#knn classification
def classify0(inX,dataSet,labels,k): #test,train,label,k
dataSetSize = dataSet.shape[0] # shape[0]行,shape[1]列
diffMat = tile(inX,(dataSetSize,1))-dataSet # tile(a,rep)a在各个维度重复,此处指dataSetSize行,1列
sqDiffMat = diffMat**2 # 距离平方
sqDistances = sqDiffMat.sum(axis=1) # axis=0按行相加,axis=1按列相加
distances = sqDistances**0.5 # 距离平方开根号
sortedDistIndices = distances.argsort() # 距离由小到大排序
classCount = {} # dict存储标签及其出现的次数
for i in range(k):
voteIlabel = labels[sortedDistIndices[i]] # 距离最近的第i个
classCount[voteIlabel] = classCount.get(voteIlabel,0)+1 # 字典中对应出现次数+1
sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
#itemgetter(0)按照key排序,itemgetter(1)按照value排序,reverse默认False从小到大,True的话从大到小
return sortedClassCount[0][0] # 排在最前面那对item的第一个值(A or B,the label we need)

knnTest.py

1
2
import knn
print(kNN.classify0([0,0],group,labels,3))

结果

1
B

3.Example:使用KNN改进约会网站的配对效果

3.1.从文本文件中读取数据并解析

knn.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#kNN readfile
def file2matrix(filename):
fr = open(filename)
arrayOLines = fr.readlines() # 一次读取整个文件,并将文件内容自动分析成一个行的列表
numberOfLines = len(arrayOLines) # 获得行数n
returnMat = zeros((numberOfLines,3)) # n行3列值全为0的数组
classLabelVector = []
index = 0
for line in arrayOLines:
line = line.strip() # 移除字符串首尾的空格
listFromLine = line.split('\t') # split(str,num)按照分隔符str进行切片,str默认空格、\n、\t,返回列表
returnMat[index,:] = listFromLine[0:3] # 给第index行赋值(获取文件每行前3维元素)
classLabelVector.append(int(listFromLine[-1])) # 获取文件每行最后一个元素label
index += 1
return returnMat,classLabelVector #返回文件处理的样本矩阵和类标签向量

knnTest.py

1
2
3
import kNN
datingDataMat,datingLabels = kNN.file2matrix('datingTestSet2.txt')
print(datingDataMat);print(datingLabels[0:20])

结果:

1
2
3
4
5
6
7
8
[[  4.09200000e+04   8.32697600e+00   9.53952000e-01]
[ 1.44880000e+04 7.15346900e+00 1.67390400e+00]
[ 2.60520000e+04 1.44187100e+00 8.05124000e-01]
...,
[ 2.65750000e+04 1.06501020e+01 8.66627000e-01]
[ 4.81110000e+04 9.13452800e+00 7.28045000e-01]
[ 4.37570000e+04 7.88260100e+00 1.33244600e+00]]
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3]

3.2.使用matplotlib创建散点图

knnTest.py

1
2
3
4
5
6
7
8
9
10
import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()

ax = fig.add_subplot(211) # ax放在2行1列的第一个位置
ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels))

bx = fig.add_subplot(212) # bx放在2行1列的第二个位置
bx.scatter(datingDataMat[:,0],datingDataMat[:,1],15.0*array(datingLabels),15.0*array(datingLabels))
plt.show()

散点图使用datingDataMat矩阵的第2和第3列数据,分别表示玩视频游戏所耗时间百分比和每周所消费的冰淇淋公升数

结果

1.png

3.3.归一化数值

knn.py

1
2
3
4
5
6
7
8
9
10
#kNN normalization
def autoNorm(dataSet):
minVals = dataSet.min(0) # calculate min from every col,res:1*3
maxVals = dataSet.max(0)
ranges = maxVals-minVals
normDataSet = zeros(shape(dataSet)) # shape--read the len of mat
m = dataSet.shape[0] # shape[0]--row;shape[1]--col
normDataSet = dataSet-tile(minVals,(m,1)) # numpy.tile([0,0],(2,1))#在列方向上重复[0,0]1次,行2次
normDataSet = normDataSet/tile(ranges,(m,1))
return normDataSet,ranges,minVals # 返回标准化的数据,范围和最小值

knnTest.py

1
2
normMat,ranges,minVals = kNN.autoNorm(datingDataMat)
print(normMat);print(ranges);print(minVals)

结果

1
2
3
4
5
6
7
8
9
[[ 0.44832535  0.39805139  0.56233353]
[ 0.15873259 0.34195467 0.98724416]
[ 0.28542943 0.06892523 0.47449629]
...,
[ 0.29115949 0.50910294 0.51079493]
[ 0.52711097 0.43665451 0.4290048 ]
[ 0.47940793 0.3768091 0.78571804]]
[ 9.12730000e+04 2.09193490e+01 1.69436100e+00]
[ 0. 0. 0.001156]

3.4.预测分类器效果

knn.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
#kNN vertification
def datingClassTest():
hoRatio = 0.10 #hold out 10% as testing data
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') #load message from file
normMat,ranges,minVals = autoNorm(datingDataMat) #normalization
m = normMat.shape[0] #get numbers of normalized data
numTestVecs = int(m*hoRatio)
errorCount = 0
for i in range(numTestVecs):
classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\
datingLabels[numTestVecs:m],3)
print("the classifier came back with:%d,the real answer is:%d"\
%(classifierResult,datingLabels[i]))
if(classifierResult != datingLabels[i]):errorCount += 1.0
print("the total error rate is: %f "%(errorCount/float(numTestVecs)))

knnTest.py

1
kNN.datingClassTest()

结果

1
2
3
4
5
6
7
8
9
10
the classifier came back with:3,the real answer is:3
the classifier came back with:2,the real answer is:2
the classifier came back with:1,the real answer is:1
the classifier came back with:1,the real answer is:1
the classifier came back with:1,the real answer is:1
...,
the classifier came back with:2,the real answer is:2
the classifier came back with:1,the real answer is:1
the classifier came back with:3,the real answer is:1
the total error rate is: 0.050000

3.5.约会网站结果预测

knn.py

1
2
3
4
5
6
7
8
9
10
11
#kNN prediction
def classifyPerson():
resultList = ['not at all','in small doses','in large doses'] # 结果标签
percentTats = float(input("percentage of time spent playing video games?")) # 输入数据
ffMiles = float(input("frequent flier miles earned per year:"))
iceCream = float(input("liters of ice cream consumed per year?"))
datingDataMat,datingLabels = file2matrix('datingTestSet2.txt') # 从文件中读入数据
normMat,ranges,minVals = autoNorm(datingDataMat) # 数据归一化
inArr = array([ffMiles,percentTats,iceCream]) # 输入数据转化为数组作为预测数据
classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
print('you will probably like this person:',resultList[classifierResult-1])

knnTest.py

1
kNN.classifyPerson()

结果

1
2
3
4
percentage of time spent playing video games?3
frequent flier miles earned per year:10000
liters of ice cream consumed per year?5
you will probably like this person: in small doses

4.手写识别系统

4.1图形转为向量

knn.py

1
2
3
4
5
6
7
8
9
10
11
#convert img to vector
# 把一个32*32的图片转化为1*1024的向量
from os import listdir #list filename of file
def img2vector(filename):
returnVect = zeros((1,1024)) # 1*1024numpy向量
fr = open(filename)
for i in range(32): # 读入32行
lineStr = fr.readline()
for j in range(32):
returnVect[0,32*i+j] = int(lineStr[j]) # 每行数据存储在numpy向量中
return returnVect

knnTest.py

1
2
3
testVector = kNN.img2vector('testDigits/0_13.txt')
print(testVector[0,0:31])
print(testVector[0,32:63])

结果

1
2
3
4
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.
0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[ 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 1. 1. 1. 1. 1.
1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

4.2.k-紧邻算法识别手写数字

knn.py

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
#test numbers using kNN classify0
def handwritingClassTest():
hwLabels = []
trainingFileList = listdir('trainingDigits')
m = len(trainingFileList) # 图片数量
trainingMat = zeros((m,1024))
for i in range(m):
fileNameStr = trainingFileList[i] # 第i个图片
fileStr = fileNameStr.split('.')[0] # 根据.分割文件名称并获取第一个字符(0_1或1_1或2_1等)
classNumStr = int(fileStr.split('_')[0]) # 根据_分割文件名称并获取0 1等
hwLabels.append(classNumStr)
trainingMat[i,:] = img2vector('trainingDigits/%s'%fileNameStr)

testFileList = listdir('testDigits')
errorCount = 0.0
mTest = len(testFileList)
for i in range(mTest):
fileNameStr = testFileList[i] # 第i个文件名称的字符串形式
fileStr = fileNameStr.split('.')[0] # 根据.分割文件名称并获取第一个字符(文件顺序0_1,0_2,0_3...)
classNumStr = int(fileStr.split('_')[0]) # 获取文件名1,2,3...
vectorUnderTest = img2vector('testDigits/%s'%fileNameStr) # 图像转化为向量
classifierResult = classify0(vectorUnderTest,\
trainingMat,hwLabels,3)
print('the classifier came back with:%d,the real answer is:%d'\
%(classifierResult,classNumStr))
if(classifierResult != classNumStr):errorCount += 1.0
print('\nthe total number of errors is :%d'%errorCount)
print('\nthe total error rate is :%f'%(errorCount/float(mTest)))

knnTest.py

1
kNN.handwritingClassTest()

结果

1
2
3
4
5
6
7
8
9
10
the classifier came back with:9,the real answer is:9
the classifier came back with:7,the real answer is:7
the classifier came back with:7,the real answer is:7
...,
the classifier came back with:4,the real answer is:4
the classifier came back with:5,the real answer is:5

the total number of errors is :11

the total error rate is :0.011628

这个算法并不高效,有900个测试图片;
对于每一个测试图片,首先要一步复杂度为1024运的算转化为测试向量;
每个测试向量都要运行900次距离计算;
每次距离计算都是1024个浮点数的计算。。。
有没有一种算法更加节省空间和时间呢?
跟着《机器学习实战》这本书的步伐,很快我们就知道,有一种叫做k决策树的大佬,
据说是k-紧邻算法的优化版,可以大大得节省计算开销。