1.Python导入数据

knn.py

from numpy import * # import scientific computing package numpy
import operator # import operator modular

# createDataSet主要用来创建数据集和标签
def createDateSet():
    group = array([[1.0,1.1],[1.0,1.0],[0,0],[0,0.1]])
    labels = ['A','A','B','B']
    return group,labels

knnTest.py

1
2
3

import knn
datingDataMat,datingLabels = kNN.file2matrix('datingTestSet2.txt')
print(datingDataMat);print(datingLabels)

结果

[[  4.09200000e+04   8.32697600e+00   9.53952000e-01]
 [  1.44880000e+04   7.15346900e+00   1.67390400e+00]
 [  2.60520000e+04   1.44187100e+00   8.05124000e-01]
 ..., 
 [  2.65750000e+04   1.06501020e+01   8.66627000e-01]
 [  4.81110000e+04   9.13452800e+00   7.28045000e-01]
 [  4.37570000e+04   7.88260100e+00   1.33244600e+00]]
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3, 2, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 1, 3, 1, 2, 1, 1, 2, 3, 3, 1, 2, 3, 3, 3, 1, 1, 1, 1, 2, 2, 1, 3, 2, 2, 2, 2, 3, 1, 2, 1, 2, 2, 2, 2, 2, 3, 2, 3, 1, 2, 3, 2, 2, 1, 3, 1, 1, 3, 3, 1, 2, 3, 1, 3, 1, 2, 2, 1, 1, 3, 3, 1, 2, 1, 3, 3, 2, 1, 1, 3, 1, 2, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 3, 1, 2, 1, 1, 2, 3, 2, 3, 2, 3, 2, 1, 3, 3, 3, 1, 3, 2, 2, 3, 1, 3, 3, 3, 1, 3, 1, 1, 3, 3, 2, 3, 3, 1, 2, 3, 2, 2, 3, 3, 3, 1, 2, 2, 1, 1, 3, 2, 3, 3, 1, 2, 1, 3, 1, 2, 3, 2, 3, 1, 1, 1, 3, 2, 3, 1, 3, 2, 1, 3, 2, 2, 3, 2, 3, 2, 1, 1, 3, 1, 3, 2, 2, 2, 3, 2, 2, 1, 2, 2, 3, 1, 3, 3, 2, 1, 1, 1, 2, 1, 3, 3, 3, 3, 2, 1, 1, 1, 2, 3, 2, 1, 3, 1, 3, 2, 2, 3, 1, 3, 1, 1, 2, 1, 2, 2, 1, 3, 1, 3, 2, 3, 1, 2, 3, 1, 1, 1, 1, 2, 3, 2, 2, 3, 1, 2, 1, 1, 1, 3, 3, 2, 1, 1, 1, 2, 2, 3, 1, 1, 1, 2, 1, 1, 2, 1, 1, 1, 2, 2, 3, 2, 3, 3, 3, 3, 1, 2, 3, 1, 1, 1, 3, 1, 3, 2, 2, 1, 3, 1, 3, 2, 2, 1, 2, 2, 3, 1, 3, 2, 1, 1, 3, 3, 2, 3, 3, 2, 3, 1, 3, 1, 3, 3, 1, 3, 2, 1, 3, 1, 3, 2, 1, 2, 2, 1, 3, 1, 1, 3, 3, 2, 2, 3, 1, 2, 3, 3, 2, 2, 1, 1, 1, 1, 3, 2, 1, 1, 3, 2, 1, 1, 3, 3, 3, 2, 3, 2, 1, 1, 1, 1, 1, 3, 2, 2, 1, 2, 1, 3, 2, 1, 3, 2, 1, 3, 1, 1, 3, 3, 3, 3, 2, 1, 1, 2, 1, 3, 3, 2, 1, 2, 3, 2, 1, 2, 2, 2, 1, 1, 3, 1, 1, 2, 3, 1, 1, 2, 3, 1, 3, 1, 1, 2, 2, 1, 2, 2, 2, 3, 1, 1, 1, 3, 1, 3, 1, 3, 3, 1, 1, 1, 3, 2, 3, 3, 2, 2, 1, 1, 1, 2, 1, 2, 2, 3, 3, 3, 1, 1, 3, 3, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 2, 3, 2, 1, 1, 1, 1, 3, 3, 3, 3, 2, 1, 1, 1, 1, 3, 1, 1, 2, 1, 1, 2, 3, 2, 1, 2, 2, 2, 3, 2, 1, 3, 2, 3, 2, 3, 2, 1, 1, 2, 3, 1, 3, 3, 3, 1, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 3, 2, 1, 3, 3, 2, 2, 2, 3, 1, 2, 1, 1, 3, 2, 3, 2, 3, 2, 3, 3, 2, 2, 1, 3, 1, 2, 1, 3, 1, 1, 1, 3, 1, 1, 3, 3, 2, 2, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 1, 3, 3, 1, 2, 3, 1, 3, 1, 1, 2, 1, 3, 1, 1, 1, 1, 2, 1, 3, 1, 2, 1, 3, 1, 3, 1, 1, 2, 2, 2, 3, 2, 2, 1, 2, 3, 3, 2, 3, 3, 3, 2, 3, 3, 1, 3, 2, 3, 2, 1, 2, 1, 1, 1, 2, 3, 2, 2, 1, 2, 2, 1, 3, 1, 3, 3, 3, 2, 2, 3, 3, 1, 2, 2, 2, 3, 1, 2, 1, 3, 1, 2, 3, 1, 1, 1, 2, 2, 3, 1, 3, 1, 1, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3, 2, 2, 2, 3, 1, 3, 1, 2, 3, 2, 2, 3, 1, 2, 3, 2, 3, 1, 2, 2, 3, 1, 1, 1, 2, 2, 1, 1, 2, 1, 2, 1, 2, 3, 2, 1, 3, 3, 3, 1, 1, 3, 1, 2, 3, 3, 2, 2, 2, 1, 2, 3, 2, 2, 3, 2, 2, 2, 3, 3, 2, 1, 3, 2, 1, 3, 3, 1, 2, 3, 2, 1, 3, 3, 3, 1, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 1, 2, 1, 3, 1, 2, 2, 1, 3, 2, 1, 3, 3, 2, 2, 2, 1, 2, 2, 1, 3, 1, 3, 1, 3, 3, 1, 1, 2, 3, 2, 2, 3, 1, 1, 1, 1, 3, 2, 2, 1, 3, 1, 2, 3, 1, 3, 1, 3, 1, 1, 3, 2, 3, 1, 1, 3, 3, 3, 3, 1, 3, 2, 2, 1, 1, 3, 3, 2, 2, 2, 1, 2, 1, 2, 1, 3, 2, 1, 2, 2, 3, 1, 2, 2, 2, 3, 2, 1, 2, 1, 2, 3, 3, 2, 3, 1, 1, 3, 3, 1, 2, 2, 2, 2, 2, 2, 1, 3, 3, 3, 3, 3, 1, 1, 3, 2, 1, 2, 1, 2, 2, 3, 2, 2, 2, 3, 1, 2, 1, 2, 2, 1, 1, 2, 3, 3, 1, 1, 1, 1, 3, 3, 3, 3, 3, 3, 1, 3, 3, 2, 3, 2, 3, 3, 2, 2, 1, 1, 1, 3, 3, 1, 1, 1, 3, 3, 2, 1, 2, 1, 1, 2, 2, 1, 1, 1, 3, 1, 1, 2, 3, 2, 2, 1, 3, 1, 2, 3, 1, 2, 2, 2, 2, 3, 2, 3, 3, 1, 2, 1, 2, 3, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2, 2, 1, 3, 3, 3]

2.KNN算法实现

knn.py

#knn classification
def classify0(inX,dataSet,labels,k):    #test,train,label,k
    dataSetSize = dataSet.shape[0]  # shape[0]行，shape[1]列
    diffMat = tile(inX,(dataSetSize,1))-dataSet # tile(a,rep)a在各个维度重复，此处指dataSetSize行，1列
    sqDiffMat = diffMat**2  # 距离平方
    sqDistances = sqDiffMat.sum(axis=1) # axis=0按行相加，axis=1按列相加
    distances = sqDistances**0.5 # 距离平方开根号
    sortedDistIndices = distances.argsort() # 距离由小到大排序
    classCount = {} # dict存储标签及其出现的次数
    for i in range(k):
        voteIlabel = labels[sortedDistIndices[i]]   # 距离最近的第i个
        classCount[voteIlabel] = classCount.get(voteIlabel,0)+1 # 字典中对应出现次数+1
    sortedClassCount = sorted(classCount.items(),key=operator.itemgetter(1),reverse=True)
    #itemgetter(0)按照key排序，itemgetter(1)按照value排序，reverse默认False从小到大，True的话从大到小
    return sortedClassCount[0][0]   # 排在最前面那对item的第一个值（A or B,the label we need）

knnTest.py

1 2	import knn print(kNN.classify0([0,0],group,labels,3))

结果

3.Example:使用KNN改进约会网站的配对效果

3.1.从文本文件中读取数据并解析

knn.py

#kNN readfile
def file2matrix(filename):
    fr = open(filename)
    arrayOLines = fr.readlines()    # 一次读取整个文件，并将文件内容自动分析成一个行的列表
    numberOfLines = len(arrayOLines)    # 获得行数n
    returnMat = zeros((numberOfLines,3)) # n行3列值全为0的数组
    classLabelVector = []
    index = 0
    for line in arrayOLines:
        line = line.strip() # 移除字符串首尾的空格
        listFromLine = line.split('\t') # split(str,num)按照分隔符str进行切片，str默认空格、\n、\t，返回列表
        returnMat[index,:] = listFromLine[0:3] # 给第index行赋值（获取文件每行前3维元素）
        classLabelVector.append(int(listFromLine[-1]))  # 获取文件每行最后一个元素label
        index += 1
    return returnMat,classLabelVector   #返回文件处理的样本矩阵和类标签向量

knnTest.py

1
2
3

import kNN
datingDataMat,datingLabels = kNN.file2matrix('datingTestSet2.txt')
print(datingDataMat);print(datingLabels[0:20])

结果：

[[  4.09200000e+04   8.32697600e+00   9.53952000e-01]
 [  1.44880000e+04   7.15346900e+00   1.67390400e+00]
 [  2.60520000e+04   1.44187100e+00   8.05124000e-01]
 ..., 
 [  2.65750000e+04   1.06501020e+01   8.66627000e-01]
 [  4.81110000e+04   9.13452800e+00   7.28045000e-01]
 [  4.37570000e+04   7.88260100e+00   1.33244600e+00]]
[3, 2, 1, 1, 1, 1, 3, 3, 1, 3, 1, 1, 2, 1, 1, 1, 1, 1, 2, 3]

3.2.使用matplotlib创建散点图

knnTest.py

import matplotlib
import matplotlib.pyplot as plt
fig = plt.figure()

ax = fig.add_subplot(211)   # ax放在2行1列的第一个位置
ax.scatter(datingDataMat[:,1],datingDataMat[:,2],15.0*array(datingLabels),15.0*array(datingLabels))

bx = fig.add_subplot(212)   # bx放在2行1列的第二个位置
bx.scatter(datingDataMat[:,0],datingDataMat[:,1],15.0*array(datingLabels),15.0*array(datingLabels))
plt.show()

散点图使用datingDataMat矩阵的第2和第3列数据，分别表示玩视频游戏所耗时间百分比和每周所消费的冰淇淋公升数

结果

3.3.归一化数值

knn.py

#kNN normalization
def autoNorm(dataSet):
    minVals = dataSet.min(0)    # calculate min from every col,res:1*3
    maxVals = dataSet.max(0)
    ranges = maxVals-minVals
    normDataSet = zeros(shape(dataSet)) # shape--read the len of mat
    m = dataSet.shape[0]    # shape[0]--row;shape[1]--col
    normDataSet = dataSet-tile(minVals,(m,1))   # numpy.tile([0,0],(2,1))#在列方向上重复[0,0]1次，行2次
    normDataSet = normDataSet/tile(ranges,(m,1))
    return normDataSet,ranges,minVals   # 返回标准化的数据，范围和最小值

knnTest.py

1 2	normMat,ranges,minVals = kNN.autoNorm(datingDataMat) print(normMat);print(ranges);print(minVals)

结果

[[ 0.44832535  0.39805139  0.56233353]
 [ 0.15873259  0.34195467  0.98724416]
 [ 0.28542943  0.06892523  0.47449629]
 ..., 
 [ 0.29115949  0.50910294  0.51079493]
 [ 0.52711097  0.43665451  0.4290048 ]
 [ 0.47940793  0.3768091   0.78571804]]
[  9.12730000e+04   2.09193490e+01   1.69436100e+00]
[ 0.        0.        0.001156]

3.4.预测分类器效果

knn.py

#kNN vertification
def datingClassTest():
    hoRatio = 0.10  #hold out 10% as testing data
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')  #load message from file
    normMat,ranges,minVals = autoNorm(datingDataMat)    #normalization
    m = normMat.shape[0]    #get numbers of normalized data
    numTestVecs = int(m*hoRatio)
    errorCount = 0
    for i in range(numTestVecs):
        classifierResult = classify0(normMat[i,:],normMat[numTestVecs:m,:],\
                                     datingLabels[numTestVecs:m],3)
        print("the classifier came back with:%d,the real answer is:%d"\
              %(classifierResult,datingLabels[i]))
        if(classifierResult != datingLabels[i]):errorCount += 1.0
    print("the total error rate is: %f "%(errorCount/float(numTestVecs)))

knnTest.py

1	kNN.datingClassTest()

结果

the classifier came back with:3,the real answer is:3
the classifier came back with:2,the real answer is:2
the classifier came back with:1,the real answer is:1
the classifier came back with:1,the real answer is:1
the classifier came back with:1,the real answer is:1
...,
the classifier came back with:2,the real answer is:2
the classifier came back with:1,the real answer is:1
the classifier came back with:3,the real answer is:1
the total error rate is: 0.050000

3.5.约会网站结果预测

knn.py

#kNN prediction
def classifyPerson():
    resultList = ['not at all','in small doses','in large doses']   # 结果标签
    percentTats = float(input("percentage of time spent playing video games?")) # 输入数据
    ffMiles = float(input("frequent flier miles earned per year:"))
    iceCream = float(input("liters of ice cream consumed per year?"))
    datingDataMat,datingLabels = file2matrix('datingTestSet2.txt')  # 从文件中读入数据
    normMat,ranges,minVals = autoNorm(datingDataMat)    # 数据归一化
    inArr = array([ffMiles,percentTats,iceCream])   # 输入数据转化为数组作为预测数据
    classifierResult = classify0((inArr-minVals)/ranges,normMat,datingLabels,3)
    print('you will probably like this person:',resultList[classifierResult-1])

knnTest.py

1	kNN.classifyPerson()

结果

percentage of time spent playing video games?3
frequent flier miles earned per year:10000
liters of ice cream consumed per year?5
you will probably like this person: in small doses

4.手写识别系统

4.1图形转为向量

knn.py

#convert img to vector
# 把一个32*32的图片转化为1*1024的向量
from os import listdir  #list filename of file
def img2vector(filename):
    returnVect = zeros((1,1024))    # 1*1024numpy向量
    fr = open(filename)
    for i in range(32): # 读入32行
        lineStr = fr.readline()
        for j in range(32):
            returnVect[0,32*i+j] = int(lineStr[j]) # 每行数据存储在numpy向量中
    return returnVect

knnTest.py

1
2
3

testVector = kNN.img2vector('testDigits/0_13.txt')
print(testVector[0,0:31])
print(testVector[0,32:63])

结果

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]
[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  1.  1.  1.  1.  1.
  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.]

4.2.k-紧邻算法识别手写数字

knn.py

#test numbers using kNN classify0
def handwritingClassTest():
    hwLabels = []
    trainingFileList = listdir('trainingDigits')
    m = len(trainingFileList)   # 图片数量
    trainingMat = zeros((m,1024))
    for i in range(m):
        fileNameStr = trainingFileList[i]   # 第i个图片
        fileStr = fileNameStr.split('.')[0] # 根据.分割文件名称并获取第一个字符（0_1或1_1或2_1等）
        classNumStr = int(fileStr.split('_')[0])    # 根据_分割文件名称并获取0 1等
        hwLabels.append(classNumStr)
        trainingMat[i,:] = img2vector('trainingDigits/%s'%fileNameStr)

    testFileList = listdir('testDigits')
    errorCount = 0.0
    mTest = len(testFileList)
    for i in range(mTest):
        fileNameStr = testFileList[i]   # 第i个文件名称的字符串形式
        fileStr = fileNameStr.split('.')[0] # 根据.分割文件名称并获取第一个字符（文件顺序0_1,0_2,0_3...）
        classNumStr = int(fileStr.split('_')[0])    # 获取文件名1,2,3...
        vectorUnderTest = img2vector('testDigits/%s'%fileNameStr)   # 图像转化为向量
        classifierResult = classify0(vectorUnderTest,\
                                     trainingMat,hwLabels,3)
        print('the classifier came back with:%d,the real answer is:%d'\
              %(classifierResult,classNumStr))
        if(classifierResult != classNumStr):errorCount += 1.0
    print('\nthe total number of errors is :%d'%errorCount)
    print('\nthe total error rate is :%f'%(errorCount/float(mTest)))

knnTest.py

1	kNN.handwritingClassTest()

结果

the classifier came back with:9,the real answer is:9
the classifier came back with:7,the real answer is:7
the classifier came back with:7,the real answer is:7
...,
the classifier came back with:4,the real answer is:4
the classifier came back with:5,the real answer is:5

the total number of errors is :11

the total error rate is :0.011628

这个算法并不高效，有900个测试图片；
对于每一个测试图片，首先要一步复杂度为1024运的算转化为测试向量；
每个测试向量都要运行900次距离计算；
每次距离计算都是1024个浮点数的计算。。。
有没有一种算法更加节省空间和时间呢？
跟着《机器学习实战》这本书的步伐，很快我们就知道，有一种叫做k决策树的大佬，
据说是k-紧邻算法的优化版，可以大大得节省计算开销。