1、sklearn库
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
df = pd.read_table('D:\datingTestSet2.txt',header = None)
data=df.iloc[:,0:3]
target=df.iloc[:,3]
x_train,x_test, y_train, y_test =train_test_split(data,target,test_size=0.1, random_state=0)
knn=KNeighborsClassifier(weights='distance',n_neighbors=10)
knn.fit(x_train,y_train)
y_predict=knn.predict(x_test)
score=knn.score(x_test,y_test,sample_weight=None)
print(score)
输出0.75,继续学习。
2、自定义函数,up!
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_table('D:\datingTestSet2.txt',header = None)
data=df.iloc[:,0:3]
target=df.iloc[:,3]
x_train,x_test, y_train, y_test =train_test_split(data,target,test_size=0.1, random_state=0)
results=[]
x_test_array=np.array(x_test)
y_test_array=np.array(y_test)
x_train_array=np.array(x_train)
y_train_array=np.array(y_train)
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) #频数降序排列
return sortedClassCount[0][0]
count=0
for j in range(99):
#print (x_test_array[j])
results=classify0(x_test_array[j],x_train_array,y_train_array,10)
#print(x_test_array[j])
#print(y_test_array[j])
if results ==y_test_array[j]:
count=count+1
print(count/100)
输出0.73,怎么还下降了,悲桑
3、分析原因
玩游戏所耗时间
飞行常客里程数
冰淇淋
样本分类
1
0.953952
40920
8.326976
3
2
1.673904
14488
7.1534691
2
3
0.805124
26052
1.441871
1
4
0.428964
75136
13.147394
1
计算一下两个样本间的距离(欧式),公里数太大了,对计算结果的影响远远超过了其余两项,归一化吧那就!
4、sklearn+归一化
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
df = pd.read_table('D:\datingTestSet2.txt',header = None)
data=df.iloc[:,0:3]
target=df.iloc[:,3]
x_train,x_test, y_train, y_test =train_test_split(data,target,test_size=0.1, random_state=0)
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = np.zeros(np.shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - np.tile(minVals, (m,1))
normDataSet = normDataSet/np.tile(ranges, (m,1)) #element wise divide
return normDataSet
results=[]
x_test_array=np.array(autoNorm(x_test))
y_test_array=np.array(y_test)
x_train_array=np.array(autoNorm(x_train))
y_train_array=np.array(y_train)
knn=KNeighborsClassifier(weights='distance',n_neighbors=10)
knn.fit(x_train_array,y_train_array)
y_predict=knn.predict(x_test_array)
score=knn.score(x_test_array,y_test_array,sample_weight=None)
print(score)
输出0.9,相较于0.75提升不小,非常满意。
5、自定义+归一化
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
df = pd.read_table('D:\datingTestSet2.txt',header = None)
data=df.iloc[:,0:3]
target=df.iloc[:,3]
x_train,x_test, y_train, y_test =train_test_split(data,target,test_size=0.1, random_state=0)
def autoNorm(dataSet):
minVals = dataSet.min(0)
maxVals = dataSet.max(0)
ranges = maxVals - minVals
normDataSet = zeros(shape(dataSet))
m = dataSet.shape[0]
normDataSet = dataSet - tile(minVals, (m,1))
normDataSet = normDataSet/tile(ranges, (m,1)) #element wise divide
return normDataSet
results=[]
x_test_array=np.array(autoNorm(x_test))
y_test_array=np.array(y_test)
x_train_array=np.array(autoNorm(x_train))
y_train_array=np.array(y_train)
def classify0(inX, dataSet, labels, k):
dataSetSize = dataSet.shape[0]
diffMat = tile(inX, (dataSetSize,1)) - dataSet
sqDiffMat = diffMat**2
sqDistances = sqDiffMat.sum(axis=1)
distances = sqDistances**0.5
sortedDistIndicies = distances.argsort()
classCount={}
for i in range(k):
voteIlabel = labels[sortedDistIndicies[i]]
classCount[voteIlabel] = classCount.get(voteIlabel,0) + 1
sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True) #频数降序排列
return sortedClassCount[0][0]
count=0
for j in range(99):
#print (x_test_array[j])
results=classify0(x_test_array[j],x_train_array,y_train_array,10)
#print(x_test_array[j])
#print(y_test_array[j])
if results ==y_test_array[j]:
count=count+1
print(count/100)
#if results!=y_train_array
#df=pd.DataFrame({'predicted':results,'value':y_test})
#count=0
#for k in range(99):
#if df.iloc[k,0]!=df.iloc[k,1]:
#count=count=1
#error=count/100
输出结果0.89,just so so,复制半天不如sklearn,一把辛酸泪。
6、《机器学习实战》中分类器的错误率只有2.4%(好羡慕),莫非是k值的选取?改成3试一下。sklearn方法没有变化,依旧是0.9,自定义竟然下降到了0.88。不得不说这真是一门神奇的学科,至于10%和2.4%之间的差距,小白还要慢慢探索(能得到0.9非常开心),希望大神们指引一下~非常感谢。
蜗牛蜗牛你等等我
原创文章 3获赞 0访问量 40
关注
私信
展开阅读全文
作者:蜗牛蜗牛你等等我