''' 在机器学习pipeline中同时使用PCA和LDA ''' from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import Pipeline from sklearn.model_selection import cross_val_score # import the Iris dataset from scikit-learn from sklearn.datasets import load_iris # import our plotting module import matplotlib.pyplot as plt # load the Iris dataset iris = load_iris() # 创建X,y变量来表示特征和响应变量列。create X and y variables to hold features and response column iris_X, iris_y = iris.data, iris.target # Create a PCA module to keep a single component single_pca = PCA(n_components=1) # Create a LDA module to keep a single component single_lda = LinearDiscriminantAnalysis(n_components=1) # Instantiate a KNN model knn = KNeighborsClassifier(n_neighbors=3) # run a cross validation on the KNN without any feature transformation knn_average = cross_val_score(knn, iris_X, iris_y).mean() # This is a baseline accuracy. If we did nothing, KNN on its own achieves a 98% accuracy knn_average #Let's use our LDA, which keeps only the most powerful component lda_pipeline = Pipeline([('lda', single_lda), ('knn', knn)]) lda_average = cross_val_score(lda_pipeline, iris_X, iris_y).mean() lda_average # create a pipeline that performs PCA pca_pipeline = Pipeline([('pca', single_pca), ('knn', knn)]) pca_average = cross_val_score(pca_pipeline, iris_X, iris_y).mean() pca_average # try LDA with 2 components lda_pipeline = Pipeline([('lda',LinearDiscriminantAnalysis(n_components=2)),('knn', knn)]) lda_average = cross_val_score(lda_pipeline, iris_X, iris_y).mean() # Just as good as using original data lda_average # compare our feature transformation tools to a feature selection tool from sklearn.feature_selection import SelectKBest # try all possible values for k, excluding keeping all columns for k in [1, 2, 3]: # make the pipeline select_pipeline = Pipeline([('select', SelectKBest(k=k)), ('knn', knn)]) # cross validate the pipeline select_average = cross_val_score(select_pipeline, iris_X, iris_y).mean() print (k, "best feature has accuracy:", select_average) ''' 用 GridSearch module 寻找最优组合: Scaling data (with or without mean/std) PCA components LDA components KNN neighbors ''' def get_best_model_and_accuracy(model, params, X, y): grid = GridSearchCV(model, # the model to grid search params, # the parameter set to try error_score=0.) # if a parameter set raises an error, continue and set the performance as 0 grid.fit(X, y) # fit the model and parameters # our classical metric for performance print ("Best Accuracy: {}".format(grid.best_score_)) # the best parameters that caused the best accuracy print ("Best Parameters: {}".format(grid.best_params_)) # the average time it took a model to fit to the data (in seconds) avg_time_fit = round(grid.cv_results_['mean_fit_time'].mean(), 3) print ("Average Time to Fit (s): {}".format(avg_time_fit)) # the average time it took a model to predict out of sample data (in seconds) # this metric gives us insight into how this model will perform in real-time analysis print ("Average Time to Score (s):{}".format(round(grid.cv_results_['mean_score_time'].mean(), 3))) from sklearn.model_selection import GridSearchCV from sklearn.preprocessing import StandardScaler iris_params = { 'preprocessing__scale__with_std': [True, False], 'preprocessing__scale__with_mean': [True, False], 'preprocessing__pca__n_components':[1, 2, 3, 4], 'preprocessing__lda__n_components':[1, 2],# according to scikit-learn docs, max allowed n_components for LDA is number of classes-1 'clf__n_neighbors': range(1, 9) } # make a larger pipeline preprocessing = Pipeline([('scale', StandardScaler()), ('pca', PCA()),('lda', LinearDiscriminantAnalysis())]) iris_pipeline = Pipeline(steps=[('preprocessing', preprocessing),('clf',KNeighborsClassifier())]) get_best_model_and_accuracy(iris_pipeline, iris_params, iris_X, iris_y) '''output: 1 best feature has accuracy: 0.9538398692810457 2 best feature has accuracy: 0.9607843137254902 3 best feature has accuracy: 0.9738562091503268 Best Accuracy: 0.9866666666666667 Best Parameters: {'clf__n_neighbors': 3, 'preprocessing__lda__n_components': 2, 'preprocessing__pca__n_components': 3, 'preprocessing__scale__with_mean': True, 'preprocessing__scale__with_std': False} Average Time to Fit (s): 0.003 Average Time to Score (s):0.003 '''

