Skr-Eric的机器学习课堂(九)-- 语音识别、图像识别和人脸识别

Paloma ·
更新时间:2024-09-20
· 869 次阅读

语音识别 1.梅尔频率倒谱系数(MFCC)矩阵

首先将音频输入按照时间顺序划分为若干片段,将每个片段做傅里叶变换,得到相对应的频率分布,从中提取与人类语言内容相关性最强的十三的特征频率所对应的能量强度,构成一个样本。将从每个片段中所获得的频率样本按行组成一个矩阵,即梅尔频率倒谱系数(MFCC)矩阵。MFCC矩阵反映了该音频输入的内容特征,可被用于对语音内容的识别。

# -*- coding: utf-8 -*- from __future__ import unicode_literals import numpy as np import scipy.io.wavfile as wf import python_speech_features as sf import matplotlib.pyplot as mp sample_rate, sigs = wf.read( '../../data/speeches/training/banana/banana01.wav') mfcc = sf.mfcc(sigs, sample_rate) mp.matshow(mfcc.T, cmap='jet', fignum='MFCC') mp.title('MFCC', fontsize=20) mp.xlabel('Sample', fontsize=14) mp.ylabel('Feature', fontsize=14) mp.tick_params(which='both', top='False', labeltop='False', labelbottom='True', labelsize=10) mp.show() 2.语音识别

apple

     xxx.wav -> MFCC \

     xxx.wav -> MFCC  > MFCC -> HMM \  0.8

     ...                                   /                           |

     lime                       MFCC <- xxx.wav

     xxx.wav -> MFCC \                                 |               |

     xxx.wav -> MFCC  > MFCC -> HMM /  0.9 max v

     ...                                                          /     lime

# -*- coding: utf-8 -*- from __future__ import unicode_literals import os import warnings import numpy as np import scipy.io.wavfile as wf import python_speech_features as sf import hmmlearn.hmm as hl warnings.filterwarnings( 'ignore', category=DeprecationWarning) np.seterr(all='ignore') def search_speeches(directory, speeches): directory = os.path.normpath(directory) if not os.path.isdir(directory): raise IOError( "The directory '" + directory + "' doesn't exist!") for entry in os.listdir(directory): label = directory[directory.rfind( os.path.sep) + 1:] path = os.path.join(directory, entry) if os.path.isdir(path): search_speeches(path, speeches) elif os.path.isfile(path) and \ path.endswith('.wav'): if label not in speeches: speeches[label] = [] speeches[label].append(path) train_speeches = {} search_speeches( '../../data/speeches/training', train_speeches) train_x, train_y = [], [] for label, filenames in train_speeches.items(): mfccs = np.array([]) for filename in filenames: sample_rate, sigs = wf.read(filename) mfcc = sf.mfcc(sigs, sample_rate) if len(mfccs) == 0: mfccs = mfcc else: mfccs = np.append(mfccs, mfcc, axis=0) train_x.append(mfccs) train_y.append(label) models = {} for mfccs, label in zip(train_x, train_y): model = hl.GaussianHMM( n_components=4, covariance_type='diag', n_iter=1000) models[label] = model.fit(mfccs) test_speeches = {} search_speeches( '../../data/speeches/testing', test_speeches) test_x, test_y = [], [] for label, filenames in test_speeches.items(): mfccs = np.array([]) for filename in filenames: sample_rate, sigs = wf.read(filename) mfcc = sf.mfcc(sigs, sample_rate) if len(mfccs) == 0: mfccs = mfcc else: mfccs = np.append(mfccs, mfcc, axis=0) test_x.append(mfccs) test_y.append(label) pred_test_y = [] for mfccs in test_x: best_score, best_label = None, None for label, model in models.items(): score = model.score(mfccs) if (best_score is None) or ( best_score < score): best_score, best_label = score, label pred_test_y.append(best_label) print(test_y) print(pred_test_y)

 

图像识别 1.OpenCV,开源计算机视觉库 # -*- coding: utf-8 -*- from __future__ import unicode_literals import numpy as np import cv2 as cv original = cv.imread('../../data/forest.jpg') print(original.shape) cv.imshow('Original', original) blue = np.zeros_like(original) blue[..., 0] = original[..., 0] cv.imshow('Blue', blue) green = np.zeros_like(original) green[..., 1] = original[..., 1] cv.imshow('Green', green) red = np.zeros_like(original) red[..., 2] = original[..., 2] cv.imshow('Red', red) h, w = original.shape[:2] l, t = int(w / 4), int(h / 4) r, b = int(w * 3 / 4), int(h * 3 / 4) cropped = original[t:b, l:r] cv.imshow('Cropped', cropped) ''' scaled = cv.resize( original, (int(w / 2), int(h / 2))) ''' scaled = cv.resize(original, None, fx=0.5, fy=0.5) cv.imshow('Scaled', scaled) cv.waitKey() cv.imwrite('../../data/blue.jpg', blue) cv.imwrite('../../data/green.jpg', green) cv.imwrite('../../data/red.jpg', red) cv.imwrite('../../data/cropped.jpg', cropped) cv.imwrite('../../data/scaled.jpg', scaled) 2.边缘识别 # -*- coding: utf-8 -*- from __future__ import unicode_literals import cv2 as cv original = cv.imread('../../data/chair.jpg') cv.imshow('Original', original) canny = cv.Canny(original, 50, 240) cv.imshow('Canny', canny) cv.waitKey() 3.直方图均衡化 # -*- coding: utf-8 -*- from __future__ import unicode_literals import cv2 as cv original = cv.imread('../../data/sunrise.jpg') cv.imshow('Original', original) gray = cv.cvtColor(original, cv.COLOR_BGR2GRAY) cv.imshow('Gray', gray) equalized_gray = cv.equalizeHist(gray) cv.imshow('Equalized Gray', equalized_gray) yuv = cv.cvtColor(original, cv.COLOR_BGR2YUV) yuv[..., 0] = cv.equalizeHist(yuv[..., 0]) equalized_color = cv.cvtColor(yuv, cv.COLOR_YUV2BGR) cv.imshow('Equalized Color', equalized_color) cv.waitKey() 4.结构特征 # -*- coding: utf-8 -*- from __future__ import unicode_literals import cv2 as cv original = cv.imread('../../data/table.jpg') cv.imshow('Original', original) gray = cv.cvtColor(original, cv.COLOR_BGR2GRAY) cv.imshow('Gray', gray) # 创建STAR特征点检测器 star = cv.xfeatures2d.StarDetector_create() keypoints = star.detect(gray) mixture = original.copy() cv.drawKeypoints( original, keypoints, mixture, flags=cv.DRAW_MATCHES_FLAGS_DRAW_RICH_KEYPOINTS) cv.imshow('Mixture', mixture) cv.waitKey() 5.图像的特征描述矩阵

图像的特征描述矩阵来源于反映图像中物体结构特征关键点的空间直方图。

# -*- coding: utf-8 -*- from __future__ import unicode_literals import cv2 as cv import matplotlib.pyplot as mp original = cv.imread('../../data/table.jpg') cv.imshow('Original', original) gray = cv.cvtColor(original, cv.COLOR_BGR2GRAY) cv.imshow('Gray', gray) # 创建STAR特征点检测器 star = cv.xfeatures2d.StarDetector_create() keypoints = star.detect(gray) # 创建SIFT特征点描述器 sift = cv.xfeatures2d.SIFT_create() _, desc = sift.compute(gray, keypoints) mp.matshow(desc, cmap='gray', fignum='Description') mp.title('Description', fontsize=20) mp.xlabel('Feature', fontsize=14) mp.ylabel('Sample', fontsize=14) mp.tick_params(which='both', top='False', labeltop='False', labelbottom='True', labelsize=10) mp.show() 6.基于特征描述矩阵的图像物体识别 # -*- coding: utf-8 -*- from __future__ import unicode_literals import os import warnings import numpy as np import cv2 as cv import hmmlearn.hmm as hl warnings.filterwarnings( 'ignore', category=DeprecationWarning) np.seterr(all='ignore') def search_objects(directory): directory = os.path.normpath(directory) if not os.path.isdir(directory): raise IOError( "The directory '" + directory + "' doesn't exist!") objects = {} for curdir, subdirs, files in os.walk(directory): for jpeg in (file for file in files if file.endswith( '.jpg')): path = os.path.join(curdir, jpeg) label = path.split(os.path.sep)[-2] if label not in objects: objects[label] = [] objects[label].append(path) return objects train_objects = search_objects( '../../data/objects/training/') train_x, train_y = [], [] for label, filenames in train_objects.items(): descs = np.array([]) for filename in filenames: image = cv.imread(filename) gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY) h, w = gray.shape[:2] f = 200 / min(h, w) gray = cv.resize(gray, None, fx=f, fy=f) star = cv.xfeatures2d.StarDetector_create() keypoints = star.detect(gray) sift = cv.xfeatures2d.SIFT_create() _, desc = sift.compute(gray, keypoints) if len(descs) == 0: descs = desc else: descs = np.append(descs, desc, axis=0) train_x.append(descs) train_y.append(label) models = {} for descs, label in zip(train_x, train_y): model = hl.GaussianHMM( n_components=4, covariance_type='diag', n_iter=1000) models[label] = model.fit(descs) test_objects = search_objects( '../../data/objects/testing/') test_x, test_y, test_z = [], [], [] for label, filenames in test_objects.items(): test_z.append([]) descs = np.array([]) for filename in filenames: image = cv.imread(filename) test_z[-1].append(image) gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY) h, w = gray.shape[:2] f = 200 / min(h, w) gray = cv.resize(gray, None, fx=f, fy=f) star = cv.xfeatures2d.StarDetector_create() keypoints = star.detect(gray) sift = cv.xfeatures2d.SIFT_create() _, desc = sift.compute(gray, keypoints) if len(descs) == 0: descs = desc else: descs = np.append(descs, desc, axis=0) test_x.append(descs) test_y.append(label) pred_test_y = [] for descs in test_x: best_score, best_label = None, None for label, model in models.items(): score = model.score(descs) if (best_score is None) or ( best_score < score): best_score, best_label = score, label pred_test_y.append(best_label) i = 0 for label, pred_label, images in zip( test_y, pred_test_y, test_z): for image in images: i += 1 cv.imshow('{} - {} {} {}'.format( i, label, '==' if label == pred_label else '!=', pred_label), image) cv.waitKey()

 

人脸识别 1.视频捕捉 # -*- coding: utf-8 -*- from __future__ import unicode_literals import cv2 as cv vc = cv.VideoCapture(0) while True: frame = vc.read()[1] cv.imshow('VideoCapture', frame) if cv.waitKey(33) == 27: break vc.release() cv.destroyAllWindows() 2.人脸定位 # -*- coding: utf-8 -*- from __future__ import unicode_literals import cv2 as cv # 用描述文件构造级联分类器 fd = cv.CascadeClassifier('../../data/haar/face.xml') ed = cv.CascadeClassifier('../../data/haar/eye.xml') nd = cv.CascadeClassifier('../../data/haar/nose.xml') vc = cv.VideoCapture(0) while True: frame = vc.read()[1] faces = fd.detectMultiScale(frame, 1.3, 5) for l, t, w, h in faces: a, b = int(w / 2), int(h / 2) cv.ellipse(frame, (l + a, t + b), (a, b), 0, 0, 360, (255, 0, 255), 2) face = frame[t:t + h, l:l + w] eyes = ed.detectMultiScale(face, 1.3, 5) for l, t, w, h in eyes: a, b = int(w / 2), int(h / 2) cv.ellipse(face, (l + a, t + b), (a, b), 0, 0, 360, (0, 255, 0), 2) noses = nd.detectMultiScale(face, 1.3, 5) for l, t, w, h in noses: a, b = int(w / 2), int(h / 2) cv.ellipse(face, (l + a, t + b), (a, b), 0, 0, 360, (0, 255, 255), 2) cv.imshow('VideoCapture', frame) if cv.waitKey(33) == 27: break vc.release() cv.destroyAllWindows() 3.人脸识别 # -*- coding: utf-8 -*- from __future__ import unicode_literals import os import numpy as np import cv2 as cv import sklearn.preprocessing as sp fd = cv.CascadeClassifier('../../data/haar/face.xml') def search_faces(directory): directory = os.path.normpath(directory) if not os.path.isdir(directory): raise IOError( "The directory '" + directory + "' doesn't exist!") faces = {} for curdir, subdirs, files in os.walk(directory): for jpeg in (file for file in files if file.endswith( '.jpg')): path = os.path.join(curdir, jpeg) label = path.split(os.path.sep)[-2] if label not in faces: faces[label] = [] faces[label].append(path) return faces train_faces = search_faces( '../../data/faces/training') codec = sp.LabelEncoder() codec.fit(list(train_faces.keys())) train_x, train_y = [], [] for label, filenames in train_faces.items(): for filename in filenames: image = cv.imread(filename) gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY) faces = fd.detectMultiScale( gray, 1.1, 2, minSize=(100, 100)) for l, t, w, h in faces: train_x.append(gray[t:t + h, l:l + w]) train_y.append(int( codec.transform([label])[0])) train_y = np.array(train_y) # 创建局部二值模式直方图人脸识别器 model = cv.face.LBPHFaceRecognizer_create() model.train(train_x, train_y) test_faces = search_faces( '../../data/faces/testing') test_x, test_y, test_z = [], [], [] for label, filenames in test_faces.items(): for filename in filenames: image = cv.imread(filename) gray = cv.cvtColor(image, cv.COLOR_BGR2GRAY) faces = fd.detectMultiScale( gray, 1.1, 2, minSize=(100, 100)) for l, t, w, h in faces: test_x.append(gray[t:t + h, l:l + w]) test_y.append(int( codec.transform([label])[0])) a, b = int(w / 2), int(h / 2) cv.ellipse( image, (l + a, t + b), (a, b), 0, 0, 360, (255, 0, 255), 2) test_z.append(image) test_y = np.array(test_y) pred_test_y = [] for face in test_x: pred_code = model.predict(face)[0] pred_test_y.append(pred_code) escape = False while not escape: for code, pred_code, image in zip( test_y, pred_test_y, test_z): label, pred_label = codec.inverse_transform( [code, pred_code]) text = '{} {} {}'.format( label, '==' if code == pred_code else '!=', pred_label) cv.putText(image, text, (10, 60), cv.FONT_HERSHEY_SIMPLEX, 2, (255, 255, 255), 6) cv.imshow('Recognizing Face...', image) if cv.waitKey(1000) == 27: escape = True break

想要看更多的课程请微信关注SkrEric的编程课堂


作者:Skr-Eric



学习 图像识别 机器学习 人脸识别 语音识别

需要 登录 后方可回复, 如果你还没有账号请 注册新账号