基于python的爬虫爬取百度图片

Wanda ·

更新时间:2024-11-13

· 971 次阅读

   前段时间学校安排的实训跟着学了几天，用python写的人脸颜值分析，结果会分出多个文件夹，每个文件夹里边的图片是相近颜值的。代码如下：

from icrawler.builtin import BaiduImageCrawler  # 爬虫
from aip import AipFace  # 人脸识别
import base64  # 转码
import os  # 文件读写：分类移动文件
import time  # 时间模块：控制访问频率
#以下三个要换成自己的哦
APP_ID = '########'
API_KEY = '###########'
SECRET_KEY = '#######################'
options = {'face_field': 'age,beauty', 'max_face_num': 4}  # 根据个人的需求来决定传入什么参数
client = AipFace(APP_ID, API_KEY, SECRET_KEY)
def baidu_image_crawler(keyword, maxnum):
    """
    用于爬取图片的函数
    :param keyword: 爬取的关键字
    :param maxnum: 爬取的最大数量
    :return: 无
    """
    # 百度图片爬虫
    baidu_storage = {'root_dir': 'baidu'}  # 项目根目录的baidu文件夹中
    baidu_crawler = BaiduImageCrawler(parser_threads=2,
                                      downloader_threads=4,
                                      storage=baidu_storage)
    baidu_crawler.crawl(keyword=keyword,
                        max_num=maxnum)
def get_file_content(file_path):
    """
    讲本地图片转为BASE64编码
    :param file_path: 图片的位置
    :return: 转码的结果
    """
    file = open(file_path, 'rb')
    data = file.read()
    content = base64.b64encode(data)
    file.close()
    base = content.decode('utf-8')
    print('转换后的base64：', base)
    return base
def detect_face(base64):
    """
    检测人脸并返回检测结果
    :param base64: 图片转成的BASE64编码
    :return: 百度人脸识别的结果，是一个字典的格式
    """
    result = client.detect(base64, "BASE64", options)
    return result
def parse_result(data):
    time.sleep(0.5)  # 休眠半秒
    """
    解析出颜值的等级
    :param data: 服务器返回的分析结果
    :return: 筛选服务器数据后按照个人评判的颜值等级。如果为-1，说明有错误。
    """
    # 人脸解析失败返回-1
    if data['error_code'] != 0:
        print("解析出错")
        return -1
    # 一层一层取出要拿的数据
    result = data['result']  # 结果的字典
    face_list = result['face_list']  # 列表
    yanzhi = face_list[0]['beauty']  # 取出列表的第一项对应的字典的‘beauty’键对应的值
    # 颜值分类标准，可以根据自己的评判去改动
    if yanzhi >= 95:
        return 10
    elif yanzhi >= 85:
        return 9
    elif yanzhi >= 75:
        return 8
    elif yanzhi >= 65:
        return 7
    elif yanzhi >= 55:
        return 6
    elif yanzhi >= 45:
        return 5
    elif yanzhi >= 35:
        return 4
    elif yanzhi >= 25:
        return 3
    elif yanzhi >= 15:
        return 2
    elif 0 <= yanzhi < 15:
        return 1
def classify():
    rootdir = 'baidu'  # 根目录
    # 列出根目录文件夹下的所有目录和文件
    list = os.listdir(rootdir)
    # 遍历当前目录
    for i in range(0, len(list)):
        # 拿到文件夹下每个文件的路径
        path = rootdir + '/' + list[i]
        # 判断类型
        if os.path.isfile(path):
            # 如果是文件，说明是图片，就应该做人脸分析
            data = detect_face(get_file_content(path))
            yanzhi = parse_result(data)
            print(yanzhi)
            # 按照颜值的评分，来进行文件夹的分类
            if yanzhi == -1:
                dic = 'baidu/未识别/'
                # 如果不存在则创建
                if not os.path.exists(dic):
                    os.makedirs(dic)
                # 移动文件
                os.rename(path, dic + list[i])
            else:
                dic = rootdir + '/' + str(yanzhi) + '/'
                if not os.path.exists(dic):
                    os.makedirs(dic)
                os.rename(path, dic + list[i])
if __name__ == '__main__':
    # 录入用户输入
    keyword = input('请输入要爬取的关键字:')
    count = int(input('请输入要爬取的数量：'))  # 转为int
    baidu_image_crawler(keyword, count)  # 爬
    classify()  # 分类

结果如下：
最后是生成了这种带数字的文件夹就是代表不同的颜值分数，点开里边都是蔡徐坤~

本人小白一枚，希望可以向大家多多学习哦~~