爬取豆瓣读书的图书信息和评论信息

Tamara ·

更新时间:2024-11-13

· 966 次阅读

最近在做毕业设计，需要收集用户的评分数据做协同过滤算法，同时收集评论数据做情感分析

注意一个比较坑的地方就是豆瓣图书可以没有评分，或者用户评论了但没给评分。而且豆瓣图书的编码方式很无奈呀，热门书籍附近总是冷门书籍，无评分、无评论那种，所以经常输出failed fake_useragent的用法

在这次爬虫中使用了fake_useragent来伪造请求头，因为听说豆瓣的反爬机制比较好
fake_useragent的用法简单如下，random是随机产生一个请求头

from fake_useragent import UserAgent
import requests
ua=UserAgent()
url="https://www.baidu.com"    #请求的网址
headers={"User-Agent":ua.random}   #请求头
response=requests.get(url=url,headers=headers)   #请求网址
print(headers)
print(response.status_code)   #响应状态信息
text = response.headers
for line in text.items():
    print(line)

爬取豆瓣读书的图书信息和评论信息

首先需要观察的是这些的链接
https://book.douban.com/subject/26953606/ 图书信息页面
https://book.douban.com/subject/26953606/comments/ 第一页评论页面
https://book.douban.com/subject/26953606/comments/hot?p=2 第二页评论页面
可以看到前面都是相同的https://book.douban.com/subject/再加一个图书id，评论页面后面接一个/comments/，第二页评论后面接一个hot?p=2，由此递推低3页是hot?p=3
其中一些写入文本的操作省略了或者注释掉了，成了输出操作，因为大部分人玩爬虫不会去搞机器学习或者深度学习
第二天又修改了一下，热门图书的分布实在是太稀疏了，所以在程序里先判断评论总数是否超过一千，如果超过一千条就继续爬取，否则continue

#coding=utf-8
#下载豆瓣图书的评分、评论，需要建立四张表
#表一：图书ID，图书名，平均分
#表二：用户ID，用户名
#表三：图书ID，热门评论
#表四：图书ID，用户ID，评分，评分时间
import requests
import time
import random
from bs4 import BeautifulSoup
from fake_useragent import UserAgent
ua = UserAgent()
header = {
    'User-Agent': ua.random
}
def get_score(book_id,text):                   #获取(图书ID，图书名，图书评分)
    soup = BeautifulSoup(text,'lxml')
    try:
        book_name = soup.select("#wrapper > h1 > span")   #返回书名的列表
        name = book_name[0].string
        book_score = soup.select("#interest_sectl > div > div.rating_self.clearfix > strong")  #返回分数的列表
        score = book_score[0].string
        print("book name is " + str(name)+" and score is "+str(score))  #打印书名和分数
        line = str(book_id) + "," + str(name) + "," + str(score) + "\n"
        #with open("BookInfo.txt","w",encoding="utf-8") as file:
        #    file.write(line)
        #file.close()
    except:
        print("book " + str(book_id) + " is  failed!")
def get_user_score(score_str):      #根据解析出字符串对应的分数
    if score_str == "力荐":     #5星
        return 10
    elif score_str == "推荐":   #4星
        return 8
    elif score_str == "还行":   #3星
        return 6
    elif score_str == "较差":   #2星
        return 4
    else:                       #1星
        return 2
def write_txt(soup,book_id):    #参与为url，图书id，和网页页码
    try:     #为了防止报错，因为有些人可以不打分，那么在user_info下只有一个span
        comment_list = soup.find_all("span","short")     #找到评论所在的区域
        for line in comment_list:
            print(line.string)             #打印评论
        user_list = soup.find_all("span", "comment-info")   #找到用户和评分的所在区域
        for user_info in user_list:
            user_name = user_info.find("a").string         #用户姓名所在的
            user_url = user_info.find("a").attrs["href"]   #提取出超链接
            user_id = user_url.split("/")[-2]              #提取出用户id
            score_info = user_info.find_all("span")[0].attrs["title"]   #找到用户评分的区域
            score = get_user_score(score_info)             #提取出分数的文本
            time_info = user_info.find_all("span")[1].string   #提取出评分的时间
            print("book_id is " + book_id +" user name is " + user_name + ",id is " + user_id + ",score is " + score_info + " " + time_info)   #打印出一系列信息
            '''BookComments = open("BookComments.txt", "w", encoding="utf-8")
            BookComments.close()'''
    except:
        print("cannot find!")
def get_comments(soup, comment_url, book_id, page):         #获取(图书ID，图书评论),(图书ID，用户ID，用户评分),(用户ID，用户名)
    while page <= 2:           #爬取的页数
        if int(page) == 1:     #如果是第一页
            write_txt(soup, book_id)          #传入超链接
            page += 1           #页数加一
        else:
            comment_url += "hot?p=" + str(page)   #拼合链接
            html = requests.get(url=comment_url,headers=header)
            if html.status_code == 200:
                comment_text = html.text
                soup = BeautifulSoup(comment_text,"lxml")
                write_txt(soup, book_id)                            #传入超链接
                page += 1           #页数加一
#https://book.douban.com/subject/26953606/
if __name__ == '__main__':
    url="https://book.douban.com/subject/"
    startID=26953605  #起始的图书ID
    st = 0   #循环的起点
    lens=3   #len=30000时，需要爬取的总书籍数
    while st = 1000:
                st +=1
                print(str(startID)+" is success!" + score_url + " comment_num is " + comment_num)
                text = html.text
                get_score(startID,text)
                time.sleep(random.uniform(1,3))   #暂停几秒，随机数在3-6s之间
                get_comments(soup,comment_url,startID,1)       #获取评论信息
                time.sleep(random.uniform(1,3))   #暂停几秒，随机数在3-6s之间
            else:
                print(score_url + " is failed!" + " comment_num is " + comment_num)
        else:
            print(str(startID)+" is failed!")

输出如下
在这里插入图片描述
中间一堆数据省略了
这是爬取到了一些冷门书籍，评论数少得可怜，所以直接忽略了