具体数据:电影链接、电影名称、电影评分、评分人数、电影概括
import pymysql
import xlwt
from bs4 import BeautifulSoup
from urllib import request
import re
baseurl = 'https://movie.douban.com/top250?start='
headers = {
'User-Agent': 'XXXXX',
'Referer': 'https://movie.douban.com/top250?start=25&filter='
}
data_list = [] #储存总数据的列表
book = xlwt.Workbook(encoding='utf-8',style_compression=0) #创建excel
sheet = book.add_sheet('豆瓣Top') #创建sheet
col = ("电影链接","电影名称","评分","人数","概括") #列属性
for i in range(0,len(col)): #将列属性写入excel
sheet.write(0,i,col[i])
for k in range(0,10): #爬取
url = baseurl+f'{k*25}'
req = request.Request(url,headers=headers) #上传请求头信息
response = request.urlopen(req)
html = response.read().decode('utf-8') #返回网页源码
soup = BeautifulSoup(html,'html.parser') #将源码转变为soup类型
#电影链接
findLink = re.compile(r'')
#电影名称
findName = re.compile(r'(.*?)',re.S) #两个匹配
#评分
findGoal = re.compile(r' ')
#评价人数
findNum = re.compile(r'(.*)人评价')
#电影信息
findInq = re.compile(r'(.*?)')
for item in soup.find_all('div',class_="info"): #筛选源码范围
data = []
item = str(item)
link = re.findall(findLink,item)
name = re.findall(findName,item)[0]
goal = re.findall(findGoal,item)
num = re.findall(findNum,item)
inq = re.findall(findInq,item)
data.append(link)
data.append(name.strip()) #除去空格
data.append(goal)
data.append(num)
data.append(inq)
data_list.append(data)
print(f'第{len(data_list)}条')
for i in range(0, len(data_list)): #写入excel中
for j in range(0, len(col)):
sheet.write(i + 1, j, data_list[i][j])
book.save('douban_top.xls') #保存文件
conn=pymysql.connect(host='localhost',user='root',password='XXXX',
database='XX',cursorclass=pymysql.cursors.DictCursor) #连接数据库
cursor = conn.cursor() #创建数据库指针
for i in range(0,len(data_list)):
j = 0
print(f'正在写入第{i+1}条!') #sql语句
sql = 'insert into 豆瓣top250_1 (电影链接,电影名称,评分,人数,概括) values ("'+(str(data_list[i][j]))+'","'+(str(data_list[i][j+1]))+'","'+(str(data_list[i][j+2]))+'","'+(str(data_list[i][j+3]))+'","'+(str(data_list[i][j+4]))+'");'
try:
cursor.execute(sql) #执行sql语句
conn.commit() #递交
except Exception as err:
print(err)
conn.close() #关闭数据库连接