'''
1.访问 https://movie.douban.com/top250
2.用代码实现访问排行榜5-10页。(30分)
3.提取出电影的名字,导演,评分 (40分)
4.将数据保存到top_title.txt中。(10分)
第五页 https://movie.douban.com/top250?start=100&filter=
第六页 https://movie.douban.com/top250?start=125&filter=
'''
import requests
from lxml import etree
import re
mindex = 100
def getres(mstr):
if mstr:
return mstr[0]
else:
return None
headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36',
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
'Cookie': 'bid=f0UsWskEg4E; douban-fav-remind=1; ll="108302"; push_doumail_num=0; push_noty_num=0; douban-profile-remind=1; ap_v=0,6.0'
}
for i in range(6):
res = requests.get(f'https://movie.douban.com/top250?start={mindex}&filter=', headers=headers)
html = etree.HTML(res.text)
# 为了避免某一项没数据发生错位,虽然这里不存在
ols = html.xpath('//ol[@class="grid_view"]/li')
for li in ols:
# ---名字----
name = li.xpath('.//div[@class="hd"]/a//span/text()')
# # 去掉前端的
for j in range(1, len(name)):
name[j] = re.sub(r'[\xa0/\xa0]', '', name[j])
print(name)
# ---导演----
daoyan = li.xpath('.//div[@class="bd"]/p/text()')
daoyan = getres(re.findall(r'.*导演:(.*?)\xa0\xa0\xa0主演', daoyan[0], re.S))
# ----评分---
fen = getres(li.xpath('.//div[@class="star"]/span[2]/text()')[0])
with open(f'{i + 5}页.txt', 'a', encoding='utf-8') as fw:
fw.write(f'{name[0]}\n导演:{daoyan}\n评分:{fen}\n\n')
break
mindex += 25