前一篇博客内写的代码在运行过程中还出了很多错误。而且洛谷网页也有一些奇怪的地方,会使程序报错,因此花两天修改了代码,十分感谢大佬的指点。
写代码就是不断找错和不断改错的过程,还得继续虚心学习。
换关键词后网页源代码元素有变化,导致无法用原来的语句循环定位。
出现了无法换关键词的问题。
洛谷网页跳转慢,经常出现会爬取到上一页的内容或者无法获取下一页元素的问题,需要爬取前time.sleep()几秒。
爬取效率太慢,需要优化程序甚至换思路。
发现xlsxwriter逐列导入excel表格最多只能导入两列,超过两列即报错。发现是有列表没有获取完,需要完善爬虫代码。
出现了全局变量和局部变量重名的错误,在函数内声明global变量即可,因为我本来思路就是要在函数内改变全局变量。
发现每次在题库末页转换题库的时候总会出错,上网查了貌似是因为网页元素仍未加载完导致无法定位元素,但是我已经加了隐形等待甚至在换题库的位置加了强制等待4秒的指令,仍然报错,故尝试不爬取末页资料,发现可行。
为了爬取最后一页,用了try except指令,使其不论报什么错都能继续运行,可行。
上述错误都已解决
现附上修改后的代码
from selenium import webdriver
import xlsxwriter
import time
count = 0
# 设置火狐不加载图片,提高速度
options = webdriver.FirefoxProfile()
options.set_preference('permissions.default.image', 2)
# 设置火狐驱动器的环境
wd = webdriver.Firefox(options)
# 打开网页
wd.get("https://www.luogu.com.cn/problem/list")
# 设置隐形等待网页加载完成,10秒内未加载完成即报错
wd.implicitly_wait(10)
def start_crawler():
time.sleep(2)
# 获取页数
page_number = wd.find_element_by_xpath('//*[@id="app"]/div[2]/main/div/div/div/div[2]/div/div/span/strong')
total_number = int(page_number.get_attribute('textContent'))
global count
count = total_number
for i in range(total_number ):
# 定位网页中需要爬取的元素
exercises = wd.find_elements_by_class_name('row')
# 遍历元素
for exercise in exercises:
number = exercise.find_element_by_class_name('pid')
numbers.append(number.get_attribute('textContent'))
name = exercise.find_element_by_class_name('color-default')
names.append(name.get_attribute('textContent')[9:-7])
level = exercise.find_element_by_class_name('difficulty')
levels.append(level.get_attribute('textContent')[11:-9])
passing_rate = exercise.find_element_by_class_name('rate-popup')
passing_rates.append(passing_rate.get_attribute('textContent')[9:-7])
source = exercise.find_element_by_class_name('tags')
sources.append(source.get_attribute('textContent'))
hand = exercise.find_element_by_class_name('rate-popup')
hands.append(str(hand.get_attribute('textContent')[9:-7]).split('/')[1])
# 点击显示算法的按钮算法才能弹出
wd.find_element_by_xpath('//*[@id="app"]/div[2]/main/div/div/div/div[1]/div[1]/div/div[4]/span/a').click()
for exercise1 in exercises:
methods_0 = exercise1.find_elements_by_class_name('tags-wrap')
global methods
methods = methods + [method.get_attribute('textContent') for method in methods_0]
wd.find_element_by_xpath('//*[@id="app"]/div[2]/main/div/div/div/div[1]/div[1]/div/div[4]/span/a').click()
# 点击翻页
wd.find_element_by_xpath('//*[@id="app"]/div[2]/main/div/div/div/div[2]/div/div/div/button[10]').click()
print(f'已成功获取第{i + 1}页')
def try_0():
try:
start_crawler()
except:
if count == 109:
wd.find_element_by_xpath(
'/html/body/div/div[2]/main/div/section/div/section[2]/div/div/div/ul/li[2]/a').click()
try_0()
if count == 118:
wd.find_element_by_xpath(
'/html/body/div/div[2]/main/div/section/div/section[2]/div/div/div/ul/li[3]/a').click()
try_0()
if count == 74:
wd.find_element_by_xpath(
'/html/body/div/div[2]/main/div/section/div/section[2]/div/div/div/ul/li[4]/a').click()
try_0()
if count == 56:
wd.find_element_by_xpath(
'/html/body/div/div[2]/main/div/section/div/section[2]/div/div/div/ul/li[5]/a').click()
try_0()
if count == 99:
# 创建工作簿
workbook = xlsxwriter.Workbook('洛谷.xlsx')
sheet = workbook.add_worksheet('资料')
head = ['题号', '题目', '算法', '来源', '难度', '通过率', '提交量']
for h in range(len(head)):
# 写入表头
sheet.write(0, h, head[h])
sheet.write_column('A2', numbers)
sheet.write_column('B2', names)
sheet.write_column('C2', methods)
sheet.write_column('D2', sources)
sheet.write_column('E2', levels)
sheet.write_column('F2', passing_rates)
sheet.write_column('G2', hands)
workbook.close()
else:
if count == 109:
wd.find_element_by_xpath(
'/html/body/div/div[2]/main/div/section/div/section[2]/div/div/div/ul/li[2]/a').click()
try_0()
if count == 118:
wd.find_element_by_xpath(
'/html/body/div/div[2]/main/div/section/div/section[2]/div/div/div/ul/li[3]/a').click()
try_0()
if count == 74:
wd.find_element_by_xpath(
'/html/body/div/div[2]/main/div/section/div/section[2]/div/div/div/ul/li[4]/a').click()
try_0()
if count == 56:
wd.find_element_by_xpath(
'/html/body/div/div[2]/main/div/section/div/section[2]/div/div/div/ul/li[5]/a').click()
try_0()
if count == 99:
# 创建工作簿
workbook = xlsxwriter.Workbook('洛谷.xlsx')
sheet = workbook.add_worksheet('资料')
head = ['题号', '题目', '算法', '来源', '难度', '通过率', '提交量']
for h in range(len(head)):
# 写入表头
sheet.write(0, h, head[h])
sheet.write_column('A2', numbers)
sheet.write_column('B2', names)
sheet.write_column('C2', methods)
sheet.write_column('D2', sources)
sheet.write_column('E2', levels)
sheet.write_column('F2', passing_rates)
sheet.write_column('G2', hands)
workbook.close()
# 声明列表来储存所爬内容
numbers = []
names = []
levels = []
passing_rates = []
sources = []
hands = []
methods = []
# 点开显示高级搜索选项
wd.find_element_by_xpath('/html/body/div/div[2]/main/div/section/div/div[2]/span').click()
try_0()