本文章以 丁香 医生为例
主要以 科目分类来进行爬取,每个科目爬取的内容会存储到一个文本文档中,内容是问答式。
代码如下:
from bs4 import BeautifulSoup
import pandas as pd
import json
import requests
import time
import random
def get_static_url_content(url):
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'}
req = requests.get(url, headers=headers)
print(req.status_code)
content = req.text
bsObj = BeautifulSoup(content, 'lxml')
return bsObj
def start_cralwer(subject):
url = 'https://ask.dxy.com/questions/%s/'%subject
res = get_static_url_content(url)
title = res.find('title').text
print(title)
bs = res.find_all('div', {'class': 'question-matching-questions-item'})
with open('%s.txt'%subject, 'w', encoding='utf-8') as f:
# first_page
dialogue = {}
for i in bs:
question = i.find('div', {'class':'question-dialog dialog-of-patient'})
answer = i.find('div', {'class':'question-dialog dialog-of-doctor'})
f.write('patient:' + str(question.text)+' doctor:' + str(answer.text) +'\n')
# print('patient:', str(question.text), ' doctor:', str(answer.text))
for i in range(2, 40):
ask_url = 'https://ask.dxy.com/view/i/question/list/section?section_group_name=%s&page_index=%s'%(subject, str(i)) # url need a number
time.sleep(random.randint(3, 10))
res = get_static_url_content(ask_url)
downloads_questions = res.find('body').text
# print(downloads_questions)
# print(json_str)
try:
json_str = json.loads(downloads_questions)
items_list = json_str['data']['items']
for item in items_list:
dialogs = item['dialogs']
# print(len(dialogs))
all_answer = []
questions = []
for j in dialogs:
# print(j)
if j['user_id'] == 0:
questions.extend([j['content']])
else:
answer = j['content']
all_answer.extend([answer])
dialogue[str(questions)] = all_answer
for key, value in dialogue.items():
f.write('patient:' + str(key) + ' doctor:' + str(value) + '\n')
# print('patient:', str(key), ' doctor:', str(key))
except:
print('page:',i)
print('page error:', ask_url)
f.close()
if __name__ == '__main__':
# subject
# 'xiaohuaneike',
subjects = [
'huxineike',
'erbiyanhoutoujing',
'erke',
'fengshimianyi',
'neike',
'xinxueguannei',
'ganranchuanran',
'putongwaike',
'guke',
'yanke',
'neifenmi',
'kouqiangke',
'shenjingneike',
'fuchanshengzhi',
'xinxiongwaike',
'gandanyiwaike',
'naoliuke',
'jiazhuangxianruxian',
'miniaowaike',
'xingbing',
'jingshenxinli',
'xueyeke',
'shenzangnei',
'shenjingwaike',
'zhengxingmeirong',
'tengtongmazui']
# title
for subject in subjects:
start_cralwer(subject)
print(subject, 'is done!')
可能因为 【网速问题】报错!
自行调整!