爬取【医疗问答】数据

Bea ·
更新时间:2024-11-10
· 543 次阅读

本文章以 丁香 医生为例
主要以 科目分类来进行爬取,每个科目爬取的内容会存储到一个文本文档中,内容是问答式。

代码如下:

from bs4 import BeautifulSoup import pandas as pd import json import requests import time import random def get_static_url_content(url): headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.120 Safari/537.36'} req = requests.get(url, headers=headers) print(req.status_code) content = req.text bsObj = BeautifulSoup(content, 'lxml') return bsObj def start_cralwer(subject): url = 'https://ask.dxy.com/questions/%s/'%subject res = get_static_url_content(url) title = res.find('title').text print(title) bs = res.find_all('div', {'class': 'question-matching-questions-item'}) with open('%s.txt'%subject, 'w', encoding='utf-8') as f: # first_page dialogue = {} for i in bs: question = i.find('div', {'class':'question-dialog dialog-of-patient'}) answer = i.find('div', {'class':'question-dialog dialog-of-doctor'}) f.write('patient:' + str(question.text)+' doctor:' + str(answer.text) +'\n') # print('patient:', str(question.text), ' doctor:', str(answer.text)) for i in range(2, 40): ask_url = 'https://ask.dxy.com/view/i/question/list/section?section_group_name=%s&page_index=%s'%(subject, str(i)) # url need a number time.sleep(random.randint(3, 10)) res = get_static_url_content(ask_url) downloads_questions = res.find('body').text # print(downloads_questions) # print(json_str) try: json_str = json.loads(downloads_questions) items_list = json_str['data']['items'] for item in items_list: dialogs = item['dialogs'] # print(len(dialogs)) all_answer = [] questions = [] for j in dialogs: # print(j) if j['user_id'] == 0: questions.extend([j['content']]) else: answer = j['content'] all_answer.extend([answer]) dialogue[str(questions)] = all_answer for key, value in dialogue.items(): f.write('patient:' + str(key) + ' doctor:' + str(value) + '\n') # print('patient:', str(key), ' doctor:', str(key)) except: print('page:',i) print('page error:', ask_url) f.close() if __name__ == '__main__': # subject # 'xiaohuaneike', subjects = [ 'huxineike', 'erbiyanhoutoujing', 'erke', 'fengshimianyi', 'neike', 'xinxueguannei', 'ganranchuanran', 'putongwaike', 'guke', 'yanke', 'neifenmi', 'kouqiangke', 'shenjingneike', 'fuchanshengzhi', 'xinxiongwaike', 'gandanyiwaike', 'naoliuke', 'jiazhuangxianruxian', 'miniaowaike', 'xingbing', 'jingshenxinli', 'xueyeke', 'shenzangnei', 'shenjingwaike', 'zhengxingmeirong', 'tengtongmazui'] # title for subject in subjects: start_cralwer(subject) print(subject, 'is done!')

可能因为 【网速问题】报错!
自行调整!


作者:Robin C



数据 医疗

需要 登录 后方可回复, 如果你还没有账号请 注册新账号