医脉通数据爬取 http://disease.medlive.cn
生活随笔
收集整理的這篇文章主要介紹了
医脉通数据爬取 http://disease.medlive.cn
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
import requests
import time
from lxml import etree
import re
import xlwt
import random
import xlrdfrom multiprocessing import Processclass Yimaitong():def __init__(self):# 請求的urlself.url = 'http://disease.medlive.cn/wiki/list/171'f = time.time()nd = '%s' % int(round(f))# 請求頭self.headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN,zh;q=0.9','Connection': 'keep-alive','Cookie': 'eyJ1aWQiOiIwIiwicmVzb3VyY2UiOiIiLCJhcHBfbmFtZSI6IiIsImV4dF92ZXJzaW9uIjoiMSJ9','Host': 'www.medlive.cn','Referer': 'http://disease.medlive.cn/wiki/list/178','User-Agent': 'Mozilla/5.0 (Linux; Android 6.0; Nexus 5 Build/MRA58N) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/81.0.4044.138 Mobile Safari/537.36',}header = {'Referer': 'http://disease.medlive.cn/wiki/entry/0_0_37977?row=1','Cookie': 'ymt_pk_id=b7470997e9241352; _pk_ses.3.a971=*; sess=3ve23k417je2d1hhipuku66fg2; ymtinfo=eyJ1aWQiOiIwIiwicmVzb3VyY2UiOiIiLCJhcHBfbmFtZSI6IiIsImV4dF92ZXJzaW9uIjoiMSJ9; Hm_lvt_62d92d99f7c1e7a31a11759de376479f=1588835592,1589005542,1589006182; Hm_lpvt_62d92d99f7c1e7a31a11759de376479f=1589013816; _pk_id.3.a971=b7470997e9241352.1588835592.7.1589013816.1588925929.',}def parser_url(self, start, end, pathtxt):# 解析第一次返回的數(shù)據(jù)response = requests.get(self.url)html = etree.HTML(response.text)href = html.xpath('//*[@id="wiki_list_box"]/div[1]/div[2]/ul/li/a/@href')fina_dic = {}fina_data = []# for url in range(2):for url in range(start, end):# 遍歷所需要的下標,下標對應的網(wǎng)址f = []time.sleep(0.2)new_url = 'http://disease.medlive.cn' + href[url]print(url, new_url)# 解析urlfirst_response = requests.get(new_url)new_html = etree.HTML(first_response.text)first_href = new_html.xpath('//*[@id="wiki_list_box"]/div[2]/ul/li/dl/dd/a/@href')# 獲取疾病分類jibing_name = new_html.xpath('//*[@id="wiki_list_box"]/div[1]/div[2]/ul/li[' + str(url + 1) + ']/a/text()')[0]# 解析第二次的urlfor second in first_href:list_data = []# for second in first_href:time.sleep(1)try:second_url = 'http://disease.medlive.cn' + secondsecond_response = requests.get(second_url)second_html = etree.HTML(second_response.text)second_name = second_html.xpath('/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/label/text()')[0]second_href = second_html.xpath('/html/body/div[5]/div[1]/div[1]/div[1]/div[1]/a/@href')[0]three_response = requests.get(second_href)# 獲取精要three_html = etree.HTML(three_response.text)three_url = three_html.xpath('//*[@id="content"]/div/div[1]/div[1]/div[2]/div[1]/dl/dd[3]/a/@href')[0]four_url = 'http://disease.medlive.cn' + three_urlfour_response = requests.get(four_url)four_html = etree.HTML(four_response.text)time.sleep(0.2)# 點擊詳細one_detail = \four_html.xpath('//*[@id="content"]/div/div[1]/div[2]/div/div[2]/table/tbody/tr[2]/td[1]/p/a/@href')[0]one_detail_url = 'http://disease.medlive.cn' + one_detail# print(1,one_detail_url)one_detail_response = requests.get(one_detail_url)one_detail_html = etree.HTML(one_detail_response.text)keyword = ['關鍵因素', ]keyword.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div/h5/span[1]/text()'))key_data = ['關鍵內(nèi)容', ]key_data.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div[2]/div/p/text()'))otherword = ['其它診斷因素', ]otherword.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div/h5/span[1]/text()'))other_data = ['其他診斷內(nèi)容', ]other_data.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div[2]/div/p/text()'))dengerword = ['危險因素', ]dengerword.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div/h5/span[1]/text()'))dengerdata = ['危險內(nèi)容', ]dengerdata.append(one_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div[2]/div/p/text()'))list_data.append(keyword)list_data.append(key_data)list_data.append(dengerword)list_data.append(dengerdata)list_data.append(otherword)list_data.append(other_data)# time.sleep(random.randint(2, 8))two_detail = \four_html.xpath('//*[@id="content"]/div/div[1]/div[2]/div/div[2]/table/tbody/tr[2]/td[2]/p/a/@href')[0]two_detail_url = 'http://disease.medlive.cn' + two_detail# print(2,two_detail_url)two_detail_response = requests.get(two_detail_url)two_detail_html = etree.HTML(two_detail_response.text)precedence = ['優(yōu)先檢測', ]precedence.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div[1]/h5/span/text()'))precedencedata = ['優(yōu)先檢測內(nèi)容', ]precedencedata.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/div[2]/table/tbody//text()'))# for aaa in nnnn:# print(aaa.text)# print(666)select = ['可選檢測', ]select.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div[1]/h5/text()'))selectdata = ['可選檢測內(nèi)容', ]selectdata.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[2]/ul/li/div[2]/table/tbody//text()'))new = ['新的檢測', ]new.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div[1]/h5/span/text()'))newdata = ['新的檢測內(nèi)容', ]newdata.append(two_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[3]/ul/li/div[2]/table/tbody/tr//text()'))list_data.append(precedence)list_data.append(precedencedata)list_data.append(select)list_data.append(selectdata)list_data.append(new)list_data.append(newdata)# time.sleep(random.randint(1, 5))three_detail = \four_html.xpath('//*[@id="content"]/div/div[1]/div[2]/div/div[2]/table/tbody/tr[2]/td[3]/p/a/@href')[0]three_detail_url = 'http://disease.medlive.cn' + three_detail# print(3,three_detail_url)three_detail_response = requests.get(three_detail_url)three_detail_html = etree.HTML(three_detail_response.text)Treatment_conditions = three_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/a//text()')Treatment_conditions_url = three_detail_html.xpath('//*[@id="wiki_view_frm"]/div/div[1]/ul/li/a/@href')p = r'\S+'three_re = ['治療細則', ]t_re = []for i in Treatment_conditions:t_re.append(re.findall(p, i)[0])three_re.append(t_re)three_data = ['治療細則內(nèi)容', ]for Treatment_url in Treatment_conditions_url:new_Treatment_url = 'http://disease.medlive.cn' + Treatment_urlnew_Treatment_urlresponse = requests.get(new_Treatment_url, headers=self.header)Treatment_urlresponse = etree.HTML(new_Treatment_urlresponse.text)three_data.append(Treatment_urlresponse.xpath('//*[@id="wiki_view_frm"]/div/div[1]/div[2]//text()'))list_data.append(three_re)list_data.append(three_data)detail_data = {second_name: list_data}f.append(detail_data)fina_dic = {jibing_name: f}except:passfina_data.append(fina_dic)# with open(pathtxt, 'w+')as f:# f.write(str(fina_data))# print(fina_data)print(fina_data)# 返回獲取的數(shù)據(jù)return fina_datadef first_parser_data(self, data, path):# 將獲取的數(shù)據(jù)進行解析# 寫入文件workbook = xlwt.Workbook(encoding='utf-8')worksheet = workbook.add_sheet('Sheet1')row = 2for i in data: # 拿到兒科 jibingkey 兒科:[]for jibingkey, jibingvalue in i.items(): # 拿到print(0, jibingkey)worksheet.write(row, 0, jibingkey)for bingzhong in jibingvalue:for bingzhongkey, bingzhongvalue in bingzhong.items():# 拿到病種 bingzhongkeyprint(1, bingzhongkey)worksheet.write(row, 1, bingzhongkey)for detail_data in bingzhongvalue:# 拿到細則列表 判斷長度并寫入# print(detail_data)for write_num in detail_data[1]:if len(detail_data[1]) == 0:detail_data[1].append('0')for write_data in write_num:print(2, detail_data[0])worksheet.write(row, 2, detail_data[0])print(3, write_data)worksheet.write(row, 3, write_data)# print(4,write_data)row += 1# workbook.save('yimaitong.xlsx')workbook.save(path)passdef parser_data(self, data):# 寫入文件workbook = xlwt.Workbook(encoding='utf-8')worksheet = workbook.add_sheet('Sheet1')row1 = 0row3 = 0row = 0a = 0for i in data: # 拿到兒科 jibingkey {兒科:[]}for jibingkey, jibingvalue in i.items(): # 拿到 兒科:[]worksheet.write(row3, 0, jibingkey)for bingzhong in jibingvalue: # 拿到{Alport綜合征:[]}for bingzhongkey, bingzhongvalue in bingzhong.items(): # 拿到 [[因素,[]]]worksheet.write(row1, 1, bingzhongkey)for detail_data in bingzhongvalue: # 遍歷每一組因素 ['關鍵因素', ['腎臟病變', '聽力障礙']]worksheet.write(a, 2, detail_data[0])a += 1col = 3for write_num in detail_data[1]:worksheet.write(row, col, write_num)col += 1row += 1row3 += 1row1 += 7row3 += row1workbook.save('醫(yī)脈通—數(shù)據(jù)11-18.xlsx')def main(self):process_list = []yimaitong = Yimaitong()pathtxt1 = 'yimaitong 1-5.txt'data1 = yimaitong.parser_url(0, 5, pathtxt1)path1 = 'yimaitong 1-5.xlsx'# yimaitong.first_parser_data(data1, path1)p1 = Process(target=self.first_parser_data, args=(data1, path1))p1.start()pathtxt2 = 'yimaitong 5-10.txt'data2 = yimaitong.parser_url(5, 10, pathtxt2)path2 = 'yimaitong 5-10.xlsx'# yimaitong.first_parser_data(data2, path2)p2 = Process(target=self.first_parser_data, args=(data2, path2))p2.start()pathtxt3 = 'yimaitong 10-15.txt'data3 = yimaitong.parser_url(10, 15, pathtxt3)path3 = 'yimaitong 10-15.xlsx'# yimaitong.first_parser_data(data3, path3)p3 = Process(target=self.first_parser_data, args=(data3, path3))p3.start()pathtxt4 = 'yimaitong 15-20.txt'data4 = yimaitong.parser_url(15, 20, pathtxt4)path4 = 'yimaitong 15-20.xlsx'# yimaitong.first_parser_data(data4, path4)p4 = Process(target=self.first_parser_data, args=(data4, path4))p4.start()pathtxt5 = 'yimaitong 20-25.txt'data5 = yimaitong.parser_url(20, 25, pathtxt5)path5 = 'yimaitong 20-25.xlsx'# yimaitong.first_parser_data(data4, path4)p5 = Process(target=self.first_parser_data, args=(data5, path5))p5.start()process_list.append(p1)process_list.append(p2)process_list.append(p3)process_list.append(p4)process_list.append(p5)for t in process_list:t.join()yimaitong = Yimaitong()
# yimaitong.main()
pathtxt1 = '內(nèi)容.txt'
data = yimaitong.parser_url(0, 1, pathtxt1)
# yimaitong.parser_data(data)
總結(jié)
以上是生活随笔為你收集整理的医脉通数据爬取 http://disease.medlive.cn的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 微信直播:视频号如何开通直播?功能有哪些
- 下一篇: 消息队列MQ 之 Kafka