Python3--爬取海词信息
生活随笔
收集整理的這篇文章主要介紹了
Python3--爬取海词信息
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
上代碼:
#!/usr/bin/python3import queue import threading import requests,csv,time,random from bs4 import BeautifulSoup from fake_useragent import UserAgent import pandas as pd exitFlag = 0#利用pandas讀取csv文件 def getNames(csvfile):data = pd.read_csv(csvfile,delimiter='|') # 1--讀取的文件編碼問題有待考慮names = data['EnName']return names#獲取ip列表 def get_ip_list(): f=open('ip.txt','r') ip_list=f.readlines() f.close() return ip_list #從IP列表中獲取隨機IP def get_random_ip(ip_list): proxy_ip = random.choice(ip_list) proxy_ip=proxy_ip.strip('\n') proxies = {'https': proxy_ip} return proxies #功能:將信息寫入文件 def write_file(filePath,row): with open(filePath,'a+',encoding='utf-8',newline='') as csvfile: spanreader = csv.writer(csvfile,delimiter='|',quoting=csv.QUOTE_MINIMAL) spanreader.writerow(row) def get_content(url,ip_list):try:try:time.sleep(1)proxies = get_random_ip(ip_list)headers = {'User-Agent':str(UserAgent().random)}req = requests.get(url=url, proxies=proxies,headers=headers,timeout=20)except:print("重新運行")time.sleep(10)proxies = get_random_ip(ip_list)headers = {'User-Agent':str(UserAgent().random)}req = requests.get(url=url, proxies=proxies,headers=headers,timeout=40)except:print("第二次重新運行")time.sleep(15)proxies = get_random_ip(ip_list)headers = {'User-Agent':str(UserAgent().random)}req = requests.get(url=url, proxies=proxies,headers=headers)req.encoding = 'utf-8'soup = BeautifulSoup(req.text,'lxml')content = soup.find_all('div',class_='mbox')return req.status_code, content#獲取準確的英文名、中文名、名字含義、來源、性別等信息 def get_infor_header(content):content = content.find_all('span')EnName = []CnName = []Gender = []Source = []Meaning = []EnName.append(content[0].get_text())if len(content) != 1:CnName.append(content[1].get_text())Meaning.append(content[2].get_text()) Source.append(content[3].get_text())Gender.append(content[4].em.get('title'))else:CnName.append('')Meaning.append('') Source.append('')Gender.append('')#信息的鏈接方式EnName|CnName|Gender|Source|Meaninglist_header = EnName + CnName + Gender + Source + Meaningreturn list_header#獲取英文名對應(yīng)的名人 def get_infor_celebrity(content):content = content.find_all('li')list_celebrity = []str_celebrity=''for each in content:if not str_celebrity:str_celebrity +=each.get_text()else:str_celebrity +='@' + each.get_text()list_celebrity.append(str_celebrity)return list_celebrityclass myThread (threading.Thread):def __init__(self, threadID, name, q,ip_list):threading.Thread.__init__(self)self.threadID = threadIDself.name = nameself.q = qself.ip_list = ip_listdef run(self):print ("開啟線程:" + self.name)process_data(self.name, self.q,ip_list)print ("退出線程:" + self.name)def process_data(threadName, q,ip_list):while not exitFlag:queueLock.acquire()if not workQueue.empty():data = q.get()queueLock.release()print ("%s processing %s" % (threadName, data))url = 'http://ename.dict.cn/{}'.format(data)status_code, content = get_content(url,ip_list)if status_code==200:#獲取準確的中文名、名字含義、來源、性別等信息list_header = get_infor_header(content[0])#獲取名人信息list_celebrity = get_infor_celebrity(content[1])row = list_header + list_celebrityqueueLock.acquire()write_file('haici_infor.csv',row)queueLock.release()else:queueLock.release()time.sleep(1)threadList = ["Thread-1", "Thread-2", "Thread-3", "Thread-4", "Thread-5", "Thread-6", "Thread-7", "Thread-8", "Thread-9", "Thread-10"] nameList = getNames('A-Z.csv') queueLock = threading.Lock() workQueue = queue.Queue(100000) threads = [] threadID = 1# 創(chuàng)建新線程 ip_list = get_ip_list() for tName in threadList:thread = myThread(threadID, tName, workQueue,ip_list)thread.start()threads.append(thread)threadID += 1# 填充隊列 queueLock.acquire() for word in nameList:workQueue.put(word) queueLock.release()# 等待隊列清空 while not workQueue.empty():pass# 通知線程是時候退出 exitFlag = 1# 等待所有線程完成 for t in threads:t.join() print ("退出主線程")總結(jié)
以上是生活随笔為你收集整理的Python3--爬取海词信息的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Python---获取div标签中的文字
- 下一篇: 我的微信公众号