37.爬前程无忧
前程無憂每一頁面顯示的工作條數(shù)很有限,而且排名靠前的工作也并非是最好的,而且詳細(xì)信息還得點(diǎn)進(jìn)去看,以至于一般求職者根本看不了多少工作信息,也就無法篩選出滿意的工作,這也是導(dǎo)致求職者海投簡(jiǎn)歷的原因。 在我爬取了大量工作信息后,發(fā)現(xiàn)在前程無憂里面搜索關(guān)鍵詞會(huì)有相當(dāng)多的根本不相干的工作,垃圾信息相當(dāng)多,如果只是隨便看看那么很容易只看到垃圾信息。
import requests from lxml import etree import time import random import pandas as pd import re def get_detail(detail_url):resp1 = requests.get(detail_url, headers=HEADERS)text1 = resp1.content.decode('gbk', errors='ignore')html1 = etree.HTML(text1)ask = str(html1.xpath("/html/body/div[3]/div[2]/div[3]/div[1]/div/p/text()")).replace(' ','')ask = re.sub("\['", '', ask)ask = re.sub("']", '', ask)return askdef get_detail_urls(url,HEADERS):resp = requests.get(url,headers=HEADERS)text = resp.content.decode('gbk',errors='ignore')html = etree.HTML(text)detail_url = html.xpath("//table[@class='tbspan']//a/@href")detail_url_1 = html.xpath("//table[@class='tbspan']//a/text()")detail_url_2 = []for i in range(len(detail_url)):detail_url_2.append('http://www.ygdy8.net' + detail_url[i])data = pd.DataFrame({'film_name':detail_url_1,'detail_url':detail_url_2})data.to_csv('film.csv', index=False, sep=';', mode='a', header=False)HEADERS = {'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.103 Safari/537.36'} ''' for i in range(1,100):url = 'http://www.ygdy8.net/html/gndy/dyzz/list_23_{}.html'.format(i)time.sleep(random.random())print('第{}頁'.format(i))get_detail_urls(url, HEADERS)''' def get_jobs(HEADERS):job_ = []salary_ = []area_ = []company_ = []detail_url_ =[]ask_ = []try:for page in range(1, 200):url = 'https://search.51job.com/list/090200,000000,0000,00,9,99,%2B,2,{}.html?lang=c&stype=1&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=99&companysize=99&lonlat=0%2C0&radius=-1&ord_field=0&confirmdate=9&fromType=&dibiaoid=0&address=&line=&specialarea=00&from=&welfare='.format(page)resp = requests.get(url, headers=HEADERS)text = resp.content.decode('gbk', errors='ignore')html = etree.HTML(text)print('第%s頁' % page)for i in range(4, 54):job = html.xpath("//*[@id='resultList']/div[{}]/p/span/a/text()".format(i))[0].replace(' ', '')job = re.sub("\r\n", "", job)salary = html.xpath("//*[@id='resultList']/div[{}]/span[3]/text()".format(i))area = html.xpath("//*[@id='resultList']/div[{}]/span[2]/text()".format(i))[0]company = html.xpath("//*[@id='resultList']/div[{}]/span[1]/a/text()".format(i))[0].replace(' ', '')detail_url = str(html.xpath("//*[@id='resultList']/div[{}]/p/span/a/@href".format(i)))detail_url = re.sub("\['", '', detail_url)detail_url = re.sub("']", '', detail_url)ask = get_detail(detail_url)ask_.append(ask)job_.append(job)salary_.append(salary)area_.append(area)company_.append(company)detail_url_.append(url)except:passprint(job_, '\n', salary_, '\n', company_, '\n', detail_url_)print(ask_)return job_, salary_, area_, company_, detail_url_, ask_ job_, salary_, area_, company_, detail_url_, ask_ = get_jobs(HEADERS) df1 = pd.DataFrame({'job':job_, 'salary':salary_, 'company':company_, 'ask':ask_, 'detail_url':detail_url_,}) writer=pd.ExcelWriter('test.xlsx') df1.to_excel(writer,'Sheet1') writer.save() writer.close()總結(jié)
- 上一篇: Python 当前时间是那一年第几周的周
- 下一篇: 2012年2月份第3周51Aspx源码发