requests爬虎妞
生活随笔
收集整理的這篇文章主要介紹了
requests爬虎妞
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
2019獨角獸企業重金招聘Python工程師標準>>>
import reimport datetime import time, redis from bs4 import BeautifulSoup from lxml import etreeimport requests from pymongo import MongoClient from pymysql import connect from selenium import webdriver from selenium.webdriver import DesiredCapabilitiesclass ArticleFilter(object):def __init__(self, title, content):self.redis_client = redis.StrictRedis(host='127.0.0。1', port='6379', db=9)self.first_keywords = str(self.redis_client.get('first_keywords')).split(',')self.second_keywords = str(self.redis_client.get('second_keywords')).split(',')self.title = titleself.content = contentself.group_id_list = list()# 一級關鍵詞在內容中的頻次def article_content_filter(self):first_keyword_dict = dict()second_keyword_dict = dict()# 內容查找if isinstance(self.content, list):text = ''.join([item.get('text') for item in self.content if item.get('text')])# 查詢文章內容含有的頻次最高的一級關鍵詞for first_keyword in self.first_keywords:num = 0num += text.count(first_keyword)if num > 0:first_keyword_dict[first_keyword] = numfirst_res = self.select_high(first_keyword_dict)if len(first_res) == 1:keyword, num = first_res[0][0], first_res[0][1]keyword = {'first_keywords': keyword}else:# 頻次最高的一級關鍵詞沒有或者有多個,采用二級屬性詞分類標準for second_keyword in self.second_keywords:num = 0num += text.count(second_keyword)if num > 0:second_keyword_dict[second_keyword] = numsecond_res = self.select_high(second_keyword_dict)if len(second_res) == 1:keyword, num = second_res[0][0], second_res[0][1]keyword = {'second_keywords': keyword}elif len(second_res) > 1:# 頻次最高的二級屬性詞有多個,文章分別上架到二級屬性詞對應的文章分類keyword = [x[0] for x in second_res]keyword = {'second_keywords': keyword}else:# 沒有匹配到二級屬性詞,但頻次最高的一級關鍵詞有多個,文章分別上架到一級關鍵詞對應的文章分類if len(first_res) > 1:keyword = [x[0] for x in first_res]keyword = {'first_keywords': keyword}else:return Falsereturn keywordreturn False# 標題查找def article_title_filter(self):first_keyword_dict = dict()for first_keyword in self.first_keywords:num = 0num += self.title.count(first_keyword)if num > 0:first_keyword_dict[first_keyword] = numfirst_res = self.select_high(first_keyword_dict)if len(first_res) == 1:keyword, num = first_res[0][0], first_res[0][1]first_keywords = {'first_keywords': keyword}return first_keywordsreturn False# 關鍵詞查找--主函數,返回文章關鍵詞對應的分類IDdef article_filter(self):# 1.標題查找title_keyword = self.article_title_filter()if title_keyword:first_keywords = title_keyword.get('first_keywords')group_id = self.get_keyword_group_id(first_keywords)self.group_id_list.append(group_id)else:# 2.內容查找content_keyword = self.article_content_filter()if content_keyword:first_keywords = content_keyword.get('first_keywords')if isinstance(first_keywords, str):group_id = self.get_keyword_group_id(first_keywords)self.group_id_list.append(group_id)elif isinstance(first_keywords, list):for first_keyword in first_keywords:group_id = self.get_keyword_group_id(first_keyword)self.group_id_list.append(group_id)else:second_keywords = content_keyword.get('second_keywords')if isinstance(second_keywords, str):group_id = self.get_keyword_group_id(second_keywords)self.group_id_list.append(group_id)elif isinstance(second_keywords, list):for second_keyword in second_keywords:group_id = self.get_keyword_group_id(second_keyword)self.group_id_list.append(group_id)else:self.group_id_list = Noneelse:self.group_id_list = Nonereturn self.group_id_list# 選取出現頻次最高的關鍵字@staticmethoddef select_high(keyword_dict):ls = sorted(list(keyword_dict.items()), key=lambda a: a[1], reverse=True)index = 0for i, x in enumerate(ls):if x[1] == ls[0][1]:index = i + 1else:breakprint((ls[:index]))return ls[:index]# Redis取出關鍵詞對應的文章分類IDdef get_keyword_group_id(self, keyword):article_group_id = self.redis_client.hget('group_id_of_keyword', keyword)return article_group_id# 文章敏感詞過濾def sensitive_words_filter(self):try:sensitive_words = self.redis_client.get('sensitive_words')if sensitive_words:sensitive_words = sensitive_words.split(',')text = ''.join([item.get('text') for item in self.content if item.get('text')])for sensitive_word in sensitive_words:resp_title = self.title.find(sensitive_word)resp_content = text.find(sensitive_word)if resp_title != -1 or resp_content != -1:return Trueelse:return Falseelse:return Falseexcept Exception as e:return Falseclass huxiu_spider(object):def __init__(self):self.base_url = 'https://www.huxiu.com/'self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.3; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'}def send_request(self, url):response = requests.get(url, headers=self.headers)text = response.textreturn text# 文章列表def first_analysis(self, text):selector = etree.HTML(text)results = selector.xpath('//*[@id="index"]/div[2]/div[2]/div')# //*[@id="index"]/div[1]/div[2]/div[9]/div[1]/a/div/@stylenew_list = []i = 1for res in results:res_dict = {}web_name = '虎嗅網'res_dict['web_name'] = web_name# 文章標題title = res.xpath('div[1]/h2/a/text()')[0]print('正在爬取第%s篇文章,標題是:%s' % (i, title))num = self.get_title(title, web_name)print('查看文章是否存在=====')if num == 0:print('文章不存在~~~')url = res.xpath('div/h2/a[starts-with(@href, "/article")]/@href')[0]article_link = 'https://www.huxiu.com' + urlarticle_content, article_time = self.second_analysis(article_link)if article_content != 1:print('敏感詞開始過濾')# 本地敏感關鍵詞過濾article_filter_obj = ArticleFilter(title, article_content)resp = article_filter_obj.sensitive_words_filter()if resp:print('文章存在敏感詞匯')else:# 文章內容res_dict['content'] = article_content# 文章發布時間res_dict['date'] = article_time# 文章內容鏈接res_dict['article_link'] = article_link# 文章標題res_dict['title'] = title# 文章簡介summary = res.xpath('div/div[2]/text()')[0]res_dict['summary'] = summary# 文章作者name = res.xpath('div/div/a/span/text()')[0]res_dict["name"] = name# 文章作者鏈接# res_dict["author_link"] = 'https://www.huxiu.com' + res.xpath('div/div/a/@href')[0]# 文章列表主圖if res.xpath('div/a/img/@data-original'):min_pic = res.xpath('div/a/img/@data-original')[0]oss_url = self.upload_oss(min_pic)# oss_url = oss_url.replace('http', 'https')res_dict["min_pic"] = oss_urlelif res.xpath('a/div/img/@data-original'):min_pic = res.xpath('a/div/img/@data-original')[0]oss_url = self.upload_oss(min_pic)# oss_url = oss_url.replace('http', 'https')res_dict["min_pic"] = oss_urlelif res.xpath('div/a/div/@style'):# 截取圖片是視頻樣式的mystr = res.xpath('div/a/div/@style')[0]print(111, mystr)start_index = mystr.find('(', 0, len(mystr))end_index = mystr.find('?', 0, len(mystr))min_pic = mystr[start_index + 2:end_index]print(123, min_pic)oss_url = self.upload_oss(min_pic)print(321, oss_url)# oss_url = oss_url.replace('http', 'https')res_dict["min_pic"] = oss_urlelse:oss_url = ''res_dict["min_pic"] = oss_urlself.upload_mongo(res_dict)self.upload_mysql(title, name, article_time, oss_url, summary, web_name, article_link)print('成功獲取并保存第%s篇文章' % i)i += 1new_list.append(res_dict)else:i += 1continueelse:i += 1continueprint('成功獲取到%s篇文章' % (i - 1))# 文章內容def second_analysis(self, url):try:# 自定義PhantomJS的請求頭cap = DesiredCapabilities.PHANTOMJS.copy()for key, value in self.headers.items():cap['phantomjs.page.customHeaders.{}'.format(key)] = valuebrowser = webdriver.PhantomJS('/usr/local/lib/phantomjs-2.1.1-linux-x86_64/bin/phantomjs')browser.get(url)time.sleep(3)html = browser.page_source# 選取文章發布時間selector = etree.HTML(html)if selector.xpath('//div[@class="column-link-box"]/span[1]/text()'):article_time = selector.xpath('//div[@class="column-link-box"]/span[1]/text()')[0]print(article_time)# //*[@id="article_content301428"]/p[138]/span[2]/text() ---new# //*[@class="article-author"]/span[2]/text() ---oldelif selector.xpath('//*[@id="article_content301428"]/p[138]/span[2]/text() '):article_time = selector.xpath('//*[@id="article_content301428"]/p[138]/span[2]/text() ')[0]else:article_time = ''# 文章內頭圖if selector.xpath('//div[@class="article-img-box"]/img/@src'):article_min_pic = selector.xpath('//div[@class="article-img-box"]/img/@src')[0]else:article_min_pic = ""# 選取文章內容content = selector.xpath('//*[@class="article-content-wrap"]')[0]result = etree.tostring(content, method='html')print('獲取到文章內容')# 獲取bs4對象soup = BeautifulSoup(result, 'html.parser', from_encoding='utf-8')new_list = []# 通過標簽來獲取內容ls = soup.find_all(["p", "img"])for table in ls:res = {}data = table.get_text()if data:# # 去除空字符和特殊字符new_data = "".join(data.split())new_data = new_data.replace(u'\ufeff', '')if new_data != "":res["text"] = new_datanew_list.append(res)link = table.get('src')if link:oss_url = self.upload_oss(link)res["img"] = oss_urlnew_list.append(res)if article_min_pic != '':article_min_pic = self.upload_oss(article_min_pic)# article_min_pic = article_min_pic.replace('http', 'https')new_list.insert(0, {'img': article_min_pic})browser.quit()return new_list, article_timeexcept Exception as e:print('文章不存在了', e)return 1, 1# 上傳圖片到ossdef upload_oss(self, url):kw = {'fileurl': url,'filepath': 'gander_goose/dev/test2'}result = requests.post(url='http://api.max-digital.cn/Api/oss/uploadByUrl', data=kw)result = result.json()oss_url = result.get('oss_file_url')oss_url = oss_url.replace('maxpr.oss-cn-shanghai.aliyuncs.com', 'cdn.max-digital.cn')oss_url = oss_url.replace('http', 'https')return oss_url# 數據上傳mongodef upload_mongo(self, article_dict):try:client = MongoClient('127.0.0.1', 27017)my_db = client.wechatmy_db.articles.insert_one(article_dict)print('上傳到mongo成功')except Exception as e:print('上傳到mongo失敗:', e)# 插入到mysqldef upload_mysql(self, title, name, date, oss_url, summary, web_name, link):try:# 上傳mysql# 創建Connection連接conn = connect(host='localhost', port=3306, database='wechat',user='root', password='mysql', charset='utf8')# 獲得Cursor對象cs1 = conn.cursor()# 執行insert語句,并返回受影響的行數:添加一條數據# 增加now = datetime.datetime.now()imgurl = "https://cdn.max-digital.cn/gander_goose/dev/test2/15368082362561.jpg"sql1 = "insert into article_info (title,author,wechat_art_date,min_pic,summary,web_name,is_show,is_big,link,round_head_img,create_time) values ('%s','%s','%s','%s','%s','%s','%s','%s','%s','%s','%s')" % (title, name, date, oss_url, summary, web_name, 0, 0, link, imgurl, now)cs1.execute(sql1)# 獲取最新插入的文章的IDnew_article_id = int(conn.insert_id())# 修改分類--24小時下文章的自定義排序值# sql2 = 'update article_group set sort_num = sort_num + 1 where group_id=1'# cs1.execute(sql2)# 上線到24小時分類sql3 = 'insert into article_group (article_id,group_id,sort_num,create_time) values ("%s", "%s", "%s", "%s")' % (new_article_id, 1, 1, now)cs1.execute(sql3)# 修改文章上線狀態sql4 = "update article_info set is_show = 1, zj_art_date='%s' where id='%s'" % (now, new_article_id)cs1.execute(sql4)conn.commit()cs1.close()conn.close()print('上傳到mysql成功')except Exception as e:print('mysql上傳失敗:', e)def get_title(self, title, query):# 查詢mysqlconn = connect(host='127.0.0.1', port=3306,database='zj',user='root', password='mysql', charset='utf8')# 獲得Cursor對象cs1 = conn.cursor()res = 'select * from article_info where title = "%s" and web_name = "%s" ' % (title, query)num = cs1.execute(res)return numdef run(self):text = self.send_request(self.base_url)self.first_analysis(text)if __name__ == '__main__':huxiu = huxiu_spider()while True:start_time = time.time()print('開始時間:', time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(start_time)))huxiu.run()time.sleep(3600)轉載于:https://my.oschina.net/u/3892643/blog/3055002
總結
以上是生活随笔為你收集整理的requests爬虎妞的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Joomla模板制作教程
- 下一篇: python编程入门pdf-Python