1.接到需求需要對指定的微博賬號進行微博內容抓取
這個任務也比較常見
那么拿到需求我們進到微博網站對進口進行查找,對接口進行抓取的話,能獲得數據無疑是最省事安心的方式
2.那么在github上看到寫的十分好用的案例,那么本著互聯網開源的思想,對代碼進行學習修改。
廢話不多說,直接上代碼
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import codecs
import csv
import json
import math
import os
import random
import sys
import traceback
from collections import OrderedDict
from datetime import datetime, timedelta
from time import sleep
import requests
from lxml import etree
from requests.adapters import HTTPAdapter
from tqdm import tqdmclass Weibo(object):def __init__(self,user_id,filter=0,since_date='1900-01-01',pic_download=0,video_download=0):"""Weibo類初始化"""if not isinstance(user_id, int):sys.exit(u'user_id值應為一串數字形式,請重新輸入')if filter != 0 and filter != 1:sys.exit(u'filter值應為數字0或1,請重新輸入')if not self.is_date(since_date):sys.exit(u'since_date值應為yyyy-mm-dd形式,請重新輸入')if pic_download != 0 and pic_download != 1:sys.exit(u'pic_download值應為數字0或1,請重新輸入')if video_download != 0 and video_download != 1:sys.exit(u'video_download值應為0或1,請重新輸入')self.user_id = user_id # 用戶id,即需要我們輸入的數字,如昵稱為"Dear-迪麗熱巴"的id為1669879400self.filter = filter # 取值范圍為0、1,程序默認值為0,代表要爬取用戶的全部微博,1代表只爬取用戶的原創微博self.since_date = since_date # 起始時間,即爬取發布日期從該值到現在的微博,形式為yyyy-mm-ddself.pic_download = pic_download # 取值范圍為0、1,程序默認值為0,代表不下載微博原始圖片,1代表下載self.video_download = video_download # 取值范圍為0、1,程序默認為0,代表不下載微博視頻,1代表下載self.weibo = [] # 存儲爬取到的所有微博信息self.user = {} # 存儲目標微博用戶信息self.got_count = 0 # 爬取到的微博數def is_date(self, since_date):"""判斷日期格式是否正確"""try:datetime.strptime(since_date, "%Y-%m-%d")return Trueexcept ValueError:return Falsedef get_json(self, params):"""獲取網頁中json數據"""url = 'https://m.weibo.cn/api/container/getIndex?'r = requests.get(url, params=params)return r.json()def get_weibo_json(self, page):"""獲取網頁中微博json數據"""params = {'containerid': '107603' + str(self.user_id), 'page': page}js = self.get_json(params)return jsdef get_user_info(self):"""獲取用戶信息"""params = {'containerid': '100505' + str(self.user_id)}js = self.get_json(params)if js['ok']:info = js['data']['userInfo']if info.get('toolbar_menus'):del info['toolbar_menus']user_info = self.standardize_info(info)self.user = user_inforeturn user_infodef get_long_weibo(self, id):"""獲取長微博"""url = 'https://m.weibo.cn/detail/%s' % idhtml = requests.get(url).texthtml = html[html.find('"status":'):]html = html[:html.rfind('"hotScheme"')]html = html[:html.rfind(',')]html = '{' + html + '}'js = json.loads(html, strict=False)weibo_info = js.get('status')if weibo_info:weibo = self.parse_weibo(weibo_info)return weibodef get_pics(self, weibo_info):"""獲取微博原始圖片url"""if weibo_info.get('pics'):pic_info = weibo_info['pics']pic_list = [pic['large']['url'] for pic in pic_info]pics = ','.join(pic_list)else:pics = ''return picsdef get_video_url(self, weibo_info):"""獲取微博視頻url"""video_url = ''if weibo_info.get('page_info'):if weibo_info['page_info'].get('media_info'):media_info = weibo_info['page_info']['media_info']video_url = media_info.get('mp4_720p_mp4')if not video_url:video_url = media_info.get('mp4_hd_url')if not video_url:video_url = media_info.get('mp4_sd_url')if not video_url:video_url = ''return video_urldef download_one_file(self, url, file_path, type, weibo_id):"""下載單個文件(圖片/視頻)"""try:if not os.path.isfile(file_path):s = requests.Session()s.mount(url, HTTPAdapter(max_retries=5))downloaded = s.get(url, timeout=(5, 10))with open(file_path, 'wb') as f:f.write(downloaded.content)except Exception as e:error_file = self.get_filepath(type) + os.sep + 'not_downloaded.txt'with open(error_file, 'ab') as f:url = str(weibo_id) + ':' + url + '\n'f.write(url.encode(sys.stdout.encoding))print('Error: ', e)traceback.print_exc()def download_files(self, type):"""下載文件(圖片/視頻)"""try:if type == 'img':describe = u'圖片'key = 'pics'else:describe = u'視頻'key = 'video_url'print(u'即將進行%s下載' % describe)file_dir = self.get_filepath(type)for w in tqdm(self.weibo, desc=u'%s下載進度' % describe):if w[key]:file_prefix = w['created_at'][:11].replace('-', '') + '_' + str(w['id'])if type == 'img' and ',' in w[key]:w[key] = w[key].split(',')for j, url in enumerate(w[key]):file_suffix = url[url.rfind('.'):]file_name = file_prefix + '_' + str(j + 1) + file_suffixfile_path = file_dir + os.sep + file_nameself.download_one_file(url, file_path, type,w['id'])else:if type == 'video':file_suffix = '.mp4'else:file_suffix = w[key][w[key].rfind('.'):]file_name = file_prefix + file_suffixfile_path = file_dir + os.sep + file_nameself.download_one_file(w[key], file_path, type,w['id'])print(u'%s下載完畢,保存路徑:' % describe)print(file_dir)except Exception as e:print('Error: ', e)traceback.print_exc()def get_location(self, selector):"""獲取微博發布位置"""location_icon = 'timeline_card_small_location_default.png'span_list = selector.xpath('//span')location = ''for i, span in enumerate(span_list):if span.xpath('img/@src'):if location_icon in span.xpath('img/@src')[0]:location = span_list[i + 1].xpath('string(.)')breakreturn locationdef get_topics(self, selector):"""獲取參與的微博話題"""span_list = selector.xpath("//span[@class='surl-text']")topics = ''topic_list = []for span in span_list:text = span.xpath('string(.)')if len(text) > 2 and text[0] == '#' and text[-1] == '#':topic_list.append(text[1:-1])if topic_list:topics = ','.join(topic_list)return topicsdef get_at_users(self, selector):"""獲取@用戶"""a_list = selector.xpath('//a')at_users = ''at_list = []for a in a_list:if '@' + a.xpath('@href')[0][3:] == a.xpath('string(.)'):at_list.append(a.xpath('string(.)')[1:])if at_list:at_users = ','.join(at_list)return at_usersdef string_to_int(self, string):"""字符串轉換為整數"""if isinstance(string, int):return stringelif string.endswith(u'萬+'):string = int(string[:-2] + '0000')elif string.endswith(u'萬'):string = int(string[:-1] + '0000')return int(string)def standardize_date(self, created_at):"""標準化微博發布時間"""if u"剛剛" in created_at:created_at = datetime.now().strftime("%Y-%m-%d")elif u"分鐘" in created_at:minute = created_at[:created_at.find(u"分鐘")]minute = timedelta(minutes=int(minute))created_at = (datetime.now() - minute).strftime("%Y-%m-%d")elif u"小時" in created_at:hour = created_at[:created_at.find(u"小時")]hour = timedelta(hours=int(hour))created_at = (datetime.now() - hour).strftime("%Y-%m-%d")elif u"昨天" in created_at:day = timedelta(days=1)created_at = (datetime.now() - day).strftime("%Y-%m-%d")elif created_at.count('-') == 1:year = datetime.now().strftime("%Y")created_at = year + "-" + created_atreturn created_atdef standardize_info(self, weibo):"""標準化信息,去除亂碼"""for k, v in weibo.items():if 'int' not in str(type(v)) and 'long' not in str(type(v)) and 'bool' not in str(type(v)):weibo[k] = v.replace(u"\u200b", "").encode(sys.stdout.encoding, "ignore").decode(sys.stdout.encoding)return weibodef parse_weibo(self, weibo_info):weibo = OrderedDict()if weibo_info['user']:weibo['user_id'] = weibo_info['user']['id']weibo['screen_name'] = weibo_info['user']['screen_name']else:weibo['user_id'] = ''weibo['screen_name'] = ''weibo['id'] = int(weibo_info['id'])text_body = weibo_info['text']selector = etree.HTML(text_body)weibo['text'] = etree.HTML(text_body).xpath('string(.)')weibo['pics'] = self.get_pics(weibo_info)weibo['video_url'] = self.get_video_url(weibo_info)weibo['location'] = self.get_location(selector)weibo['created_at'] = weibo_info['created_at']weibo['source'] = weibo_info['source']weibo['attitudes_count'] = self.string_to_int(weibo_info['attitudes_count'])weibo['comments_count'] = self.string_to_int(weibo_info['comments_count'])weibo['reposts_count'] = self.string_to_int(weibo_info['reposts_count'])weibo['topics'] = self.get_topics(selector)weibo['at_users'] = self.get_at_users(selector)return self.standardize_info(weibo)def print_user_info(self):"""打印用戶信息"""print('+' * 100)print(u'用戶信息')print(u'用戶id:%d' % self.user['id'])print(u'用戶昵稱:%s' % self.user['screen_name'])gender = u'女' if self.user['gender'] == 'f' else u'男'print(u'性別:%s' % gender)print(u'微博數:%d' % self.user['statuses_count'])print(u'粉絲數:%d' % self.user['followers_count'])print(u'關注數:%d' % self.user['follow_count'])if self.user.get('verified_reason'):print(self.user['verified_reason'])print(self.user['description'])print('+' * 100)def print_one_weibo(self, weibo):"""打印一條微博"""print(u'微博id:%d' % weibo['id'])print(u'微博正文:%s' % weibo['text'])print(u'原始圖片url:%s' % weibo['pics'])print(u'微博位置:%s' % weibo['location'])print(u'發布時間:%s' % weibo['created_at'])print(u'發布工具:%s' % weibo['source'])print(u'點贊數:%d' % weibo['attitudes_count'])print(u'評論數:%d' % weibo['comments_count'])print(u'轉發數:%d' % weibo['reposts_count'])print(u'話題:%s' % weibo['topics'])print(u'@用戶:%s' % weibo['at_users'])def print_weibo(self, weibo):"""打印微博,若為轉發微博,會同時打印原創和轉發部分"""if weibo.get('retweet'):print('*' * 100)print(u'轉發部分:')self.print_one_weibo(weibo['retweet'])print('*' * 100)print(u'原創部分:')self.print_one_weibo(weibo)print('-' * 120)def get_one_weibo(self, info):"""獲取一條微博的全部信息"""try:weibo_info = info['mblog']weibo_id = weibo_info['id']retweeted_status = weibo_info.get('retweeted_status')is_long = weibo_info['isLongText']if retweeted_status: # 轉發retweet_id = retweeted_status['id']is_long_retweet = retweeted_status['isLongText']if is_long:weibo = self.get_long_weibo(weibo_id)if not weibo:weibo = self.parse_weibo(weibo_info)else:weibo = self.parse_weibo(weibo_info)if is_long_retweet:retweet = self.get_long_weibo(retweet_id)if not retweet:retweet = self.parse_weibo(retweeted_status)else:retweet = self.parse_weibo(retweeted_status)retweet['created_at'] = self.standardize_date(retweeted_status['created_at'])weibo['retweet'] = retweetelse: # 原創if is_long:weibo = self.get_long_weibo(weibo_id)if not weibo:weibo = self.parse_weibo(weibo_info)else:weibo = self.parse_weibo(weibo_info)weibo['created_at'] = self.standardize_date(weibo_info['created_at'])return weiboexcept Exception as e:print("Error: ", e)traceback.print_exc()def get_one_page(self, page):"""獲取一頁的全部微博"""try:js = self.get_weibo_json(page)if js['ok']:weibos = js['data']['cards']for w in weibos[2:]:if w['card_type'] == 9:wb = self.get_one_weibo(w)if wb:if wb['created_at'] < self.since_date:return Trueif (not self.filter) or ('retweet' not in wb.keys()):self.weibo.append(wb)self.got_count = self.got_count + 1self.print_weibo(wb)except Exception as e:print("Error: ", e)traceback.print_exc()def get_page_count(self):"""獲取微博頁數"""weibo_count = self.user['statuses_count']page_count = int(math.ceil(weibo_count / 10.0))return page_countdef get_write_info(self, wrote_count):"""獲取要寫入的微博信息"""write_info = []for w in self.weibo[wrote_count:]:wb = OrderedDict()for k, v in w.items():if k not in ['user_id', 'screen_name', 'retweet']:if 'unicode' in str(type(v)):v = v.encode('utf-8')wb[k] = vif not self.filter:if w.get('retweet'):wb['is_original'] = Falsefor k2, v2 in w['retweet'].items():if 'unicode' in str(type(v2)):v2 = v2.encode('utf-8')wb['retweet_' + k2] = v2else:wb['is_original'] = Truewrite_info.append(wb)return write_infodef get_filepath(self, type):"""獲取結果文件路徑"""try:file_dir = os.path.split(os.path.realpath(__file__))[0] + os.sep + 'weibo' + os.sep + self.user['screen_name']if type == 'img' or type == 'video':file_dir = file_dir + os.sep + typeif not os.path.isdir(file_dir):os.makedirs(file_dir)if type == 'img' or type == 'video':return file_dirfile_path = file_dir + os.sep + '%d' % self.user_id + '.' + typereturn file_pathexcept Exception as e:print('Error: ', e)traceback.print_exc()def get_result_headers(self):"""獲取要寫入結果文件的表頭"""result_headers = ['id', '正文', '原始圖片url', '視頻url', '位置', '日期', '工具', '點贊數', '評論數','轉發數', '話題', '@用戶']if not self.filter:result_headers2 = ['是否原創', '源用戶id', '源用戶昵稱']result_headers3 = ['源微博' + r for r in result_headers]result_headers = result_headers + result_headers2 + result_headers3return result_headersdef write_csv(self, wrote_count):"""將爬到的信息寫入csv文件"""write_info = self.get_write_info(wrote_count)result_headers = self.get_result_headers()result_data = [w.values() for w in write_info]if sys.version < '3': # python2.xwith open(self.get_filepath('csv'), 'ab') as f:f.write(codecs.BOM_UTF8)writer = csv.writer(f)if wrote_count == 0:writer.writerows([result_headers])writer.writerows(result_data)else: # python3.xwith open(self.get_filepath('csv'),'a',encoding='utf-8-sig',newline='') as f:writer = csv.writer(f)if wrote_count == 0:writer.writerows([result_headers])writer.writerows(result_data)print(u'%d條微博寫入csv文件完畢,保存路徑:' % self.got_count)print(self.get_filepath('csv'))def write_file(self, wrote_count):"""將爬到的信息寫入文件"""if self.got_count > wrote_count:self.write_csv(wrote_count)def get_pages(self):"""獲取全部微博"""self.get_user_info()page_count = self.get_page_count()wrote_count = 0self.print_user_info()page1 = 0random_pages = random.randint(1, 5)for page in tqdm(range(1, page_count + 1), desc=u"進度"):print(u'第%d頁' % page)is_end = self.get_one_page(page)if is_end:breakif page % 20 == 0: # 每爬20頁寫入一次文件self.write_file(wrote_count)wrote_count = self.got_count# 通過加入隨機等待避免被限制。爬蟲速度過快容易被系統限制(一段時間后限# 制會自動解除),加入隨機等待模擬人的操作,可降低被系統限制的風險。默# 認是每爬取1到5頁隨機等待6到10秒,如果仍然被限,可適當增加sleep時間if page - page1 == random_pages and page < page_count:sleep(random.randint(6, 10))page1 = pagerandom_pages = random.randint(1, 5)self.write_file(wrote_count) # 將剩余不足20頁的微博寫入文件print(u'微博爬取完成,共爬取%d條微博' % self.got_count)def start(self):"""運行爬蟲"""try:self.get_pages()print(u'信息抓取完畢')print('*' * 100)if self.pic_download == 1:self.download_files('img')if self.video_download == 1:self.download_files('video')except Exception as e:print('Error: ', e)traceback.print_exc()def main():try:user_id_list = [{'我的婚禮化妝師': '534551800'},{'成都結婚攻略': '2418039085'},{'婚禮素材收集者': '2074501384'},{'中國婚禮匯': '2560464771'},{'廣州結婚攻略': '3610896264'},{'小犀婚禮手賬': '6221152196'},{'全球奢侈婚禮': '3481986133'},{'婚禮日記': '6475967681'},{'就是那個焱焱': '1215789145'},{'時尚新娘COSMOBride': '1678808364'},{'婚紗集': '2202994885'},{'全球潮流婚紗': '2048661167'},{'婚禮圈圈圈': '5688689265'},{'婚禮美圖': '2717755093'},{'寧波婚紗攝影精選': '6245644455'},{'新娘圈wedding ': '5285369641'},{'結婚種草君': '6506181316'},{'結婚課堂': '2371517642'},{'我是婚紗控': '2182991010'},{'潮女婚紗控': '2606938454'},{'你的婚禮日記': '5896420557'},{'玉小姐的婚禮夢': '5725853889'},{'婚禮視覺志': '6433807822'},{'蘇州婚紗攝影': '3301023630'},{'唯美婚紗': '3208926445'}]user_id = 3208926445 # 可以改成任意合法的用戶idfilter = 1 # 值為0表示爬取全部微博(原創微博+轉發微博),值為1表示只爬取原創微博since_date = '2019-07-01' # 起始時間,即爬取發布日期從該值到現在的微博,形式為yyyy-mm-ddpic_download = 1 # 值為0代表不下載微博原始圖片,1代表下載微博原始圖片video_download = 0 # 值為0代表不下載微博視頻,1代表下載微博視頻wb = Weibo(user_id, filter, since_date, pic_download, video_download)wb.start()except Exception as e:print('Error: ', e)traceback.print_exc()if __name__ == '__main__':main()
以上就是微博數據抓取得案例,有疑惑的同學多看看代碼
總結
以上是生活随笔為你收集整理的python-爬虫项目-微博抓取的全部內容,希望文章能夠幫你解決所遇到的問題。
如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。