爬取腾讯课堂的课程评论
生活随笔
收集整理的這篇文章主要介紹了
爬取腾讯课堂的课程评论
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
最近想了解一下在線教育的課程的如何去選擇,課程的質量如何?所以試著去爬了一下騰訊課堂,只爬了IT互聯網這一項。
通過分析發現要想爬取到評論需要是個步驟:
解析學習方向,如下圖所示:
通過開發者工具審查元素,發現標簽在<dl class="sort-menu sort-menu1 clearfix">下
然后去寫解析代碼:
解析學習方向下的分類,如下圖所示(發現與第一步相似):
到這里就要解析課程信息了,全部課程都在<ul class="course-card-list" auto-test="">下,如圖所示:
解析代碼如下:
好了,參數都分析好了,接著看代碼:
到這里整個爬蟲就寫完了,全部代碼如下:
import requests as req import sys import io import time import re import random import json import csv from utils.spider import Spiderclass Ten(Spider):def __init__(self, url):Spider.__init__(self, url)self.url = urlself.s = req.Session()self.flag = 1def get_menu_link(self, url, _pattern):headers = {'user-agent': self.round_header()}start = time.perf_counter()res = self.s.get(url, headers=headers)if res is None:returncontent = res.textmenu_pattern = re.compile(r'<dl class="sort-menu sort-menu1 clearfix">(.*?)</dl>', re.S)menu = re.findall(menu_pattern, content)link_paternt = re.compile(_pattern, re.S | re.M)if len(menu) != 0:links = re.findall(link_paternt, menu[0])end = time.perf_counter()_time = end - startprint('{0}解析成功,共耗時:{1:f}s'.format(url, _time))for item in links:item = item.replace('&', '&')link = 'https://ke.qq.com{0}'.format(item)yield linkelse:end = time.perf_counter()_time = end - startprint('{0}解析失敗!!!,共耗時:{1:f}s'.format(url, _time))return Nonedef get_course_list(self, url):headers = {'user-agent': self.round_header()}start = time.perf_counter()res = self.s.get(url, headers=headers)if res is None:returncontent = res.textcourse_card_list_pattern = re.compile(r'<ul class="course-card-list.+?">\s+(.+)\s+</ul>', re.S)course_card_list = re.findall(course_card_list_pattern, content)course_list_pattern = re.compile(r'<li class="course-card-item.*?">.*?<h4 class="item-tt">\s+'+ r'<a href="(.*?)" target="_blank" class="item-tt-link.*?">(.*?)</a>\s+</h4>.*?<div '+ r'class="item-line.*?middle">\s+<span class="line-cell.*?">\s+(.*?)\s+</span>\s+<span '+ r'class="item-source">.*?class="item-source-link.*?">(.*?)</a>\s+.*?<div '+ r'class="item-line.*?bottom">\s+<span class="line-cell item-price free">(.*?)</span>\s+</div>\s+</li>',re.S)if len(course_card_list) != 0:course_list = re.findall(course_list_pattern, course_card_list[0])[0:3]end = time.perf_counter()_time = end - startprint('解析成功,共耗時:{0}s'.format(_time))for item in course_list:yield {'url': 'https:{0}'.format(item[0]),'courseName': item[1],'num': item[2],'source': item[3],'fee': item[4]}else:end = time.perf_counter()_time = end - startprint('在該鏈接下沒有找到課程列表,共耗時:{0}s'.format(_time))return Nonedef get_comment(self, url, params, headers):res = self.get(url, params=params, headers=headers)if res is None:returnresult = json.loads(res.text).get('result')return {'total_page': result.get('total_page'),'comments': result.get('items'),'total_num': result.get('total_num')}def save(self, data):fieldnames = ['url', 'courseName', 'num', 'source', 'fee', 'total_num', 'total_page', 'comments']file_name = 'mooc.csv'with open(file_name, 'a+', newline='', encoding='utf-8') as f:w = csv.DictWriter(f, fieldnames)if self.flag == 1:w.writeheader()self.flag = 0w.writerow(data)if __name__ == "__main__":# it 互聯網# 第一步先解析互聯網下的分類URL# 第二步解析一級菜單下的分類# 第三步解析二級菜單下的前三個課程鏈接# 第四步解析課程中的評論url = 'https://ke.qq.com/course/list?mt=1001'list_no = []t = Ten(url)# 1.link_paternt = r'<dd class="">\s+<\w+></\w+>\s+<a href="(.*?)" title=".*?">.*?</a>\s+</dd>'print('--------開始爬取--------')links = t.get_menu_link(url, link_paternt)if links is not None:for item in links:# 2.option_pattern = r'<dd class="">\s+<a href="(.*?)" title=".*?">.*?</a>\s+</dd>'options = t.get_menu_link(item, option_pattern)time.sleep(2)if options is not None:for option in options:print('開始解析{}'.format(option), end=' ====>> ')# 3course_list = t.get_course_list(option)time.sleep(2)if course_list is None:list_no.append(option)continueelse:for coures in course_list:_url = coures.get('url')# 4cid = re.search(r'/(\d+)', _url).group(1)r = eval('{0:.18f}'.format(random.random())[0:19])params = {'cid': cid,'count': 10,'page': 0,'filter_rating': 0,'bkn': '','r': r}headers = {'user-agent': t.round_header(),'referer': _url,'cookie': 'pgv_info=ssid=s6819497920; ts_last=ke.qq.com/course/144558; pgv_pvid=1821056816; ts_uid=7896600315; _pathcode=0.9075570219139721; tdw_auin_data=-; tdw_data={"ver4":"4","ver6":"","refer":"","from_channel":"","path":"eh-0.9075570219139721","auin":"-","uin":null,"real_uin":null}; tdw_first_visited=1; Hm_lvt_0c196c536f609d373a16d246a117fd44=1543998342; Hm_lpvt_0c196c536f609d373a16d246a117fd44=1543998342; tdw_data_new_2={"auin":"-","sourcetype":"","sourcefrom":"","uin":"","visitor_id":"53087919"}'}__url = 'https://ke.qq.com/cgi-bin/comment_new/course_comment_list'print('獲取cid:{0}的評論'.format(cid), end='\t')comments = t.get_comment(__url, params, headers=headers)coures.update(comments)t.save(coures)總結
以上是生活随笔為你收集整理的爬取腾讯课堂的课程评论的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 【数据科学】使用Python建立你的数据
- 下一篇: 如何在微信公众号正文中添加附件?