python爬虫取腾讯视频评论
生活随笔
收集整理的這篇文章主要介紹了
python爬虫取腾讯视频评论
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
文章目錄
- 1. 爬取評論內容代碼
- 2.爬取評論時間代碼
- 3.數據處理部分
- 3.1 評論的時間戳轉換為正常時間
- 3.2 評論內容讀入csv
- 3.3 統計一天各個時間段內的評論數
- 3.4 統計最近評論數
- 4.數據分析
- 4.1 1.制作詞云圖
- 4.2 制作最近評論數條形圖與折線圖
- 4.3 制作每小時評論條形圖與折線圖
- 4.4 制作最近評論數餅圖
- 4.5 制作每小時評論餅圖
- 4.6 制作觀看時間區間評論統計餅圖
- 4.7 制作主演提及占比餅圖
- 4.8 評論內容情感分析
本文爬取騰訊視頻網頁以《喬家的兒女》這部電視劇后臺評論,對數據進行爬蟲與數據分析,爬取30頁后臺評論(完全爬取耗時巨大,所以選取30頁作為本文數據爬取范圍),對評論進行情緒文本分析處理
騰訊的評論數據在json文件里面,需要找到Json文件中的comment_id進行數據提取
1. 爬取評論內容代碼
爬去數據注意要找到comment_id,喬家的兒女評論id是:7359549499,page_num是爬取的頁數,本文選擇爬取30頁
2.爬取評論時間代碼
保留評論時間代碼為time.txt
3.數據處理部分
3.1 評論的時間戳轉換為正常時間
# coding=gbk import csv import timecsvFile = open("data.csv",'w',newline='',encoding='utf-8') writer = csv.writer(csvFile) csvRow = [] #print(csvRow) f = open("time.txt",'r',encoding='utf-8') for line in f:csvRow = int(line)#print(csvRow)timeArray = time.localtime(csvRow)csvRow = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)print(csvRow)csvRow = csvRow.split()writer.writerow(csvRow)f.close() csvFile.close()3.2 評論內容讀入csv
在文件夾中能找到csv文件
3.3 統計一天各個時間段內的評論數
# coding=gbk import csvfrom pyecharts import options as opts from sympy.combinatorics import Subset from wordcloud import WordCloudwith open('data.csv') as csvfile:reader = csv.reader(csvfile)data1 = [str(row[1])[0:2] for row in reader]print(data1) print(type(data1))#先變成集合得到seq中的所有元素,避免重復遍歷 set_seq = set(data1) rst = [] for item in set_seq:rst.append((item,data1.count(item))) #添加元素及出現個數 rst.sort() print(type(rst)) print(rst)with open("time2.csv", "w+", newline='', encoding='utf-8') as f:writer = csv.writer(f, delimiter=',')for i in rst: # 對于每一行的,將這一行的每個元素分別寫在對應的列中writer.writerow(i)with open('time2.csv') as csvfile:reader = csv.reader(csvfile)x = [str(row[0]) for row in reader]print(x) with open('time2.csv') as csvfile:reader = csv.reader(csvfile)y1 = [float(row[1]) for row in reader]print(y1)3.4 統計最近評論數
# coding=gbk import csvfrom pyecharts import options as opts from sympy.combinatorics import Subset from wordcloud import WordCloudwith open('data.csv') as csvfile:reader = csv.reader(csvfile)data1 = [str(row[0]) for row in reader]#print(data1) print(type(data1))#先變成集合得到seq中的所有元素,避免重復遍歷 set_seq = set(data1) rst = [] for item in set_seq:rst.append((item,data1.count(item))) #添加元素及出現個數 rst.sort() print(type(rst)) print(rst)with open("time1.csv", "w+", newline='', encoding='utf-8') as f:writer = csv.writer(f, delimiter=',')for i in rst: # 對于每一行的,將這一行的每個元素分別寫在對應的列中writer.writerow(i)with open('time1.csv') as csvfile:reader = csv.reader(csvfile)x = [str(row[0]) for row in reader]print(x) with open('time1.csv') as csvfile:reader = csv.reader(csvfile)y1 = [float(row[1]) for row in reader]print(y1)4.數據分析
數據分析方面:涉及到了詞云圖,條形,折線,餅圖,后三者是對評論時間與主演占比的分析,然而騰訊的評論時間是以時間戳的形式顯示,所以要進行轉換,再去統計出現次數,最后,新加了對評論內容的情感分析。
4.1 1.制作詞云圖
import numpy as np import re import jieba from wordcloud import WordCloud from matplotlib import pyplot as plt from PIL import Image# 上面的包自己安裝,不會的就百度f = open('content.txt', 'r', encoding='utf-8') # 這是數據源,也就是想生成詞云的數據 txt = f.read() # 讀取文件 f.close() # 關閉文件,其實用with就好,但是懶得改了 # 如果是文章的話,需要用到jieba分詞,分完之后也可以自己處理下再生成詞云 newtxt = re.sub("[A-Za-z0-9\!\%\[\]\,\。]", "", txt) print(newtxt) words = jieba.lcut(newtxt)img = Image.open(r'wc.jpg') # 想要搞得形狀 img_array = np.array(img)# 相關配置,里面這個collocations配置可以避免重復 wordcloud = WordCloud(background_color="white",width=1080,height=960,font_path="../文悅新青年.otf",max_words=150,scale=10,#清晰度max_font_size=100,mask=img_array,collocations=False).generate(newtxt)plt.imshow(wordcloud) plt.axis('off') plt.show() wordcloud.to_file('wc.png')4.2 制作最近評論數條形圖與折線圖
# encoding: utf-8 import csv import pyecharts.options as opts from pyecharts.charts import Bar from pyecharts.globals import ThemeTypeclass DrawBar(object):"""繪制柱形圖類"""def __init__(self):"""創建柱狀圖實例,并設置寬高和風格"""self.bar = Bar(init_opts=opts.InitOpts(width='1500px', height='700px', theme=ThemeType.LIGHT))def add_x(self):"""為圖形添加X軸數據"""with open('time1.csv') as csvfile:reader = csv.reader(csvfile)x = [str(row[0]) for row in reader]print(x)self.bar.add_xaxis(xaxis_data=x,)def add_y(self):with open('time1.csv') as csvfile:reader = csv.reader(csvfile)y1 = [float(row[1]) for row in reader]print(y1)"""為圖形添加Y軸數據,可添加多條"""self.bar.add_yaxis( # 第一個Y軸數據series_name="評論數", # Y軸數據名稱y_axis=y1, # Y軸數據label_opts=opts.LabelOpts(is_show=True,color="black"), # 設置標簽bar_max_width='100px', # 設置柱子最大寬度)def set_global(self):"""設置圖形的全局屬性"""#self.bar(width=2000,height=1000)self.bar.set_global_opts(title_opts=opts.TitleOpts( # 設置標題title='喬家的兒女近日評論統計',title_textstyle_opts=opts.TextStyleOpts(font_size=35)),tooltip_opts=opts.TooltipOpts( # 提示框配置項(鼠標移到圖形上時顯示的東西)is_show=True, # 是否顯示提示框trigger="axis", # 觸發類型(axis坐標軸觸發,鼠標移到時會有一條垂直于X軸的實線跟隨鼠標移動,并顯示提示信息)axis_pointer_type="cross"# 指示器類型(cross將會生成兩條分別垂直于X軸和Y軸的虛線,不啟用trigger才會顯示完全)),toolbox_opts=opts.ToolboxOpts(), # 工具箱配置項(什么都不填默認開啟所有工具))def draw(self):"""繪制圖形"""self.add_x()self.add_y()self.set_global()self.bar.render('DrawBar.html') # 將圖繪制到 test.html 文件內,可在瀏覽器打開def run(self):"""執行函數"""self.draw()if __name__ == '__main__':app = DrawBar()app.run() DrawBar4.3 制作每小時評論條形圖與折線圖
# encoding: utf-8 # encoding: utf-8 import csv import pyecharts.options as opts from pyecharts.charts import Bar from pyecharts.globals import ThemeTypeclass DrawBar(object):"""繪制柱形圖類"""def __init__(self):"""創建柱狀圖實例,并設置寬高和風格"""self.bar = Bar(init_opts=opts.InitOpts(width='1500px', height='700px', theme=ThemeType.MACARONS))def add_x(self):"""為圖形添加X軸數據"""str_name1 = '點'with open('time2.csv') as csvfile:reader = csv.reader(csvfile)x = [str(row[0] + str_name1) for row in reader]print(x)self.bar.add_xaxis(xaxis_data=x)def add_y(self):with open('time2.csv') as csvfile:reader = csv.reader(csvfile)y1 = [int(row[1]) for row in reader]print(y1)"""為圖形添加Y軸數據,可添加多條"""self.bar.add_yaxis( # 第一個Y軸數據series_name="評論數", # Y軸數據名稱y_axis=y1, # Y軸數據label_opts=opts.LabelOpts(is_show=False), # 設置標簽bar_max_width='50px', # 設置柱子最大寬度)def set_global(self):"""設置圖形的全局屬性"""#self.bar(width=2000,height=1000)self.bar.set_global_opts(title_opts=opts.TitleOpts( # 設置標題title='掃黑風暴各時間段評論統計',title_textstyle_opts=opts.TextStyleOpts(font_size=35)),tooltip_opts=opts.TooltipOpts( # 提示框配置項(鼠標移到圖形上時顯示的東西)is_show=True, # 是否顯示提示框trigger="axis", # 觸發類型(axis坐標軸觸發,鼠標移到時會有一條垂直于X軸的實線跟隨鼠標移動,并顯示提示信息)axis_pointer_type="cross"# 指示器類型(cross將會生成兩條分別垂直于X軸和Y軸的虛線,不啟用trigger才會顯示完全)),toolbox_opts=opts.ToolboxOpts(), # 工具箱配置項(什么都不填默認開啟所有工具))def draw(self):"""繪制圖形"""self.add_x()self.add_y()self.set_global()self.bar.render('DrawBar2.html') # 將圖繪制到 test.html 文件內,可在瀏覽器打開def run(self):"""執行函數"""self.draw()if __name__ == '__main__':app = DrawBar()app.run()4.4 制作最近評論數餅圖
import csvfrom pyecharts import options as opts from pyecharts.charts import Pie from random import randintfrom pyecharts.globals import ThemeTypewith open('time1.csv') as csvfile:reader = csv.reader(csvfile)x = [str(row[0]) for row in reader]print(x) with open('time1.csv') as csvfile:reader = csv.reader(csvfile)y1 = [float(row[1]) for row in reader]print(y1)num = y1 lab = x (Pie(init_opts=opts.InitOpts(width='1700px',height='450px',theme=ThemeType.LIGHT))#默認900,600.set_global_opts(title_opts=opts.TitleOpts(title="喬家的兒女近日評論統計",title_textstyle_opts=opts.TextStyleOpts(font_size=27)),legend_opts=opts.LegendOpts(pos_top="10%", pos_left="1%",# 圖例位置調整),).add(series_name='',center=[280, 270], data_pair=[(j, i) for i, j in zip(num, lab)])#餅圖.add(series_name='',center=[845, 270],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#環圖.add(series_name='', center=[1380, 270],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#南丁格爾圖 ).render('pie_pyecharts.html')4.5 制作每小時評論餅圖
import csvfrom pyecharts import options as opts from pyecharts.charts import Pie from random import randintfrom pyecharts.globals import ThemeTypestr_name1 = '點'with open('time2.csv') as csvfile:reader = csv.reader(csvfile)x = [str(row[0]+str_name1) for row in reader]print(x) with open('time2.csv') as csvfile:reader = csv.reader(csvfile)y1 = [int(row[1]) for row in reader]print(y1)num = y1 lab = x (Pie(init_opts=opts.InitOpts(width='1650px',height='500px',theme=ThemeType.LIGHT,))#默認900,600.set_global_opts(title_opts=opts.TitleOpts(title="喬家的兒女每小時評論統計",title_textstyle_opts=opts.TextStyleOpts(font_size=27)),legend_opts=opts.LegendOpts(pos_top="8%", pos_left="4%",# 圖例位置調整),).add(series_name='',center=[250, 300], data_pair=[(j, i) for i, j in zip(num, lab)])#餅圖.add(series_name='',center=[810, 300],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#環圖.add(series_name='', center=[1350, 300],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#南丁格爾圖 ).render('pie_pyecharts2.html')4.6 制作觀看時間區間評論統計餅圖
# coding=gbk import csvfrom pyecharts import options as opts from pyecharts.globals import ThemeType from sympy.combinatorics import Subset from wordcloud import WordCloudwith open('data.csv') as csvfile:reader = csv.reader(csvfile)data2 = [int(row[1].strip('')[0:2]) for row in reader]#print(data2) print(type(data2))#先變成集合得到seq中的所有元素,避免重復遍歷 set_seq = set(data2) list = [] for item in set_seq:list.append((item,data2.count(item))) #添加元素及出現個數 list.sort() print(type(list)) #print(list)with open("time2.csv", "w+", newline='', encoding='utf-8') as f:writer = csv.writer(f, delimiter=',')for i in list: # 對于每一行的,將這一行的每個元素分別寫在對應的列中writer.writerow(i)n = 4#分成n組 m = int(len(list)/n) list2 = [] for i in range(0, len(list), m):list2.append(list[i:i+m])print("凌晨 : ",list2[0]) print("上午 : ",list2[1]) print("下午 : ",list2[2]) print("晚上 : ",list2[3])with open('time2.csv') as csvfile:reader = csv.reader(csvfile)y1 = [int(row[1]) for row in reader]print(y1)n =6 groups = [y1[i:i + n] for i in range(0, len(y1), n)]print(groups)x=['凌晨','上午','下午','晚上'] y1=[] for y1 in groups:num_sum = 0for groups in y1:num_sum += groupsprint(x) print(y1)import csvfrom pyecharts import options as opts from pyecharts.charts import Pie from random import randintstr_name1 = '點'num = y1 lab = x (Pie(init_opts=opts.InitOpts(width='1500px',height='450px',theme=ThemeType.LIGHT))#默認900,600.set_global_opts(title_opts=opts.TitleOpts(title="喬家的兒女觀看時間區間評論統計", title_textstyle_opts=opts.TextStyleOpts(font_size=30)),legend_opts=opts.LegendOpts(pos_top="8%", # 圖例位置調整),).add(series_name='',center=[260, 270], data_pair=[(j, i) for i, j in zip(num, lab)])#餅圖.add(series_name='',center=[1230, 270],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#環圖.add(series_name='', center=[750, 270],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#南丁格爾圖 ).render('pie_pyecharts3.html')4.7 制作主演提及占比餅圖
import csvimport numpy as np import re import jieba from matplotlib.pyplot import scatter from wordcloud import WordCloud from matplotlib import pyplot as plt from PIL import Image# 上面的包自己安裝,不會的就百度f = open('content.txt', 'r', encoding='utf-8') # 這是數據源,也就是想生成詞云的數據 words = f.read() # 讀取文件 f.close() # 關閉文件,其實用with就好,但是懶得改了name=["白宇","宋祖兒","毛曉彤","張晚意","周翊然","劉鈞","李佳航",'朱珠']print(name) count=[float(words.count("白宇")),float(words.count("宋祖兒")),float(words.count("毛曉彤")),float(words.count("張晚意")),float(words.count("周翊然")),float(words.count("劉鈞")),float(words.count("李佳航")),float(words.count("朱珠"))] print(count)import csvfrom pyecharts import options as opts from pyecharts.charts import Pie from random import randintfrom pyecharts.globals import ThemeTypenum = count lab = name (Pie(init_opts=opts.InitOpts(width='1650px',height='450px',theme=ThemeType.LIGHT))#默認900,600.set_global_opts(title_opts=opts.TitleOpts(title="喬家的兒女主演提及占比",title_textstyle_opts=opts.TextStyleOpts(font_size=27)),legend_opts=opts.LegendOpts(pos_top="3%", pos_left="33%",# 圖例位置調整),).add(series_name='',center=[280, 270], data_pair=[(j, i) for i, j in zip(num, lab)])#餅圖.add(series_name='',center=[800, 270],data_pair=[(j,i) for i,j in zip(num,lab)],radius=['40%','75%'])#環圖.add(series_name='', center=[1300, 270],data_pair=[(j, i) for i, j in zip(num, lab)], rosetype='radius')#南丁格爾圖 ).render('pie_pyecharts4.html')4.8 評論內容情感分析
import numpy as np from snownlp import SnowNLP import matplotlib.pyplot as pltf = open('content.txt', 'r', encoding='UTF-8') list = f.readlines() sentimentslist = [] for i in list:s = SnowNLP(i)print(s.sentiments)sentimentslist.append(s.sentiments) plt.hist(sentimentslist, bins=np.arange(0, 1, 0.01), facecolor='g') plt.xlabel('Sentiments Probability') plt.ylabel('Quantity') plt.title('Analysis of Sentiments') plt.show()總結
以上是生活随笔為你收集整理的python爬虫取腾讯视频评论的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: python机器学习---1. K近邻算
- 下一篇: python的scikit-learn算