當前位置：首頁 > 编程语言 > python >内容正文

python

【Python数据分析实战】豆瓣读书分析(含代码和数据集)

發(fā)布時間：2024/8/1 python 34 豆豆

生活随笔收集整理的這篇文章主要介紹了【Python数据分析实战】豆瓣读书分析(含代码和数据集) 小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

@[TOC]豆瓣

一.導入數(shù)據(jù)

數(shù)據(jù)集：
鏈接：douban.csv
提取碼：pmls

#加載需要使用的庫 import pandas as pd import numpy as np import matplotlib.pyplot as plt #讀取數(shù)據(jù) df=pd.read_csv(r'/PythonTest/Data/book_douban.csv',index_col=0) #查看前十行 df.head(10) ```![在這里插入圖片描述](https://img-blog.csdnimg.cn/eb27ca3a59a44089a587da9b2774fbf2.png?x-oss-process=image/watermark,type_d3F5LXplbmhlaQ,shadow_50,text_Q1NETiBAQ0hSTuaZqA==,size_20,color_FFFFFF,t_70,g_se,x_16)```python df.info()

二.數(shù)據(jù)清洗

#重命名‘數(shù)’列為‘頁數(shù)’ df=df.rename(columns={'數(shù)':'頁數(shù)'}) #重置索引 df.reset_index(drop=True,inplace=True) # 查看矩陣形狀 df.shape # 查看評分的統(tǒng)計信息 df.describe()

2.1清理null值

# 將’none‘轉換為null df.replace('None',np.nan,inplace=True) # 查看缺失值情況 df.isnull().sum() # 去除'ISBM'列 del df['ISBM'] # 去除指定列含有空值的行 df.dropna(axis=0,subset=['作者','出版社','出版時間','頁數(shù)','價格','評分','評論數(shù)量'],how='any',inplace=True) # 重置索引 df.reset_index(drop=True,inplace=True) # 確認是否還有空值 df.isna().sum()

2.2清洗出版時間列

從數(shù)據(jù)集中可以發(fā)現(xiàn)出版時間的數(shù)據(jù)格式多樣，有1999,2012/12,1923-4,2019年六月，因此需要提取出其年份

# 為了便于統(tǒng)計，通過正則提取出版時間的年份 import re df['出版時間']=df['出版時間'].str.replace(' ','') for index,row in df.iterrows():num=re.findall('\d+',row[3])num=''.join(num)[0:4]df.iloc[index,3]=num # 將出版時間轉換為整數(shù)型 df.drop(df[df['出版時間'].str.len()!=4].index,axis=0,inplace=True) df['出版時間']=df['出版時間'].astype(np.int32) # 發(fā)現(xiàn)出版時間超出實際時間的數(shù)據(jù)，將其清除 df.drop(df[df['出版時間']>2019].index,inplace=True)

2.3轉換評分及平均數(shù)量的數(shù)據(jù)類型

# 轉換數(shù)據(jù)類型 df['評分']=df['評分'].astype(float) df['評論數(shù)量']=df['評論數(shù)量'].astype(np.int32)

2.4清洗頁數(shù)列

# 查看頁數(shù)是否含有小數(shù)點的情況 df['頁數(shù)'].str.contains('\.').value_counts()

結果：
False 46173
True 7
Name: 頁數(shù), dtype: int64

# 規(guī)范頁數(shù)的格式，去除含有其他字符的數(shù)據(jù)比如‘.’ df['頁數(shù)']=df['頁數(shù)'].apply(lambda x:x.replace(',','').replace(' ','')) df.drop(df[~(df['頁數(shù)'].str.isdecimal())].index,axis=0,inplace=True)# 轉換頁數(shù)的格式 df['頁數(shù)']=df['頁數(shù)'].astype(np.int32) # # 清除頁數(shù)為0的數(shù)據(jù) df.drop((df[df['頁數(shù)']==0]).index,inplace=True)

2.5清洗價格列

# 規(guī)范價格的格式，去除價格不是純數(shù)字的數(shù)據(jù) df['價格']=df['價格'].apply(lambda x:x.replace(',','').replace(' ','')) for r_index,row in df.iterrows():if row[5].replace('.','').isdecimal()==False:df.drop(r_index,axis=0,inplace=True)elif row[5][-1].isdecimal()==False:df.drop(r_index,axis=0,inplace=True) # 轉換價格的格式 df['價格']=df['價格'].astype(float) # 將價格低于1元的書籍去除 df.drop(df[df['價格']<1].index,inplace=True)

2.6去除書名重復的數(shù)據(jù)

# 查看此時重復的書名 df['書名'].value_counts()

# 查看重復書名數(shù)量 df['書名'].duplicated().value_counts()

結果：
False 42813
True 2073
Name: 書名, dtype: int64

# 按照評論數(shù)量排名，然后去重，以保證數(shù)據(jù)可靠性 df=df.sort_values(by='評論數(shù)量',ascending=False) df.reset_index(drop=True,inplace=True)

# 對排序后的數(shù)據(jù)進行去重 df.drop_duplicates(subset='書名', keep='first',inplace=True) df.reset_index(drop=True,inplace=True) # 查看是否還有重復的數(shù)據(jù) df['書名'].value_counts() # 清理后的數(shù)據(jù) df.to_excel(r'/PythonTest/Data/douban_book.xls',encoding='utf_8_sig') df

2.7哪個出版社的書籍評分較高？

# 先統(tǒng)計各出版社的出版作品數(shù)量 press=df['出版社'].value_counts() press=pd.DataFrame(press) press=press.reset_index().rename(columns={'index':'出版集團','出版社':'出版數(shù)量'}) press # 將出版作品數(shù)量大于200的出版社名稱提取到列表中 lst=press[press['出版數(shù)量']>200]['出版集團'].tolist() # 將列表中的出版社的作品平均分計算出來，并按照降序排序 press_rank=df[df['出版社'].isin(lst)].groupby(by='出版社',as_index=False).agg({'評分':np.mean}).sort_values(by='評分',ascending=False) # 保存為excel press_rank.to_excel(r'/PythonTest/Data/press_rank.xls',encoding='utf_8_sig') press_rank

2.8哪些書值得一讀？

# 將評論數(shù)量大于50000的作品提取出來，并按照評分降序排序 sor=df[df['評論數(shù)量']>50000].sort_values(by='評分',ascending=False) sor # 計算評分列的平均值 df['評分'].mean() # 加權總分 = (v ÷ (v+m)) × R + (m ÷ (v+m)) #-R ：該電影的算數(shù)平均分。是用普通的方法計算出的平均分 # -v ：該電影投票人數(shù) # -m：進入排行需要的最小投票數(shù) sor.eval('加權總分=(((評論數(shù)量/(評論數(shù)量+50000))*評分)+(50000/(評論數(shù)量+50000)))',inplace=True) book_rank=sor.sort_values(by='加權總分',ascending=False).reset_index(drop=True).head(20) # 保存為excel book_rank.to_excel(r'/PythonTest/Data/book_rank.xls',encoding='utf_8_sig') book_rank

2.9作者排名（10部作品及以上）

# 先提取出評論數(shù)量大于100的作品 df1=df[df['評論數(shù)量']>100] # 再提取出評分大于等于8的作品 df1=df1[df1['評分']>=8] # 將過濾后的的作品按作者進行統(tǒng)計 writer=df1['作者'].value_counts() writer=pd.DataFrame(writer) writer.reset_index(inplace=True) writer.rename(columns={'index':'作家','作者':'作品數(shù)量'},inplace=True) writer

# 提取出優(yōu)秀作品數(shù)量大于等于10的作家 lst1=writer[writer['作品數(shù)量']>=10]['作家'].tolist() # 求得每位作家的平均得分 writer_rank=df1[df1['作者'].isin(lst1)].groupby(by='作者',as_index=False).agg({'評分':np.mean}).sort_values(by='評分',ascending=False).reset_index(drop=True).head(20) # 保存為excel writer_rank.to_excel(r'/PythonTest/Data/writer_rank.xls',encoding='utf_8_sig') writer_rank

三.數(shù)據(jù)分析與可視化

3.1各年作品出版數(shù)量折線圖

3.2各價位作品數(shù)量直方圖

3.3各出版社出版作品數(shù)量條形圖&評分折線圖

3.4作者作品評分條形圖

3.5作品評分樹狀圖

總結

以上是生活随笔為你收集整理的【Python数据分析实战】豆瓣读书分析(含代码和数据集)的全部內(nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯，歡迎將生活随笔推薦給好友。

上一篇：解决打印机问题的方法
下一篇： linux的vi编辑器的dd命令,lin