爬虫及数据分析--当当网
生活随笔
收集整理的這篇文章主要介紹了
爬虫及数据分析--当当网
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
大數據期末課程設計,設計爬蟲及數據分析,網上學習了很多資料,此份價值最高,作此記錄,侵刪。
headers = {'Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.119 Safari/537.36','Accept-Encoding': 'gzip, deflate','Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8','Cache-Control': 'no-cache','Connection': 'keep-alive','Host': 'search.dangdang.com' } def parseHtml(html):data = {}#print(html)soup = BeautifulSoup(html, 'lxml')conshoplist = soup.find_all('div', {'class': 'con shoplist'})[0]for each in conshoplist.find_all('li'):# 書名bookname = each.find_all('a')[0].get('title').strip(' ')# 書圖img_src = each.find_all('a')[0].img.get('data-original')if img_src is None:img_src = each.find_all('a')[0].img.get('src')img_src = img_src.strip(' ')# 價格price = float(each.find_all('p', {'class': 'price'})[0].span.text[1:])# 簡介detail = each.find_all('p', {'class': 'detail'})[0].text# 評分stars = float(each.find_all('p', {'class': 'search_star_line'})[0].span.span.get('style').split(': ')[-1].strip('%;')) / 20# 評論數量num_comments = float(each.find_all('p', {'class': 'search_star_line'})[0].a.text[:-3])data[bookname] = [img_src, price, detail, stars, num_comments]return data'''柱狀圖(2維)'''def drawBar(title, data, savepath='./results'):if not os.path.exists(savepath):os.mkdir(savepath)attrs = [i for i, j in data.items()]values = [j for i, j in data.items()]c = (Bar(init_opts=opts.InitOpts(animation_opts=opts.AnimationOpts(animation_delay=1000, animation_easing="elasticOut"), theme=ThemeType.ROMA)).set_global_opts(title_opts=opts.TitleOpts(title=title, pos_left='35%'),datazoom_opts=[opts.DataZoomOpts(), opts.DataZoomOpts(type_="inside")], ).add_xaxis(attrs).add_yaxis('', values))c.render(os.path.join(savepath, '%s.html' % title))'''餅圖'''def drawPie(title, data, savepath='./results'):if not os.path.exists(savepath):os.mkdir(savepath)attrs = [i for i, j in data.items()]values = [j for i, j in data.items()]c = (Pie().add("",[list(z) for z in zip(attrs, values)],radius=["30%", "65%"],center=["50%", "60%"],rosetype="radius",).set_global_opts(title_opts=opts.TitleOpts(title=title, pos_left='35%'),legend_opts=opts.LegendOpts(orient="vertical", pos_top="15%", pos_left="2%")))c.render(os.path.join(savepath, '%s.html' % title))'''漏斗圖'''def drawFunnel(title, data, savepath='./results'):if not os.path.exists(savepath):os.mkdir(savepath)attrs = [i for i, j in data.items()]values = [j for i, j in data.items()]c = (Funnel().add("",[list(z) for z in zip(attrs, values)],label_opts=opts.LabelOpts(position="inside"),sort_="none",).set_global_opts(title_opts=opts.TitleOpts(title=title, pos_left='35%'),legend_opts=opts.LegendOpts(orient="vertical", pos_top="15%", pos_left="2%")))c.render(os.path.join(savepath, '%s.html' % title))'''統計詞頻'''def statistics(texts, stopwords):words_dict = {}for text in texts:temp = jieba.cut(text)for t in temp:if t in stopwords or t == 'unknow':continueif t in words_dict.keys():words_dict[t] += 1else:words_dict[t] = 1return words_dict'''詞云'''def drawWordCloud(words, title, savepath='./results'):if not os.path.exists(savepath):os.mkdir(savepath)c = (WordCloud().add("", words, word_size_range=[20, 100], shape=SymbolType.DIAMOND).set_global_opts(title_opts=opts.TitleOpts(title=title, pos_left='40%')))c.render(os.path.join(savepath, '%s.html' % title))def visualization(str_name, num):tmp = str_name + '_' + str(num) + '.pkl'with open(tmp, 'rb') as f:data = pickle.load(f)# 價格分布results = {}prices = []price_max = ['', 0]for key, value in data.items():price = value[1]if price_max[1] < price:price_max = [key, price]prices.append(price)results['小于50元'] = sum(i < 50 for i in prices)results['50-100元'] = sum((i < 100 and i >= 50) for i in prices)results['100-200元'] = sum((i < 200 and i >= 100) for i in prices)results['200-300元'] = sum((i < 300 and i >= 200) for i in prices)results['300-400元'] = sum((i < 400 and i >= 300) for i in prices)results['400元以上'] = sum(i >= 400 for i in prices)tmp = str_name + '相關圖書的價格分布'drawPie(tmp, results)# 評分分布results = {}stars = []for key, value in data.items():star = value[3] if value[3] > 0 else '暫無評分'stars.append(str(star))for each in sorted(set(stars)):results[each] = stars.count(each)tmp = str_name + '相關圖書評分的分布'drawBar(tmp, results)# 評論數量results = {}comments_num = []top20 = {}for key, value in data.items():num = int(value[-1])comments_num.append(num)top20[key.split('【')[0].split('(')[0].split('(')[0].split(' ')[0].split(':')[0]] = numresults['0評論'] = sum(i == 0 for i in comments_num)results['0-100評論'] = sum((i > 0 and i <= 100) for i in comments_num)results['100-1000評論'] = sum((i > 100 and i <= 1000) for i in comments_num)results['1000-5000評論'] = sum((i > 1000 and i <= 5000) for i in comments_num)results['5000評論以上'] = sum(i > 5000 for i in comments_num)tmp = str_name + '相關圖書評論數量分布'drawFunnel(tmp, results)top20 = dict(sorted(top20.items(), key=lambda item: item[1])[-20:])tmp = str_name + '相關圖書評論數量TOP20'drawBar(tmp, top20)# 詞云stopwords = open('./stopwords.txt', 'r', encoding='utf-8').read().split('\n')[:-1]texts = [j[2] for i, j in data.items()]words_dict = statistics(texts, stopwords)words_dict = list(tuple(words_dict.items()))tmp = str_name + '相關圖書簡介詞云'drawWordCloud(words_dict, tmp, savepath='./results')爬蟲結果展示:
總結
以上是生活随笔為你收集整理的爬虫及数据分析--当当网的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 2022年数维杯国际大学生数学建模挑战赛
- 下一篇: 一沙一世界(10亿光年),科学的图文介绍