python实现北京租房信息计算
案例說明
????????在本次案例中,我們將通過 Python 爬蟲技術獲取某網數萬條北京租房數據,用北京部分城區真實房價分析真實的房租情況:數據獲取、數據清洗預覽、數據分析可視化,一起了解最近房租的狀況。
????????本次實驗使用的 Python 第三方庫
實驗步驟
???????????Step1:安裝并引入必要的庫?????
#1、導入相關庫 import numpy as np import pandas as pd import requests import time import re import seaborn as sns import statsmodels.api as sm import statsmodels.formula.api as smf import matplotlib.pyplot as plt from bs4 import BeautifulSoup? ? ? ? ? ? ??Step2:數據爬取——獲取一個區域的租房鏈接地址
#2、獲取一個區域的租房鏈接地址 headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \(KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'}def get_areas(url):try:print('start grabing areas')resposne = requests.get(url, headers=headers, timeout=30)html = resposne.contentsoup = BeautifulSoup(html, 'html.parser')all_sights = soup.findAll('li', 'filter__item--level2')areas = []areas_links = []for item in all_sights:if not item.get_text()=='\n不限\n':areas.append(item.get_text())areas_links.append('https://bj.lianjia.com'+item.find('a').get('href'))return areas, areas_linksexcept Exception as e:print('爬取網站出現了一點問題,問題如下:')print(e)return ''????????Step3:獲取該區域下的租房信息鏈接
#3、獲取該區域下的租房信息鏈接 def get_pages(area, area_link):print("開始抓取頁面")resposne = requests.get(area_link, headers=headers)html = resposne.contentsoup = BeautifulSoup(html, 'html.parser')pages = int(soup.findAll('div', 'content__pg')[0].get('data-totalpage'))print("這個區域有" + str(pages) + "頁")info = []for page in range(1,pages+1):url = 'https://bj.lianjia.com/zufang/dongcheng/pg' + str(page)print("\r開始抓取%s頁的信息, 已爬取%s條數據"%(str(page), len(info)), end='')info += get_house_info(area, url)return info????????Step4:獲取該區域下的租房信息鏈接
#4、解析租房房產信息 def get_house_info(area, url, info=[]):# time.sleep(2)try:resposne = requests.get(url, headers=headers)html = resposne.contentsoup = BeautifulSoup(html, 'html.parser')all_text = soup.findAll('div', 'content__list--item')for item in all_text:item = item.get_text()item = item.replace(' ', '').split('\n')while "" in item:item.remove("")while "/" in item:item.remove("/")info.append(item)return infoexcept Exception as e:print(e)time.sleep(2)return get_house_info(area, url)????????Step5:開始爬蟲
????????
#5、開始爬蟲任務 info = [] url = 'https://bj.lianjia.com/zufang' areas, area_link = get_areas(url) info = get_pages(areas[1], area_link[1])????????Step6:數據清洗與數據保存
#6、簡單數據清洗與保存數據 def keys(info, key=''):ls = []for item in info:if key in item:ls.append(key)else:ls.append('')return lsdef clean_data(info, key=''):title = [item[0] for item in info]address = [item[1] for item in info]area = [item[2] for item in info]toward = [item[3] for item in info]style = [item[4] for item in info]floor = [item[5] for item in info]source = [item[6] for item in info]time = [item[7] for item in info]price = [item[-1] for item in info]subway = keys(info, '近地鐵')decorate = keys(info, '精裝')heating = keys(info, '集中供暖')new_room = keys(info, '新上')time_for_look = keys(info, '隨時看房')return pd.DataFrame({'title': title,'address': address,'area': area,'toward': toward,'style': style,'floor': floor,'source': source,'time': time,'price': price,'subway': subway,'decorate': decorate,'heating': heating,'new_room': new_room,'time_for_look': time_for_look}) data = clean_data(info) data.to_csv('data.csv', index=True)????????Step7:對爬取的數據進行分析
#7、讀取已爬取的數據,并對數據數字化 data = pd.read_csv('data.csv')# 客廳數量 data['sitting_room_value'] = data['style'].apply(lambda x: x.split('廳')[0][-1]) data['sitting_room_value'] = data['sitting_room_value'].replace('衛', 0)# 臥室浴室數量 data['bedroom_value'] = data['style'].apply(lambda x: x[0]) data['bathroom_value'] = data['style'].apply(lambda x: x[-2])# 價格、面積、樓層 data['price_value'] = data['price'].apply(lambda x: x[:-3]) data['area_value'] = data['area'].apply(lambda x: x[:-1]) data['floor_value'] = data['floor'].apply(lambda x: x.split('(')[-1][0])# 租房方位朝向 def toward(x, key=''):if key in x:return keyelse:return 0 data['north'] = data['toward'].apply(lambda x: toward(x, '北')).replace('北', 1) data['south'] = data['toward'].apply(lambda x: toward(x, '南')).replace('南', 1) data['east'] = data['toward'].apply(lambda x: toward(x, '東')).replace('東', 1) data['west'] = data['toward'].apply(lambda x: toward(x, '西')).replace('西', 1)# 提取全部的數值數據 values_data = data[['sitting_room_value', 'bedroom_value','bathroom_value', 'price_value', 'area_value','floor_value', 'north','south', 'east', 'west']].astype(float)# 描述性統計 values_data.describe()????????Step8:使用 seaborn 進行簡單數據可視化分析價格分布
#8、使用seaborn進行簡單數據可視化 sns.displot(values_data['price_value'], kde=True) sns.jointplot(x='area_value', y='price_value', data=values_data) sns.pairplot(values_data)? ? ? ? Step8:分析價格分布
#9、分位數回歸 mod = smf.quantreg('price_value ~ area_value', values_data) res = mod.fit(q=.5)quantiles = np.arange(.05, .96, .1) def fit_model(q):res = mod.fit(q=q)return [q, res.params['Intercept'], res.params['area_value']] + \res.conf_int().loc['area_value'].tolist()models = [fit_model(x) for x in quantiles] models = pd.DataFrame(models, columns=['q', 'a', 'b', 'lb', 'ub'])ols = smf.ols('price_value ~ area_value', values_data).fit() ols_ci = ols.conf_int().loc['area_value'].tolist() ols = dict(a = ols.params['Intercept'],b = ols.params['area_value'],lb = ols_ci[0],ub = ols_ci[1])print(models) print(ols)x = np.arange(values_data.area_value.min(), values_data.area_value.max(), 50) get_y = lambda a, b: a + b * xfig, ax = plt.subplots(figsize=(8, 6))for i in range(models.shape[0]):y = get_y(models.a[i], models.b[i])ax.plot(x, y, linestyle='dotted', color='grey')y = get_y(ols['a'], ols['b'])ax.plot(x, y, color='red', label='OLS') ax.scatter(values_data.area_value, values_data.price_value, alpha=.2)legend = ax.legend() ax.set_xlabel('Area', fontsize=16) ax.set_ylabel('Price', fontsize=16);plt.show()實現效果:
?關于所租房屋的面積與價格之間的關系:
?面積與價格之間的回歸分析:
?根據不同方位,不同價格,不同戶型進行分析,形成散點圖:(豎軸從下依次往上分別是:東、西、南、北、樓層數,面積數、出租價格、浴室數、臥室、客廳數;橫軸依次是客廳數、臥室、浴室、租金價格、面積數、樓層數、北、南、西、東):
?實驗總結
????????本實驗只總結了爬取網絡數據的一般步驟,在實驗的數據分析階段,本實驗列舉了部分可用的方法,可以使用機器學習、深度學習、計量經濟學方法的模型,也可以使用其他的數據可視化方法,對數據進行可視化分析
總代碼:
#1、導入相關庫 import numpy as np import pandas as pd import requests import time import re import seaborn as sns import statsmodels.api as sm import statsmodels.formula.api as smf import matplotlib.pyplot as plt from bs4 import BeautifulSoup#2、獲取一個區域的租房鏈接地址 headers = {'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 \(KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36'}def get_areas(url):try:print('start grabing areas')resposne = requests.get(url, headers=headers, timeout=30)html = resposne.contentsoup = BeautifulSoup(html, 'html.parser')all_sights = soup.findAll('li', 'filter__item--level2')areas = []areas_links = []for item in all_sights:if not item.get_text()=='\n不限\n':areas.append(item.get_text())areas_links.append('https://bj.lianjia.com'+item.find('a').get('href'))return areas, areas_linksexcept Exception as e:print('爬取網站出現了一點問題,問題如下:')print(e)return ''#3、獲取該區域下的租房信息鏈接 def get_pages(area, area_link):print("開始抓取頁面")resposne = requests.get(area_link, headers=headers)html = resposne.contentsoup = BeautifulSoup(html, 'html.parser')pages = int(soup.findAll('div', 'content__pg')[0].get('data-totalpage'))print("這個區域有" + str(pages) + "頁")info = []for page in range(1,pages+1):url = 'https://bj.lianjia.com/zufang/dongcheng/pg' + str(page)print("\r開始抓取%s頁的信息, 已爬取%s條數據"%(str(page), len(info)), end='')info += get_house_info(area, url)return info#4、解析租房房產信息 def get_house_info(area, url, info=[]):# time.sleep(2)try:resposne = requests.get(url, headers=headers)html = resposne.contentsoup = BeautifulSoup(html, 'html.parser')all_text = soup.findAll('div', 'content__list--item')for item in all_text:item = item.get_text()item = item.replace(' ', '').split('\n')while "" in item:item.remove("")while "/" in item:item.remove("/")info.append(item)return infoexcept Exception as e:print(e)time.sleep(2)return get_house_info(area, url)#5、開始爬蟲任務 info = [] url = 'https://bj.lianjia.com/zufang' areas, area_link = get_areas(url) info = get_pages(areas[1], area_link[1])#6、簡單數據清洗與保存數據 def keys(info, key=''):ls = []for item in info:if key in item:ls.append(key)else:ls.append('')return lsdef clean_data(info, key=''):title = [item[0] for item in info]address = [item[1] for item in info]area = [item[2] for item in info]toward = [item[3] for item in info]style = [item[4] for item in info]floor = [item[5] for item in info]source = [item[6] for item in info]time = [item[7] for item in info]price = [item[-1] for item in info]subway = keys(info, '近地鐵')decorate = keys(info, '精裝')heating = keys(info, '集中供暖')new_room = keys(info, '新上')time_for_look = keys(info, '隨時看房')return pd.DataFrame({'title': title,'address': address,'area': area,'toward': toward,'style': style,'floor': floor,'source': source,'time': time,'price': price,'subway': subway,'decorate': decorate,'heating': heating,'new_room': new_room,'time_for_look': time_for_look}) data = clean_data(info) data.to_csv('data.csv', index=True)#7、讀取已爬取的數據,并對數據數字化 data = pd.read_csv('data.csv')# 客廳數量 data['sitting_room_value'] = data['style'].apply(lambda x: x.split('廳')[0][-1]) data['sitting_room_value'] = data['sitting_room_value'].replace('衛', 0)# 臥室浴室數量 data['bedroom_value'] = data['style'].apply(lambda x: x[0]) data['bathroom_value'] = data['style'].apply(lambda x: x[-2])# 價格、面積、樓層 data['price_value'] = data['price'].apply(lambda x: x[:-3]) data['area_value'] = data['area'].apply(lambda x: x[:-1]) data['floor_value'] = data['floor'].apply(lambda x: x.split('(')[-1][0])# 租房方位朝向 def toward(x, key=''):if key in x:return keyelse:return 0 data['north'] = data['toward'].apply(lambda x: toward(x, '北')).replace('北', 1) data['south'] = data['toward'].apply(lambda x: toward(x, '南')).replace('南', 1) data['east'] = data['toward'].apply(lambda x: toward(x, '東')).replace('東', 1) data['west'] = data['toward'].apply(lambda x: toward(x, '西')).replace('西', 1)# 提取全部的數值數據 values_data = data[['sitting_room_value', 'bedroom_value','bathroom_value', 'price_value', 'area_value','floor_value', 'north','south', 'east', 'west']].astype(float)# 描述性統計 values_data.describe()#8、使用seaborn進行簡單數據可視化 sns.displot(values_data['price_value'], kde=True) sns.jointplot(x='area_value', y='price_value', data=values_data) sns.pairplot(values_data)#9、分位數回歸 mod = smf.quantreg('price_value ~ area_value', values_data) res = mod.fit(q=.5)quantiles = np.arange(.05, .96, .1) def fit_model(q):res = mod.fit(q=q)return [q, res.params['Intercept'], res.params['area_value']] + \res.conf_int().loc['area_value'].tolist()models = [fit_model(x) for x in quantiles] models = pd.DataFrame(models, columns=['q', 'a', 'b', 'lb', 'ub'])ols = smf.ols('price_value ~ area_value', values_data).fit() ols_ci = ols.conf_int().loc['area_value'].tolist() ols = dict(a = ols.params['Intercept'],b = ols.params['area_value'],lb = ols_ci[0],ub = ols_ci[1])print(models) print(ols)x = np.arange(values_data.area_value.min(), values_data.area_value.max(), 50) get_y = lambda a, b: a + b * xfig, ax = plt.subplots(figsize=(8, 6))for i in range(models.shape[0]):y = get_y(models.a[i], models.b[i])ax.plot(x, y, linestyle='dotted', color='grey')y = get_y(ols['a'], ols['b'])ax.plot(x, y, color='red', label='OLS') ax.scatter(values_data.area_value, values_data.price_value, alpha=.2)legend = ax.legend() ax.set_xlabel('Area', fontsize=16) ax.set_ylabel('Price', fontsize=16);plt.show()總結
以上是生活随笔為你收集整理的python实现北京租房信息计算的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 某同步总线的时钟频率为100MHz,宽度
- 下一篇: 设备管理器里“SM总线控制器”、“其他P