lgfj
???? ???? 數據來源于公司的mongodb 數據庫,由于公司保密需要,端口不能給出。大家看下吧。利用四個小區的歷史交易訓練模型,給出房價輸出。
import pymongo from pymongo import MongoClient import numpy as np import pandas as pd from pandas import DataFrame,Series from numpy import row_stack,column_stackclient = MongoClient('192.168.xx.xx',2xxxx) db = client.fangjia seawater = db.seawater seawater.find_one()#["dancing","swimming"] query = {"city":"上海","cat":"sell","region":"浦東","district_name":{"$in":["康橋半島二期","康橋半島五期","綠洲清水灣","中邦城市"]},"p_date":{"$gt":20170508}}lt= seawater.count(query) print(lt) pos = list()for s in seawater.find(query).limit(lt-1):pos.append(s)data=DataFrame(pos)data.to_excel('data.xls')choose_class=['total_price','area','height','room','direction','hall','toilet','fitment','district_name','p_date']dc=data[choose_class]dc.to_excel('dc.xls')''' lo=list(range(dc.shape[0]))la=list(range(dc.shape[0]))k2=[121.5886,31.148452] #康橋半島二期經緯度k5=[121.589463,31.139917] #康橋半島五期經緯度lw=[121.586066,31.154501] #綠洲清水灣經緯度klk=[121.58401,31.157145] #中邦城市期經緯度'''for i in dc['district_name'].index :if dc['district_name'][i]=='康橋半島二期':dc['district_name'][i]=0elif dc['district_name'][i]=='康橋半島五期':dc['district_name'][i]=1elif dc['district_name'][i]=='綠洲清水灣':dc['district_name'][i]=2elif dc['district_name'][i]=='中邦城市':dc['district_name'][i] =3''' for i in dc['district_name'].index :if dc['district_name'][i]=='康橋半島二期':dc['district_name'][i]=0elif dc['district_name'][i]=='康橋半島五期':dc['district_name'][i]=1elif dc['district_name'][i]=='綠洲清水灣':dc['district_name'][i]=2elif dc['district_name'][i]=='康橋綠洲康城1期':dc['district_name'][i] =3 ''' ''' dc.to_excel('dc.xls') for i in dc['direction'].index:if ('東' in dc['direction'][i]) or ('西' in dc['direction'][i]):dc['direction'][i]=0else:dc['direction'][i]=1for i in dc['fitment'].index:if ('豪' in dc['fitment'][i]==True) or ('精' in dc['fitment'][i]==True):dc['fitment'][i]=0elif ('毛' in dc['fitment'][i]==True) :dc['direction'][i]=1 else :dc['direction'][i]=2 '''uy=dc.valuesfor i in range(uy.shape[0]):if (uy[i][4]=='南') or (uy[i][4]=='南北'):uy[i][4]=1else:uy[i][4]=0for i in range(uy.shape[0]):if (uy[i][7]=='精裝修') or (uy[i][7]=='中裝修'):uy[i][7]=1else:uy[i][7]=0uu=DataFrame(uy)uu1 = uu.fillna({2:18,3:3,5:2,6:2,7:1})data_train = uu1.drop([0],axis=0)data_max = data_train.max() data_min = data_train.min()data_train1 = (data_train-data_min)/(data_max-data_min+0.2) #數據標準化knife=int(0.95*(data_train.shape[0]))#用于切割數據80%用于訓練,20%用于計算x_train = data_train1.iloc[0:knife,1:9].as_matrix() #訓練樣本標簽列 y_train = data_train1.iloc[0:knife,0:1].as_matrix() #訓練樣本特征from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activationmodel = Sequential() #建立模型 model.add(Dense(input_dim = 8, output_dim = 48)) #添加輸入層、隱藏層的連接 model.add(Activation('tanh')) #以Relu函數為激活函數model.add(Dense(input_dim = 100, output_dim = 100)) #添加隱藏層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數 model.add(Dropout(0.2))model.add(Dense(input_dim = 100, output_dim = 50)) #添加隱藏層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數 model.add(Dropout(0.2)) model.add(Dense(input_dim = 50, output_dim = 36)) #添加隱藏層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數model.add(Dense(input_dim = 36, output_dim = 12)) #添加隱藏層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數 model.add(Dense(input_dim = 12, output_dim = 12)) #添加隱藏層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數model.add(Dense(input_dim = 12, output_dim = 1)) #添加隱藏層、輸出層的連接 model.add(Activation('tanh')) #以sigmoid函數為激活函數 #編譯模型,損失函數為binary_crossentropy,用adam法求解 model.compile(loss='mean_squared_error', optimizer='adam') model.fit(x_train, y_train, nb_epoch = 300, batch_size = 5) #訓練模型model.save_weights('net.model') #保存模型參數x_test = data_train1.iloc[knife:,1:9].as_matrix() #訓練樣本標簽列 y_test = data_train1.iloc[knife:,0:1].as_matrix() #訓練樣本特征r = pd.DataFrame(model.predict(x_test)) rt=r*(data_max-data_min+0.2)+data_min #print(rt.round(2))predict=rt.values[:,0:1]realvalue= data_train.values[knife:,0:1]error=abs((predict-realvalue)/realvalue)*100geek=column_stack((predict,realvalue,error))DataFrame(geek).to_excel('geek.xls')print(geek)print('平均計算誤差:','%.2f'%error.mean(),'%')???? ???? 輸出的是小區均價,已經把時間平滑處理,即把時間轉換成一組數,隨機從數據集中取出一條數據進行驗證,當然訓練集不包含此條數據,計算結果非常好,誤差幾乎是0。在這一點上,神經網絡秒殺經典機器學習算法,秒殺xgboost
# -*- coding: utf-8 -*- """ Created on Thu Aug 24 15:14:07 2017@author: Administrator """import pymongo from pymongo import MongoClient import numpy as np import pandas as pd from pandas import DataFrame,Series from numpy import row_stack,column_stack from dateutil.parser import parse from matplotlib.pylab import date2num import random#從公司的數據庫中導入數據 client = MongoClient('192.168.xx.xx',2xxxx) db = client.fangjia seawater = db.seawater seawater.find_one()# 索引數據庫里的數據 query = {"city":"上海","cat":"sell","region":"松江","district_name":{"$in":["綠洲比華利花園","沿海麗水馨庭","雅仕軒","上??党?#34;]},"p_date":{"$gt":20170508}}lt= seawater.count(query) print(lt) pos = list() #數據轉化為數組,數組的元素為字典 for s in seawater.find(query).limit(lt-1):pos.append(s)#將數據轉化為 DataFrame data=DataFrame(pos)data.to_excel('data.xls')#需要提取的特征 choose_class=['total_price','area','height','room','direction','hall','toilet','fitment','district_name','p_date']dc=data[choose_class] #將'total_price' 轉化為均價,并把均價賦值給'total_price' mean_price=dc['total_price']/dc['area']dc['total_price']=mean_price #將'total_price' 轉化為均價#這段代碼用于把時間轉化成一個連續的數,至于是否有效有待觀察 #################### h=dc['p_date'] for i in range(1,len(h)): a=int(h[i]) b=str(a) c=parse(b) e = date2num(c) h[i]=e dc['p_date']=h ################### dc.to_excel('dc.xls')''' lo=list(range(dc.shape[0]))la=list(range(dc.shape[0]))k2=[121.5886,31.148452] #康橋半島二期經緯度k5=[121.589463,31.139917] #康橋半島五期經緯度lw=[121.586066,31.154501] #綠洲清水灣經緯度klk=[121.58401,31.157145] #中邦城市期經緯度'''for i in dc['district_name'].index :if dc['district_name'][i]=='綠洲比華利花園':dc['district_name'][i]=0elif dc['district_name'][i]=='沿海麗水馨庭':dc['district_name'][i]=1elif dc['district_name'][i]=='雅仕軒':dc['district_name'][i]=2elif dc['district_name'][i]=='上海康城':dc['district_name'][i] =3''' for i in dc['district_name'].index :if dc['district_name'][i]=='康橋半島二期':dc['district_name'][i]=0elif dc['district_name'][i]=='康橋半島五期':dc['district_name'][i]=1elif dc['district_name'][i]=='綠洲清水灣':dc['district_name'][i]=2elif dc['district_name'][i]=='康橋綠洲康城1期':dc['district_name'][i] =3 ''' ''' dc.to_excel('dc.xls') for i in dc['direction'].index:if ('東' in dc['direction'][i]) or ('西' in dc['direction'][i]):dc['direction'][i]=0else:dc['direction'][i]=1for i in dc['fitment'].index:if ('豪' in dc['fitment'][i]==True) or ('精' in dc['fitment'][i]==True):dc['fitment'][i]=0elif ('毛' in dc['fitment'][i]==True) :dc['direction'][i]=1 else :dc['direction'][i]=2 '''uy=dc.valuesfor i in range(uy.shape[0]):if (uy[i][4]=='南') or (uy[i][4]=='南北'):uy[i][4]=1else:uy[i][4]=0for i in range(uy.shape[0]):if (uy[i][7]=='精裝修') or (uy[i][7]=='中裝修'):uy[i][7]=1else:uy[i][7]=0uu=DataFrame(uy)uu1 = uu.fillna({2:18,3:3,5:2,6:2,7:1})data_all = uu1.drop([0],axis=0)sample_number=data_all.shape[0]kk=int(0.05 *sample_number)test_label=[random.randint(0,sample_number) for _ in range(kk)]data_train= data_all.drop(test_label,axis=0) #data_train.to_excel('data_train.xls') data_max = data_train.max() data_min = data_train.min()data_train1 = (data_train-data_min)/(data_max-data_min+0.2) #數據標準化#knife=int(0.95*(data_train.shape[0]))#用于切割數據80%用于訓練,20%用于計算x_train = data_train1.iloc[:,1:10].as_matrix() #訓練樣本標簽列 y_train = data_train1.iloc[:,0:1].as_matrix() #訓練樣本特征from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activationmodel = Sequential() #建立模型 model.add(Dense(input_dim = 9, output_dim = 48)) #添加輸入層、隱藏層的連接 model.add(Activation('tanh')) #以Relu函數為激活函數model.add(Dense(input_dim = 100, output_dim = 100)) #添加隱藏層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數 model.add(Dropout(0.2))model.add(Dense(input_dim = 100, output_dim = 50)) #添加隱藏層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數 model.add(Dropout(0.2)) model.add(Dense(input_dim = 50, output_dim = 36)) #添加隱藏層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數model.add(Dense(input_dim = 36, output_dim = 12)) #添加隱藏層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數 model.add(Dense(input_dim = 12, output_dim = 12)) #添加隱藏層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數model.add(Dense(input_dim = 12, output_dim = 1)) #添加隱藏層、輸出層的連接 model.add(Activation('tanh')) #以sigmoid函數為激活函數 #編譯模型,損失函數為binary_crossentropy,用adam法求解 model.compile(loss='mean_squared_error', optimizer='adam') model.fit(x_train, y_train, nb_epoch = 200, batch_size = 3) #訓練模型model.save_weights('net.model') #保存模型參數test=data_all.ix[test_label,:]#test_max = test.max() #test_min = test.min() data_test = (test-data_min)/(data_max-data_min+0.2) x_test = data_test.iloc[:,1:10].as_matrix() y_test = data_test.iloc[:,0:1].as_matrix()#x_test = data_train1.iloc[knife:,1:9].as_matrix() #訓練樣本標簽列 #y_test = data_train1.iloc[knife:,0:1].as_matrix() #訓練樣本特征r = pd.DataFrame(model.predict(x_test)) rt=r*(data_max-data_min+0.2)+data_min #print(rt.round(2))predict=rt.values[:,0:1]realvalue= test.iloc[:,0:1].as_matrix()error=abs((predict-realvalue)/realvalue)*100geek=column_stack((predict,realvalue,error))DataFrame(geek).to_excel('geek.xls')print(geek)print('平均計算誤差:','%.2f'%error.mean(),'%')均值填充和考慮經緯度2017.8.30
# -*- coding: utf-8 -*- """ Created on Thu Aug 24 15:14:07 2017@author: Administrator """import pymongo from pymongo import MongoClient import numpy as np import pandas as pd from pandas import DataFrame,Series from numpy import row_stack,column_stack from dateutil.parser import parse from matplotlib.pylab import date2num import random#導入經度和緯度#從公司的數據庫中導入數據 client1 = MongoClient('192.168.0.136',xxx) db1 = client1.fangjia seaweed1 = db1.seaweed#print(seaweed.find_one({"city":"上海","region":"浦東","name":"康橋半島二期"},{"lat2":1,"lng2":1}))''' print(seaweed.find_one({"city":"上海","region":"浦東","name":{"$in":["康橋半島二期","康橋半島三期","綠洲清水灣","中邦城市"]}},{"lat2":1,"lng2":1}))''' query1 = {"status":0,"cat":"district","city":"上海","region":"浦東", "name":{"$in":["康橋半島二期","康橋半島三期","綠洲清水灣","中邦城市"]}} fields1 = {"lat2":1,"lng2":1, "city":1,"region":1,"cat":1,"name":1}lct= list() for s in seaweed.find(query1, fields1):lct.append(s)lf=DataFrame(lct)le=lf le.index=le['name'] lr=le[['lng2','lat2']]#從公司的數據庫中導入數據 client = MongoClient('192.168.10.88',2xxxx) db = client.fangjia seawater = db.seawater seawater.find_one()# 索引數據庫里的數據 query = {"city":"上海","cat":"sell","region":"浦東","district_name":{"$in":["康橋半島二期","康橋半島三期","綠洲清水灣","中邦城市"]},"p_date":{"$gt":20160508}}lt= seawater.count(query) print(lt) pos = list() #數據轉化為數組,數組的元素為字典 for s in seawater.find(query).limit(lt-1):pos.append(s)#將數據轉化為 DataFrame data=DataFrame(pos)data.to_excel('data.xls')#需要提取的特征 choose_class=['total_price','area','height','room','direction','hall','toilet','fitment','district_name','p_date']dc=data[choose_class]dc['lng2']=0 dc['lat2']=1''' for i in range(dc.shape[0]):bn=dc['district_name']p=bn[i]dc['lng2'][i]=lo['lng2'][p]'''for i in range(dc.shape[0]):if dc['district_name'][i]==lr.index[0]:dc['lng2'][i]=lr['lng2'][0]dc['lat2'][i]=lr['lat2'][0]elif dc['district_name'][i]==lr.index[1]:dc['lng2'][i]=lr['lng2'][1]dc['lat2'][i]=lr['lat2'][1]elif dc['district_name'][i]==lr.index[2]:dc['lng2'][i]=lr['lng2'][2]dc['lat2'][i]=lr['lat2'][2]elif dc['district_name'][i]==lr.index[3]:dc['lng2'][i]=lr['lng2'][3]dc['lat2'][i]=lr['lat2'][3]#將'total_price' 轉化為均價,并把均價賦值給'total_price' mean_price=dc['total_price']/dc['area']dc['total_price']=mean_price #將'total_price' 轉化為均價#這段代碼用于把時間轉化成一個連續的數,至于是否有效有待觀察 #################### h=dc['p_date'] for i in range(1,len(h)): a=int(h[i]) b=str(a) c=parse(b) e = date2num(c) h[i]=e dc['p_date']=h ################### dc.to_excel('dc.xls')''' #給每個小區賦予一個標簽 for i in dc['district_name'].index :if dc['district_name'][i]=='康橋半島二期':dc['district_name'][i]=0elif dc['district_name'][i]=='康橋半島三期':dc['district_name'][i]=1elif dc['district_name'][i]=='綠洲清水灣':dc['district_name'][i]=2elif dc['district_name'][i]=='中邦城市':dc['district_name'][i] =3'''for i in dc['direction'].index:if ('南' in str(dc['direction'][i])) :dc['direction'][i]=0else:dc['direction'][i]=1for i in dc['fitment'].index:if ('豪' or '精') in str(dc['fitment'][i]) :dc['fitment'][i]=0else :dc['fitment'][i]=1dc=dc.fillna({'height':dc['height'].mean(),'room':dc['room'].mean(),'toilet':dc['toilet'].mean(),'hall':dc['hall'].mean(),})ds=dc.drop('district_name',axis=1)data_all = ds.drop([0],axis=0)sample_number=data_all.shape[0]kk=int(0.05 *sample_number)test_label=[random.randint(1,sample_number) for _ in range(kk)]data_train= data_all.drop(test_label,axis=0) #data_train.to_excel('data_train.xls') data_max = data_train.max() data_min = data_train.min()data_train1 = (data_train-data_min)/(data_max-data_min+0.2) #數據標準化#knife=int(0.95*(data_train.shape[0]))#用于切割數據80%用于訓練,20%用于計算x_train = data_train1.iloc[:,1:11].as_matrix() #訓練樣本標簽列 y_train = data_train1.iloc[:,0:1].as_matrix() #訓練樣本特征from keras.models import Sequential from keras.layers.core import Dense, Dropout, Activationmodel = Sequential() #建立模型 model.add(Dense(input_dim = 10, output_dim = 48)) #添加輸入層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數model.add(Dense(input_dim = 48, output_dim = 100)) #添加隱藏層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數model.add(Dense(input_dim = 100, output_dim = 50)) #添加隱藏層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數model.add(Dense(input_dim = 50, output_dim = 36)) #添加隱藏層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數model.add(Dense(input_dim = 36, output_dim = 12)) #添加隱藏層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數 model.add(Dense(input_dim = 12, output_dim = 12)) #添加隱藏層、隱藏層的連接 model.add(Activation('relu')) #以Relu函數為激活函數model.add(Dense(input_dim = 12, output_dim = 1)) #添加隱藏層、輸出層的連接 model.add(Activation('sigmoid')) #以sigmoid函數為激活函數 #編譯模型,損失函數為binary_crossentropy,用adam法求解 model.compile(loss='mean_squared_error', optimizer='adam') model.fit(x_train, y_train, nb_epoch = 300, batch_size = 2) #訓練模型model.save_weights('net.model') #保存模型參數test=data_all.ix[test_label,:]#test_max = test.max() #test_min = test.min() data_test = (test-data_min)/(data_max-data_min+0.2) x_test = data_test.iloc[:,1:11].as_matrix() y_test = data_test.iloc[:,0:1].as_matrix()#x_test = data_train1.iloc[knife:,1:9].as_matrix() #訓練樣本標簽列 #y_test = data_train1.iloc[knife:,0:1].as_matrix() #訓練樣本特征r = (model.predict(x_test)) rt=r*(data_max.values-data_min.values+0.2)+data_min.values #print(rt.round(2))predict=rt[:,0:1]realvalue= test.iloc[:,0:1].as_matrix()error=abs((predict-realvalue)/realvalue)*100geek=column_stack((predict,realvalue,error))DataFrame(geek).to_excel('geek.xls')print(geek)print('平均計算誤差:','%.2f'%error.mean(),'%')總結
- 上一篇: Python 时间
- 下一篇: 如何在DataFrame索引某一行