Part2-Chapter8-预测乐高玩具套装价格
生活随笔
收集整理的這篇文章主要介紹了
Part2-Chapter8-预测乐高玩具套装价格
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
目標是爬取ebay'上的二手樂高數(shù)據(jù),并使用嶺回歸交叉驗證的方式給出回歸方程from bs4 import BeautifulSoup
import numpy as np
import randomdef scrapePage(retX, retY, inFile, yr, numPce, origPrc):# 打開并讀取HTML文件with open(inFile, encoding='utf-8') as f:html = f.read()soup = BeautifulSoup(html)i = 1# 根據(jù)HTML頁面結構進行解析currentRow = soup.find_all('table', r = "%d" % i)while(len(currentRow) != 0):currentRow = soup.find_all('table', r = "%d" % i)title = currentRow[0].find_all('a')[1].textlwrTitle = title.lower()# 查找是否有全新標簽if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):newFlag = 1.0else:newFlag = 0.0# 查找是否已經(jīng)標志出售,我們只收集已出售的數(shù)據(jù)soldUnicde = currentRow[0].find_all('td')[3].find_all('span')if len(soldUnicde) == 0:print("商品 #%d 沒有出售" % i)else:# 解析頁面獲取當前價格soldPrice = currentRow[0].find_all('td')[4]priceStr = soldPrice.textpriceStr = priceStr.replace('$','')priceStr = priceStr.replace(',','')if len(soldPrice) > 1:priceStr = priceStr.replace('Free shipping', '')sellingPrice = float(priceStr)# 去掉不完整的套裝價格if sellingPrice > origPrc * 0.5:print("%d\t%d\t%d\t%f\t%f" % (yr, numPce, newFlag, origPrc, sellingPrice))retX.append([yr, numPce, newFlag, origPrc])retY.append(sellingPrice)i += 1currentRow = soup.find_all('table', r = "%d" % i)#分別抓取各網(wǎng)頁數(shù)據(jù)
def setDataCollect(retX, retY):scrapePage(retX, retY, 'lego8288.html', 2006, 800, 49.99) scrapePage(retX, retY, 'lego10030.html', 2002, 3096, 269.99) scrapePage(retX, retY, 'lego10179.html', 2007, 5195, 499.99) scrapePage(retX, retY, 'lego10181.html', 2007, 3428, 199.99) scrapePage(retX, retY, 'lego10189.html', 2008, 5922, 299.99) scrapePage(retX, retY, 'lego10196.html', 2009, 3263, 249.99)#標準化
def regularize(xMat,yMat):inxMat = xMat.copy()inyMat = yMat.copy()yMean = np.mean(yMat,0)inyMat = yMat - yMeaninMeans = np.mean(inxMat,0)inVar = np.var(inxMat,0)print(inMeans)inxMat = (inxMat - inMeans)/inVar計算平方誤差
def rssError(yArr,yHatArr):return ((yArr - yHatArr)**2).sum()#計算回歸系數(shù)W
def standRegres(xArr,yArr):xMat = np.mat(xArr)yMat = np.mat(yArr).TxTx = xMat.T * xMatif np.linalg.det(xTx) == 0.0:print("無法求逆")returnws = xTx.I * (xMat.T * yMat)return ws#交叉驗證嶺回歸
def crossValidation(xArr,yArr,numVal = 10):#得到數(shù)據(jù)數(shù)m = len(yArr)#建索引表indexList = list(range(m))#誤差表errorMat = np.zeros((numVal,30))#交叉驗證numVal次for i in range(numVal):trainX = []trainY = []testX = []testY = []#將數(shù)據(jù)“洗牌”random.shuffle(indexList)#劃分訓練集與測試集for j in range(m):if j < m*0.9:trainX.append(xArr[indexList[j]])trainY.append(yArr[indexList[j]])else:testX.append(xArr[indexList[j]])testY.append(yArr[indexList[j]])wMat = ridgeTest(trainX,trainY)#對每次交叉驗證,計算三十個lamda的系數(shù)for k in range(30):matTestX = np.mat(testX)matTrainX = np.mat(trainX)meanTrain = np.mean(matTrainX,0)varTrain = np.var(matTrainX,0)matTestX = (matTestX - meanTrain)/varTrainyEst = matTestX * np.mat(wMat[k,:]).T + np.mean(trainY)errorMat[i,k] = rssError(yEst.T.A,np.array(testY))#得到最小誤差的系數(shù)meanErrors = np.mean(errorMat,0)minMean = float(min(meanErrors)) bestWeights = wMat[np.nonzero(meanErrors == minMean)]xMat = np.mat(xArr)yMat = np.mat(yArr)meanX = np.mean(xMat,0)varX = np.var(xMat,0)#逆標準化數(shù)據(jù)unReg = bestWeights / varXprint('%f%+f*年份%+f*部件數(shù)量%+f*是否全新%+f*原價'%((-1 * np.sum(np.multiply(meanX,unReg))+np.mean(yMat)),unReg[0,0],unReg[0,1],unReg[0,2],unReg[0,3]))#嶺回歸測試
def ridgeTest(xArr,yArr):xMat = np.mat(xArr)yMat = np.mat(yArr).TyMean = np.mean(yMat,axis = 0)yMat = yMat-yMeanxMeans = np.mean(xMat,axis = 0)xVar = np.var(xMat,axis = 0)xMat = (xMat - xMeans)/xVarnumTestPts = 30wMat = np.zeros((numTestPts,np.shape(xMat)[1]))for i in range(numTestPts):ws = ridgeRegres(xMat,yMat,np.exp(i-10))wMat[i,:] = ws.Treturn wMatif __name__ == "__main__":lgX = []lgY = []setDataCollect(lgX,lgY)crossValidation(lgX,lgY)
總結
以上是生活随笔為你收集整理的Part2-Chapter8-预测乐高玩具套装价格的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: Beta阶段团队项目开发篇章1
- 下一篇: 简易电商宝贝放大镜效果--jQuery