當前位置：首頁 > 编程资源 > 编程问答 >内容正文

编程问答

Part2-Chapter8-预测乐高玩具套装价格

發(fā)布時間：2024/3/26 编程问答 39 豆豆

生活随笔收集整理的這篇文章主要介紹了 Part2-Chapter8-预测乐高玩具套装价格小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

目標是爬取ebay'上的二手樂高數(shù)據(jù)，并使用嶺回歸交叉驗證的方式給出回歸方程from bs4 import BeautifulSoup import numpy as np import randomdef scrapePage(retX, retY, inFile, yr, numPce, origPrc):# 打開并讀取HTML文件with open(inFile, encoding='utf-8') as f:html = f.read()soup = BeautifulSoup(html)i = 1# 根據(jù)HTML頁面結構進行解析currentRow = soup.find_all('table', r = "%d" % i)while(len(currentRow) != 0):currentRow = soup.find_all('table', r = "%d" % i)title = currentRow[0].find_all('a')[1].textlwrTitle = title.lower()# 查找是否有全新標簽if (lwrTitle.find('new') > -1) or (lwrTitle.find('nisb') > -1):newFlag = 1.0else:newFlag = 0.0# 查找是否已經(jīng)標志出售，我們只收集已出售的數(shù)據(jù)soldUnicde = currentRow[0].find_all('td')[3].find_all('span')if len(soldUnicde) == 0:print("商品 #%d 沒有出售" % i)else:# 解析頁面獲取當前價格soldPrice = currentRow[0].find_all('td')[4]priceStr = soldPrice.textpriceStr = priceStr.replace('$','')priceStr = priceStr.replace(',','')if len(soldPrice) > 1:priceStr = priceStr.replace('Free shipping', '')sellingPrice = float(priceStr)# 去掉不完整的套裝價格if sellingPrice > origPrc * 0.5:print("%d\t%d\t%d\t%f\t%f" % (yr, numPce, newFlag, origPrc, sellingPrice))retX.append([yr, numPce, newFlag, origPrc])retY.append(sellingPrice)i += 1currentRow = soup.find_all('table', r = "%d" % i)#分別抓取各網(wǎng)頁數(shù)據(jù) def setDataCollect(retX, retY)：scrapePage(retX, retY, 'lego8288.html', 2006, 800, 49.99) scrapePage(retX, retY, 'lego10030.html', 2002, 3096, 269.99) scrapePage(retX, retY, 'lego10179.html', 2007, 5195, 499.99) scrapePage(retX, retY, 'lego10181.html', 2007, 3428, 199.99) scrapePage(retX, retY, 'lego10189.html', 2008, 5922, 299.99) scrapePage(retX, retY, 'lego10196.html', 2009, 3263, 249.99)#標準化 def regularize(xMat,yMat):inxMat = xMat.copy()inyMat = yMat.copy()yMean = np.mean(yMat,0)inyMat = yMat - yMeaninMeans = np.mean(inxMat,0)inVar = np.var(inxMat,0)print(inMeans)inxMat = (inxMat - inMeans)/inVar計算平方誤差 def rssError(yArr,yHatArr):return ((yArr - yHatArr)**2).sum()#計算回歸系數(shù)W def standRegres(xArr,yArr):xMat = np.mat(xArr)yMat = np.mat(yArr).TxTx = xMat.T * xMatif np.linalg.det(xTx) == 0.0:print("無法求逆")returnws = xTx.I * (xMat.T * yMat)return ws#交叉驗證嶺回歸 def crossValidation(xArr,yArr,numVal = 10):#得到數(shù)據(jù)數(shù)m = len(yArr)#建索引表indexList = list(range(m))#誤差表errorMat = np.zeros((numVal,30))#交叉驗證numVal次for i in range(numVal):trainX = []trainY = []testX = []testY = []#將數(shù)據(jù)“洗牌”random.shuffle(indexList)#劃分訓練集與測試集for j in range(m):if j < m*0.9:trainX.append(xArr[indexList[j]])trainY.append(yArr[indexList[j]])else:testX.append(xArr[indexList[j]])testY.append(yArr[indexList[j]])wMat = ridgeTest(trainX,trainY)#對每次交叉驗證，計算三十個lamda的系數(shù)for k in range(30):matTestX = np.mat(testX)matTrainX = np.mat(trainX)meanTrain = np.mean(matTrainX,0)varTrain = np.var(matTrainX,0)matTestX = (matTestX - meanTrain)/varTrainyEst = matTestX * np.mat(wMat[k,:]).T + np.mean(trainY)errorMat[i,k] = rssError(yEst.T.A,np.array(testY))#得到最小誤差的系數(shù)meanErrors = np.mean(errorMat,0)minMean = float(min(meanErrors)) bestWeights = wMat[np.nonzero(meanErrors == minMean)]xMat = np.mat(xArr)yMat = np.mat(yArr)meanX = np.mean(xMat,0)varX = np.var(xMat,0)#逆標準化數(shù)據(jù)unReg = bestWeights / varXprint('%f%+f*年份%+f*部件數(shù)量%+f*是否全新%+f*原價'%((-1 * np.sum(np.multiply(meanX,unReg))+np.mean(yMat)),unReg[0,0],unReg[0,1],unReg[0,2],unReg[0,3]))#嶺回歸測試 def ridgeTest(xArr,yArr):xMat = np.mat(xArr)yMat = np.mat(yArr).TyMean = np.mean(yMat,axis = 0)yMat = yMat-yMeanxMeans = np.mean(xMat,axis = 0)xVar = np.var(xMat,axis = 0)xMat = (xMat - xMeans)/xVarnumTestPts = 30wMat = np.zeros((numTestPts,np.shape(xMat)[1]))for i in range(numTestPts):ws = ridgeRegres(xMat,yMat,np.exp(i-10))wMat[i,:] = ws.Treturn wMatif __name__ == "__main__":lgX = []lgY = []setDataCollect(lgX,lgY)crossValidation(lgX,lgY)

總結

以上是生活随笔為你收集整理的Part2-Chapter8-预测乐高玩具套装价格的全部內(nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯，歡迎將生活随笔推薦給好友。

上一篇： Beta阶段团队项目开发篇章1
下一篇：简易电商宝贝放大镜效果--jQuery