當前位置：首頁 > 编程语言 > python >内容正文

python

python爬取交通违法记录_python爬虫爬取汽车页面信息，并附带分析（静态爬虫）...

發布時間：2024/1/18 python 30 豆豆

生活随笔收集整理的這篇文章主要介紹了 python爬取交通违法记录_python爬虫爬取汽车页面信息，并附带分析（静态爬虫）... 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

1 importrequests2 from bs4 importBeautifulSoup3 importre4 importrandom5 importtime6

8 #爬蟲主函數

9 defmm(url):10 #設置目標url，使用requests創建請求

11 header ={12 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}13 req0 = requests.get(url=url, headers=header)14 req0.encoding = "gb18030" #解決亂碼問題

15 html0 =req0.text16

17 #使用BeautifulSoup創建html代碼的BeautifulSoup實例，存為soup0

18 soup0 = BeautifulSoup(html0, "html.parser")19

20 #獲取最后一頁數字，對應-122（對照前一小節獲取尾頁的內容看你就明白了）

21 total_page = int(soup0.find("div", class_="pagers").findAll("a")[-2].get_text())22 myfile = open("aika_qc_gn_1_1_1.txt", "a", encoding='gb18030', errors='ignore') #解決亂碼問題

23 print("user", "來源", "認為有用人數", "類型", "comment")24 NAME = "user" + "來源" + "認為有用人數" + "類型" + "comment"

25 myfile.write(NAME + "\n")26 for i in list(range(1, total_page + 1)):27 #設置隨機暫停時間

28 stop = random.uniform(1, 3)29

30 url = "http://newcar.xcar.com.cn/257/review/0/0_" + str(i) + ".htm"

31 req = requests.get(url=url, headers=header)32 req.encoding = "gb18030" #解決亂碼問題

33 html =req.text34

35 soup = BeautifulSoup(html, "html.parser")36 contents = soup.find('div', class_="review_comments").findAll("dl")37 l =len(contents)38 for content incontents:39 tiaoshu =contents.index(content)40 try:41 ss = "正在爬取第%d頁的第%d的評論，網址為%s" % (i, tiaoshu + 1, url)42 print(ss) #正在爬取的條數

43 try:44

45 #點評角度

46 comment_jiaodu = content.find("dt").find("em").find("a").get_text().strip().replace("\n",47 "").replace(48 "\t", "").replace("\r", "")49 except:50 comment_jiaodu = "sunny"

51 try:52

53 #點評類型

54 comment_type0 = content.find("dt").get_text().strip().replace("\n", "").replace("\t", "").replace(55 "\r",56 "")57 comment_type1 = comment_type0.split("【")[1]58 comment_type = comment_type1.split("】")[0]59 except:60 comment_type = "sunny"

62 #認為該條評價有用的人數

63 try:64 useful =int(65 content.find("dd").find("div", class_="useful").find("i").find(66 "span").get_text().strip().replace(67 "\n", "").replace("\t", "").replace("\r", ""))68 except:69 useful = "sunny"

71 #評論來源

72 try:73 comment_region = content.find("dd").find("p").find("a").get_text().strip().replace("\n",74 "").replace(75 "\t", "").replace("\r", "")76 except:77 comment_region = "sunny"

79 #評論者名稱

80 try:81 user =\82 content.find("dd").find("p").get_text().strip().replace("\n", "").replace("\t", "").replace(83 "\r",84 "").split(85 "：")[-1]86 except:87 user = "sunny"

89 #評論內容

90 try:91 comment_url = content.find('dt').findAll('a')[-1]['href']92 urlc =comment_url93 headerc ={94 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}95 reqc = requests.get(urlc, headers=headerc)96 htmlc =reqc.text97 soupc = BeautifulSoup(htmlc, "html.parser")98

99 comment0 =\100 soupc.find('div', id='mainNew').find('div', class_='maintable').findAll('form')[1].find('table',101 class_='t_msg').findAll(102 'tr')[1]103 try:104 comment = comment0.find('font').get_text().strip().replace("\n", "").replace("\t", "")105 except:106 comment = "sunny"

107 try:108 comment_time = soupc.find('div', id='mainNew').find('div', class_='maintable').findAll('form')[109 1].find('table', class_='t_msg').find('div',110 style='padding-top: 4px;float:left').get_text().strip().replace(111 "\n", "").replace(112 "\t", "")[4:]113 except:114 comment_time = "sunny"

115 except:116 try:117 comment =\118 content.find("dd").get_text().split("\n")[-1].split('\r')[-1].strip().replace("\n",119 "").replace(120 "\t", "").replace("\r", "").split("：")[-1]121 except:122 comment = "sunny"

123

124 time.sleep(stop)125 print(user, comment_region, useful, comment_type, comment)126

127 tt = user + " " + comment_region + " " + str(useful) + " " + comment_type + " " +comment128 myfile.write(tt + "\n")129 exceptException as e:130 print(e)131 s = "爬取第%d頁的第%d的評論失敗，網址為%s" % (i, tiaoshu + 1, url)132 print(s)133 pass

134 myfile.close()135

136

137 #統計評論分布

138 deffenxi():139 myfile = open("aika_qc_gn_1_1_1.txt", "r")140 good =0141 middle =0142 bad =0143 nn =0144 for line inmyfile:145 commit = line.split(" ")[3]146 if commit == "好評":147 good = good + 1

148 elif commit == "中評":149 middle = middle + 1

150 elif commit == "差評":151 bad = bad + 1

152 else:153 nn = nn + 1

154 count = good + middle + bad +nn155 g = round(good / (count - nn) * 100, 2)156 m = round(middle / (count - nn) * 100, 2)157 b = round(bad / (count - nn) * 100, 2)158 n = round(nn / (count - nn) * 100, 2)159 print("好評占比：", g)160 print("中評占比：", m)161 print("差評占比：", b)162 print ("未評論：", n)163

164

165 url = "http://newcar.xcar.com.cn/257/review/0.htm"

166 mm(url)167 fenxi()

總結

以上是生活随笔為你收集整理的python爬取交通违法记录_python爬虫爬取汽车页面信息，并附带分析（静态爬虫）...的全部內容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯，歡迎將生活随笔推薦給好友。