python爬取交通违法记录_python爬虫爬取汽车页面信息,并附带分析(静态爬虫)...
1 importrequests2 from bs4 importBeautifulSoup3 importre4 importrandom5 importtime6
7
8 #爬蟲主函數
9 defmm(url):10 #設置目標url,使用requests創建請求
11 header ={12 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}13 req0 = requests.get(url=url, headers=header)14 req0.encoding = "gb18030" #解決亂碼問題
15 html0 =req0.text16
17 #使用BeautifulSoup創建html代碼的BeautifulSoup實例,存為soup0
18 soup0 = BeautifulSoup(html0, "html.parser")19
20 #獲取最后一頁數字,對應-122(對照前一小節獲取尾頁的內容看你就明白了)
21 total_page = int(soup0.find("div", class_="pagers").findAll("a")[-2].get_text())22 myfile = open("aika_qc_gn_1_1_1.txt", "a", encoding='gb18030', errors='ignore') #解決亂碼問題
23 print("user", "來源", "認為有用人數", "類型", "comment")24 NAME = "user" + "來源" + "認為有用人數" + "類型" + "comment"
25 myfile.write(NAME + "\n")26 for i in list(range(1, total_page + 1)):27 #設置隨機暫停時間
28 stop = random.uniform(1, 3)29
30 url = "http://newcar.xcar.com.cn/257/review/0/0_" + str(i) + ".htm"
31 req = requests.get(url=url, headers=header)32 req.encoding = "gb18030" #解決亂碼問題
33 html =req.text34
35 soup = BeautifulSoup(html, "html.parser")36 contents = soup.find('div', class_="review_comments").findAll("dl")37 l =len(contents)38 for content incontents:39 tiaoshu =contents.index(content)40 try:41 ss = "正在爬取第%d頁的第%d的評論,網址為%s" % (i, tiaoshu + 1, url)42 print(ss) #正在爬取的條數
43 try:44
45 #點評角度
46 comment_jiaodu = content.find("dt").find("em").find("a").get_text().strip().replace("\n",47 "").replace(48 "\t", "").replace("\r", "")49 except:50 comment_jiaodu = "sunny"
51 try:52
53 #點評類型
54 comment_type0 = content.find("dt").get_text().strip().replace("\n", "").replace("\t", "").replace(55 "\r",56 "")57 comment_type1 = comment_type0.split("【")[1]58 comment_type = comment_type1.split("】")[0]59 except:60 comment_type = "sunny"
61
62 #認為該條評價有用的人數
63 try:64 useful =int(65 content.find("dd").find("div", class_="useful").find("i").find(66 "span").get_text().strip().replace(67 "\n", "").replace("\t", "").replace("\r", ""))68 except:69 useful = "sunny"
70
71 #評論來源
72 try:73 comment_region = content.find("dd").find("p").find("a").get_text().strip().replace("\n",74 "").replace(75 "\t", "").replace("\r", "")76 except:77 comment_region = "sunny"
78
79 #評論者名稱
80 try:81 user =\82 content.find("dd").find("p").get_text().strip().replace("\n", "").replace("\t", "").replace(83 "\r",84 "").split(85 ":")[-1]86 except:87 user = "sunny"
88
89 #評論內容
90 try:91 comment_url = content.find('dt').findAll('a')[-1]['href']92 urlc =comment_url93 headerc ={94 "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"}95 reqc = requests.get(urlc, headers=headerc)96 htmlc =reqc.text97 soupc = BeautifulSoup(htmlc, "html.parser")98
99 comment0 =\100 soupc.find('div', id='mainNew').find('div', class_='maintable').findAll('form')[1].find('table',101 class_='t_msg').findAll(102 'tr')[1]103 try:104 comment = comment0.find('font').get_text().strip().replace("\n", "").replace("\t", "")105 except:106 comment = "sunny"
107 try:108 comment_time = soupc.find('div', id='mainNew').find('div', class_='maintable').findAll('form')[109 1].find('table', class_='t_msg').find('div',110 style='padding-top: 4px;float:left').get_text().strip().replace(111 "\n", "").replace(112 "\t", "")[4:]113 except:114 comment_time = "sunny"
115 except:116 try:117 comment =\118 content.find("dd").get_text().split("\n")[-1].split('\r')[-1].strip().replace("\n",119 "").replace(120 "\t", "").replace("\r", "").split(":")[-1]121 except:122 comment = "sunny"
123
124 time.sleep(stop)125 print(user, comment_region, useful, comment_type, comment)126
127 tt = user + " " + comment_region + " " + str(useful) + " " + comment_type + " " +comment128 myfile.write(tt + "\n")129 exceptException as e:130 print(e)131 s = "爬取第%d頁的第%d的評論失敗,網址為%s" % (i, tiaoshu + 1, url)132 print(s)133 pass
134 myfile.close()135
136
137 #統計評論分布
138 deffenxi():139 myfile = open("aika_qc_gn_1_1_1.txt", "r")140 good =0141 middle =0142 bad =0143 nn =0144 for line inmyfile:145 commit = line.split(" ")[3]146 if commit == "好評":147 good = good + 1
148 elif commit == "中評":149 middle = middle + 1
150 elif commit == "差評":151 bad = bad + 1
152 else:153 nn = nn + 1
154 count = good + middle + bad +nn155 g = round(good / (count - nn) * 100, 2)156 m = round(middle / (count - nn) * 100, 2)157 b = round(bad / (count - nn) * 100, 2)158 n = round(nn / (count - nn) * 100, 2)159 print("好評占比:", g)160 print("中評占比:", m)161 print("差評占比:", b)162 print ("未評論:", n)163
164
165 url = "http://newcar.xcar.com.cn/257/review/0.htm"
166 mm(url)167 fenxi()
總結
以上是生活随笔為你收集整理的python爬取交通违法记录_python爬虫爬取汽车页面信息,并附带分析(静态爬虫)...的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 数据分析(3): 漏斗观察法
- 下一篇: jquery dwn 开发学习