当当网数据爬取
##嘗試爬取當(dāng)當(dāng)網(wǎng)上的各種信息
import requests
from bs4 import BeautifulSoup
import time
import jsonheader = {"Referer":"http://search.dangdang.com/?key=python&%253Bact=input&%253Bpage_index=%7B%7D&_ddclickunion=P-295132-199857_64_0_ZGljdHNfZ29vZ2xl_1%7Cad_type&page_index=3",'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.112 Safari/537.36 EastBrowser/2.1',"Accept-Language":"zh-CN,zh;q=0.9","Accept-Ranges":"bytes","Accept":"*/*"}def get_links(url,list):wb_data = requests.get(url, headers=header)soup = BeautifulSoup(wb_data.text, 'lxml')links = soup.select('p.name > a')#將url鏈接都放在links里面去for link in links:href = link.get("href")#這里取得整個(gè)頁(yè)面的鏈接get_info(href,list)##定義獲取網(wǎng)頁(yè)信息的函數(shù)
def get_info(url,list):wb_data = requests.get(url, headers=header)soup = BeautifulSoup(wb_data.text,'lxml')titles = soup.select('#product_info > div.name_info > h1')authors = soup.select('#author > a')##author > a:nth-child(1)publishers = soup.select('#product_info > div.messbox_info > span:nth-of-type(2) > a')for title,author,publisher in zip(titles,authors,publishers):data = {"title":title.get_text().strip(),"author":author.get_text().strip(),"publisher":publisher.get_text().strip()}print(data)list.append(data)if __name__ == "__main__":url = ["http://search.dangdang.com/?key=python&act=input&page_index={}".format(x) for x in range(1, 3)]
list = []
for i in url:get_links(i,list)time.sleep(1)
print(len(list))filename = r"C:\Users\dell\Desktop\大三上學(xué)期\數(shù)據(jù)爬取\爬蟲\程序\pythondata"
with open(filename,"w") as f_name:json.dump(list,f_name,ensure_ascii=True,indent=2,sort_keys=False)#indent表示縮進(jìn),ensure_ascii確保格式正確不出現(xiàn)亂碼##當(dāng)前新的反爬代碼
url = 'http://search.dangdang.com/?key=python&act=input&page_index=1'
wb_data = requests.get(url, headers=header)import httpx
client = httpx.Client(http2=True)
response = client.get('http://search.dangdang.com/?key=python&act=input&page_index=1')
print(response.text)
總結(jié)
- 上一篇: 奇文共赏 史记-货殖列传-王石传
- 下一篇: 交易系统订单存在的意义