from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.parse import unquote
url='https://baike.baidu.com/item/%E9%87%91%E8%9E%8D/860'defnew_url(url):new_url=unquote(url,'utf8')return new_url
html=urlopen('https://baike.baidu.com/item/%E9%87%91%E8%9E%8D/860')
bs=BeautifulSoup(html,'html.parser')for link in bs.find_all('a'):if'href'in link.attrs:print(link.attrs['href'])#發(fā)現(xiàn)符合要求的鏈接和不符合要求的鏈接都被選出,需要進(jìn)行下一步篩選
進(jìn)一步篩選合適的詞條鏈接,發(fā)現(xiàn)詞條鏈接的共同點(diǎn):
詞條鏈接都是類似于:/item/%E4%BC%9A%E8%AE%A1/88436這樣的形式
利用正則表達(dá)式,篩選鏈接:
#^(/item/).*?/[0-9]*$#https://baike.baidu.com/item/%E9%87%91%E8%9E%8D/860from urllib.request import urlopen
from bs4 import BeautifulSoup
from urllib.parse import unquote
import re
url='https://baike.baidu.com/item/%E9%87%91%E8%9E%8D/860'defnew_url(url):new_url=unquote(url,'utf8')return new_url
html=urlopen('https://baike.baidu.com/item/%E9%87%91%E8%9E%8D/860')
bs=BeautifulSoup(html,'html.parser')for link in bs.find_all('a',href=re.compile('^(/item/).*?/[0-9]*$')):if'href'in link.attrs:print(link.attrs['href'])