利用python脚本(re)抓取美空mm图片
生活随笔
收集整理的這篇文章主要介紹了
利用python脚本(re)抓取美空mm图片
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
很久沒有寫博客了,這段時間一直在搞風控的東西,過段時間我把風控的內容整理整理發出來大家一起研究研究。
這兩天抽空寫了兩個python爬蟲腳本,一個使用re,一個使用xpath。
直接上代碼——基于re:
spider.py
# -*- coding:utf-8 -*-import urllib.request import re import tool import os import http.cookiejar# 抓取MOKO_MM class Spider:# 頁面初始化def __init__(self):self.siteURL = 'http://www.moko.cc/focus|list.action'self.tool = tool.Tool()# 獲取索引頁面的內容def getPage(self, pageIndex):url = self.siteURL + "?type=4&curPage=" + str(pageIndex)request = urllib.request.Request(url)response = urllib.request.urlopen(request)return response.read().decode('utf-8')# 獲取索引界面所有MM的信息,list格式def getContents(self, pageIndex):page = self.getPage(pageIndex)pattern = re.compile('<div class="subMainContent".*?<a href="(.*?)".*?subFocus-07.*?<img src="(.*?)".*?subFocus-08.*?<h1>(.*?)</h1>',re.S)items = re.findall(pattern, page)# item[0] 詳情頁鏈接# item[1] 縮略圖# item[2] 標題contents = []for item in items:contents.append([item[0], item[1], item[2].replace(" ", "-").replace("|", "-").replace(".", "-").replace(":", "-")])return contents# 獲取MM個人詳情頁面def getDetailPage(self, infoURL):def makeMyOpener(head={'accept-encoding': 'deflate, sdch','Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8','Accept-Language': 'zh-CN,zh;q=0.8','Cookie': 'JSESSIONID=58C82905AD36B5DFA8D4F1C98A2559DC; Hm_lvt_8d82e75c6168ba4bc0135a08edae2a2e=1488505496; Hm_lpvt_8d82e75c6168ba4bc0135a08edae2a2e','Referer': 'https://mm.taobao.com/687471686.htm','User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:51.0) Gecko/20100101 Firefox/51.0'}):cookie = http.cookiejar.CookieJar()opener = urllib.request.build_opener(urllib.request.HTTPCookieProcessor(cookie))header = []for key, value in head.items():elem = (key, value)header.append(elem)opener.addheaders = headerreturn openeroper = makeMyOpener()uop = oper.open(infoURL)data = uop.read().decode('utf-8')return data# response = urllib.request.urlopen(infoURL)# return response.read().decode('utf-8')# 獲取個人文字簡介def getBrief(self, page):pattern = re.compile('<div class="infoShow-12".*?<p align="center".*?<strong>(.*?)</strong>.*?<strong>(.*?)</strong>.*?<strong>(.*?)</strong>.*?<strong>(.*?)</strong>',re.S)items = re.findall(pattern, page)# item[0] 主頁地址# item[1] 頭像# item[2] 姓名contents = []for item in items:contents.append([item[0], item[1], item[2]])return contents# result = re.search(pattern, page)# print(result.group())# return self.tool.replace(result.group(1))# 獲取頁面所有圖片def getAllImg(self, page):pattern = re.compile('<div class="infoShow-12">(.*?)<div class="infoShow-13">', re.S)# 個人信息頁面所有代碼content = re.search(pattern, page)# 從代碼中提取圖片patternImg = re.compile('<img.*?src="(.*?)"', re.S)images = re.findall(patternImg, content.group(1))return images# 保存多張寫真圖片def saveImgs(self, images, name):number = 1print(u"發現", name, u"共有", len(images), u"張照片")for imageURL in images:splitPath = imageURL.split('.')splitPath = splitPathfTail = splitPath.pop()if len(fTail) > 3:fTail = "jpg"fileName = name + "/" + str(number) + "." + fTailself.saveImg(imageURL, fileName)number += 1# 保存頭像def saveIcon(self, iconURL, name):splitPath = iconURL.split('.')fTail = splitPath.pop()fileName = name + "/icon." + fTailself.saveImg(iconURL, fileName)# 保存個人簡介def saveBrief(self, content, name):fileName = name + "/" + name + ".txt"f = open(fileName, "w+")print(u"正在保存信息為", fileName)# f.write(content.decode('utf-8')) f.write(content)# 保存圖片地址頁到各文件夾中def saveToLocal(self, Li, name):fileName = name + "/" + "urlPage.txt"print(u"正在保存圖片地址頁:", fileName)# f.write(content.decode('utf-8'))# pre=pre.replace("[","")# pre=pre.replace("]","")+"\n"# print (pre)f = open(fileName, "w")f.write(Li)f.close()# 追加方式寫入當前爬行的名字,后續調用content = name + " "with open('url.txt', 'a') as url:url.write(content)url.close()print(name + u"追加完成!\n")# 傳入圖片地址,文件名,保存單張圖片def saveImg(self, imageURL, fileName):try:u = urllib.request.urlopen(imageURL)data = u.read()f = open(fileName, 'wb')f.write(data)print(u"正在保存的一張圖片為", fileName)f.close()except urllib.request.URLError as e:print(e.reason)# 創建新目錄def mkdir(self, path):path = path.strip()# 判斷路徑是否存在# 存在 True# 不存在 FalseisExists = os.path.exists(path)# 判斷結果if not isExists:# 如果不存在則創建目錄print(u"新建了名字叫做", path, u'的文件夾')# 創建目錄操作函數 os.makedirs(path)return Trueelse:# 如果目錄存在則不創建,并提示目錄已存在print(u"名為", path, '的文件夾已經創建成功')return False# 將一頁MOKO MM的信息保存起來def savePageInfo(self, pageIndex):# 獲取第一頁MOKO MM列表contents = self.getContents(pageIndex)for item in contents:# item[0]個人詳情URL,item[1]頭像URL,item[2]姓名print(u"發現一位名叫", item[2], u"的信息")print(u"正在保存", item[2], "的信息")print(u"個人詳情地址是", "http://www.moko.cc" + str(item[0]))# 個人詳情頁面的URLdetailURL = "http://www.moko.cc" + str(item[0])# 得到個人詳情頁面代碼detailPage = self.getDetailPage(detailURL)# 獲取個人簡介# brief = self.getBrief(detailPage)# 獲取所有圖片列表self.mkdir(item[2])images = self.getAllImg(detailPage)spider.saveImgs(images, item[2])# 保存個人簡介# self.saveBrief(brief.encode('utf-8'), item[2])# self.saveBrief(brief, item[2])# 保存圖片地址頁到本地# self.saveToLocal(detailPage, item[2])# 保存頭像self.saveIcon("https:" + str(item[1]), item[2])# 刪除舊名單(如果有)def deleteOldTxt(self):filename = 'url.txt'if os.path.exists(filename):os.remove(filename)print("\n發現舊名單,已刪除\n采集開始\n")# 傳入起止頁碼,獲取MM頁面保存def savePagesInfo(self, start, end):for i in range(start, end + 1):print(u"正在尋找第", i, u"個地方")self.savePageInfo(i)# 保存圖片# self.saveImgs(images,item[2])# 讀取名字listdef openNameList(self):with open("url.txt", "r") as f:for line in f:line = line.strip()# line.split(",")# result.append(line)# result.append(line.split(","))# \s匹配空格與tab,\s+表示至少一個result = re.split(r'\s+', line)return result# 逐個調取文件夾下頁面中地址來保存def saveAll(self):i = spider.openNameList()for name in i:print("當前正在保存的是" + name + "的圖片")filepath = name + "/urlPage.txt"with open(filepath, "r") as urlContent:urlContent = urlContent.read()images = spider.getAllImg(urlContent)spider.saveImgs(images, name)# 傳入起止頁碼即可,在此傳入了6,10,表示抓取第6到10頁的MM spider = Spider() spider.deleteOldTxt() spider.savePagesInfo(1, 10)tool.py
# -*- coding:utf-8 -*- import re# 處理頁面標簽類 class Tool:# 去除img標簽,1-7位空格,removeImg = re.compile(r'<img.*?>| {1,7}| ')# 刪除超鏈接標簽removeAddr = re.compile(r'<a.*?>|</a>')# 把換行的標簽換為\nreplaceLine = re.compile(r'<tr>|<div>|</div>|</p>')# 將表格制表<td>替換為\treplaceTD = re.compile(r'<td>')# 將換行符或雙換行符替換為\nreplaceBR = re.compile(r'<br><br>|<br>')# 將其余標簽剔除rremoveExtraTag = re.compile(r'<.*?>')# 將多行空行刪除removeNoneLine = re.compile(r'\n+')# 刪除removeSpace = re.compile(r' ')def replace(self, x):x = re.sub(self.removeImg, "", x)x = re.sub(self.removeAddr, "", x)x = re.sub(self.replaceLine, "\n", x)x = re.sub(self.replaceTD, "\t", x)x = re.sub(self.replaceBR, "\n", x)x = re.sub(self.removeExtraTag, "", x)x = re.sub(self.removeNoneLine, "\n", x)x = re.sub(self.removeSpace, "", x)# strip()將前后多余內容刪除return x.strip()?
總結
以上是生活随笔為你收集整理的利用python脚本(re)抓取美空mm图片的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 智能算法的研究与实现
- 下一篇: ASP.NET Core-数据保护(Da