httpclient爬取性感美图
生活随笔
收集整理的這篇文章主要介紹了
httpclient爬取性感美图
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
依賴httpclient4.2,Jsop
SemeiziCrawler.java
package kidbei.learn.crawler;import java.io.File; import java.io.FileOutputStream; import java.io.IOException; import java.io.InputStream; import java.io.OutputStream; import java.io.StringWriter; import java.util.ArrayList; import java.util.Iterator; import java.util.List;import org.apache.commons.io.IOUtils; import org.apache.http.HttpEntity; import org.apache.http.HttpResponse; import org.apache.http.client.methods.HttpGet; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.util.EntityUtils; import org.jsoup.Jsoup; import org.jsoup.nodes.Document; import org.jsoup.nodes.Element; import org.jsoup.select.Elements; /*** http://sejie.wanxun.org/post/2012-09-25/40039413449* @author Administrator**/ public class SemeiziCrawler {private static final String BASEHOST = "http://sejie.wanxun.org/";private static DefaultHttpClient client = ConnectionManager.getHttpClient();static String url = "http://sejie.wanxun.org/post/2012-09-25/40039413449";private static String IMGPATH = "D:\\sexpicture\\色戒美眉圖"+File.separator+StringUtil.getDate();static int STARTPAGE = 1;static int PAGECOUNT = 100;public static void main(String[] args) {File f = new File(IMGPATH);if(!f.exists()){f.mkdirs();}String host = BASEHOST ;for(int i=STARTPAGE;i<PAGECOUNT;i++){if(i != 1){host = BASEHOST+"page/"+i;}System.out.println("進入第"+i+"頁");String pageContext = getResultByUrl(host); // System.out.println(pageContext);List<String>articleURLS = getArticleURL(pageContext);for(String articleURL:articleURLS){String articleContext = getResultByUrl(articleURL);List<String> ImgURLS = getImgURLS(articleContext);for(String ImgURL:ImgURLS){savepic(ImgURL);}}} // String articleContext = getResultByUrl(url); // List<String> strs = getImgURLS(articleContext); // for(String str:strs){ // System.out.println(str); // } }/*** 根據url獲取頁面* @param url* @return*/public static String getResultByUrl(String url){System.out.println("打開網頁"+url);HttpGet get = new HttpGet(url);HttpEntity entity = null;HttpResponse response = null;try {response = client.execute(get);entity = response.getEntity();if(entity != null){InputStream is = entity.getContent();StringWriter sw = new StringWriter();IOUtils.copy(is, sw, "UTF-8");is.close();sw.close();return sw.toString();}} catch (Exception e) {System.out.println("網頁打開出錯");return null;}finally{get.abort();try {EntityUtils.consume(entity);} catch (IOException e) {e.printStackTrace();}}return null;}/*** 找出當前頁面中所有帖子的地址* @param pageStr 網頁字符串* @return*/public static List<String> getArticleURL(String pageContext){if(pageContext == null){return null;}List<String> articleURLS = new ArrayList<String>();System.out.println("尋找帖子...........");try {Document doc = Jsoup.parseBodyFragment(pageContext);Elements es = doc.select("div.post"); es = es.select("div[class=post-item type-photo]");es = es.select("div.meta a:containsOwn(全文)");for(Element e:es){articleURLS.add(e.attr("href"));}} catch (Exception e) {e.printStackTrace();return null;}return articleURLS;}/*** 獲取帖子的圖片地址* @param articleURLS* @return*/public static List<String> getImgURLS(String articleContext){List<String>ImgURLS = new ArrayList<String>();if(articleContext == null){return null;}System.out.println("獲取圖片地址-----------");Document doc = Jsoup.parse(articleContext);Elements es = doc.select("a[target=_blank] img[src]");for(Iterator<Element> i=es.iterator();i.hasNext();){Element e = i.next();ImgURLS.add(e.attr("src"));}return ImgURLS;}/*** 保存圖片* @param ImgURL*/public static void savepic(String ImgURL){if(ImgURL == null){return ;}HttpGet get = new HttpGet(ImgURL);String[] strs = ImgURL.split("/");String fileName = strs[strs.length-1];String savePath = IMGPATH+File.separator+fileName;HttpEntity entity = null;try {HttpResponse response = client.execute(get);entity = response.getEntity();System.out.println("保存圖片>>>>.>>>>>>"+fileName);InputStream is = entity.getContent();OutputStream os = new FileOutputStream(savePath);IOUtils.copy(is, os);IOUtils.closeQuietly(os);IOUtils.closeQuietly(is);} catch (Exception e) {e.printStackTrace();System.out.println("圖片保存失敗");return ;}} }?StringUtil.java?
package kidbei.learn.crawler;import java.io.File; import java.text.SimpleDateFormat; import java.util.Date; import java.util.Random;public class StringUtil {public static String getRandomString(){StringBuffer generateRandStr = new StringBuffer(); Random rand = new Random(); int length = 6; char ch;for(int i=0;i<length;i++) { int randNum = Math.abs(rand.nextInt())%26+97; // 產生97到122的隨機數(a-z的鍵位值) ch = ( char ) randNum;generateRandStr.append( ch );} return generateRandStr.toString(); }public static String getSavePath(String IMGPATH,String fileName){SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");String date = sdf.format(new Date()).toString();if(!(fileName.endsWith(".jpg"))){fileName = fileName + ".jpg";}String randStr = StringUtil.getRandomString();return IMGPATH+File.separator+date+File.separator+randStr+fileName;}public static String getDate(){SimpleDateFormat sdf = new SimpleDateFormat("yyyyMMdd");return sdf.format(new Date()).toString();} }ConnectionManager.java
package kidbei.learn.crawler;import org.apache.http.conn.scheme.PlainSocketFactory; import org.apache.http.conn.scheme.Scheme; import org.apache.http.conn.scheme.SchemeRegistry; import org.apache.http.conn.ssl.SSLSocketFactory; import org.apache.http.impl.client.DefaultHttpClient; import org.apache.http.impl.conn.PoolingClientConnectionManager; import org.apache.http.params.BasicHttpParams; import org.apache.http.params.CoreConnectionPNames; import org.apache.http.params.CoreProtocolPNames; import org.apache.http.params.HttpParams;public class ConnectionManager {static final int TIMEOUT = 20000;//連接超時時間static final int SO_TIMEOUT = 20000;//數據傳輸超時static String UA = "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1" +" (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1";public static DefaultHttpClient getHttpClient(){SchemeRegistry schemeRegistry = new SchemeRegistry();schemeRegistry.register(new Scheme("http",80,PlainSocketFactory.getSocketFactory()));schemeRegistry.register(new Scheme("https", 443, SSLSocketFactory.getSocketFactory()));PoolingClientConnectionManager cm = new PoolingClientConnectionManager(schemeRegistry);cm.setMaxTotal(500);cm.setDefaultMaxPerRoute(200);HttpParams params = new BasicHttpParams();params.setParameter(CoreConnectionPNames.CONNECTION_TIMEOUT,TIMEOUT);params.setParameter(CoreConnectionPNames.SO_TIMEOUT, SO_TIMEOUT);params.setParameter(CoreProtocolPNames.USER_AGENT, UA);DefaultHttpClient client = new DefaultHttpClient(cm,params);return client;} }本文轉自:http://www.oschina.net/code/snippet_257479_14524#23843
轉載于:https://www.cnblogs.com/dreammyle/p/4149687.html
總結
以上是生活随笔為你收集整理的httpclient爬取性感美图的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: SQL Server调优系列基础篇(联合
- 下一篇: BZOJ3738 : [Ontak201