【Java爬虫】爬取淘宝买家秀
生活随笔
收集整理的這篇文章主要介紹了
【Java爬虫】爬取淘宝买家秀
小編覺(jué)得挺不錯(cuò)的,現(xiàn)在分享給大家,幫大家做個(gè)參考.
爬取目標(biāo)
https://h5.m.taobao.com/ocean/privatenode/shop.html?&sellerId=50852803
需要sellerId=50852803的50852803
獲取數(shù)據(jù)地址 https://acs.m.taobao.com/h5/mtop.taobao.social.feed.aggregate/1.0/?appKey=12574478&t=1582778795899&sign=367a770e5a56cfaafc350da1da6b7d76&api=mtop.taobao.social.feed.aggregate&v=1.0&timeout=300000&timer=300000&type=jsonp&dataType=jsonp&callback=mtopjsonp1&data=%7B%22params%22%3A%22%7B%5C%22nodeId%5C%22%3A%5C%22%5C%22%2C%5C%22sellerId%5C%22%3A%5C%2250852803%5C%22%7D%22%2C%22cursor%22%3A%221%22%2C%22pageNum%22%3A%221%22%2C%22pageId%22%3A5703%2C%22env%22%3A%221%22%7D 其中 t為當(dāng)前時(shí)間戳 sign 為 (token + "&" + t + "&" + appKey + "&" + data) 這幾個(gè)參數(shù)拼接后轉(zhuǎn)成MD5我們需要獲取的就只有token,而token是服務(wù)器傳過(guò)來(lái)的 所以偽造一次訪問(wèn)獲取返回的token,然后再訪問(wèn)數(shù)據(jù)?加端端老師免費(fèi)領(lǐng)取更多編程資料
pom.xml
<dependency><groupId>org.apache.httpcomponents</groupId><artifactId>httpclient</artifactId><version>4.5.11</version></dependency><dependency><groupId>com.alibaba</groupId><artifactId>fastjson</artifactId><version>1.2.62</version></dependency><dependency><groupId>org.projectlombok</groupId><artifactId>lombok</artifactId><optional>true</optional></dependency>TbBuyerShow.java
@Data @NoArgsConstructor public class TbBuyerShow {private String sellerId; //店鋪類別IDprivate String title; //店鋪名稱private String userName; //用戶名稱private String userUrl; //用戶鏈接private String userTitle; //用戶評(píng)論private String imgId; //圖片IDprivate String imgUrl; //圖片銜接private String targetUrl; //圖片來(lái)源private Integer pageNum; }BuyerShowReptile.Java
public class BuyerShowReptile {public static void main(String[] args) {List<TbBuyerShow> reptile = reptile("50852803", 1, 20);reptile.forEach(tbBuyerShow -> System.out.println(tbBuyerShow.getImgUrl()));}//ID,第幾頁(yè),固定參數(shù)public static List<TbBuyerShow> reptile(String sellerId, int index, int num) {String url = "https://acs.m.taobao.com/h5/mtop.taobao.social.feed.aggregate/1.0/?";String appKey = "12574478";String t = String.valueOf(new Date().getTime());String sign = "af1fde903d6e32e57aaf3377e6a68f3a";String data = "{\"params\":" +"\"{\\\"nodeId\\\":" +"\\\"\\\",\\\"sellerId\\\":" +"\\\"" + sellerId + "\\\",\\\"pagination\\\":" +"{\\\"direction\\\":" +"\\\"1\\\",\\\"hasMore\\\":" +"\\\"true\\\",\\\"pageNum\\\":" +"\\\"" + index + "\\\",\\\"pageSize\\\":" +"\\\"" + num + "\\\"}}\",\"cursor\":" +"\"" + index + "\",\"pageNum\":" +"\"" + index + "\",\"pageId\":" +"5703,\"env\":" +"\"1\"}";Params params = newParams(appKey, t, sign, data);String str = htmlUrl(url, params);String mh5tk = "";String mh5tkenc = "";String token = "";String u;CookieStore cookieStore = new BasicCookieStore();CloseableHttpClient httpClient = HttpClientBuilder.create().setDefaultCookieStore(cookieStore).build();HttpGet httpGet = new HttpGet(str);CloseableHttpResponse response = null;try {response = httpClient.execute(httpGet);List<Cookie> cookies = cookieStore.getCookies();for (Cookie cookie : cookies) {if ("_m_h5_tk".equals(cookie.getName())) {mh5tk = cookie.getValue();token = mh5tk.split("_")[0];}if ("_m_h5_tk_enc".equals(cookie.getName())) {mh5tkenc = cookie.getValue();}}u = token + "&" + params.getT() + "&" + appKey + "&" + data;sign = DigestUtils.md5DigestAsHex(u.getBytes());params = newParams(appKey, t, sign, data);str = htmlUrl(url, params);Cookie cookie = new BasicClientCookie("_m_h5_tk", mh5tk);((BasicClientCookie) cookie).setAttribute("_m_h5_tk_enc", mh5tkenc);cookieStore.addCookie(cookie);httpClient = HttpClientBuilder.create().setDefaultCookieStore(cookieStore).build();httpGet = new HttpGet(str);response = httpClient.execute(httpGet);HttpEntity entity = response.getEntity();String conResult = EntityUtils.toString(entity, "UTF-8");return newTbBuyerShow(conResult, sellerId, index);} catch (IOException e) {e.printStackTrace();} finally {try {if (httpClient != null) {httpClient.close();}if (response != null) {response.close();}} catch (IOException e) {e.printStackTrace();}}return null;}static List<TbBuyerShow> newTbBuyerShow(String conResult, String sellerId, Integer index) {List<TbBuyerShow> tbBuyerShows = new ArrayList<>();String title = ""; //店鋪名稱String userName = ""; //用戶名稱String userUrl = ""; //用戶鏈接String userTitle = ""; //用戶評(píng)論String imgId; //圖片IDString imgUrl; //圖片銜接String targetUrl = ""; //圖片來(lái)源Integer pageNum = index; //頁(yè)碼if (!StringUtils.isEmpty(conResult)) {conResult = conResult.replace("mtopjsonp(", "");conResult = conResult.replace(")", "");JSONObject jsonObject = JSON.parseObject(conResult);jsonObject = jsonObject.getJSONObject("data");if (!StringUtils.isEmpty(jsonObject)) {JSONObject header = jsonObject.getJSONObject("header");if (!StringUtils.isEmpty(header)) {title = (String) header.get("title");}JSONArray userList = jsonObject.getJSONArray("list");if (!StringUtils.isEmpty(userList)) {for (int i = 0; i < userList.size(); i++) {JSONObject list = userList.getJSONObject(i);JSONObject user = list.getJSONObject("user");if (!StringUtils.isEmpty(user)) {userName = (String) user.get("userNick");userUrl = (String) user.get("userUrl");}if (!StringUtils.isEmpty(list.get("title"))) {userTitle = (String) list.get("title");}if (!StringUtils.isEmpty(list.get("targetUrl"))) {targetUrl = (String) list.get("targetUrl");}JSONArray picsList = list.getJSONArray("pics");if (!StringUtils.isEmpty(picsList)) {for (int j = 0; j < picsList.size(); j++) {TbBuyerShow tbBuyerShow = new TbBuyerShow();JSONObject pics = picsList.getJSONObject(j);imgId = (String) pics.get("id");imgUrl = (String) pics.get("path");tbBuyerShow.setSellerId(sellerId);tbBuyerShow.setTitle(title);tbBuyerShow.setUserName(userName);tbBuyerShow.setUserUrl(userUrl);tbBuyerShow.setUserTitle(userTitle);tbBuyerShow.setImgId(imgId);tbBuyerShow.setImgUrl(imgUrl);tbBuyerShow.setTargetUrl(targetUrl);tbBuyerShow.setPageNum(pageNum);tbBuyerShows.add(tbBuyerShow);}}}}}}return tbBuyerShows;}static Params newParams(String appkey, String t, String sign, String data) {Params params = new Params();params.setAppKey(appkey);params.setT(t);params.setSign(sign);params.setApi("mtop.taobao.social.feed.aggregate");params.setV("1.0");params.setTimeout("300000");params.setTimer("300000");params.setType("jsonp");params.setDataType("jsonp");params.setCallback("mtopjsonp");params.setData(data);return params;}/*** * https://acs.m.taobao.com/h5/mtop.taobao.social.feed.aggregate/1.0/* * ?appKey=12574478* * &t=1581927984172* * &sign=e83a3add7b5fc1b70b0601a2ccd133e9* * &api=mtop.taobao.social.feed.aggregate* * &v=1.0* * &timeout=300000* * &timer=300000* * &type=jsonp* * &dataType=jsonp* * &callback=mtopjsonp1* * &data=%7B%22params%22%3A%22%7B%5C%22nodeId%5C%22%3A%5C%22%5C%22%2C%5C%22sellerId%5C%22%3A%5C%22109043255%5C%22%7D%22%2C%22cursor%22%3A%221%22%2C%22pageNum%22%3A%221%22%2C%22pageId%22%3A5703%2C%22env%22%3A%221%22%7D* *** @param url* @return*/static String htmlUrl(String url, Params params) {StringBuffer buffer = new StringBuffer();try {buffer.append(url).append("appkey=" + URLEncoder.encode(params.getAppKey(), "utf-8")).append("&t=" + URLEncoder.encode(params.getT(), "utf-8")).append("&sign=" + URLEncoder.encode(params.getSign(), "utf-8")).append("&api=" + URLEncoder.encode(params.getApi(), "utf-8")).append("&v=" + URLEncoder.encode(params.getV(), "utf-8")).append("&timeout=" + URLEncoder.encode(params.getTimeout(), "utf-8")).append("&timer=" + URLEncoder.encode(params.getTimer(), "utf-8")).append("&type=" + URLEncoder.encode(params.getType(), "utf-8")).append("&dataType=" + URLEncoder.encode(params.getDataType(), "utf-8")).append("&callback=" + URLEncoder.encode(params.getCallback(), "utf-8")).append("&data=" + URLEncoder.encode(params.getData(), "utf-8"));} catch (UnsupportedEncodingException e) {e.printStackTrace();}return buffer.toString();} }總結(jié)
以上是生活随笔為你收集整理的【Java爬虫】爬取淘宝买家秀的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: [react] react的书写规范有哪
- 下一篇: mac使用brew update无反应解