C++爬虫项目爬取图片
生活随笔
收集整理的這篇文章主要介紹了
C++爬虫项目爬取图片
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
C++爬蟲項目爬取圖片,
值得注意的是有些網站的圖片爬不來的,有反爬機制,所以一般人爬不下來.
主要代碼文件
main.cpp文件里面的代碼
CHttp.h 文件里面的代碼
#include<iostream> #include<windows.h> #include<string> #include<queue> //#include<WinSock2.h>在windows里邊 using namespace std;#pragma comment(lib,"ws2_32.lib")//網絡的庫class CHttp { private:string m_host;string m_object;SOCKET m_socket;bool AnalyseUrl(string url);//解析URL\httpbool AnalyseUrl2(string url);//\httpsbool init();//初始化套接字bool Connect();//連接web服務器 public:CHttp(void);~CHttp(void);string FetchGet(string url);//通過Get方式獲取網頁void AnalyseHtml(string html);//解析網頁,獲得圖片地址和其他的鏈接 };CHttp.cpp 實現的文件的代碼是
#include "CHttp.h"CHttp::CHttp(void) {}CHttp::~CHttp(void) {closesocket(m_socket);WSACleanup(); }//解析URL\http bool CHttp::AnalyseUrl(string url) {if (string::npos == url.find("http://"))return false;if (url.length() <= 7)return false;int pos = url.find('/', 7);if (pos == string::npos){m_host = url.substr(7);m_object = '/';}else{m_host = url.substr(7, pos - 7);m_object = url.substr(pos);}if (m_host.empty())return false;return true; }//解析URL\https bool CHttp::AnalyseUrl2(string url) {if (string::npos == url.find("https://"))return false;if (url.length() <= 8)return false;int pos = url.find('/', 8);if (pos == string::npos){m_host = url.substr(8);m_object = '/';}else{m_host = url.substr(8, pos - 8);m_object = url.substr(pos);}if (m_host.empty())return false;return true; }bool CHttp::init() {//1 請求協議版本WSADATA wsaData;WSAStartup(MAKEWORD(2, 2), &wsaData);if (LOBYTE(wsaData.wVersion) != 2 ||HIBYTE(wsaData.wVersion) != 2) {printf("請求協議版本失敗!\n");return false;}//printf("請求協議成功!\n");//2 創建socketm_socket = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP);if (SOCKET_ERROR == m_socket) {printf("創建socket失敗!\n");WSACleanup();return false;}//printf("創建socket成功!\n");return true; }//連接web服務器 bool CHttp::Connect() {//DNS服務器:將域名解析成IP地址hostent *p = gethostbyname(m_host.c_str());if (p == NULL)return false;SOCKADDR_IN sa;sa.sin_family = AF_INET;sa.sin_port = htons(80);//http的默認端口,https的默認端口443memcpy(&sa.sin_addr, p->h_addr, 4);if (-1 == connect(m_socket, (SOCKADDR*)&sa, sizeof(sa))){cout << "服務器連接失敗" << endl;return false;}else{//cout<<"服務器連接成功"<<endl;return true;} }string CHttp::FetchGet(string url)//通過Get方式獲取網頁 {string html;//解析urlif (false == AnalyseUrl(url)){if (false == AnalyseUrl2(url)){cout << "Html解析失敗" << endl;return "";}}//cout<<"主機名"<<m_host<<"\t\t"<<"資源名"<<m_object<<endl;if (false == init())//初始化套接字{return "";}if (false == Connect())//連接服務器{return "";}//發送Get請求 Get請求數據string request = "GET " + m_object +" HTTP/1.1\r\nHost:" + m_host +"\r\nConnection: Close\r\n\r\n";if (SOCKET_ERROR == send(m_socket, request.c_str(), request.size(), 0)){cout << "send request error" << endl;closesocket(m_socket);return "";}//接收數據char ch;while (recv(m_socket, &ch, 1, 0)){html += ch;}return html; } //判斷是否以什么結尾 bool hasEnding(char *& strFull, char*& strEnd) {char * pFull = strFull;while (*pFull != 0)pFull++;char * pEnd = strEnd;while (*pEnd != 0)pEnd++;while (1){pFull--;pEnd--;if (*pEnd == 0){break;}if (*pFull != *pEnd){return false;}}return true; } void CHttp::AnalyseHtml(string html)//解析網頁,獲得圖片地址和其他的鏈接 {int startIndex = 0;int endIndex = 0;//找到所有的圖片for (int pos = 0;pos < html.length();){startIndex = html.find("src=\"", startIndex);if (startIndex == -1){break;}startIndex += 5;endIndex = html.find("\"", startIndex);//找到資源鏈接string src = html.substr(startIndex, endIndex - startIndex);char *src1 = (char *)src.c_str();//cout<<src<<endl;//判斷連接是否是想要的資源 // char *strend = ".jpg";// char* strend = new char[20];// strcpy(strend, ".jpg");char* strend = new char[20]{ ".jpg" };if (hasEnding(src1, strend) == true){/*if(-1!=src.find("t_s960x600c5"))*/if (-1 != src.find("t_s1920x1080c5")){cout << src << endl;//新建一個線程來下載圖片extern queue<string> p;p.push(src);extern void loadImage();CreateThread(NULL, NULL, (LPTHREAD_START_ROUTINE)loadImage,NULL, NULL, NULL);}/*system("pause");*/}startIndex = endIndex + 1;//system("pause");}startIndex = 0;//找到其他URL地址for (int pos = 0;pos < html.length();){startIndex = html.find("href=\"", startIndex);if (startIndex == -1){break;}startIndex += 6;endIndex = html.find("\"", startIndex);//找到資源鏈接string src = html.substr(startIndex, endIndex - startIndex);char *src1 = (char *)src.c_str();//cout<<src<<endl;//判斷連接是否是想要的資源//char *strend = ".html";//char* strend = new char[20];//strcpy(strend, ".html");char* strend = new char[100]{ ".html" };if (hasEnding(src1, strend) == true){if ((-1 != src.find("bizhi") || -1 != src.find("showpic")) && -1 == src.find("http://")){string url = "http://desk.zol.com.cn" + src;extern queue<string> q;q.push(url);//cout<<url<<endl;}}startIndex = endIndex + 1;//system("pause");}}總結
以上是生活随笔為你收集整理的C++爬虫项目爬取图片的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: MFC之实现鼠标自动左击,频率可调,支持
- 下一篇: C++实现黑客帝国流星雨效果