C#winform抓取百度,Google搜索关键词结果
生活随笔
收集整理的這篇文章主要介紹了
C#winform抓取百度,Google搜索关键词结果
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
基于網站seo,做了一采集百度和Google搜索關鍵字結果的采集.在這里與大家分享一下
先看先效果圖?
代碼附加:
?View Code
??1???private?void?baidu_Click(object?sender,?EventArgs?e)
?2?????????{
?3?????????????int?num?=?100;//搜索條數
?4?????????????string?url?=?"http://www.baidu.com/s?wd="?+?txtSearch.Text.Trim()?+?"&rn="?+?num?+?"";
?5?????????????string?html?=?search(url,?"gb2312");
?6?????????????BaiduSearch?baidu?=?new?BaiduSearch();
?7?????????????if?(!string.IsNullOrEmpty(html))
?8?????????????{
?9?????????????????int?count?=?baidu.GetSearchCount(html);//搜索條數
10?????????????????if?(count?>?0)
11?????????????????{
12?????????????????????List<Keyword>?keywords?=?baidu.GetKeywords(html,?txtSearch.Text.Trim());
13?????????????????????dataGridView1.DataSource?=?keywords;
14?????????????????}
15?
16?????????????}
17?????????}
18?
19?????????private?void?google_Click(object?sender,?EventArgs?e)
20?????????{
21?????????????int?num?=?100;
22?????????????string?url?=?"http://www.google.com.hk/search?hl=zh-CN&source=hp&q="?+?txtSearch.Text.Trim()?+?"&aq=f&aqi=&aql=&oq=&num="?+?num?+?"";
23?????????????string?html?=?search(url,?"utf-8");
24?????????????if?(!string.IsNullOrEmpty(html))
25?????????????{
26?
27?????????????????googleSearch?google?=?new?googleSearch();
28?????????????????List<Keyword>?keywords?=?google.GetKeywords(html,?txtSearch.Text.Trim());
29?????????????????dataGridView1.DataSource?=?keywords;
30?
31?????????????}
32?????????}
33?????????///?<summary>
34?????????///?搜索處理
35?????????///?</summary>
36?????????///?<param?name="url">搜索網址</param>
37?????????///?<param?name="Chareset">編碼</param>
38?????????public?string?search(string?url,?string?Chareset)
39?????????{
40?????????????HttpState?result?=?new?HttpState();
41?????????????Uri?uri?=?new?Uri(url);
42?????????????HttpWebRequest?myHttpWebRequest?=?(HttpWebRequest)WebRequest.Create(url);
43?????????????myHttpWebRequest.UseDefaultCredentials?=?true;
44?????????????myHttpWebRequest.ContentType?=?"text/html";
45?????????????myHttpWebRequest.UserAgent?=?"Mozilla/4.0?(compatible;?MSIE?8.0;?Windows?NT?5.0;?.NET?CLR?1.1.4322;?.NET?CLR?2.0.50215;)";
46?????????????myHttpWebRequest.Method?=?"GET";
47?????????????myHttpWebRequest.CookieContainer?=?new?CookieContainer();
48?
49?????????????try
50?????????????{
51?????????????????HttpWebResponse?response?=?(HttpWebResponse)myHttpWebRequest.GetResponse();
52?????????????????//?從?ResponseStream?中讀取HTML源碼并格式化?add?by?cqp
53?????????????????result.Html?=?readResponseStream(response,?Chareset);
54?????????????????result.CookieContainer?=?myHttpWebRequest.CookieContainer;
55?????????????????return?result.Html;
56?????????????}
57?????????????catch?(Exception?ex)
58?????????????{
59?????????????????return?ex.ToString();
60?????????????}
61?
62?????????}
63?????????public?string?readResponseStream(HttpWebResponse?response,?string?Chareset)
64?????????{
65?????????????string?result?=?"";
66?????????????using?(StreamReader?responseReader?=?new?StreamReader(response.GetResponseStream(),?Encoding.GetEncoding(Chareset)))
67?????????????{
68?????????????????result?=?formatHTML(responseReader.ReadToEnd());
69?????????????}
70?
71?????????????return?result;
72?????????}
73?????????///?<summary>
74?????????///?描述:格式化網頁源碼
75?????????///?
76?????????///?</summary>
77?????????///?<param?name="htmlContent"></param>
78?????????///?<returns></returns>
79?????????public?string?formatHTML(string?htmlContent)
80?????????{
81?????????????string?result?=?"";
82?
83?????????????result?=?htmlContent.Replace("»",?"").Replace(" ",?"")
84?????????????????????.Replace("©",?"").Replace("/r",?"").Replace("/t",?"")
85?????????????????????.Replace("/n",?"").Replace("&",?"&");
86?????????????return?result;
87
?把百度和Google兩個類抽取了出來
1.百度Search類
?View Code
??1?class?BaiduSearch
?2?????{
?3?????????protected?string?uri?=?"http://www.baidu.com/s?wd=";
?4?????????protected?Encoding?queryEncoding?=?Encoding.GetEncoding("gb2312");
?5?????????protected?Encoding?pageEncoding?=?Encoding.GetEncoding("gb2312");
?6?????????protected?string?resultPattern?=?@"(?<=找到相關結果[約]?)[0-9,]*?(?=個)";
?7?????????public?int?GetSearchCount(string?html)
?8?????????{
?9?????????????int?result?=?0;
10?????????????string?searchcount?=?string.Empty;
11?
12?????????????Regex?regex?=?new?Regex(resultPattern);
13?????????????Match?match?=?regex.Match(html);
14?
15?????????????if?(match.Success)
16?????????????{
17?????????????????searchcount?=?match.Value;
18?????????????}
19?????????????else
20?????????????{
21?????????????????searchcount?=?"0";
22?????????????}
23?
24?????????????if?(searchcount.IndexOf(",")?>?0)
25?????????????{
26?????????????????searchcount?=?searchcount.Replace(",",?string.Empty);
27?????????????}
28?
29?????????????int.TryParse(searchcount,?out?result);
30?
31?????????????return?result;
32?????????}
33?
34?????????public?List<Keyword>?GetKeywords(string?html,?string?word)
35?????????{
36?????????????int?i?=?1;
37?????????????List<Keyword>?keywords?=?new?List<Keyword>();
38?????????????string?ss="<h3?class=\"t\"><a.*?href=\"(?<url>.*?)\".*?>(?<content>.*?)</a>";
39?????????????MatchCollection?mcTable?=?Regex.Matches(html,ss);
40?????????????foreach?(Match?mTable?in?mcTable)
41?????????????{
42?????????????????if?(mTable.Success)
43?????????????????{
44?????????????????????Keyword?keyword?=?new?Keyword();
45?????????????????????keyword.ID?=?i++;
46?????????????????????keyword.Title?=?Regex.Replace(mTable.Groups["content"].Value,?"<[^>]*>",?string.Empty);
47?????????????????????keyword.Link?=?mTable.Groups["url"].Value;
48?????????????????????keywords.Add(keyword);
49?
50?????????????????}
51?????????????}
52?
53?????????????return?keywords;
54?????????}
55
2 .GoogleSearch類
?View Code
??1???class?googleSearch
?2?????{
?3?
?4?????????public?List<Keyword>?GetKeywords(string?html,?string?word)
?5?????????{
?6?????????????int?i?=?1;
?7?????????????List<Keyword>?keywords?=?new?List<Keyword>();
?8?
?9?????????????Regex?regTable?=?new?Regex("<h3?class=\"r\"><a.*?href=\"(?<url>.*?)\".*?>(?<content>.*?)</a>",?RegexOptions.IgnoreCase);
10?????????????Regex?regA?=?new?Regex(@"(?is)<a/b[^>]*?href=(['""]?)(?<link>[^'""/s>]+)/1[^>]*>(?<title>.*?)</a>",?RegexOptions.IgnoreCase);
11?
12?????????????MatchCollection?mcTable?=?regTable.Matches(html);
13?????????????foreach?(Match?mTable?in?mcTable)
14?????????????{
15?????????????????if?(mTable.Success)
16?????????????????{
17?????????????????????Keyword?keyword?=?new?Keyword();
18?????????????????????keyword.ID?=?i++;
19?????????????????????keyword.Title?=?Regex.Replace(mTable.Groups["content"].Value,?"<[^>]*>",?string.Empty);
20?????????????????????keyword.Link?=?mTable.Groups["url"].Value;
21?????????????????????keywords.Add(keyword);
22?????????????????}
23?????????????}
24?
25?????????????return?keywords;
26?????????}
27
?忘了.還有個導出Excel,這個友友們應該都有自己的方法,我這里就簡單寫了一個excel導出.也貼出來吧.
?1???public?void?ExportDataGridViewToExcel(DataGridView?dataGridview1)?2?????????{
?3?????????????SaveFileDialog?saveFileDialog?=?new?SaveFileDialog();
?4?????????????saveFileDialog.Filter?=?"Execl??files??(*.xls)|*.xls";
?5?????????????saveFileDialog.FilterIndex?=?0;
?6?????????????saveFileDialog.RestoreDirectory?=?true;
?7?????????????saveFileDialog.CreatePrompt?=?true;
?8?????????????saveFileDialog.Title?=?"導出Excel文件";
?9?
10?????????????DateTime?now?=?DateTime.Now;
11?????????????saveFileDialog.FileName?=?now.Year.ToString().PadLeft(2)?+?now.Month.ToString().PadLeft(2,?'0')?+?now.Day.ToString().PadLeft(2,?'0')?+?"-"?+?now.Hour.ToString().PadLeft(2,?'0')?+?now.Minute.ToString().PadLeft(2,?'0')?+?now.Second.ToString().PadLeft(2,?'0');
12?????????????saveFileDialog.ShowDialog();
13?
14?????????????Stream?myStream;
15?????????????myStream?=?saveFileDialog.OpenFile();
16?????????????StreamWriter?sw?=?new?StreamWriter(myStream,?System.Text.Encoding.GetEncoding("gb2312"));
17?????????????string?str?=?"";
18?????????????try
19?????????????{
20?????????????????//寫標題??????
21?????????????????for?(int?i?=?0;?i?<?dataGridview1.ColumnCount;?i++)
22?????????????????{
23?????????????????????if?(i?>?0)
24?????????????????????{
25?????????????????????????str?+=?"\t";
26?????????????????????}
27?????????????????????str?+=?dataGridview1.Columns[i].HeaderText;
28?????????????????}
29?????????????????sw.WriteLine(str);
30?????????????????//寫內容???
31?????????????????for?(int?j?=?0;?j?<?dataGridview1.Rows.Count;?j++)
32?????????????????{
33?????????????????????string?tempStr?=?"";
34?????????????????????for?(int?k?=?0;?k?<?dataGridview1.Columns.Count;?k++)
35?????????????????????{
36?????????????????????????if?(k?>?0)
37?????????????????????????{
38?????????????????????????????tempStr?+=?"\t";
39?????????????????????????}
40?????????????????????????tempStr?+=?dataGridview1.Rows[j].Cells[k].Value.ToString();
41?????????????????????}
42?????????????????????sw.WriteLine(tempStr);
43?????????????????}
44?????????????????sw.Close();
45?????????????????myStream.Close();
46?????????????????MessageBox.Show("導出成功");
47?????????????}
48?????????????catch?(Exception?e)
49?????????????{
50?????????????????MessageBox.Show(e.ToString());
51?????????????}
52?????????????finally
53?????????????{
54?????????????????sw.Close();
55?????????????????myStream.Close();
56?????????????}
57?????????}?
我把HTTpStatus類給貼出來..有需要demo的可以發郵件給我.或者留下郵箱
Httpstatus.cs?
class HttpState{private string _statusDescription;public string StatusDescription{get { return _statusDescription; }set { _statusDescription = value; }}/// <summary>/// 回調 址址, 登陸測試中使用/// </summary>private string _callBackUrl;public string CallBackUrl{get { return _callBackUrl; }set { _callBackUrl = value; }}/// <summary>/// 網頁網址 絕對路徑格式/// </summary>private string _url;public string Url{get { return _url; }set { _url = value; }}/// <summary>/// 字符串的形式的Cookie信息/// </summary>private string _cookies;public string Cookies{get { return _cookies; }set { _cookies = value; }}/// <summary>/// Cookie信息/// </summary>private CookieContainer _cookieContainer = new CookieContainer();public CookieContainer CookieContainer{get { return _cookieContainer; }set { _cookieContainer = value; }}/// <summary>/// 網頁源碼/// </summary>private string _html;public string Html{get { return _html; }set { _html = value; }}/// <summary>/// 驗證碼臨時文件(絕對路徑)/// </summary>private string _tmpValCodePic;public string TmpValCodePic{get { return _tmpValCodePic; }set { _tmpValCodePic = value; }}/// <summary>/// 驗證碼臨時文件名(相對路徑)/// </summary>private string _tmpValCodeFileName = "emptyPic.gif";public string TmpValCodeFileName{get { return _tmpValCodeFileName; }set { _tmpValCodeFileName = value; }}/// <summary>/// 有驗證碼/// </summary>private bool _isValCode;public bool IsValCode{get { return _isValCode; }set { _isValCode = value; }}/// <summary>/// 驗證碼URL/// </summary>private string _valCodeURL;public string ValCodeURL{get { return _valCodeURL; }set { _valCodeURL = value; }}/// <summary>/// 驗證碼識別后的值/// </summary>private string _valCodeValue;public string ValCodeValue{get { return _valCodeValue; }set { _valCodeValue = value; }}/// <summary>/// 其它參數/// </summary>private Hashtable _otherParams = new Hashtable();public Hashtable OtherParams{get { return _otherParams; }set { _otherParams = value; }}// 重復添加處理 add by fengcj 09/11/19 PMpublic void addOtherParam(object key, object value){if (!this.OtherParams.ContainsKey(key))this.OtherParams.Add(key, value);else{this.OtherParams[key] = value;}}public void removeOtherParam(object key){this.OtherParams.Remove(key);}public object getOtherParam(object key){return this.OtherParams[key];}}
?KeyWord.cs
?
?
class Keyword{public int ID { get; set; }public string Title { get; set; }public string Link { get; set; }}鑒于大家都需要demo,今天就整理一下發上來.添加了導出word,導出excel功能.暈...木找到怎么放文件路徑進來....有需要的可以email我.
?
?
?
?
?
?
?
?
?
轉載于:https://www.cnblogs.com/liguanghui/archive/2011/11/07/2239161.html
總結
以上是生活随笔為你收集整理的C#winform抓取百度,Google搜索关键词结果的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: JSON serializing and
- 下一篇: [原]win32 rundll32 应用