对网站的代码采集实例
生活随笔
收集整理的這篇文章主要介紹了
对网站的代码采集实例
小編覺得挺不錯的,現在分享給大家,幫大家做個參考.
1.采集的網站:http://www.abnova.com/support/publication.asp
2. 相關的代碼:列表(規則采集)頁面使用:phpQuery.php,可以參考:PHP curl_setopt函數用法介紹中篇
3.產品詳情頁面:信息(不規則采集),參考代碼如下:
<?php header('Content-Type:text/html;charset=UTF-8'); include 'phpQuery/phpQuery.php'; set_time_limit(0); $id = isset($_GET['id']) ? intval($_GET['id']) : 1;$listArr = file('list.txt');if (!array_key_exists($id,$listArr)){echo 'finished';exit; }$url = preg_replace('/[\r\n]+/','',$listArr[$id]); phpQuery::newDocumentFile($url);$artList = pq("#sub_product_info"); echo '<pre>';foreach($artList as $li){$data = array();$datacode = array();$datacode = explode('=',$url);$data['code'] = $datacode[1];$one = '';$one_a = '';$one_a = pq($li)->find("#10000 b")->html();$one_a = trim(strip_tags($one_a));if($one_a == 'Product Description:'){// echo 1;$one = pq($li)->find('#10000 li')->eq(1)->html();$one = trim(strip_tags($one));echo "Product Description: ".$one; }else{echo "Product Description: ".$one;// echo "wrong!<br/>"; }$data['Description'] = $one;echo '<br/>';#########################$two = '';$two_a = '';$two_a = pq($li)->find("#90000 b")->html();$two_a = trim(strip_tags($two_a));if($two_a == 'Immunogen:'){$two = pq($li)->find('#90000 li')->eq(1)->html();$two = trim(strip_tags($two));echo "Immunogen: ".$two; }else{echo "Immunogen: ".$two;// echo "wrong<br/>"; }$data['Immunogen'] = $two;echo '<br/>';#########################$three = '';$three_a = '';$three_a = pq($li)->find("#110000 b")->html();$three_a = trim(strip_tags($three_a));if($three_a == 'Host:'){$three = pq($li)->find('#110000 li')->eq(1)->html();$three = trim(strip_tags($three));echo "Host: ".$three; }else{echo "Host: ".$three;// echo "wrong<br/>"; }$data['Host'] = $three;echo '<br/>';#########################$four = '';$four_a = '';$four_a = pq($li)->find("#130000 b")->html();$four_a = trim(strip_tags($four_a));if($four_a == 'Reactivity:'){$four = pq($li)->find('#130000 li')->eq(1)->html();$four = trim(strip_tags($four));echo "Reactivity: ".$four; }else{echo "Reactivity: ".$four;// echo "wrong<br/>"; }$data['Reactivity'] = $four;echo '<br/>';#########################$five = '';$five_a ='';$five_a = pq($li)->find("#240000 b")->html();$five_a = trim(strip_tags($five_a));if($five_a == 'Isotype:'){$five = pq($li)->find('#240000 li')->eq(1)->html();$five = trim(strip_tags($five));echo "Isotype: ".$five; }else{echo "Isotype: ".$five;// echo "wrong<br/>"; }$data['Isotype'] = $five;echo '<br/>';#########################$six = '';$six_a = '';$six_a = pq($li)->find("#290000 b")->html();$six_a = trim(strip_tags($six_a));if($six_a == 'Quality Control Testing:'){$six_all = pq($li)->find('#290000 li')->eq(1)->html();$six_all = trim(strip_tags($six_all,"<br>"));// $six_all = str_replace("<br><br><br/>",'###',)$six_arr = explode("<br><br><br>",$six_all);// var_dump($six_arr);$six = trim($six_arr[0]); echo "Quality Control Testing: ".$six; }else{echo "Quality Control Testing: ".$six;// echo "wrong<br/>"; }$data['Testing'] = $six;echo '<br/>';echo '<hr/>';#########################$wh_11 = '';$wh_11 = pq($li) -> find(".part")->eq(2)->find(".first_title b")->html();$wh_11 = trim(strip_tags($wh_11));echo "APP: ".$wh_11;echo '<br/>';$wh_22 = '';$wh_22 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(1)->find("li")->eq(0)->html();$wh_22 = trim(strip_tags($wh_22));echo "Western: ".$wh_22;echo '<br/>';$wh_33 = '';$wh_33 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(2)->find("li")->eq(0)->html();$wh_33 = trim(strip_tags($wh_33));echo "Western Blot: ".$wh_33;echo '<br/>';$wh_44 = '';$wh_44 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(3)->find("li")->eq(0)->html();$wh_44 = trim(strip_tags($wh_44));echo "Immunohistochemistry: ".$wh_44;echo '<br/>';$wh_55 = '';$wh_55 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(4)->find("li")->eq(0)->html();$wh_55 = trim(strip_tags($wh_55));echo "Immunofluorescence: ".$wh_55;echo '<br/>';$wh_66 = '';$wh_66 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(5)->find("li")->eq(0)->html();$wh_66 = trim(strip_tags($wh_66));echo "Sandwich ELISA: ".$wh_66;echo '<br/>';$wh_77 = '';$wh_77 = pq($li) -> find(".part")->eq(2)->find("ul")->eq(6)->find("li")->eq(0)->html();$wh_77 = trim(strip_tags($wh_77));echo "ELISA: ".$wh_77;echo '<br/>';$app = array();$appstr = '';if ($wh_22 != '') $app['w1'] = $wh_22;if ($wh_33 != '') $app['w2'] = $wh_33;if ($wh_44 != '') $app['w3'] = $wh_44;if ($wh_55 != '') $app['w4'] = $wh_55;if ($wh_66 != '') $app['w5'] = $wh_66;if ($wh_77 != '') $app['w6'] = $wh_77;echo $appstr = implode(',',$app);$data['app'] = $appstr;echo '<hr/>';#########################// $length = pq($li) -> find(".part")->eq(3)->find("ul")->find("li")->html();// $length = trim(strip_tags($length,'<b>'));// $length = str_replace("<b>","####",$length);// // $length = str_replace("</b>",",",$length);// echo $length;// $arr = explode(",",$length);// var_dump($arr);// foreach($length as $list){// echo $list;// // exit;// }echo '<hr/>';echo '<br/>';$heng_11 = '';$heng_11_a = '';$heng_11_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(1)->find("li")->eq(0)->find("b")->html();$heng_11_a = trim(strip_tags($heng_11_a));if($heng_11_a == 'Entrez GeneID:'){$heng_11 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(1)->find("li")->eq(1)->find("a")->html();$heng_11 = trim(strip_tags($heng_11));echo "Entrez GeneID: ".$heng_11;}else{echo "Entrez GeneID: ".$heng_11;}$data['GeneID'] = $heng_11;echo '<br/>';$heng_22 = '';$heng_22_a = '';$heng_22_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(2)->find("li")->eq(0)->find("b")->html();$heng_22_a = trim(strip_tags($heng_22_a));if($heng_22_a == 'GeneBank Accession#:'){$heng_22 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(2)->find("li")->eq(1)->find("a")->html();$heng_22 = trim(strip_tags($heng_22));echo "GeneBank Accession#: ".$heng_22;}else{echo "GeneBank Accession#: ".$heng_22;}$data['GeneBank Accession'] = $heng_22;echo '<br/>';$heng_33 = '';$heng_33_a = '';$heng_33_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(3)->find("li")->eq(0)->find("b")->html();$heng_33_a = trim(strip_tags($heng_33_a));if($heng_33_a == 'Protein Accession#:'){$heng_33 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(3)->find("li")->eq(1)->find("a")->html();$heng_33 = trim(strip_tags($heng_33));echo "Protein Accession#: ".$heng_33;}else{echo "Protein Accession#: ".$heng_33;}$data['Protein Accession'] = $heng_33;echo '<br/>';$heng_44 = '';$heng_44_a = '';$heng_44_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(4)->find("li")->eq(0)->find("b")->html();$heng_44_a = trim(strip_tags($heng_44_a));if($heng_44_a == 'Gene Name:'){$heng_44 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(4)->find("li")->eq(1)->html();$heng_44 = trim(strip_tags($heng_44));echo "Gene Name: ".$heng_44;}else{echo "Gene Name: ".$heng_44;}$data['Gene Name'] = $heng_44;echo '<br/>';$heng_55 = '';$heng_55_a = '';$heng_55_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(5)->find("li")->eq(0)->find("b")->html();$heng_55_a = trim(strip_tags($heng_55_a));if($heng_55_a == 'Gene Alias:'){$heng_55 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(5)->find("li")->eq(1)->html();$heng_55 = trim(strip_tags($heng_55));echo "Gene Alias: ".$heng_55;}else{echo "Gene Alias: ".$heng_55;} $data['Gene Alias'] = $heng_55;echo '<br/>';$heng_66 = '';$heng_66_a = '';$heng_66_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(7)->find("li")->eq(0)->find("b")->html();$heng_66_a = trim(strip_tags($heng_66_a));if($heng_66_a == 'Omim ID:'){$heng_66 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(7)->find("li")->eq(1)->html();$heng_66 = trim(strip_tags($heng_66));echo "Omim ID: ".$heng_66;}else{echo "Omim ID: ".$heng_66;} $data['Omim ID'] = $heng_66;echo '<br/>';$heng_77 = '';$heng_77_a = '';$heng_77_a = pq($li) -> find(".part")->eq(3)->find("ul")->eq(8)->find("li")->eq(0)->find("b")->html();$heng_77_a = trim(strip_tags($heng_77_a));if($heng_77_a == 'Gene Ontology:'){$heng_77 = pq($li) -> find(".part")->eq(3)->find("ul")->eq(8)->find("li")->eq(1)->find("a")->html();$heng_77 = trim(strip_tags($heng_77));echo "Gene Ontology: ".$heng_77;}else{echo "Gene Ontology: ".$heng_77;}$data['Gene Ontology'] = $heng_77;# 獲取文獻$rarr = array();$Reference = '';if (preg_match('/Publication Reference/',$li->textContent)){preg_match_all('/Publication Reference(.*?)Applications/',preg_replace('/[\r\n]+/','',$li->textContent),$rarr);}$Reference = $rarr[1][0];$data['Reference'] = $Reference;$rarr = array();print_r($data);# 寫入文件 $handle = fopen('list-new.csv','a');fputcsv($handle,$data);fclose($handle);}?> <script> function JumpUrl(){location.href='?id=<?php echo ($id+1);?>'; } setTimeout('JumpUrl()',0); </script>總結
以上是生活随笔為你收集整理的对网站的代码采集实例的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: jenkins调整jdk版本不生效的解决
- 下一篇: 广播接收者