Java正则表达式获取网页所有网址和链接文字
/*獲取網址首頁的所有網址和鏈接文字*/
import?java.io.BufferedReader;
import?java.io.IOException;
import?java.io.InputStreamReader;
import?java.net.MalformedURLException;
import?java.net.URL;
import?java.util.ArrayList;
import?java.util.HashMap;
import?java.util.List;
import?java.util.regex.Matcher;
import?java.util.regex.Pattern;
import?java.net.*;
import?java.io.*;
import?java.util.regex.*;
/*
根據指定的規則,通過構造正則表達式獲取網址
*/
public?class?Urls
...{
????private?String?startUrl;?????????????????????????????????????????//開始采集網址
????String??urlContent;
????String?ContentArea;
????private?String?strAreaBegin?,strAreaEnd?;????????????//采集區域開始采集字符串和結束采集字符串
????private?String?stringInUrl,stringNotInUrl;????????
????String?strContent;//獲得的采集內容
????String[]?allUrls;????????????????????????????????????????????????????????????//采集到的所有網址
????private?String??regex;?????????????????????????????????????????????????//采集規則
????
????UrlAndTitle???urlAndTitle=new?UrlAndTitle();????//存儲網址和標題????????????????????
????
????
????public?static?void?main(String[]?args)
????...{
?????????Urls?myurl=new?Urls("<body","/body>");
?????????myurl.getStartUrl("http://www.zuzwn.com/");
?????????myurl.getUrlContent();
?????????myurl.getContentArea();
?????????myurl.getStartUrl("http://www.zuzwn.com/");
?????????myurl.getStringNotInUrl("google");
?????????myurl.Urls();
?????????
????????//System.out.println("startUrl:"+myurl.startUrl);
????????//System.out.println("urlcontent:"+myurl.urlContent);
????????//System.out.println("ContentArea:"+myurl.ContentArea);
?
????}
????
????
????//初始化構造函數?strAreaBegin?和strAreaEnd
?
????public?Urls?(String?strAreaBegin,String?strAreaEnd)
????...{
????????this.strAreaBegin=strAreaBegin;
????????this.strAreaEnd=strAreaEnd;
??}
??
??//
????public?void?Urls()
????...{
????????int?i=0;
????????//String?regex?="<a?href="?'?http://[a-zA-Z0-9]+/.[a-zA-Z0-9]+/.[a-zA-Z]+/?[/.?[/S|/s]]+[a>]$";
????????String?regex?="<a.*?/a>";
?????????//String?regex?="http://.*?>";
????????Pattern?pt=Pattern.compile(regex);
????????Matcher?mt=pt.matcher(ContentArea);
????????while(mt.find())
?????????...{
?????????????????System.out.println(mt.group());
?????????????????i++;
?????????????????
?????????????????//獲取標題
?????????????????Matcher?title=Pattern.compile(">.*?</a>").matcher(mt.group());?
?????????????????while(title.find())
?????????????????...{
??????????????????????System.out.println("標題:"+title.group().replaceAll(">|</a>",""));
?????????????????}
?????????????????
?????????????????//獲取網址
?????????????????Matcher?myurl=Pattern.compile("href=.*?>").matcher(mt.group());?
?????????????????while(myurl.find())
?????????????????...{
??????????????????????System.out.println("網址:"+myurl.group().replaceAll("href=|>",""));
?????????????????}
?????????????????
?????????????????System.out.println();
?????????????????
?????????????????
?????????}
??????
????????System.out.println("共有"+i+"個符合結果");
????????
????}????
?
????
????//獲得開始采集網址
????public?void?getStartUrl(String?startUrl)
????...{
????????this.startUrl=startUrl;
????}
????
????//獲得網址所在內容;
????public?void?getUrlContent()
????...{
????????
????????StringBuffer?is=new?StringBuffer();
????????try
????????...{
????????????URL?myUrl=new?URL(startUrl);
????????????BufferedReader?br=?new?BufferedReader(
????????????????????????????????????????????????????????new?InputStreamReader(myUrl.openStream()));
????????????????????????????????????????????????????????????
????????????String?s;????????????????????????????????????????????????
????????????while((s=br.readLine())!=null)
????????????...{
????????????????is.append(s);
????????????}????????????????????????????????????????????
????????????urlContent=is.toString();
????????}
????catch(Exception?e)
????
????...{?
????????System.out.println("網址文件未能輸出");
????????e.printStackTrace();
????}
????????
????????
????}
?????
????
????//獲得網址所在的匹配區域部分
????public?void?getContentArea()
????...{
?????????int?pos1=0,pos2=0;
?????????pos1=?urlContent.indexOf(strAreaBegin)+strAreaBegin.length();
?????????pos2=urlContent.indexOf(strAreaEnd,pos1);
?????????ContentArea=urlContent.substring(pos1,pos2);?
????}
????
????//以下兩個函數獲得網址應該要包含的關鍵字及不能包含的關鍵字
????//這里只做初步的實驗。后期,保護的關鍵字及不能包含的關鍵字應該是不只一個的。
????public?void?getStringInUrl(String?stringInUrl)
????...{
?????????this.stringInUrl=stringInUrl;????????
??????????
????}
????
????public?void?getStringNotInUrl(String?stringNotInUrl)
????...{
????????this.stringNotInUrl=stringNotInUrl;
????}
????
????//獲取采集規則
????
????//獲取url網址
????public?void?getUrl()
????...{
?????
????}
????
????public?String?getRegex()
????...{
????????return?regex;
????????
????}
????
????class?UrlAndTitle
????...{
????????String?myURL;
????????String?title;
????}
}
轉載于:https://www.cnblogs.com/zuzwn/p/3614978.html
總結
以上是生活随笔為你收集整理的Java正则表达式获取网页所有网址和链接文字的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 5.5 准备创建bean
- 下一篇: /etc/sudoers中的含义