HTTPCLIENT抓取网页内容

2014-11-24 10:21:31 · 作者: · 浏览: 0

通过httpclient抓取网页信息。


public class SnippetHtml{ 
     
    /**
     * 通过url获取网站html
     * @param url 网站url
     */ 
    public String parseHtml (String url) { 
        // 测试HttpClient用法  
        HttpClient client=new HttpClient(); 
        //设置代理服务器地址和端口  
        HttpMethod method = null; 
        String html = ""; 
        try { 
            method = new GetMethod(url); 
            client.executeMethod(method); 
            html = method.getResponseBodyAsString();//获取网页内容  
        } catch (HttpException e) { 
            // TODO Auto-generated catch block  
            e.printStackTrace(); 
        } catch (IOException e) { 
            // TODO Auto-generated catch block  
            e.printStackTrace(); 
        } finally { 
            //释放连接  
            if (method != null) { 
                method.releaseConnection();   
            } 
        } 
        return html ; 
    } 
     
    /**
     * 解析html获取地震bean
     * @param html 解析网页html
     * @return List 
     */ 
    public void getHtmlEarthBean (String html) { 
        if (html != null && !"".equals(html)) { 
            Document doc = Jsoup.parse(html);    
            Elements linksElements = doc.getElementsByAttributeva lue("class", "news-table");//获取class名字为 news-table  
            for (Element ele : linksElements) { 
                Elements linksElements1 = ele.getElementsByTag("td");//获取网页td的标签元素  
                for (Element ele1 : linksElements1) { 
                    System.out.println(ele1.text()); 
                } 
            }    
        } 
    } 
} 

public class SnippetHtml{
 
 /**
  * 通过url获取网站html
  * @param url 网站url
  */
 public String parseHtml (String url) {
  // 测试HttpClient用法
  HttpClient client=new HttpClient();
  //设置代理服务器地址和端口
  HttpMethod method = null;
  String html = "";
  try {
   method = new GetMethod(url);
   client.executeMethod(method);
   html = method.getResponseBodyAsString();//获取网页内容
  } catch (HttpException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  } catch (IOException e) {
   // TODO Auto-generated catch block
   e.printStackTrace();
  } finally {
   //释放连接
   if (method != null) {
    method.releaseConnection(); 
   }
  }
  return html ;
 }
 
 /**
  * 解析html获取地震bean
  * @param html 解析网页html
  * @return List
  */
 public void getHtmlEarthBean (String html) {
  if (html != null && !"".equals(html)) {
   Document doc = Jsoup.parse(html);  
   Elements linksElements = doc.getElementsByAttributeva lue("class", "news-table");//获取class名字为 news-table
   for (Element ele : linksElements) {
    Elements linksElements1 = ele.getElementsByTag("td");//获取网页td的标签元素
    for (Element ele1 : linksElements1) {
     System.out.println(ele1.text());
    }
   }  
  }
 }
}

需要下载jar包:commons-httpclient-3.1.jar 和 jsoup-1.6.1.jar 作为抓取和解析。