java抓取网页内容 (三)

2014-11-24 11:10:38 · 作者: · 浏览: 2
Matcher ma = pa.matcher(s);
while (ma.find()) {
list.add(ma.group());
}
return list;
}

/**
*
* @param s
* @return 去掉标记
*/
public String outTag(final String s) {
return s.replaceAll("<.* >", "");
}

public static void main(String[] args) {
Test t = new Test();
String content = t.getHtmlContent("http://www.taobao.com");
//content = content.replaceAll("(
)+ ", "\n");// 转化换行
//content = content.replaceAll("

.*

", "");// 去图片注释
System.out.println(content);
System.out.println(t.getTitle(content));
List a = t.getNews(content);
List news = new ArrayList();
for (String s : a) {
news.add(s.replaceAll("<.* >", ""));
}
System.out.println(news);
//…… 获取js、css等操作省略
}
}
后来我想了想我觉得读取本地和通过url读取原理不是一样的嘛,但是我尝试了好多种写法都不行,不是乱码问题就是报错,我该怎么办 老天就是这样捉弄人,功夫不负有心人当我尝试至999次时候突然眼前一亮,我成功实现读取本地html了……说真的代码不难但是你需要多次尝试,把我的代码分享给大家:


[html]
/**
* 抓取本地网页内容
*
* @param filePath
* @return
*/
public static String getHtmlContent(String filePath) {
String temp;
BufferedReader br;
StringBuffer sb = new StringBuffer();
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), "GB2312"));
while ((temp = br.readLine()) != null) {
sb.append(temp);
}
} catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}

/**
* 抓取本地网页内容
*
* @param filePath
* @return
*/
public static String getHtmlContent(String filePath) {
String temp;
BufferedReader br;
StringBuffer sb = new StringBuffer();
try {
br = new BufferedReader(new InputStreamReader(new FileInputStream(filePath), "GB2312"));
while ((temp = br.readLine()) != null) {
sb.append(temp);
}
} catch (IOException e) {
e.printStackTrace();
}
return sb.toString();
}