博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
抓取HTML网页数据
阅读量:5219 次
发布时间:2019-06-14

本文共 5956 字,大约阅读时间需要 19 分钟。

(转)htmlparse filter使用

该类并不是一个通用的工具类,需要按自己的要求实现,这里只记录了Htmlparse.jar包的一些用法。仅此而已! 

详细看这里:

 

import java.util.*;   import org.htmlparser.Node;   import org.htmlparser.NodeFilter;   import org.htmlparser.Parser;   import org.htmlparser.filters.AndFilter;   import org.htmlparser.filters.HasAttributeFilter;   import org.htmlparser.filters.NodeClassFilter;   import org.htmlparser.filters.TagNameFilter;   import org.htmlparser.tags.BodyTag;   import org.htmlparser.tags.LinkTag;   import org.htmlparser.util.NodeList;   import org.htmlparser.util.ParserException;       public class HtmlparseUtil {       WebHttpClient util=new WebHttpClient();             public Map
linkGet(String url, String charset) { String content=util.getWebContentByGet(url,charset); Map
linkMap = new HashMap
(); try { //开始解析 Parser parser = Parser.createParser(content, charset); // 过滤出
标签 NodeFilter linkFilter = new NodeClassFilter(LinkTag.class); NodeList list = parser.extractAllNodesThatMatch(linkFilter); Node node = null; for (int i = 0; i < list.size(); i++) { node = list.elementAt(i); // 获得网页中的链接map(href,text) linkMap.put(((LinkTag) node).getLink(), this.processText(((LinkTag) node).getLinkText())); } } catch (ParserException e) { e.printStackTrace(); } return linkMap; } public String bodyGet(String url, String charset) { String content=util.getWebContentByGet(url,charset); String body = ""; try { Parser parser = Parser.createParser(content, charset); // 过滤标签 NodeFilter bodyFilter = new NodeClassFilter(BodyTag.class); NodeList list = parser.extractAllNodesThatMatch(bodyFilter); Node node = null; for (int i = 0; i < list.size(); i++) { node = list.elementAt(i); // 获得网页内容 保存在content中 body = ((BodyTag) node).getBody(); } } catch (ParserException e) { e.printStackTrace(); } return body; } public Map
termGet(String url, String charset) { String content=util.getWebContentByGet(url,charset); Map
map = new HashMap
(); try { //开始解析 // 过滤出class为term的
元素 Parser parser = Parser.createParser(content, charset); AndFilter filter = new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","term")); Node node = null; NodeList nodeList = parser.parse(filter); for (int i = 0; i < nodeList.size(); i++) { node = nodeList.elementAt(i); map.put("term", node.toPlainTextString()); } // 过滤出class为start-time的元素 Parser parser2 = Parser.createParser(content, charset); AndFilter filter2 = new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","start-time")); NodeList nodeList2 = parser2.parse(filter2); for (int i = 0; i < nodeList2.size(); i++) { node = nodeList2.elementAt(i); map.put("start-time", node.toPlainTextString()); } // 过滤出id为J_SingleEndTimeLabel的元素 Parser parser3 = Parser.createParser(content, charset); AndFilter filter3 = new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("id","J_SingleEndTimeLabel")); NodeList nodeList3 = parser3.parse(filter3); for (int i = 0; i < nodeList3.size(); i++) { node = nodeList3.elementAt(i); map.put("end-time", node.toPlainTextString()); } // 过滤出class为box post的
元素 Parser parser4 = Parser.createParser(content, charset); AndFilter filter4 = new AndFilter(new TagNameFilter("div"),new HasAttributeFilter("class","box post")); NodeList nodeList4 = parser4.parse(filter4); for (int i = 0; i < nodeList4.size(); i++) { node = nodeList4.elementAt(i); String temp=node.toPlainTextString().trim(); temp=temp.substring(10,20).trim(); map.put("pre-term", temp); } // 过滤出class为J_AwardNumber的
元素 Parser parser5 = Parser.createParser(content, charset); // AndFilter filter5 = // new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","J_AwardNumber")); NodeList nodeList5 = parser5.parse(new HasAttributeFilter("class","J_AwardNumber")); StringBuffer buffer=new StringBuffer(); for (int i = 0; i < nodeList5.size(); i++) { node = nodeList5.elementAt(i); buffer.append(","+node.toPlainTextString()); } buffer.append("|"); // 过滤出class为blue J_AwardNumber的元素 Parser parser6 = Parser.createParser(content, charset); // AndFilter filter6 = // new AndFilter(new TagNameFilter("span"),new HasAttributeFilter("class","blue J_AwardNumber")); NodeList nodeList6 = parser6.parse(new HasAttributeFilter("class","blue J_AwardNumber")); for (int i = 0; i < nodeList6.size(); i++) { node = nodeList6.elementAt(i); buffer.append(node.toPlainTextString()+","); } map.put("numbers", buffer.toString()); } catch (ParserException e) { // TODO Auto-generated catch block e.printStackTrace(); } return map; } private String processText(String content){ content=content.trim().replaceAll(" ", ""); // content=content.replaceAll("

", "\n"); // content=content.replaceAll("", ""); // content=content.replaceAll("

", ""); // content=content.replaceAll("", ""); // content=content.replaceAll("", ""); return content; } public static void main(String[] str) { String url="http://caipiao.taobao.com/lottery/order/lottery_dlt.htm?type=1"; HtmlparseUtil util=new HtmlparseUtil(); Map
map=util.termGet(url, "gb2312"); System.out.println("term="+map.get("term"));//
10074 System.out.println("start-time="+map.get("start-time"));// System.out.println("end-time="+map.get("end-time"));// System.out.println("pre-term="+map.get("pre-term"));// System.out.println("numbers="+map.get("numbers"));// } }

 

转载于:https://www.cnblogs.com/shide/p/3394957.html

你可能感兴趣的文章
springSide部署出现AnnotationConfigUtils.processCommonDefinitionAnnotations(…) is not public!
查看>>
ThinkPHP隐藏入口文件的配置方法
查看>>
20180708-Java变量类型
查看>>
CSS编写技巧
查看>>
linux中fork()函数详解 ...
查看>>
12个常用的js正则表达式
查看>>
win7:你需要来自Administrators的权限才能对此文件进行修改的一个文件
查看>>
正则表达式划分CSV
查看>>
笔记 2、分布式系统基础设施
查看>>
git
查看>>
Cisco设备管理
查看>>
【ACM从零开始】LeetCode OJ-Delete Node in a Linked List
查看>>
把ajax包装成promise的形式(3)
查看>>
手指长按事件
查看>>
简单斐波那契
查看>>
Linux内核TC工具链路带宽设计--无类队列规定
查看>>
LAMP环境搭建
查看>>
筹集资金只为那一个梦
查看>>
chkdsk 命令对Raid盘检测和查错、修复
查看>>
什么是文件系统?
查看>>