Skip to content

Instantly share code, notes, and snippets.

@wppurking
Last active December 27, 2015 17:49
Show Gist options
  • Save wppurking/7365373 to your computer and use it in GitHub Desktop.
Save wppurking/7365373 to your computer and use it in GitHub Desktop.
jsoup 抓取 reddit
public static List<Title> parseHtml(String name) {
List<Title> lists = new ArrayList<Title>();
//URL
String getUrl = "http://www.reddit.com/";
if(name!=null) {
getUrl=getUrl+"/search?q="+name;
}
//标记选取规则
String g1 = ".title >a";
//属性选取规则
String g2 = "abs:href";
try {
//try { --- 代码的格式化也是很重要的哦
Document doc = Jsoup.connect(getUrl).timeout(60000).get();
Elements links = doc.select(g1);
//int i=0;
int i = 0; // -------- 代码规范
for (Element link : links) {
// 变量名要尽量有意义
Title redditLink = new Title();
String bookTitle = link.text();
String bookURL = link.attr(g2);
redditLink.setTitle(bookTitle);
redditLink.setHref(bookURL);
lists.add(redditLink);
//System.out.println("链接 "+i+++": "+bookURL + " 标题" + bookTitle);
// 这个 +i+++ 让我多理解了一下哈
System.out.println("链接 " + i++ + ": " + bookURL + " 标题" + bookTitle);
// 我们可以通过 logger 代替 systemout 来查看系统的运行状况
}
} catch (IOException e) {
System.out.println("抓取异常");
e.printStackTrace();
}
return lists;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment