Last active
December 27, 2015 17:49
-
-
Save wppurking/7365373 to your computer and use it in GitHub Desktop.
jsoup 抓取 reddit
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
public static List<Title> parseHtml(String name) { | |
List<Title> lists = new ArrayList<Title>(); | |
//URL | |
String getUrl = "http://www.reddit.com/"; | |
if(name!=null) { | |
getUrl=getUrl+"/search?q="+name; | |
} | |
//标记选取规则 | |
String g1 = ".title >a"; | |
//属性选取规则 | |
String g2 = "abs:href"; | |
try { | |
//try { --- 代码的格式化也是很重要的哦 | |
Document doc = Jsoup.connect(getUrl).timeout(60000).get(); | |
Elements links = doc.select(g1); | |
//int i=0; | |
int i = 0; // -------- 代码规范 | |
for (Element link : links) { | |
// 变量名要尽量有意义 | |
Title redditLink = new Title(); | |
String bookTitle = link.text(); | |
String bookURL = link.attr(g2); | |
redditLink.setTitle(bookTitle); | |
redditLink.setHref(bookURL); | |
lists.add(redditLink); | |
//System.out.println("链接 "+i+++": "+bookURL + " 标题" + bookTitle); | |
// 这个 +i+++ 让我多理解了一下哈 | |
System.out.println("链接 " + i++ + ": " + bookURL + " 标题" + bookTitle); | |
// 我们可以通过 logger 代替 systemout 来查看系统的运行状况 | |
} | |
} catch (IOException e) { | |
System.out.println("抓取异常"); | |
e.printStackTrace(); | |
} | |
return lists; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment