Skip to content

Instantly share code, notes, and snippets.

@Terryhung
Created November 23, 2017 16:22
Show Gist options
  • Save Terryhung/3fe1b8d697f85849f0dcf2a70ec1711e to your computer and use it in GitHub Desktop.
Save Terryhung/3fe1b8d697f85849f0dcf2a70ec1711e to your computer and use it in GitHub Desktop.
public static String AddDocHead(String s) throws Exception {
String _s = "<style> .article{padding-left: 18px;padding-right: 18px;padding-top: 15px;word-break: break-all}.article-header{font-size: 27px;color: #1a1a1a}.article-info{font-size: 16px;color: #c2c2c2;margin-top: 18px;margin-bottom: 18px}.article-content{margin-top: 18px;font-size: 22px;color: #818181}.main-image{width: 100%;max-height: 500px;margin-top: 20px;margin-bottom: 20px}hr{margin-top: 20px;color: #f5f5f5} img{width: 100%}</style>".concat(s);
return _s;
}
public static void Parsing() throws Exception {
String url = "http://www.cna.com.tw/news/aloc/201710030325-1.aspx";
Document doc = Jsoup.connect(url)
.header("User-Agent", "Mozilla/5.0 (Linux; Android 7.0; SAMSUNG SM-G950U Build/NRD90M) AppleWebKit/537.36 (KHTML, like Gecko) SamsungBrowser/5.2 Chrome/51.0.2704.106 Mobile Safari/537.36")
.get();
String target = "div.news_article";
Elements divs = doc.select(target);
String remove = "script, button";
Elements removed = doc.select(remove);
String image_url = "http://img.appledaily.com.tw/images/twapple/640pix/20170925/BN02/BN02_005.jpg";
String news_title = "苗栗縣鼓勵青年創業苗栗縣鼓勵青年創業苗栗縣鼓勵青年創業";
String news_source = "CNA";
String news_source_date = "2017-11-11";
removed.remove();
Whitelist wl = Whitelist.relaxed();
wl.addTags("div", "span", "p", "br", "article", "section", "style");
wl.addAttributes("div", "class");
wl.addAttributes("img", "class", "src");
/* To html*/
String mProcessedHtml = Jsoup.clean(divs.outerHtml(),wl);
mProcessedHtml = String.format("<div class='article'><div class='article-header'>%s</div><div class='article-info'>%s, %s</div><div class='article-content'>%s</div></div>", news_title, news_source, news_source_date, mProcessedHtml);
/* Add Title to first position*/
/*Wrap New Tag*/
mProcessedHtml = AddDocHead(mProcessedHtml);
CreateHTML(Arrays.asList(mProcessedHtml));
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment