Skip to content

Instantly share code, notes, and snippets.

@wololock
Created October 25, 2014 19:34
Show Gist options
  • Save wololock/ffd9ef32f7abe3f325b0 to your computer and use it in GitHub Desktop.
Save wololock/ffd9ef32f7abe3f325b0 to your computer and use it in GitHub Desktop.
Extracting paragraphs from list of link
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
public class ExtractingParentParagraphsFromScrapedLinks {
public static void main(String[] args) {
String html = "<div class=\"somename\">" +
"<p>This is paragraph with <a href=\"home.html\">link</a>. Lorem ipsum</p>" +
"<p>This is another paragraph, but <span class=\"highlight\">this one is <a href=\"different.html\">different</a> than</span> the first one</p>" +
"<a href=\"no.html\">this wont be scraped</a>" +
"</div>";
Document document = Jsoup.parse(html);
Elements paragraphs = document.select("p:has(a[href])");
for (Element paragraph : paragraphs) {
System.out.println(paragraph.select("a[href]"));
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment