Skip to content

Instantly share code, notes, and snippets.

@saasindustries
Created January 12, 2021 15:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save saasindustries/679b70bea866feac3cafe79f24fbef2b to your computer and use it in GitHub Desktop.
Save saasindustries/679b70bea866feac3cafe79f24fbef2b to your computer and use it in GitHub Desktop.
import java.io.IOException;
import java.net.MalformedURLException;
import com.gargoylesoftware.htmlunit.BrowserVersion;
import com.gargoylesoftware.htmlunit.FailingHttpStatusCodeException;
import com.gargoylesoftware.htmlunit.WebClient;
import com.gargoylesoftware.htmlunit.html.DomNode;
import com.gargoylesoftware.htmlunit.html.DomNodeList;
import com.gargoylesoftware.htmlunit.html.HtmlPage;
public class myhtmlunit {
public static void main(String[] args) throws FailingHttpStatusCodeException, MalformedURLException, IOException {
//initialize a headless browser
WebClient webClient = new WebClient(BrowserVersion.CHROME);
//configuring options
webClient.getOptions().setUseInsecureSSL(true);
webClient.getOptions().setCssEnabled(false);
//webClient.getOptions().setJavaScriptEnabled(false);
webClient.getOptions().setThrowExceptionOnFailingStatusCode(false);
webClient.getOptions().setThrowExceptionOnScriptError(false);
//fetching the web page
HtmlPage page = webClient.getPage("https://www.reddit.com/r/scraping/");
//selecting all headings
DomNodeList<DomNode> headings = page.querySelectorAll("h3._eYtD2XCVieq6emjKBH3m");
//iterating and extracting
for (DomNode content: headings) {
System.out.println(content.asText());
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment