Skip to content

Instantly share code, notes, and snippets.

@bertrandmartel
Last active October 1, 2018 19:47
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bertrandmartel/ea7b89654fee50f9e6b7b09ae5500c12 to your computer and use it in GitHub Desktop.
Save bertrandmartel/ea7b89654fee50f9e6b7b09ae5500c12 to your computer and use it in GitHub Desktop.
scrap example using Jsoup for parsing HTML & GSON for parsing some JSON REST API call
package some.package;
import com.google.gson.Gson;
import com.google.gson.GsonBuilder;
import com.google.gson.annotations.SerializedName;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
public class CompanyScrap {
class CompanyFilter {
@SerializedName("ids")
private List<Integer> mIds;
@SerializedName("hexdigest")
private String mDigest;
@SerializedName("total")
private String mTotalCount;
@SerializedName("page")
private int mPage;
@SerializedName("sort")
private String mSort;
@SerializedName("new")
private boolean mNew;
public List<Integer> getIds() {
return mIds;
}
public String getDigest() {
return mDigest;
}
public String getTotalCount() {
return mTotalCount;
}
public int getpage() {
return mPage;
}
private String buildRequest() {
String out = "total=" + mTotalCount + "&";
out += "sort=" + mSort + "&";
out += "page=" + mPage + "&";
out += "new=" + mNew + "&";
for (int i = 0; i < mIds.size(); i++) {
out += "ids[]=" + mIds.get(i) + "&";
}
out += "hexdigest=" + mDigest + "&";
return out;
}
}
private static class Company {
private String mLink;
private String mName;
private String mDescription;
public Company(String name, String link, String description) {
mLink = link;
mName = name;
mDescription = description;
}
public String getLink() {
return mLink;
}
public String getName() {
return mName;
}
public String getDescription() {
return mDescription;
}
}
private static class HtmlContainer {
@SerializedName("html")
private String mHtml;
public String getHtml() {
return mHtml;
}
}
private static List<Company> getCompanies(final CompanyFilter companyFilter) throws IOException {
List<Company> companies = new ArrayList<>();
URLConnection urlConn = new URL("https://angel.co/companies/startups?" + companyFilter.buildRequest()).openConnection();
urlConn.setRequestProperty("User-Agent", "Mozilla");
urlConn.connect();
BufferedReader reader = new BufferedReader(new InputStreamReader(urlConn.getInputStream(), "UTF-8"));
HtmlContainer htmlObj = new Gson().fromJson(reader, HtmlContainer.class);
Element doc = Jsoup.parse(htmlObj.getHtml());
Elements data = doc.select("div[data-_tn]");
if (data.size() > 0) {
for (int i = 2; i < data.size(); i++) {
companies.add(new Company(data.get(i).select("a").first().attr("title"),
data.get(i).select("a").first().attr("href"),
data.get(i).select("div.pitch").first().text()));
}
} else {
System.out.println("no data");
}
return companies;
}
/**
* Return company filter object
*/
private static CompanyFilter getCompanyFilter(final String filter, final int page) throws IOException {
String response = Jsoup.connect("https://angel.co/company_filters/search_data")
.header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8")
.header("X-Requested-With", "XMLHttpRequest")
.data("filter_data[company_types][]=", filter)
.data("sort", "signal")
.data("page", String.valueOf(page))
.userAgent("Mozilla")
.ignoreContentType(true)
.post().body().text();
GsonBuilder gsonBuilder = new GsonBuilder();
Gson gson = gsonBuilder.create();
return gson.fromJson(response, CompanyFilter.class);
}
public static void main(String[] args) throws IOException {
int pageCount = 1;
List<Company> companies = new ArrayList<>();
for (int i = 0; i < 10; i++) {
System.out.println("get page n°" + pageCount);
CompanyFilter companyFilter = getCompanyFilter("Startup", pageCount);
pageCount++;
System.out.println("digest : " + companyFilter.getDigest());
System.out.println("count : " + companyFilter.getTotalCount());
System.out.println("array size : " + companyFilter.getIds().size());
System.out.println("page : " + companyFilter.getpage());
companies.addAll(getCompanies(companyFilter));
if (companies.size() == 0) {
break;
} else {
System.out.println("size : " + companies.size());
}
}
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment