Last active
October 1, 2018 19:47
-
-
Save bertrandmartel/ea7b89654fee50f9e6b7b09ae5500c12 to your computer and use it in GitHub Desktop.
scrap example using Jsoup for parsing HTML & GSON for parsing some JSON REST API call
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package some.package; | |
import com.google.gson.Gson; | |
import com.google.gson.GsonBuilder; | |
import com.google.gson.annotations.SerializedName; | |
import org.jsoup.Jsoup; | |
import org.jsoup.nodes.Element; | |
import org.jsoup.select.Elements; | |
import java.io.IOException; | |
import java.util.ArrayList; | |
import java.util.List; | |
public class CompanyScrap { | |
class CompanyFilter { | |
@SerializedName("ids") | |
private List<Integer> mIds; | |
@SerializedName("hexdigest") | |
private String mDigest; | |
@SerializedName("total") | |
private String mTotalCount; | |
@SerializedName("page") | |
private int mPage; | |
@SerializedName("sort") | |
private String mSort; | |
@SerializedName("new") | |
private boolean mNew; | |
public List<Integer> getIds() { | |
return mIds; | |
} | |
public String getDigest() { | |
return mDigest; | |
} | |
public String getTotalCount() { | |
return mTotalCount; | |
} | |
public int getpage() { | |
return mPage; | |
} | |
private String buildRequest() { | |
String out = "total=" + mTotalCount + "&"; | |
out += "sort=" + mSort + "&"; | |
out += "page=" + mPage + "&"; | |
out += "new=" + mNew + "&"; | |
for (int i = 0; i < mIds.size(); i++) { | |
out += "ids[]=" + mIds.get(i) + "&"; | |
} | |
out += "hexdigest=" + mDigest + "&"; | |
return out; | |
} | |
} | |
private static class Company { | |
private String mLink; | |
private String mName; | |
private String mDescription; | |
public Company(String name, String link, String description) { | |
mLink = link; | |
mName = name; | |
mDescription = description; | |
} | |
public String getLink() { | |
return mLink; | |
} | |
public String getName() { | |
return mName; | |
} | |
public String getDescription() { | |
return mDescription; | |
} | |
} | |
private static class HtmlContainer { | |
@SerializedName("html") | |
private String mHtml; | |
public String getHtml() { | |
return mHtml; | |
} | |
} | |
private static List<Company> getCompanies(final CompanyFilter companyFilter) throws IOException { | |
List<Company> companies = new ArrayList<>(); | |
URLConnection urlConn = new URL("https://angel.co/companies/startups?" + companyFilter.buildRequest()).openConnection(); | |
urlConn.setRequestProperty("User-Agent", "Mozilla"); | |
urlConn.connect(); | |
BufferedReader reader = new BufferedReader(new InputStreamReader(urlConn.getInputStream(), "UTF-8")); | |
HtmlContainer htmlObj = new Gson().fromJson(reader, HtmlContainer.class); | |
Element doc = Jsoup.parse(htmlObj.getHtml()); | |
Elements data = doc.select("div[data-_tn]"); | |
if (data.size() > 0) { | |
for (int i = 2; i < data.size(); i++) { | |
companies.add(new Company(data.get(i).select("a").first().attr("title"), | |
data.get(i).select("a").first().attr("href"), | |
data.get(i).select("div.pitch").first().text())); | |
} | |
} else { | |
System.out.println("no data"); | |
} | |
return companies; | |
} | |
/** | |
* Return company filter object | |
*/ | |
private static CompanyFilter getCompanyFilter(final String filter, final int page) throws IOException { | |
String response = Jsoup.connect("https://angel.co/company_filters/search_data") | |
.header("Content-Type", "application/x-www-form-urlencoded;charset=UTF-8") | |
.header("X-Requested-With", "XMLHttpRequest") | |
.data("filter_data[company_types][]=", filter) | |
.data("sort", "signal") | |
.data("page", String.valueOf(page)) | |
.userAgent("Mozilla") | |
.ignoreContentType(true) | |
.post().body().text(); | |
GsonBuilder gsonBuilder = new GsonBuilder(); | |
Gson gson = gsonBuilder.create(); | |
return gson.fromJson(response, CompanyFilter.class); | |
} | |
public static void main(String[] args) throws IOException { | |
int pageCount = 1; | |
List<Company> companies = new ArrayList<>(); | |
for (int i = 0; i < 10; i++) { | |
System.out.println("get page n°" + pageCount); | |
CompanyFilter companyFilter = getCompanyFilter("Startup", pageCount); | |
pageCount++; | |
System.out.println("digest : " + companyFilter.getDigest()); | |
System.out.println("count : " + companyFilter.getTotalCount()); | |
System.out.println("array size : " + companyFilter.getIds().size()); | |
System.out.println("page : " + companyFilter.getpage()); | |
companies.addAll(getCompanies(companyFilter)); | |
if (companies.size() == 0) { | |
break; | |
} else { | |
System.out.println("size : " + companies.size()); | |
} | |
} | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment