Skip to content

Instantly share code, notes, and snippets.

@ictrobot
Created June 26, 2016 14:14
Show Gist options
  • Save ictrobot/dfa8af82df65f87e5f4ae1870776d716 to your computer and use it in GitHub Desktop.
Save ictrobot/dfa8af82df65f87e5f4ae1870776d716 to your computer and use it in GitHub Desktop.
package ethanjones.statistics;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.io.File;
import java.io.FileWriter;
import java.io.PrintWriter;
import java.io.Writer;
import java.net.URL;
public class BookStatistics {
public static void main(String[] args) throws Exception {
get_statistics("fiction", "http://www.whsmith.co.uk/chart/books/fiction-02x08974?results=200");
get_statistics("nonfiction", "http://www.whsmith.co.uk/chart/books/non-fiction-02x09005?results=200");
get_statistics("childrens", "http://www.whsmith.co.uk/chart/books/childrens-02x09004?results=200&filters=FILTER_format_fg%3aHardback%2cPaperback");
}
private static void get_statistics(String file, String page) throws Exception {
File f = new File("output/" + file + ".csv");
Writer w = new PrintWriter(new FileWriter(f));
Document document = Jsoup.parse(new URL(page), 1000);
Elements books = document.select("li.chart_product");
for (Element book : books) {
try {
String num = book.select("span.chart_number").first().ownText();
Element a = book.select("a").first();
String title = a.attr("title");
String productPage = a.attr("href");
String author = book.select("span.product_second").first().ownText();
String price = book.select("span.price").first().ownText().replace("£", "");
Document product = Jsoup.parse(new URL(productPage), 1000);
Element attributes = product.select("ul.product_attribute_list").first();
String pages = attributes.select("li[itemprop*=Pages]").first().ownText().trim();
String format = attributes.select("li[itemprop*=format]").first().ownText().trim().toLowerCase();
w.write("\"" + num + "\",\"" + title + "\",\"" + author + "\",\"" + price + "\",\"" + pages + "\",\"" + format + "\"\n");
} catch (Exception ignored) {}
}
w.close();
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment