Skip to content

Instantly share code, notes, and snippets.

@mark-cooper
Created January 15, 2012 16:11
Show Gist options
  • Save mark-cooper/1616289 to your computer and use it in GitHub Desktop.
Save mark-cooper/1616289 to your computer and use it in GitHub Desktop.
Open Library data dumps: getting CSV
package net.libcode.www.openlibrary;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import com.google.common.io.Files;
/*
* Script style, built incrementally, ready for reorganization
* JSONObjects use a lot of Strings:
* -Xmx1024m -XX:MaxPermSize=256m
*/
public class OpenLibraryData {
public static final JSONParser parser = new JSONParser();
public static final int MAX_JSON = 50000; // Chunk these
public static final File authors = new File("/home/mcooper/OL-Data/ol_dump_authors_2011-12-31.txt");
public static final File authors_search = new File("/home/mcooper/OL-Data/authors_search.txt");
public static final File authors_output = new File("/home/mcooper/OL-Data/authors_output.txt");
public static final File authors_found = new File("/home/mcooper/OL-Data/authors_found.csv");
public static final File works = new File("/home/mcooper/OL-Data/ol_dump_works_2011-12-31.txt");
public static final File works_search = new File("/home/mcooper/OL-Data/works_search.txt");
public static final File works_output = new File("/home/mcooper/OL-Data/works_output.txt");
public static final File works_found = new File("/home/mcooper/OL-Data/works_found.csv");
public static final File editions = new File("/home/mcooper/OL-Data/ol_dump_editions_2011-12-31.txt");
public static final File editions_search = new File("/home/mcooper/OL-Data/editions_search.txt");
public static final File editions_output = new File("/home/mcooper/OL-Data/editions_output.txt");
public static final File editions_found = new File("/home/mcooper/OL-Data/editions_found.csv");
public static final String author_profile_url = "http://openlibrary.org/authors/";
public static final String author_cover_url = "http://covers.openlibrary.org/a/olid/";
public static final String author_csv_hdr = "ID\tNAME\tPROFILE\tIMAGE\tBIRTH\tDEATH\tLINKS";
public static final String works_profile_url = "http://openlibrary.org/works/";
public static final String works_cover_url = "http://covers.openlibrary.org/w/id/";
public static final String works_csv_hdr = "ID\tTITLE\tPROFILE\tIMAGE\tAUTHORS\tPUBLISHED\tDESCRIPTION";
public static final String editions_profile_url = "http://openlibrary.org/books/";
public static final String editions_cover_url = "http://covers.openlibrary.org/b/id/";
public static final String editions_csv_hdr = "ID\tTITLE\tWORKS\tAUTHORS\tOCLC\tLCCN\tISBN10\tISBN13\tPLACES\tPUBLISHERS\tDATE\tFORMAT\tPAGES\tPAGINATION\tDIMENSIONS\tSUBJECTS\tSUBJECT PLACES\tSERIES\tCOVERS\tLIBRARYTHING\tGOODREADS";
public static final String OLID_DELIM = "\t";
public static final String OLID_IDENT = "O";
public static final String JSON_S = "{";
public static final String JSON_E = "}";
public static final String SMALL_JPG = "-S.jpg";
public static final String MED_JPG = "-M.jpg";
public static final String CSV_FIELD_DELIM = "\t";
public static final String CSV_CELL_DELIM = "|";
public static final String NL = String.format("%n");
public static void main(String[] args) throws IOException, ParseException {
clearFile(authors_found); // Start over searching the authors dump file ...
Set<String> searchTerms = readSearchTermsFile(authors_search, 10);
long startTime = System.currentTimeMillis();
searchData(authors, authors_output, searchTerms, false, true);
searchTerms = null;
Map<String, JSONObject> authorsData = parseData(authors_output, 0, true);
List<String> authors_csv = new ArrayList<String>(authorsData.size() + 1);
authors_csv.add(author_csv_hdr);
System.out.println();
System.out.println("AUTHORS FOUND #" + authorsData.size());
System.out.println();
authors_csv.addAll(getAuthorsCsv(authorsData, "\t", "|", true));
long endTime = System.currentTimeMillis();
System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s");
System.out.println();
appendToFile(authors_found, authors_csv, Charset.defaultCharset());
authors_csv = null;
clearFile(works_found); // Search the works dump file by author OLID
searchTerms = readSearchTermsFile(works_search, 10);
startTime = System.currentTimeMillis();
searchData(works, works_output, searchTerms, false, false);
searchTerms = null;
Map<String, JSONObject> worksData = parseData(works_output, 0, false);
List<String> works_csv = new ArrayList<String>(worksData.size() + 1);
works_csv.add(works_csv_hdr);
System.out.println();
System.out.println("WORKS FOUND #" + worksData.size());
System.out.println();
works_csv.addAll(getWorksCsv(worksData, "\t", "|", false));
endTime = System.currentTimeMillis();
System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s");
System.out.println();
appendToFile(works_found, works_csv, Charset.defaultCharset());
works_csv = null;
clearFile(editions_found); // Search the editions dump file ... this is huge.
// searchTerms = readSearchTermsFile(editions_search, 100000);
// searchDataByOLID(editions, editions_output, searchTerms, true);
// searchTerms = null;
startTime = System.currentTimeMillis();
Map<String, JSONObject> editionsData = parseData(editions_output, 0, true);
List<String> editions_csv = new ArrayList<String>(editionsData.size() + 1);
editions_csv.add(editions_csv_hdr);
System.out.println();
System.out.println("EDITIONS FOUND #" + editionsData.size());
System.out.println();
editions_csv.addAll(getEditionsCsv(editionsData, CSV_FIELD_DELIM, CSV_CELL_DELIM, false));
editionsData = null;
endTime = System.currentTimeMillis();
System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s");
System.out.println();
appendToFile(editions_found, editions_csv, Charset.defaultCharset());
editions_csv = null;
//////////
startTime = System.currentTimeMillis();
editionsData = parseData(editions_output, 50001, true);
editions_csv = new ArrayList<String>(editionsData.size() + 1);
editions_csv.add(editions_csv_hdr);
System.out.println();
System.out.println("EDITIONS FOUND #" + editionsData.size());
System.out.println();
editions_csv.addAll(getEditionsCsv(editionsData, CSV_FIELD_DELIM, CSV_CELL_DELIM, false));
editionsData = null;
endTime = System.currentTimeMillis();
System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s");
System.out.println();
appendToFile(editions_found, editions_csv, Charset.defaultCharset());
editions_csv = null;
}
public static void clearFile(File file) throws IOException {
Files.write(new byte[]{}, file);
}
public static void appendToFile(File file, String data, Charset charset) throws IOException {
Files.append(data + NL, file, charset);
}
public static void appendToFile(File file, List<String> data, Charset charset) throws IOException {
for(String x : data) {
Files.append(x + NL, file, charset);
}
}
public static Set<String> readSearchTermsFile(File file, int capacity) throws IOException {
Set<String> lines = new HashSet<String>(capacity);
BufferedReader reader = Files.newReader(file, Charset.defaultCharset());
String line;
while((line = reader.readLine()) != null) {
lines.add(line);
}
reader.close();
return lines;
}
public static int searchData(File file, File output_file, Set<String> search, boolean unique, boolean print) throws IOException, ParseException {
clearFile(output_file);
BufferedReader reader = Files.newReader(file, Charset.defaultCharset());
String line;
String term;
int count = 0;
int found = 0;
while((line = reader.readLine()) != null) {
count += 1;
for (Iterator<String> i = search.iterator(); i.hasNext();) {
term = i.next();
if(line.contains(term)) {
found += 1;
appendToFile(output_file, line, Charset.defaultCharset());
if(unique) i.remove();
if(print) System.out.println(count + ": " + line);
break;
}
}
}
reader.close();
return found;
}
/*
* Search by OLID. Fast (6 minutes for 100,000 OLID edition searches on 25gb editions dump)
*/
public static int searchDataByOLID(File file, File output_file, Set<String> search, boolean print) throws IOException {
clearFile(output_file);
BufferedReader reader = Files.newReader(file, Charset.defaultCharset());
String line;
String[] parts;
String id;
int count = 0;
int found = 0;
while((line = reader.readLine()) != null) {
count += 1;
parts = line.split(OLID_DELIM);
int idStart = parts[1].indexOf(OLID_IDENT);
id = parts[1].substring(idStart, parts[1].length());
if(search.contains(id)) {
found += 1;
appendToFile(output_file, line, Charset.defaultCharset());
if(print) System.out.println(count + ": " + line);
}
}
reader.close();
return found;
}
/*
* Return Map of OLID => Json data up to MAX_JSON elements
*/
public static Map<String, JSONObject> parseData(File file, int begin, boolean print) throws IOException, ParseException {
Map<String, JSONObject> data = new HashMap<String, JSONObject>();
BufferedReader reader = Files.newReader(file, Charset.defaultCharset());
String line;
String[] parts;
String id;
String json;
int lineNumber = 0;
int count = 0;
while((line = reader.readLine()) != null && count < MAX_JSON) {
lineNumber += 1;
if(lineNumber >= begin) {
count += 1;
// Get the OLID
parts = line.split(OLID_DELIM);
int idStart = parts[1].indexOf(OLID_IDENT);
id = parts[1].substring(idStart, parts[1].length());
// Get the JSON
int jsonStart = line.indexOf(JSON_S);
int jsonStop = line.lastIndexOf(JSON_E);
json = line.substring(jsonStart, jsonStop + 1);
data.put(id, (JSONObject) parser.parse(json));
if(print) System.out.println(count + ": " + line);
}
}
reader.close();
return data;
}
public static List<String> getAuthorsCsv(Map<String, JSONObject> authors, String fieldDelimiter, String cellDelimiter, boolean print) {
List<String> data = new ArrayList<String>(authors.size());
for(Map.Entry<String, JSONObject> entry : authors.entrySet()) {
List<String> field = new ArrayList<String>();
field.add(entry.getKey());
JSONObject json = entry.getValue();
field.add(getJsonString(json, "name"));
field.add(author_profile_url + field.get(0));
field.add(author_cover_url + field.get(0) + MED_JPG);
field.add(getJsonString(json, "birth_date"));
field.add(getJsonString(json, "death_date"));
field.add(getJsonHashString(json, "links", "url", cellDelimiter, false));
data.add(join(field, fieldDelimiter));
if(print) System.out.println(join(field, fieldDelimiter));
}
return data;
}
public static List<String> getWorksCsv(Map<String, JSONObject> works, String fieldDelimiter, String cellDelimiter, boolean print) {
List<String> data = new ArrayList<String>(works.size());
String title;
String subtitle;
String desc;
for(Map.Entry<String, JSONObject> entry : works.entrySet()) {
List<String> field = new ArrayList<String>();
field.add(entry.getKey());
JSONObject json = entry.getValue();
title = getJsonString(json, "title");
subtitle = getJsonString(json, "subtitle");
if(! subtitle.isEmpty()) title = title + " - " + subtitle;
field.add(title);
field.add(works_profile_url + field.get(0));
field.add(works_cover_url + field.get(0) + MED_JPG);
field.add(getJsonHashString(json, "authors", "author", "key", cellDelimiter, true));
field.add(getJsonString(json, "first_publish_date"));
try {
JSONObject description = (JSONObject) json.get("description");
desc = getJsonString(description, "value");
} catch (ClassCastException e) {
desc = getJsonString(json, "description");
}
field.add(removeNewLines(desc));
data.add(join(field, fieldDelimiter));
if(print) System.out.println(join(field, fieldDelimiter));
}
return data;
}
public static List<String> getEditionsCsv(Map<String, JSONObject> editions, String fieldDelimiter, String cellDelimiter, boolean print) {
List<String> data = new ArrayList<String>(editions.size());
String title;
String subtitle;
String by;
for(Map.Entry<String, JSONObject> entry : editions.entrySet()) {
List<String> field = new ArrayList<String>();
field.add(entry.getKey());
JSONObject json = entry.getValue();
title = getJsonString(json, "title");
subtitle = getJsonString(json, "subtitle");
by = getJsonString(json, "by_statement");
if(! subtitle.isEmpty()) title = title + " - " + subtitle;
if(! by.isEmpty()) title = title + " - " + by;
field.add(title);
field.add(getJsonHashString(json, "works", "key", cellDelimiter, true));
field.add(getJsonHashString(json, "authors", "key", cellDelimiter, true));
field.add(getJsonArrayString(json, "oclc_numbers", cellDelimiter));
field.add(getJsonArrayString(json, "lccn", cellDelimiter));
field.add(getJsonArrayString(json, "isbn_10", cellDelimiter));
field.add(getJsonArrayString(json, "isbn_13", cellDelimiter));
field.add(getJsonArrayString(json, "publish_places", cellDelimiter));
field.add(getJsonArrayString(json, "publishers", cellDelimiter));
field.add(getJsonString(json, "publish_date"));
field.add(getJsonString(json, "physical format"));
field.add(getJsonString(json, "number_of_pages"));
field.add(getJsonString(json, "pagination"));
field.add(getJsonString(json, "physical_dimensions"));
field.add(getJsonArrayString(json, "subjects", cellDelimiter));
field.add(getJsonArrayString(json, "subject_places", cellDelimiter));
field.add(getJsonArrayString(json, "series", cellDelimiter));
field.add(getJsonArrayString(json, "covers", cellDelimiter));
field.add(getJsonHashArrayString(json, "identifiers", "librarything", cellDelimiter));
field.add(getJsonHashArrayString(json, "identifiers", "goodreads", cellDelimiter));
data.add(join(field, fieldDelimiter));
if(print) System.out.println(join(field, fieldDelimiter));
}
return data;
}
public static String getJsonString(JSONObject json, String key) {
String value = "";
if(json != null) {
String result = null;
try {
result = (String) json.get(key);
} catch (ClassCastException e) {
result = String.valueOf(json.get(key));
}
if(result != null) value = result;
}
return value;
}
public static String getJsonHashArrayString(JSONObject json, String json_obj, String key, String delimiter) {
String value = "";
if(json != null) {
JSONObject a = (JSONObject) json.get(json_obj);
if(a != null) value = getJsonArrayString(a, key, delimiter);
}
return value;
}
public static String getJsonArrayString(JSONObject json, String key, String delimiter) {
String value = "";
if(json != null) {
JSONArray result = (JSONArray) json.get(key);
if(result != null) value = join(result, delimiter);
}
return value;
}
public static String getJsonHashString(JSONObject json, String key, String sub_key, String delimiter, boolean olid) {
String value = "";
JSONArray result = (JSONArray) json.get(key);
if(result != null) {
String element;
for(Object x : result) {
JSONObject a = (JSONObject) x;
element = getJsonString(a, sub_key);
if(olid) {
int idStart = element.indexOf(OLID_IDENT);
element = element.substring(idStart, element.length());
}
value = value + element + delimiter;
}
}
if(! value.isEmpty()) value = value.substring(0, value.length() - 1);
return value;
}
public static String getJsonHashString(JSONObject json, String key, String sub_key, String sub_sub_key, String delimiter, boolean olid) {
String value = "";
JSONArray result = (JSONArray) json.get(key);
if(result != null) {
String element;
for(Object x : result) {
JSONObject a = (JSONObject) x;
JSONObject b = (JSONObject) a.get(sub_key);
element = getJsonString(b, sub_sub_key);
if(olid) {
int idStart = element.indexOf(OLID_IDENT);
element = element.substring(idStart, element.length());
}
value = value + element + delimiter;
}
}
if(! value.isEmpty()) value = value.substring(0, value.length() - 1);
return value;
}
public static String join(List<?> list, String delimiter) {
StringBuilder l = new StringBuilder();
for(Object x : list) {
if(l.length() != 0) l.append(delimiter);
l.append(x.toString());
}
return l.toString();
}
public static String removeNewLines(String text) {
return text.replaceAll("\\r\\n|\\r|\\n", " ");
}
public static long millisToSeconds(long milli) {
return (long) (milli * 0.001);
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment