mark-cooper/OpenLibraryData.java

## OpenLibraryData.java
package net.libcode.www.openlibrary;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.nio.charset.Charset;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;

import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;

import com.google.common.io.Files;

/*
 * Script style, built incrementally, ready for reorganization
 * JSONObjects use a lot of Strings:
 * -Xmx1024m -XX:MaxPermSize=256m
 */
public class OpenLibraryData {

	public static final JSONParser parser = new JSONParser();
	public static final int MAX_JSON = 50000; // Chunk these

	public static final File authors        = new File("/home/mcooper/OL-Data/ol_dump_authors_2011-12-31.txt");
	public static final File authors_search = new File("/home/mcooper/OL-Data/authors_search.txt");
	public static final File authors_output = new File("/home/mcooper/OL-Data/authors_output.txt");
	public static final File authors_found  = new File("/home/mcooper/OL-Data/authors_found.csv");

	public static final File works        = new File("/home/mcooper/OL-Data/ol_dump_works_2011-12-31.txt");
	public static final File works_search = new File("/home/mcooper/OL-Data/works_search.txt");
	public static final File works_output = new File("/home/mcooper/OL-Data/works_output.txt");
	public static final File works_found  = new File("/home/mcooper/OL-Data/works_found.csv");

	public static final File editions        = new File("/home/mcooper/OL-Data/ol_dump_editions_2011-12-31.txt");
	public static final File editions_search = new File("/home/mcooper/OL-Data/editions_search.txt");
	public static final File editions_output = new File("/home/mcooper/OL-Data/editions_output.txt");
	public static final File editions_found  = new File("/home/mcooper/OL-Data/editions_found.csv");

	public static final String author_profile_url = "http://openlibrary.org/authors/";
	public static final String author_cover_url = "http://covers.openlibrary.org/a/olid/";
	public static final String author_csv_hdr = "ID\tNAME\tPROFILE\tIMAGE\tBIRTH\tDEATH\tLINKS";

	public static final String works_profile_url = "http://openlibrary.org/works/";
	public static final String works_cover_url = "http://covers.openlibrary.org/w/id/";
	public static final String works_csv_hdr = "ID\tTITLE\tPROFILE\tIMAGE\tAUTHORS\tPUBLISHED\tDESCRIPTION";

	public static final String editions_profile_url = "http://openlibrary.org/books/";
	public static final String editions_cover_url = "http://covers.openlibrary.org/b/id/";
	public static final String editions_csv_hdr = "ID\tTITLE\tWORKS\tAUTHORS\tOCLC\tLCCN\tISBN10\tISBN13\tPLACES\tPUBLISHERS\tDATE\tFORMAT\tPAGES\tPAGINATION\tDIMENSIONS\tSUBJECTS\tSUBJECT PLACES\tSERIES\tCOVERS\tLIBRARYTHING\tGOODREADS";

	public static final String OLID_DELIM = "\t";
	public static final String OLID_IDENT = "O";
	public static final String JSON_S = "{";
	public static final String JSON_E = "}";
	public static final String SMALL_JPG = "-S.jpg";
	public static final String MED_JPG = "-M.jpg";

	public static final String CSV_FIELD_DELIM = "\t";
	public static final String CSV_CELL_DELIM = "|";
	public static final String NL = String.format("%n");

	public static void main(String[] args) throws IOException, ParseException {
		clearFile(authors_found); // Start over searching the authors dump file ...
		Set<String> searchTerms = readSearchTermsFile(authors_search, 10);

		long startTime = System.currentTimeMillis();
		searchData(authors, authors_output, searchTerms, false, true);
		searchTerms = null;

		Map<String, JSONObject> authorsData = parseData(authors_output, 0, true);

		List<String> authors_csv = new ArrayList<String>(authorsData.size() + 1);
		authors_csv.add(author_csv_hdr);

		System.out.println();
		System.out.println("AUTHORS FOUND #" + authorsData.size());
		System.out.println();

		authors_csv.addAll(getAuthorsCsv(authorsData, "\t", "|", true));

		long endTime = System.currentTimeMillis();
		System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s");
		System.out.println();

		appendToFile(authors_found, authors_csv, Charset.defaultCharset());
		authors_csv = null;

		clearFile(works_found); // Search the works dump file by author OLID
		searchTerms = readSearchTermsFile(works_search, 10);

		startTime = System.currentTimeMillis();
		searchData(works, works_output, searchTerms, false, false);
		searchTerms = null;

		Map<String, JSONObject> worksData = parseData(works_output, 0, false);

		List<String> works_csv = new ArrayList<String>(worksData.size() + 1);
		works_csv.add(works_csv_hdr);

		System.out.println();
		System.out.println("WORKS FOUND #" + worksData.size());
		System.out.println();

		works_csv.addAll(getWorksCsv(worksData, "\t", "|", false));

		endTime = System.currentTimeMillis();
		System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s");
		System.out.println();

		appendToFile(works_found, works_csv, Charset.defaultCharset());
		works_csv = null;

		clearFile(editions_found); // Search the editions dump file ... this is huge.
//		searchTerms = readSearchTermsFile(editions_search, 100000);
//		searchDataByOLID(editions, editions_output, searchTerms, true);
//		searchTerms = null;

		startTime = System.currentTimeMillis();
		Map<String, JSONObject> editionsData = parseData(editions_output, 0, true);

		List<String> editions_csv = new ArrayList<String>(editionsData.size() + 1);
		editions_csv.add(editions_csv_hdr);

		System.out.println();
		System.out.println("EDITIONS FOUND #" + editionsData.size());
		System.out.println();

		editions_csv.addAll(getEditionsCsv(editionsData, CSV_FIELD_DELIM, CSV_CELL_DELIM, false));
		editionsData = null;

		endTime = System.currentTimeMillis();
		System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s");
		System.out.println();

		appendToFile(editions_found, editions_csv, Charset.defaultCharset());
		editions_csv = null;

		//////////

		startTime = System.currentTimeMillis();
		editionsData = parseData(editions_output, 50001, true);

		editions_csv = new ArrayList<String>(editionsData.size() + 1);
		editions_csv.add(editions_csv_hdr);

		System.out.println();
		System.out.println("EDITIONS FOUND #" + editionsData.size());
		System.out.println();

		editions_csv.addAll(getEditionsCsv(editionsData, CSV_FIELD_DELIM, CSV_CELL_DELIM, false));
		editionsData = null;

		endTime = System.currentTimeMillis();
		System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s");
		System.out.println();

		appendToFile(editions_found, editions_csv, Charset.defaultCharset());
		editions_csv = null;
	}

	public static void clearFile(File file) throws IOException {
		Files.write(new byte[]{}, file);
	}

	public static void appendToFile(File file, String data, Charset charset) throws IOException {
		Files.append(data + NL, file, charset);
	}

	public static void appendToFile(File file, List<String> data, Charset charset) throws IOException {
		for(String x : data) {
			Files.append(x + NL, file, charset);
		}
	}

	public static Set<String> readSearchTermsFile(File file, int capacity) throws IOException {
		Set<String> lines = new HashSet<String>(capacity);
		BufferedReader reader = Files.newReader(file, Charset.defaultCharset());
		String line;
		while((line = reader.readLine()) != null) {
			lines.add(line);
		}
		reader.close();
		return lines;
	}

	public static int searchData(File file, File output_file, Set<String> search, boolean unique, boolean print) throws IOException, ParseException {
		clearFile(output_file);
		BufferedReader reader = Files.newReader(file, Charset.defaultCharset());
		String line;
		String term;
		int count = 0;
		int found = 0;
		while((line = reader.readLine()) != null) {
			count += 1;
			for (Iterator<String> i = search.iterator(); i.hasNext();) {
			    term = i.next();
			    if(line.contains(term)) {
			    	found += 1;
			    	appendToFile(output_file, line, Charset.defaultCharset());
					if(unique) i.remove();
					if(print) System.out.println(count + ": " + line);
					break;
				}
			}
		}
		reader.close();
		return found;
	}

	/*
	 * Search by OLID. Fast (6 minutes for 100,000 OLID edition searches on 25gb editions dump)
	 */
	public static int searchDataByOLID(File file, File output_file, Set<String> search, boolean print) throws IOException {
		clearFile(output_file);
		BufferedReader reader = Files.newReader(file, Charset.defaultCharset());
		String line;
		String[] parts;
		String id;
		int count = 0;
		int found = 0;
		while((line = reader.readLine()) != null) {
			count += 1;
			parts = line.split(OLID_DELIM);
			int idStart = parts[1].indexOf(OLID_IDENT);
			id = parts[1].substring(idStart, parts[1].length());
			if(search.contains(id)) {
				found += 1;
				appendToFile(output_file, line, Charset.defaultCharset());
				if(print) System.out.println(count + ": " + line);
			}
		}
		reader.close();
		return found;
	}

	/*
	 * Return Map of OLID => Json data up to MAX_JSON elements
	 */
	public static Map<String, JSONObject> parseData(File file, int begin, boolean print) throws IOException, ParseException {
		Map<String, JSONObject> data = new HashMap<String, JSONObject>();
		BufferedReader reader = Files.newReader(file, Charset.defaultCharset());
		String line;
		String[] parts;
		String id;
		String json;
		int lineNumber = 0;
		int count = 0;
		while((line = reader.readLine()) != null && count < MAX_JSON) {
			lineNumber += 1;
			if(lineNumber >= begin) {
				count += 1;
				// Get the OLID
				parts = line.split(OLID_DELIM);
				int idStart = parts[1].indexOf(OLID_IDENT);
				id = parts[1].substring(idStart, parts[1].length());

				// Get the JSON
				int jsonStart = line.indexOf(JSON_S);
				int jsonStop = line.lastIndexOf(JSON_E);
				json = line.substring(jsonStart, jsonStop + 1);
				data.put(id, (JSONObject) parser.parse(json));

				if(print) System.out.println(count + ": " + line);
			}
		}
		reader.close();
		return data;
	}

	public static List<String> getAuthorsCsv(Map<String, JSONObject> authors, String fieldDelimiter, String cellDelimiter, boolean print) {
		List<String> data = new ArrayList<String>(authors.size());
		for(Map.Entry<String, JSONObject> entry : authors.entrySet()) {
			List<String> field = new ArrayList<String>();
			field.add(entry.getKey());

			JSONObject json = entry.getValue();
			field.add(getJsonString(json, "name"));
			field.add(author_profile_url + field.get(0));
			field.add(author_cover_url + field.get(0) + MED_JPG);
			field.add(getJsonString(json, "birth_date"));
			field.add(getJsonString(json, "death_date"));
			field.add(getJsonHashString(json, "links", "url", cellDelimiter, false));

			data.add(join(field, fieldDelimiter));
			if(print) System.out.println(join(field, fieldDelimiter));
		}
		return data;
	}

	public static List<String> getWorksCsv(Map<String, JSONObject> works, String fieldDelimiter, String cellDelimiter, boolean print) {
		List<String> data = new ArrayList<String>(works.size());
		String title;
		String subtitle;
		String desc;
		for(Map.Entry<String, JSONObject> entry : works.entrySet()) {
			List<String> field = new ArrayList<String>();
			field.add(entry.getKey());

			JSONObject json = entry.getValue();
			title = getJsonString(json, "title");
			subtitle = getJsonString(json, "subtitle");
			if(! subtitle.isEmpty()) title = title + " - " + subtitle;
			field.add(title);

			field.add(works_profile_url + field.get(0));
			field.add(works_cover_url + field.get(0) + MED_JPG);
			field.add(getJsonHashString(json, "authors", "author", "key", cellDelimiter, true));
			field.add(getJsonString(json, "first_publish_date"));

			try {
				JSONObject description = (JSONObject) json.get("description");
				desc = getJsonString(description, "value");
			} catch (ClassCastException e) {
				desc = getJsonString(json, "description");
			}
			field.add(removeNewLines(desc));

			data.add(join(field, fieldDelimiter));
			if(print) System.out.println(join(field, fieldDelimiter));
		}
		return data;
	}

	public static List<String> getEditionsCsv(Map<String, JSONObject> editions, String fieldDelimiter, String cellDelimiter, boolean print) {
		List<String> data = new ArrayList<String>(editions.size());
		String title;
		String subtitle;
		String by;
		for(Map.Entry<String, JSONObject> entry : editions.entrySet()) {
			List<String> field = new ArrayList<String>();
			field.add(entry.getKey());

			JSONObject json = entry.getValue();
			title = getJsonString(json, "title");
			subtitle = getJsonString(json, "subtitle");
			by = getJsonString(json, "by_statement");
			if(! subtitle.isEmpty()) title = title + " - " + subtitle;
			if(! by.isEmpty()) title = title + " - " + by;
			field.add(title);

			field.add(getJsonHashString(json, "works", "key", cellDelimiter, true));
			field.add(getJsonHashString(json, "authors", "key", cellDelimiter, true));

			field.add(getJsonArrayString(json, "oclc_numbers", cellDelimiter));
			field.add(getJsonArrayString(json, "lccn", cellDelimiter));
			field.add(getJsonArrayString(json, "isbn_10", cellDelimiter));
			field.add(getJsonArrayString(json, "isbn_13", cellDelimiter));

			field.add(getJsonArrayString(json, "publish_places", cellDelimiter));
			field.add(getJsonArrayString(json, "publishers", cellDelimiter));
			field.add(getJsonString(json, "publish_date"));

			field.add(getJsonString(json, "physical format"));
			field.add(getJsonString(json, "number_of_pages"));
			field.add(getJsonString(json, "pagination"));
			field.add(getJsonString(json, "physical_dimensions"));

			field.add(getJsonArrayString(json, "subjects", cellDelimiter));
			field.add(getJsonArrayString(json, "subject_places", cellDelimiter));
			field.add(getJsonArrayString(json, "series", cellDelimiter));
			field.add(getJsonArrayString(json, "covers", cellDelimiter));
			field.add(getJsonHashArrayString(json, "identifiers", "librarything", cellDelimiter));
			field.add(getJsonHashArrayString(json, "identifiers", "goodreads", cellDelimiter));

			data.add(join(field, fieldDelimiter));
			if(print) System.out.println(join(field, fieldDelimiter));
		}
		return data;
	}

	public static String getJsonString(JSONObject json, String key) {
		String value = "";
		if(json != null) {
			String result = null;
			try {
				result = (String) json.get(key);
			} catch (ClassCastException e) {
				result = String.valueOf(json.get(key));
			}
			if(result != null) value = result;
		}
		return value;
	}

	public static String getJsonHashArrayString(JSONObject json, String json_obj, String key, String delimiter) {
		String value = "";
		if(json != null) {
			JSONObject a = (JSONObject) json.get(json_obj);
			if(a != null) value = getJsonArrayString(a, key, delimiter);
		}
		return value;
	}

	public static String getJsonArrayString(JSONObject json, String key, String delimiter) {
		String value = "";
		if(json != null) {
			JSONArray result = (JSONArray) json.get(key);
			if(result != null) value = join(result, delimiter);
		}
		return value;
	}

	public static String getJsonHashString(JSONObject json, String key, String sub_key, String delimiter, boolean olid) {
		String value = "";
		JSONArray result = (JSONArray) json.get(key);
		if(result != null) {
			String element;
			for(Object x : result) {
				JSONObject a = (JSONObject) x;
				element = getJsonString(a, sub_key);
				if(olid) {
					int idStart = element.indexOf(OLID_IDENT);
					element = element.substring(idStart, element.length());
				}
				value = value + element + delimiter;
			}
		}
		if(! value.isEmpty()) value = value.substring(0, value.length() - 1);
		return value;
	}

	public static String getJsonHashString(JSONObject json, String key, String sub_key, String sub_sub_key, String delimiter, boolean olid) {
		String value = "";
		JSONArray result = (JSONArray) json.get(key);
		if(result != null) {
			String element;
			for(Object x : result) {
				JSONObject a = (JSONObject) x;
				JSONObject b = (JSONObject) a.get(sub_key);
				element = getJsonString(b, sub_sub_key);
				if(olid) {
					int idStart = element.indexOf(OLID_IDENT);
					element = element.substring(idStart, element.length());
				}
				value = value + element + delimiter;
			}
		}
		if(! value.isEmpty()) value = value.substring(0, value.length() - 1);
		return value;
	}

	public static String join(List<?> list, String delimiter) {
		StringBuilder l = new StringBuilder();
		for(Object x : list) {
			if(l.length() != 0) l.append(delimiter);
			l.append(x.toString());
		}
		return l.toString();
	}

	public static String removeNewLines(String text) {
		return text.replaceAll("\\r\\n|\\r|\\n", " ");
	}

	public static long millisToSeconds(long milli) {
		return (long) (milli * 0.001);
	}

}
	package net.libcode.www.openlibrary;

	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileReader;
	import java.io.IOException;
	import java.nio.charset.Charset;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;

	import org.json.simple.JSONArray;
	import org.json.simple.JSONObject;
	import org.json.simple.parser.JSONParser;
	import org.json.simple.parser.ParseException;

	import com.google.common.io.Files;

	/*
	* Script style, built incrementally, ready for reorganization
	* JSONObjects use a lot of Strings:
	* -Xmx1024m -XX:MaxPermSize=256m
	*/
	public class OpenLibraryData {

	public static final JSONParser parser = new JSONParser();
	public static final int MAX_JSON = 50000; // Chunk these

	public static final File authors = new File("/home/mcooper/OL-Data/ol_dump_authors_2011-12-31.txt");
	public static final File authors_search = new File("/home/mcooper/OL-Data/authors_search.txt");
	public static final File authors_output = new File("/home/mcooper/OL-Data/authors_output.txt");
	public static final File authors_found = new File("/home/mcooper/OL-Data/authors_found.csv");

	public static final File works = new File("/home/mcooper/OL-Data/ol_dump_works_2011-12-31.txt");
	public static final File works_search = new File("/home/mcooper/OL-Data/works_search.txt");
	public static final File works_output = new File("/home/mcooper/OL-Data/works_output.txt");
	public static final File works_found = new File("/home/mcooper/OL-Data/works_found.csv");

	public static final File editions = new File("/home/mcooper/OL-Data/ol_dump_editions_2011-12-31.txt");
	public static final File editions_search = new File("/home/mcooper/OL-Data/editions_search.txt");
	public static final File editions_output = new File("/home/mcooper/OL-Data/editions_output.txt");
	public static final File editions_found = new File("/home/mcooper/OL-Data/editions_found.csv");

	public static final String author_profile_url = "http://openlibrary.org/authors/";
	public static final String author_cover_url = "http://covers.openlibrary.org/a/olid/";
	public static final String author_csv_hdr = "ID\tNAME\tPROFILE\tIMAGE\tBIRTH\tDEATH\tLINKS";

	public static final String works_profile_url = "http://openlibrary.org/works/";
	public static final String works_cover_url = "http://covers.openlibrary.org/w/id/";
	public static final String works_csv_hdr = "ID\tTITLE\tPROFILE\tIMAGE\tAUTHORS\tPUBLISHED\tDESCRIPTION";

	public static final String editions_profile_url = "http://openlibrary.org/books/";
	public static final String editions_cover_url = "http://covers.openlibrary.org/b/id/";
	public static final String editions_csv_hdr = "ID\tTITLE\tWORKS\tAUTHORS\tOCLC\tLCCN\tISBN10\tISBN13\tPLACES\tPUBLISHERS\tDATE\tFORMAT\tPAGES\tPAGINATION\tDIMENSIONS\tSUBJECTS\tSUBJECT PLACES\tSERIES\tCOVERS\tLIBRARYTHING\tGOODREADS";

	public static final String OLID_DELIM = "\t";
	public static final String OLID_IDENT = "O";
	public static final String JSON_S = "{";
	public static final String JSON_E = "}";
	public static final String SMALL_JPG = "-S.jpg";
	public static final String MED_JPG = "-M.jpg";

	public static final String CSV_FIELD_DELIM = "\t";
	public static final String CSV_CELL_DELIM = "\|";
	public static final String NL = String.format("%n");

	public static void main(String[] args) throws IOException, ParseException {
	clearFile(authors_found); // Start over searching the authors dump file ...
	Set<String> searchTerms = readSearchTermsFile(authors_search, 10);

	long startTime = System.currentTimeMillis();
	searchData(authors, authors_output, searchTerms, false, true);
	searchTerms = null;

	Map<String, JSONObject> authorsData = parseData(authors_output, 0, true);

	List<String> authors_csv = new ArrayList<String>(authorsData.size() + 1);
	authors_csv.add(author_csv_hdr);

	System.out.println();
	System.out.println("AUTHORS FOUND #" + authorsData.size());
	System.out.println();

	authors_csv.addAll(getAuthorsCsv(authorsData, "\t", "\|", true));

	long endTime = System.currentTimeMillis();
	System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s");
	System.out.println();

	appendToFile(authors_found, authors_csv, Charset.defaultCharset());
	authors_csv = null;

	clearFile(works_found); // Search the works dump file by author OLID
	searchTerms = readSearchTermsFile(works_search, 10);

	startTime = System.currentTimeMillis();
	searchData(works, works_output, searchTerms, false, false);
	searchTerms = null;

	Map<String, JSONObject> worksData = parseData(works_output, 0, false);

	List<String> works_csv = new ArrayList<String>(worksData.size() + 1);
	works_csv.add(works_csv_hdr);

	System.out.println();
	System.out.println("WORKS FOUND #" + worksData.size());
	System.out.println();

	works_csv.addAll(getWorksCsv(worksData, "\t", "\|", false));

	endTime = System.currentTimeMillis();
	System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s");
	System.out.println();

	appendToFile(works_found, works_csv, Charset.defaultCharset());
	works_csv = null;

	clearFile(editions_found); // Search the editions dump file ... this is huge.
	// searchTerms = readSearchTermsFile(editions_search, 100000);
	// searchDataByOLID(editions, editions_output, searchTerms, true);
	// searchTerms = null;

	startTime = System.currentTimeMillis();
	Map<String, JSONObject> editionsData = parseData(editions_output, 0, true);

	List<String> editions_csv = new ArrayList<String>(editionsData.size() + 1);
	editions_csv.add(editions_csv_hdr);

	System.out.println();
	System.out.println("EDITIONS FOUND #" + editionsData.size());
	System.out.println();

	editions_csv.addAll(getEditionsCsv(editionsData, CSV_FIELD_DELIM, CSV_CELL_DELIM, false));
	editionsData = null;

	endTime = System.currentTimeMillis();
	System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s");
	System.out.println();

	appendToFile(editions_found, editions_csv, Charset.defaultCharset());
	editions_csv = null;

	//////////

	startTime = System.currentTimeMillis();
	editionsData = parseData(editions_output, 50001, true);

	editions_csv = new ArrayList<String>(editionsData.size() + 1);
	editions_csv.add(editions_csv_hdr);

	System.out.println();
	System.out.println("EDITIONS FOUND #" + editionsData.size());
	System.out.println();

	editions_csv.addAll(getEditionsCsv(editionsData, CSV_FIELD_DELIM, CSV_CELL_DELIM, false));
	editionsData = null;

	endTime = System.currentTimeMillis();
	System.out.println("Total execution time: " + (millisToSeconds(endTime - startTime)) + "s");
	System.out.println();

	appendToFile(editions_found, editions_csv, Charset.defaultCharset());
	editions_csv = null;
	}

	public static void clearFile(File file) throws IOException {
	Files.write(new byte[]{}, file);
	}

	public static void appendToFile(File file, String data, Charset charset) throws IOException {
	Files.append(data + NL, file, charset);
	}

	public static void appendToFile(File file, List<String> data, Charset charset) throws IOException {
	for(String x : data) {
	Files.append(x + NL, file, charset);
	}
	}

	public static Set<String> readSearchTermsFile(File file, int capacity) throws IOException {
	Set<String> lines = new HashSet<String>(capacity);
	BufferedReader reader = Files.newReader(file, Charset.defaultCharset());
	String line;
	while((line = reader.readLine()) != null) {
	lines.add(line);
	}
	reader.close();
	return lines;
	}

	public static int searchData(File file, File output_file, Set<String> search, boolean unique, boolean print) throws IOException, ParseException {
	clearFile(output_file);
	BufferedReader reader = Files.newReader(file, Charset.defaultCharset());
	String line;
	String term;
	int count = 0;
	int found = 0;
	while((line = reader.readLine()) != null) {
	count += 1;
	for (Iterator<String> i = search.iterator(); i.hasNext();) {
	term = i.next();
	if(line.contains(term)) {
	found += 1;
	appendToFile(output_file, line, Charset.defaultCharset());
	if(unique) i.remove();
	if(print) System.out.println(count + ": " + line);
	break;
	}
	}
	}
	reader.close();
	return found;
	}

	/*
	* Search by OLID. Fast (6 minutes for 100,000 OLID edition searches on 25gb editions dump)
	*/
	public static int searchDataByOLID(File file, File output_file, Set<String> search, boolean print) throws IOException {
	clearFile(output_file);
	BufferedReader reader = Files.newReader(file, Charset.defaultCharset());
	String line;
	String[] parts;
	String id;
	int count = 0;
	int found = 0;
	while((line = reader.readLine()) != null) {
	count += 1;
	parts = line.split(OLID_DELIM);
	int idStart = parts[1].indexOf(OLID_IDENT);
	id = parts[1].substring(idStart, parts[1].length());
	if(search.contains(id)) {
	found += 1;
	appendToFile(output_file, line, Charset.defaultCharset());
	if(print) System.out.println(count + ": " + line);
	}
	}
	reader.close();
	return found;
	}

	/*
	* Return Map of OLID => Json data up to MAX_JSON elements
	*/
	public static Map<String, JSONObject> parseData(File file, int begin, boolean print) throws IOException, ParseException {
	Map<String, JSONObject> data = new HashMap<String, JSONObject>();
	BufferedReader reader = Files.newReader(file, Charset.defaultCharset());
	String line;
	String[] parts;
	String id;
	String json;
	int lineNumber = 0;
	int count = 0;
	while((line = reader.readLine()) != null && count < MAX_JSON) {
	lineNumber += 1;
	if(lineNumber >= begin) {
	count += 1;
	// Get the OLID
	parts = line.split(OLID_DELIM);
	int idStart = parts[1].indexOf(OLID_IDENT);
	id = parts[1].substring(idStart, parts[1].length());

	// Get the JSON
	int jsonStart = line.indexOf(JSON_S);
	int jsonStop = line.lastIndexOf(JSON_E);
	json = line.substring(jsonStart, jsonStop + 1);
	data.put(id, (JSONObject) parser.parse(json));

	if(print) System.out.println(count + ": " + line);
	}
	}
	reader.close();
	return data;
	}

	public static List<String> getAuthorsCsv(Map<String, JSONObject> authors, String fieldDelimiter, String cellDelimiter, boolean print) {
	List<String> data = new ArrayList<String>(authors.size());
	for(Map.Entry<String, JSONObject> entry : authors.entrySet()) {
	List<String> field = new ArrayList<String>();
	field.add(entry.getKey());

	JSONObject json = entry.getValue();
	field.add(getJsonString(json, "name"));
	field.add(author_profile_url + field.get(0));
	field.add(author_cover_url + field.get(0) + MED_JPG);
	field.add(getJsonString(json, "birth_date"));
	field.add(getJsonString(json, "death_date"));
	field.add(getJsonHashString(json, "links", "url", cellDelimiter, false));

	data.add(join(field, fieldDelimiter));
	if(print) System.out.println(join(field, fieldDelimiter));
	}
	return data;
	}

	public static List<String> getWorksCsv(Map<String, JSONObject> works, String fieldDelimiter, String cellDelimiter, boolean print) {
	List<String> data = new ArrayList<String>(works.size());
	String title;
	String subtitle;
	String desc;
	for(Map.Entry<String, JSONObject> entry : works.entrySet()) {
	List<String> field = new ArrayList<String>();
	field.add(entry.getKey());

	JSONObject json = entry.getValue();
	title = getJsonString(json, "title");
	subtitle = getJsonString(json, "subtitle");
	if(! subtitle.isEmpty()) title = title + " - " + subtitle;
	field.add(title);

	field.add(works_profile_url + field.get(0));
	field.add(works_cover_url + field.get(0) + MED_JPG);
	field.add(getJsonHashString(json, "authors", "author", "key", cellDelimiter, true));
	field.add(getJsonString(json, "first_publish_date"));

	try {
	JSONObject description = (JSONObject) json.get("description");
	desc = getJsonString(description, "value");
	} catch (ClassCastException e) {
	desc = getJsonString(json, "description");
	}
	field.add(removeNewLines(desc));

	data.add(join(field, fieldDelimiter));
	if(print) System.out.println(join(field, fieldDelimiter));
	}
	return data;
	}

	public static List<String> getEditionsCsv(Map<String, JSONObject> editions, String fieldDelimiter, String cellDelimiter, boolean print) {
	List<String> data = new ArrayList<String>(editions.size());
	String title;
	String subtitle;
	String by;
	for(Map.Entry<String, JSONObject> entry : editions.entrySet()) {
	List<String> field = new ArrayList<String>();
	field.add(entry.getKey());

	JSONObject json = entry.getValue();
	title = getJsonString(json, "title");
	subtitle = getJsonString(json, "subtitle");
	by = getJsonString(json, "by_statement");
	if(! subtitle.isEmpty()) title = title + " - " + subtitle;
	if(! by.isEmpty()) title = title + " - " + by;
	field.add(title);

	field.add(getJsonHashString(json, "works", "key", cellDelimiter, true));
	field.add(getJsonHashString(json, "authors", "key", cellDelimiter, true));

	field.add(getJsonArrayString(json, "oclc_numbers", cellDelimiter));
	field.add(getJsonArrayString(json, "lccn", cellDelimiter));
	field.add(getJsonArrayString(json, "isbn_10", cellDelimiter));
	field.add(getJsonArrayString(json, "isbn_13", cellDelimiter));

	field.add(getJsonArrayString(json, "publish_places", cellDelimiter));
	field.add(getJsonArrayString(json, "publishers", cellDelimiter));
	field.add(getJsonString(json, "publish_date"));

	field.add(getJsonString(json, "physical format"));
	field.add(getJsonString(json, "number_of_pages"));
	field.add(getJsonString(json, "pagination"));
	field.add(getJsonString(json, "physical_dimensions"));

	field.add(getJsonArrayString(json, "subjects", cellDelimiter));
	field.add(getJsonArrayString(json, "subject_places", cellDelimiter));
	field.add(getJsonArrayString(json, "series", cellDelimiter));
	field.add(getJsonArrayString(json, "covers", cellDelimiter));
	field.add(getJsonHashArrayString(json, "identifiers", "librarything", cellDelimiter));
	field.add(getJsonHashArrayString(json, "identifiers", "goodreads", cellDelimiter));

	data.add(join(field, fieldDelimiter));
	if(print) System.out.println(join(field, fieldDelimiter));
	}
	return data;
	}

	public static String getJsonString(JSONObject json, String key) {
	String value = "";
	if(json != null) {
	String result = null;
	try {
	result = (String) json.get(key);
	} catch (ClassCastException e) {
	result = String.valueOf(json.get(key));
	}
	if(result != null) value = result;
	}
	return value;
	}

	public static String getJsonHashArrayString(JSONObject json, String json_obj, String key, String delimiter) {
	String value = "";
	if(json != null) {
	JSONObject a = (JSONObject) json.get(json_obj);
	if(a != null) value = getJsonArrayString(a, key, delimiter);
	}
	return value;
	}

	public static String getJsonArrayString(JSONObject json, String key, String delimiter) {
	String value = "";
	if(json != null) {
	JSONArray result = (JSONArray) json.get(key);
	if(result != null) value = join(result, delimiter);
	}
	return value;
	}

	public static String getJsonHashString(JSONObject json, String key, String sub_key, String delimiter, boolean olid) {
	String value = "";
	JSONArray result = (JSONArray) json.get(key);
	if(result != null) {
	String element;
	for(Object x : result) {
	JSONObject a = (JSONObject) x;
	element = getJsonString(a, sub_key);
	if(olid) {
	int idStart = element.indexOf(OLID_IDENT);
	element = element.substring(idStart, element.length());
	}
	value = value + element + delimiter;
	}
	}
	if(! value.isEmpty()) value = value.substring(0, value.length() - 1);
	return value;
	}

	public static String getJsonHashString(JSONObject json, String key, String sub_key, String sub_sub_key, String delimiter, boolean olid) {
	String value = "";
	JSONArray result = (JSONArray) json.get(key);
	if(result != null) {
	String element;
	for(Object x : result) {
	JSONObject a = (JSONObject) x;
	JSONObject b = (JSONObject) a.get(sub_key);
	element = getJsonString(b, sub_sub_key);
	if(olid) {
	int idStart = element.indexOf(OLID_IDENT);
	element = element.substring(idStart, element.length());
	}
	value = value + element + delimiter;
	}
	}
	if(! value.isEmpty()) value = value.substring(0, value.length() - 1);
	return value;
	}

	public static String join(List<?> list, String delimiter) {
	StringBuilder l = new StringBuilder();
	for(Object x : list) {
	if(l.length() != 0) l.append(delimiter);
	l.append(x.toString());
	}
	return l.toString();
	}

	public static String removeNewLines(String text) {
	return text.replaceAll("\\r\\n\|\\r\|\\n", " ");
	}

	public static long millisToSeconds(long milli) {
	return (long) (milli * 0.001);
	}

	}