Skip to content

Instantly share code, notes, and snippets.

@jexp
Created November 18, 2011 05:25
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jexp/1375679 to your computer and use it in GitHub Desktop.
Save jexp/1375679 to your computer and use it in GitHub Desktop.
package org.neo4j.performance.wiki;
/**
* @author mh
* @since 18.07.11
*/
import org.apache.commons.io.FileUtils;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.Relationship;
import org.neo4j.graphdb.RelationshipType;
import org.neo4j.graphdb.index.BatchInserterIndex;
import org.neo4j.graphdb.index.IndexManager;
import org.neo4j.helpers.collection.MapUtil;
import org.neo4j.index.impl.lucene.LuceneBatchInserterIndexProvider;
import org.neo4j.kernel.impl.batchinsert.BatchInserterImpl;
import org.neo4j.mailvision.load.Config;
import org.neo4j.mailvision.load.Tracing;
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
import static org.neo4j.helpers.collection.MapUtil.map;
public class WikiImportBatch {
public static final int BATCH_SIZE = 50000;
public static final int ARTICLES = 25 * Config.MILLION;
public static final int AUTHORS = ARTICLES / 5;
public static final File FILE = new File("wiki_"+ARTICLES+".csv");
public static final File STORE_DIR = new File("target/wiki_"+ARTICLES);
public static final int FILTER_SIZE = 1000000;
enum MyRelationshipTypes implements RelationshipType { ARTICLE, WROTE, EDIT}
private static void createFile(File file) throws IOException {
Random rnd = new Random();
BufferedWriter writer=new BufferedWriter(new FileWriter(file));
for (int article=0;article<ARTICLES;article++) {
String articleName = String.format("%0"+(8+rnd.nextInt(10))+"d",article);
final int articleAuthors = rnd.nextInt(10);
for (int authors= articleAuthors;authors>=0;authors--) {
final int author = rnd.nextInt(AUTHORS);
final int width = 8 + (author % 10);
final String authorName = String.format("%0" + width + "d", author);
writer.write(articleName+"%;% 2006-03-05T00:14:27Z%;% "+authorName+"\n");
}
}
writer.close();
}
public static void main(String[] args) throws IOException {
long time = System.currentTimeMillis();
if (!FILE.exists()) {
createFile(FILE);
}
time = Tracing.traceMillis(time,"Create File");
BufferedReader bf = new BufferedReader(new FileReader(FILE));
WikiImportBatch importBatch = new WikiImportBatch();
importBatch.createGraphDatabase(bf);
Tracing.traceMillis(time,"Create Database");
}
private String lastArticle = "";
private BatchInserterImpl db;
private IndexManager index;
private BatchInserterIndex authorList;
private int transactionCounter = 0;
private long article;
private boolean isFirstAuthor = false;
private Node author;
private Relationship relationship;
private int node;
private void createGraphDatabase(BufferedReader bf) throws IOException {
if (STORE_DIR.exists()) FileUtils.cleanDirectory(STORE_DIR);
STORE_DIR.mkdirs();
db = new BatchInserterImpl(STORE_DIR.getAbsolutePath());
final LuceneBatchInserterIndexProvider indexProvider = new LuceneBatchInserterIndexProvider(db);
authorList = indexProvider.nodeIndex("Author", MapUtil.stringMap("type", "exact"));
authorList.setCacheCapacity("Author", 1000000);
long startTime = System.currentTimeMillis();
long time = startTime;
String zeile;
final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
int articles = 0;
int authors = 0;
//Map<String,Long> cache=new HashMap<String, Long>();
Set<Integer> bloomFilter = new HashSet<Integer>(FILTER_SIZE);
try {
// reads lines of CSV-file
while ((zeile = bf.readLine()) != null) {
if (transactionCounter++ % BATCH_SIZE == 0) {
time = Tracing.traceMillis(time, String.format("lines %d articles %d authors %d", transactionCounter, articles, authors));
}
// String[] looks like this: Article%;% Timestamp%;% Author
String[] artikelinfo = zeile.split("%;% ");
if (artikelinfo.length != 3) {
System.out.println("ERROR: check CSV");
for (int i = 0; i < artikelinfo.length; i++) {
System.out.println(artikelinfo[i]);
}
return;
}
final String articleName = artikelinfo[0];
final String authorName = artikelinfo[2];
final String timestamp = artikelinfo[1];
if (!articleName.equals(lastArticle)) {
articles++;
article = createArticle(articleName);
lastArticle = articleName;
isFirstAuthor = true;
}
Long author = null;
final int hash = authorName.hashCode() % FILTER_SIZE;
if (bloomFilter.contains(hash)) {
author = authorList.get("Author", authorName).getSingle();
}
//final Long authorId = cache.get(authorName);
//if (authorId!=null) author = db.getNodeById(authorId);
if (author==null) {
author = createAuthor(authorName);
bloomFilter.add(hash);
// cache.put(authorName,author.getId());
authors++;
}
final MyRelationshipTypes relType = isFirstAuthor ? MyRelationshipTypes.WROTE : MyRelationshipTypes.EDIT;
db.createRelationship(author, article, relType, map("Timestamp", sdf.parse(timestamp).getTime()));
if (isFirstAuthor) { isFirstAuthor = false; }
}
} catch (Exception e) {
} finally {
}
Tracing.traceMillis(startTime, String.format("Import time lines %d articles %d authors %d",transactionCounter,articles,authors));
indexProvider.shutdown();
db.shutdown();
}
private long createAuthor(String authorName) {
long author = db.createNode(map("Name", authorName));
authorList.add(author, map("Author", authorName));
authorList.flush();
return author;
}
private long createArticle(String name) {
return db.createNode(map("Article", name));
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment