public
Created

  • Download Gist
WikiImportBatch.java
Java
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156
package org.neo4j.performance.wiki;
 
/**
* @author mh
* @since 18.07.11
*/
 
import org.apache.commons.io.FileUtils;
import org.neo4j.graphdb.Node;
import org.neo4j.graphdb.Relationship;
import org.neo4j.graphdb.RelationshipType;
import org.neo4j.graphdb.index.BatchInserterIndex;
import org.neo4j.graphdb.index.IndexManager;
import org.neo4j.helpers.collection.MapUtil;
import org.neo4j.index.impl.lucene.LuceneBatchInserterIndexProvider;
import org.neo4j.kernel.impl.batchinsert.BatchInserterImpl;
import org.neo4j.mailvision.load.Config;
import org.neo4j.mailvision.load.Tracing;
 
import java.io.*;
import java.text.SimpleDateFormat;
import java.util.HashSet;
import java.util.Random;
import java.util.Set;
 
import static org.neo4j.helpers.collection.MapUtil.map;
 
public class WikiImportBatch {
 
public static final int BATCH_SIZE = 50000;
public static final int ARTICLES = 25 * Config.MILLION;
public static final int AUTHORS = ARTICLES / 5;
public static final File FILE = new File("wiki_"+ARTICLES+".csv");
public static final File STORE_DIR = new File("target/wiki_"+ARTICLES);
public static final int FILTER_SIZE = 1000000;
 
enum MyRelationshipTypes implements RelationshipType { ARTICLE, WROTE, EDIT}
 
private static void createFile(File file) throws IOException {
Random rnd = new Random();
BufferedWriter writer=new BufferedWriter(new FileWriter(file));
for (int article=0;article<ARTICLES;article++) {
String articleName = String.format("%0"+(8+rnd.nextInt(10))+"d",article);
final int articleAuthors = rnd.nextInt(10);
for (int authors= articleAuthors;authors>=0;authors--) {
final int author = rnd.nextInt(AUTHORS);
final int width = 8 + (author % 10);
final String authorName = String.format("%0" + width + "d", author);
 
writer.write(articleName+"%;% 2006-03-05T00:14:27Z%;% "+authorName+"\n");
}
}
writer.close();
}
public static void main(String[] args) throws IOException {
long time = System.currentTimeMillis();
if (!FILE.exists()) {
createFile(FILE);
}
time = Tracing.traceMillis(time,"Create File");
BufferedReader bf = new BufferedReader(new FileReader(FILE));
WikiImportBatch importBatch = new WikiImportBatch();
importBatch.createGraphDatabase(bf);
Tracing.traceMillis(time,"Create Database");
}
 
private String lastArticle = "";
private BatchInserterImpl db;
private IndexManager index;
private BatchInserterIndex authorList;
private int transactionCounter = 0;
private long article;
private boolean isFirstAuthor = false;
private Node author;
private Relationship relationship;
private int node;
 
private void createGraphDatabase(BufferedReader bf) throws IOException {
if (STORE_DIR.exists()) FileUtils.cleanDirectory(STORE_DIR);
STORE_DIR.mkdirs();
db = new BatchInserterImpl(STORE_DIR.getAbsolutePath());
final LuceneBatchInserterIndexProvider indexProvider = new LuceneBatchInserterIndexProvider(db);
authorList = indexProvider.nodeIndex("Author", MapUtil.stringMap("type", "exact"));
authorList.setCacheCapacity("Author", 1000000);
 
long startTime = System.currentTimeMillis();
long time = startTime;
String zeile;
final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'");
int articles = 0;
int authors = 0;
//Map<String,Long> cache=new HashMap<String, Long>();
Set<Integer> bloomFilter = new HashSet<Integer>(FILTER_SIZE);
try {
// reads lines of CSV-file
while ((zeile = bf.readLine()) != null) {
if (transactionCounter++ % BATCH_SIZE == 0) {
time = Tracing.traceMillis(time, String.format("lines %d articles %d authors %d", transactionCounter, articles, authors));
}
// String[] looks like this: Article%;% Timestamp%;% Author
String[] artikelinfo = zeile.split("%;% ");
if (artikelinfo.length != 3) {
System.out.println("ERROR: check CSV");
for (int i = 0; i < artikelinfo.length; i++) {
System.out.println(artikelinfo[i]);
}
return;
}
 
final String articleName = artikelinfo[0];
final String authorName = artikelinfo[2];
final String timestamp = artikelinfo[1];
 
if (!articleName.equals(lastArticle)) {
articles++;
article = createArticle(articleName);
lastArticle = articleName;
isFirstAuthor = true;
}
Long author = null;
final int hash = authorName.hashCode() % FILTER_SIZE;
if (bloomFilter.contains(hash)) {
author = authorList.get("Author", authorName).getSingle();
}
//final Long authorId = cache.get(authorName);
//if (authorId!=null) author = db.getNodeById(authorId);
if (author==null) {
author = createAuthor(authorName);
bloomFilter.add(hash);
// cache.put(authorName,author.getId());
authors++;
}
final MyRelationshipTypes relType = isFirstAuthor ? MyRelationshipTypes.WROTE : MyRelationshipTypes.EDIT;
db.createRelationship(author, article, relType, map("Timestamp", sdf.parse(timestamp).getTime()));
if (isFirstAuthor) { isFirstAuthor = false; }
}
} catch (Exception e) {
} finally {
}
Tracing.traceMillis(startTime, String.format("Import time lines %d articles %d authors %d",transactionCounter,articles,authors));
indexProvider.shutdown();
db.shutdown();
 
}
 
private long createAuthor(String authorName) {
long author = db.createNode(map("Name", authorName));
authorList.add(author, map("Author", authorName));
authorList.flush();
return author;
}
 
private long createArticle(String name) {
return db.createNode(map("Article", name));
}
}

Please sign in to comment on this gist.

Something went wrong with that request. Please try again.