Created
November 18, 2011 05:25
-
-
Save jexp/1375679 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
package org.neo4j.performance.wiki; | |
/** | |
* @author mh | |
* @since 18.07.11 | |
*/ | |
import org.apache.commons.io.FileUtils; | |
import org.neo4j.graphdb.Node; | |
import org.neo4j.graphdb.Relationship; | |
import org.neo4j.graphdb.RelationshipType; | |
import org.neo4j.graphdb.index.BatchInserterIndex; | |
import org.neo4j.graphdb.index.IndexManager; | |
import org.neo4j.helpers.collection.MapUtil; | |
import org.neo4j.index.impl.lucene.LuceneBatchInserterIndexProvider; | |
import org.neo4j.kernel.impl.batchinsert.BatchInserterImpl; | |
import org.neo4j.mailvision.load.Config; | |
import org.neo4j.mailvision.load.Tracing; | |
import java.io.*; | |
import java.text.SimpleDateFormat; | |
import java.util.HashSet; | |
import java.util.Random; | |
import java.util.Set; | |
import static org.neo4j.helpers.collection.MapUtil.map; | |
public class WikiImportBatch { | |
public static final int BATCH_SIZE = 50000; | |
public static final int ARTICLES = 25 * Config.MILLION; | |
public static final int AUTHORS = ARTICLES / 5; | |
public static final File FILE = new File("wiki_"+ARTICLES+".csv"); | |
public static final File STORE_DIR = new File("target/wiki_"+ARTICLES); | |
public static final int FILTER_SIZE = 1000000; | |
enum MyRelationshipTypes implements RelationshipType { ARTICLE, WROTE, EDIT} | |
private static void createFile(File file) throws IOException { | |
Random rnd = new Random(); | |
BufferedWriter writer=new BufferedWriter(new FileWriter(file)); | |
for (int article=0;article<ARTICLES;article++) { | |
String articleName = String.format("%0"+(8+rnd.nextInt(10))+"d",article); | |
final int articleAuthors = rnd.nextInt(10); | |
for (int authors= articleAuthors;authors>=0;authors--) { | |
final int author = rnd.nextInt(AUTHORS); | |
final int width = 8 + (author % 10); | |
final String authorName = String.format("%0" + width + "d", author); | |
writer.write(articleName+"%;% 2006-03-05T00:14:27Z%;% "+authorName+"\n"); | |
} | |
} | |
writer.close(); | |
} | |
public static void main(String[] args) throws IOException { | |
long time = System.currentTimeMillis(); | |
if (!FILE.exists()) { | |
createFile(FILE); | |
} | |
time = Tracing.traceMillis(time,"Create File"); | |
BufferedReader bf = new BufferedReader(new FileReader(FILE)); | |
WikiImportBatch importBatch = new WikiImportBatch(); | |
importBatch.createGraphDatabase(bf); | |
Tracing.traceMillis(time,"Create Database"); | |
} | |
private String lastArticle = ""; | |
private BatchInserterImpl db; | |
private IndexManager index; | |
private BatchInserterIndex authorList; | |
private int transactionCounter = 0; | |
private long article; | |
private boolean isFirstAuthor = false; | |
private Node author; | |
private Relationship relationship; | |
private int node; | |
private void createGraphDatabase(BufferedReader bf) throws IOException { | |
if (STORE_DIR.exists()) FileUtils.cleanDirectory(STORE_DIR); | |
STORE_DIR.mkdirs(); | |
db = new BatchInserterImpl(STORE_DIR.getAbsolutePath()); | |
final LuceneBatchInserterIndexProvider indexProvider = new LuceneBatchInserterIndexProvider(db); | |
authorList = indexProvider.nodeIndex("Author", MapUtil.stringMap("type", "exact")); | |
authorList.setCacheCapacity("Author", 1000000); | |
long startTime = System.currentTimeMillis(); | |
long time = startTime; | |
String zeile; | |
final SimpleDateFormat sdf = new SimpleDateFormat("yyyy-MM-dd'T'HH:mm:ss'Z'"); | |
int articles = 0; | |
int authors = 0; | |
//Map<String,Long> cache=new HashMap<String, Long>(); | |
Set<Integer> bloomFilter = new HashSet<Integer>(FILTER_SIZE); | |
try { | |
// reads lines of CSV-file | |
while ((zeile = bf.readLine()) != null) { | |
if (transactionCounter++ % BATCH_SIZE == 0) { | |
time = Tracing.traceMillis(time, String.format("lines %d articles %d authors %d", transactionCounter, articles, authors)); | |
} | |
// String[] looks like this: Article%;% Timestamp%;% Author | |
String[] artikelinfo = zeile.split("%;% "); | |
if (artikelinfo.length != 3) { | |
System.out.println("ERROR: check CSV"); | |
for (int i = 0; i < artikelinfo.length; i++) { | |
System.out.println(artikelinfo[i]); | |
} | |
return; | |
} | |
final String articleName = artikelinfo[0]; | |
final String authorName = artikelinfo[2]; | |
final String timestamp = artikelinfo[1]; | |
if (!articleName.equals(lastArticle)) { | |
articles++; | |
article = createArticle(articleName); | |
lastArticle = articleName; | |
isFirstAuthor = true; | |
} | |
Long author = null; | |
final int hash = authorName.hashCode() % FILTER_SIZE; | |
if (bloomFilter.contains(hash)) { | |
author = authorList.get("Author", authorName).getSingle(); | |
} | |
//final Long authorId = cache.get(authorName); | |
//if (authorId!=null) author = db.getNodeById(authorId); | |
if (author==null) { | |
author = createAuthor(authorName); | |
bloomFilter.add(hash); | |
// cache.put(authorName,author.getId()); | |
authors++; | |
} | |
final MyRelationshipTypes relType = isFirstAuthor ? MyRelationshipTypes.WROTE : MyRelationshipTypes.EDIT; | |
db.createRelationship(author, article, relType, map("Timestamp", sdf.parse(timestamp).getTime())); | |
if (isFirstAuthor) { isFirstAuthor = false; } | |
} | |
} catch (Exception e) { | |
} finally { | |
} | |
Tracing.traceMillis(startTime, String.format("Import time lines %d articles %d authors %d",transactionCounter,articles,authors)); | |
indexProvider.shutdown(); | |
db.shutdown(); | |
} | |
private long createAuthor(String authorName) { | |
long author = db.createNode(map("Name", authorName)); | |
authorList.add(author, map("Author", authorName)); | |
authorList.flush(); | |
return author; | |
} | |
private long createArticle(String name) { | |
return db.createNode(map("Article", name)); | |
} | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment