Wikimedia clickstream import
<?php | |
require_once(__DIR__.'/vendor/autoload.php'); | |
use Neoxygen\NeoClient\ClientBuilder; | |
$client = ClientBuilder::create() | |
->addConnection('default', 'http', 'localhost', 7474, true, 'neo4j', 'password') | |
->setAutoFormatResponse(true) | |
->setDefaultTimeout(500) | |
->build(); | |
$handle = fopen(__DIR__.'/dataset/2015_02_clickstream.tsv', 'r'); | |
$tx = $client->prepareTransaction(); | |
$i = 0; | |
while (!feof($handle)) { | |
echo 'Processing line ' . $i . PHP_EOL; | |
$line = fgets($handle); | |
if ($i > 0) { | |
$expl = explode("\t", $line); | |
$prev_id = trim($expl[0]); | |
$curr_id = trim($expl[1]); | |
$occ = (int) trim($expl[2]); | |
$prev_title = trim($expl[3]); | |
$curr_title = trim($expl[4]); | |
$type = trim($expl[5]); | |
$extPrevs = array('other-google', 'other-wikipedia', 'other-other'); | |
if (in_array($prev_title, $extPrevs)) { | |
$prev_id = 99999999; | |
} | |
$q = 'MERGE (page:Page {id: {prev_id}}) | |
ON CREATE SET page.title = {prev_title} | |
MERGE (target:Page {id: {curr_id}}) | |
ON CREATE SET target.title = {curr_title}'; | |
if ('link' == $type) { | |
$q .= ' | |
MERGE (page)-[r:TARGET_BY_LINK]->(target) | |
SET r.occurences = {occ}'; | |
} elseif ('other' == $type) { | |
$q .= ' | |
MERGE (page)-[r:TARGET_BY_SEARCH]->(target) | |
SET r.occurences = {occ}'; | |
} | |
$p = [ | |
'prev_id' => (int) $prev_id, | |
'curr_id' => (int) $curr_id, | |
'occ' => $occ, | |
'prev_title' => $prev_title, | |
'curr_title' => $curr_title | |
]; | |
$tx->pushQuery($q, $p); | |
if (count($tx->getStatements()) >= 1000) { | |
$tx->commit(); | |
$tx = $client->prepareTransaction(); | |
} | |
} | |
$i++; | |
} | |
fclose($handle); | |
$tx->commit(); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment