Skip to content

Instantly share code, notes, and snippets.

@ikwattro
Created March 17, 2015 19:30
Show Gist options
  • Save ikwattro/acf99e7354bbb14b9c4f to your computer and use it in GitHub Desktop.
Save ikwattro/acf99e7354bbb14b9c4f to your computer and use it in GitHub Desktop.
Wikimedia clickstream import
<?php
require_once(__DIR__.'/vendor/autoload.php');
use Neoxygen\NeoClient\ClientBuilder;
$client = ClientBuilder::create()
->addConnection('default', 'http', 'localhost', 7474, true, 'neo4j', 'password')
->setAutoFormatResponse(true)
->setDefaultTimeout(500)
->build();
$handle = fopen(__DIR__.'/dataset/2015_02_clickstream.tsv', 'r');
$tx = $client->prepareTransaction();
$i = 0;
while (!feof($handle)) {
echo 'Processing line ' . $i . PHP_EOL;
$line = fgets($handle);
if ($i > 0) {
$expl = explode("\t", $line);
$prev_id = trim($expl[0]);
$curr_id = trim($expl[1]);
$occ = (int) trim($expl[2]);
$prev_title = trim($expl[3]);
$curr_title = trim($expl[4]);
$type = trim($expl[5]);
$extPrevs = array('other-google', 'other-wikipedia', 'other-other');
if (in_array($prev_title, $extPrevs)) {
$prev_id = 99999999;
}
$q = 'MERGE (page:Page {id: {prev_id}})
ON CREATE SET page.title = {prev_title}
MERGE (target:Page {id: {curr_id}})
ON CREATE SET target.title = {curr_title}';
if ('link' == $type) {
$q .= '
MERGE (page)-[r:TARGET_BY_LINK]->(target)
SET r.occurences = {occ}';
} elseif ('other' == $type) {
$q .= '
MERGE (page)-[r:TARGET_BY_SEARCH]->(target)
SET r.occurences = {occ}';
}
$p = [
'prev_id' => (int) $prev_id,
'curr_id' => (int) $curr_id,
'occ' => $occ,
'prev_title' => $prev_title,
'curr_title' => $curr_title
];
$tx->pushQuery($q, $p);
if (count($tx->getStatements()) >= 1000) {
$tx->commit();
$tx = $client->prepareTransaction();
}
}
$i++;
}
fclose($handle);
$tx->commit();
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment