Skip to content

Instantly share code, notes, and snippets.

@naoa
Created April 20, 2014 17:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save naoa/11119518 to your computer and use it in GitHub Desktop.
Save naoa/11119518 to your computer and use it in GitHub Desktop.
<?php
$article = $argv[1];
if ($handle = opendir($article)) {
while (false !== ($file = readdir($handle))) {
echo "-------$file------\n";
echo "XML loading and importing...\n";
$xml = new XMLReader();
if(!$xml->open($article . "/" . $file)){
die('Failed to open file!');
}
$rc = 0;
$startTimeAll = microtime(true);
while ($xml->read()){
if ($xml->name === "page") {
$page = array();
$node = new SimpleXMLElement($xml->readOuterXML());
$page['id'] = (string)$node->id;
$page['title'] = (string)$node->title;
$page['text'] = (string)$node->revision->text;
if ($page['id'] != ""){
$startTime = microtime(true);
$json_doc = array(
"id" => $page['id'],
"title" => $page['title'],
"text" => $page['text']
);
$qry = json_encode($json_doc);
$url="http://localhost:9200/wikipedia/text/" . $page['id'];
$req="PUT";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_PORT, 9200);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $req);
curl_setopt($ch, CURLOPT_POSTFIELDS, $qry);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
ob_start();
curl_exec ($ch);
curl_close ($ch);
$data = ob_get_contents();
ob_end_clean();
$endTime = microtime(true);
$elapsedTime = $endTime - $startTime;
echo $page['id'] . "," . $elapsedTime . "\n";
$rc++;
}
}
}
$xml->close();
$endTimeAll = microtime(true);
$elapsedTimeAll = $endTimeAll - $startTimeAll;
echo $rc . " records done. Total " . $elapsedTimeAll . "\n";
}
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment