Skip to content

Instantly share code, notes, and snippets.

@naoa
Last active August 29, 2015 14:04
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save naoa/ad19bc1d2f7debb2da87 to your computer and use it in GitHub Desktop.
Save naoa/ad19bc1d2f7debb2da87 to your computer and use it in GitHub Desktop.
<?php
$article = $argv[1];
$xml = new XMLReader();
if(!$xml->open($article)){
die('Failed to open file!');
}
$rc = 1;
$startTimeAll = microtime(true);
while ($xml->read()){
if ($xml->name === "page") {
$page = array();
$node = new SimpleXMLElement($xml->readOuterXML());
$page['id'] = (string)$node->id;
$page['title'] = (string)$node->title;
$page['text'] = (string)$node->revision->text;
if ($page['id'] != ""){
$update_startTime = microtime(true);
$json_doc = array(
"_key" => $page['id'],
"title" => $page['id'] . "_" . $page['title'],
"text" => $page['text']
);
$qry = json_encode($json_doc);
$url="http://localhost:9200/wikipedia/text/" . $page['id'];
$req="PUT";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_PORT, 9200);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $req);
curl_setopt($ch, CURLOPT_POSTFIELDS, $qry);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
curl_exec ($ch);
curl_close ($ch);
$updateTime = microtime(true) - $update_startTime;
$json_doc = array(
"from" => "0",
"size" => "0",
"fields" => array("id"),
"query" => array(
"query_string" => array(
"default_field" => "title",
"query" => "\"" . $page['id'] . "_" . $page['title'] . "\""
),
)
);
$qry = json_encode($json_doc);
$search_startTime = microtime(true);
$url="http://localhost:9200/wikipedia/text/_search";
$req="GET";
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, $url);
curl_setopt($ch, CURLOPT_PORT, 9200);
curl_setopt($ch, CURLOPT_CUSTOMREQUEST, $req);
curl_setopt($ch, CURLOPT_POSTFIELDS, $qry);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, 1);
curl_setopt($ch, CURLOPT_HEADER, 0);
$count = 0;
while($count == 0){
$data = curl_exec ($ch);
$json = json_decode($data);
$count = $json->hits->total;
if($count == 1){
break;
} else {
//sleep 10msec
usleep(100);
}
}
curl_close ($ch);
$searchTime = microtime(true) - $search_startTime;
$AllTime = microtime(true) - $update_startTime;
$now = microtime(true);
echo "${rc}, ${now}, ${updateTime}, ${searchTime}, ${AllTime}\n";
$rc++;
if($rc > 1000){
break;
}
}
}
}
$xml->close();
$endTimeAll = microtime(true);
$elapsedTimeAll = $endTimeAll - $startTimeAll;
$rc--;
echo $rc . " records done. Total = " . $elapsedTimeAll . "\n";
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment