Skip to content

Instantly share code, notes, and snippets.

@naoa
Last active December 29, 2015 04:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save naoa/7618016 to your computer and use it in GitHub Desktop.
Save naoa/7618016 to your computer and use it in GitHub Desktop.
This script has the feature that import wikipedia page XML. %php wiki_page_import.php <database> <wiki_articles_xml_dir> <output_file>
<?php
$db = "localhost";
$db_name = $argv[1];
$table = "text";
$username = "mysql";
$password = "";
$article = $argv[2];
$output_file = $argv[3];
$fw = fopen($output_file, 'a+');
if ($handle = opendir($article)) {
while (false !== ($file = readdir($handle))) {
echo "-------$file------\n";
echo "XML loading and SQL importing...\n";
$xml = new XMLReader();
if(!$xml->open($article . "/" . $file)){
die('Failed to open file!');
}
$con = mysql_connect($db, $username, $password);
if (!$con) {
exit('Database connection error');
}
$result = mysql_select_db($db_name, $con);
if (!$result) {
exit('Database select error');
}
$rc = 0;
while ($xml->read()){
if ($xml->name === "page") {
$page = array();
$node = new SimpleXMLElement($xml->readOuterXML());
$page['id'] = mysql_escape_string($node->id);
$page['title'] = mysql_escape_string($node->title);
$page['text'] = mysql_escape_string($node->revision->text);
if ($page['id'] != ""){
$query = "INSERT IGNORE INTO " . $table . " VALUES ";
$query .= "(" . $page['id'] . ",\"" . $page['title'] . "\",\"" . $page['text'] . "\");";
$startTime = microtime(true);
$result = mysql_query($query, $con);
if($result != 1){
echo "INSERT ERROR aborted" . "\n";
$con = mysql_close($con);
if (!$con) {
exit('Database close error');
}
exit;
}
$endTime = microtime(true);
$elapsedTime = $endTime - $startTime;
fwrite($page['id'] . "," . $elapsedTime . "\n");
$rc++;
}
}
}
$xml->close();
fclose($fw);
echo $rc . " records done.\n";
$con = mysql_close($con);
if (!$con) {
exit('Database close error');
}
}
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment