Last active
August 29, 2015 14:20
-
-
Save donwilson/0a5a09a3abc63e75c638 to your computer and use it in GitHub Desktop.
An automated Wikipedia data dump downloader written in PHP. This tool checks for the latest complete English Wikipedia data dump and automatically downloads, extracts, and stores articles into individual files for fast reads. Optionally stores relevant info into an 'articles' mysql database table.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
set_time_limit(0); | |
ini_set('memory_limit', "1024M"); | |
// encoding settings | |
try { | |
setlocale(LC_ALL, 'en_US.UTF8'); | |
mb_internal_encoding("UTF-8"); | |
} catch(Exception $e) { die("Unable to set local/internal encoding"); } | |
define('DATA_DIR', __DIR__); | |
define('CHOWN_USER', "apache"); // who to chown the downloaded folders+files | |
define('DB_HOST', "localhost"); | |
define('DB_USER', ""); | |
define('DB_PASS', ""); | |
define('DB_NAME', "dump_wikipedia"); | |
define('DB_ARTICLES_TABLE', "articles"); | |
// Automated wiki data dump downloading | |
if(false === ($contents = file_get_contents("http://dumps.wikimedia.org/backup-index.html"))) { | |
print "Unable to check on status of wiki dump.". PHP_EOL; | |
die; | |
} | |
/* | |
// this waits until the entire enwiki data store is done processing | |
if(!preg_match("#<a href=\"enwiki\/([0-9]{8})\">enwiki<\/a>\:\s*<span(?:[^\>]*)>\s*Dump complete\s*<\/span>#si", $contents, $match)) { | |
print "Wiki dumping not completed.". PHP_EOL; | |
die; | |
} | |
*/ | |
// only look for enwiki date | |
if(!preg_match("#<a href=\"enwiki\/([0-9]{8})\">#si", $contents, $match)) { | |
print "Wiki dumping not completed.". PHP_EOL; | |
die; | |
} | |
unset($contents); | |
$latest_wiki_date = trim($match[1]); | |
if(!preg_match("#^[0-9]{8}$#si", $latest_wiki_date)) { | |
print "latest_wiki_date malformed.". PHP_EOL; | |
die; | |
} | |
$base_dir = DATA_DIR ."/wiki_data"; | |
$data_dir = $base_dir ."/raw_data"; | |
$page_dir = $base_dir ."/pages_by_id"; | |
if(is_file($base_dir ."/done.txt")) { | |
$done_contents = file_get_contents($base_dir ."/done.txt"); | |
if($done_contents == $latest_wiki_date) { | |
// already downloaded | |
print "Already done with the current month"; | |
die; | |
} | |
unlink($base_dir ."/done.txt"); | |
} | |
if(is_file($base_dir ."/pending.txt")) { | |
$pending_date = trim(file_get_contents($base_dir ."/pending.txt")); | |
if(is_numeric($pending_date) && ($pending_date >= (time() - (60 * 60 * 24)))) { | |
// allow at least a day for the previous attempt to finish | |
print "Being processed elsewhere"; | |
die; | |
} | |
unlink($base_dir ."/pending.txt"); | |
} | |
$folders = array( | |
$base_dir, | |
$data_dir, | |
$page_dir, | |
); | |
foreach($folders as $folder) { | |
@mkdir($folder, 0755); | |
@chmod($folder, 0755); | |
@chown($folder, CHOWN_USER); | |
} | |
unset($folders); | |
$contents = file_get_contents("http://dumps.wikimedia.org/enwiki/". $latest_wiki_date); | |
if(!preg_match("#<span class='status'>\s*done\s*</span>\s*<span class='title'>\s*All pages, current versions only\.\s*</span>#si", $contents)) { | |
print "enwiki-". $latest_wiki_date ."-pages-articles#-p#p#.xml.bz2 still processing."; | |
die; | |
} | |
if(empty($contents) || !preg_match_all("#\/?enwiki\/". $latest_wiki_date ."\/enwiki\-". $latest_wiki_date ."\-pages\-articles[0-9]+\.xml\-p[0-9]+p[0-9]+\.bz2#", $contents, $matches)) { | |
print "Could not find raw dump urls.". PHP_EOL; | |
die; | |
} | |
file_put_contents($base_dir ."/pending.txt", time()); | |
foreach($matches[0] as $url) { | |
$url = ltrim(trim($url), "/"); | |
$url_bits = explode("/", $url); | |
$file_name = array_pop($url_bits); | |
unset($url_bits); | |
$unzipped_file_name = preg_replace("#\.bz2$#si", "", $file_name); | |
if(file_exists($data_dir ."/". $file_name) || file_exists($data_dir ."/". $unzipped_file_name)) { | |
print "Already downloaded '". $url ."'... skipping.". PHP_EOL; | |
continue; | |
} | |
print "Downloading '". $url ."'... "; | |
$fh = fopen($data_dir ."/". $file_name, "wb"); | |
$ch = curl_init(); | |
curl_setopt($ch, CURLOPT_URL, "http://dumps.wikimedia.org/". ltrim(trim($url), "/")); | |
curl_setopt($ch, CURLOPT_FILE, $fh); | |
if(false === curl_exec($ch)) { | |
print "Error downloaded '". $file_name ."'...". PHP_EOL; | |
} else { | |
print "done!". PHP_EOL; | |
$curl_error = curl_error($ch); | |
if(!empty($curl_error)) { | |
print "Curl error: ". $curl_error . PHP_EOL; | |
} else { | |
$curl_info = curl_getinfo($ch); | |
print "Success: ". $file_name ." => ". $curl_info['download_content_length'] ." bytes". PHP_EOL; | |
print "Extracting... "; | |
flush(); | |
exec("/usr/bin/bunzip2 \"". $data_dir ."/". $file_name ."\""); | |
@chmod($data_dir ."/". $unzipped_file_name, 0755); | |
@chown($data_dir ."/". $unzipped_file_name, "apache"); | |
print "done!". PHP_EOL; | |
flush(); | |
} | |
} | |
curl_close($ch); | |
fclose($fh); | |
flush(); | |
} | |
@mysql_connect(DB_HOST, DB_USER, DB_PASS) or die(mysql_error()); | |
if(false === @mysql_select_db(DB_NAME)) { | |
@mysql_query("CREATE DATABASE `". mysql_real_escape_string(DB_NAME) ."`") or die("Database create error: ". mysql_error() . PHP_EOL); | |
if(false === @mysql_select_db(DB_NAME)) { | |
print "Unable to connect to newly created mysql database: ". mysql_error() . PHP_EOL; | |
die; | |
} | |
} | |
@mysql_set_charset("utf8") or die("DB Error Line #". __LINE__ .": ". mysql_error()); | |
@mysql_query("SET character_set_results = 'utf8', character_set_client = 'utf8', character_set_connection = 'utf8', character_set_database = 'utf8', character_set_server = 'utf8'") or die("DB Error Line #". __LINE__ .": ". mysql_error()); | |
@mysql_query("SET NAMES utf8;") or die("DB Error Line #". __LINE__ .": ". mysql_error()); | |
@mysql_query("SET foreign_key_checks = 0;") or die("DB Error Line #". __LINE__ .": ". mysql_error()); | |
@mysql_query("SET sql_mode = 'NO_AUTO_VALUE_ON_ZERO';") or die("DB Error Line #". __LINE__ .": ". mysql_error()); | |
// Test if table exists | |
if(false === @mysql_query("SELECT 1 FROM `". mysql_real_escape_string(DB_ARTICLES_TABLE) ."` LIMIT 1")) { | |
@mysql_query(" | |
CREATE TABLE IF NOT EXISTS `". mysql_real_escape_string(DB_ARTICLES_TABLE) ."` ( | |
`id` int(10) unsigned NOT NULL AUTO_INCREMENT, | |
`wiki_id` int(10) unsigned NOT NULL COMMENT '". mysql_real_escape_string("Wikipedia ID") ."', | |
`wiki_revision_id` int(10) unsigned NOT NULL COMMENT '". mysql_real_escape_string("This version's wikipedia revision ID") ."', | |
`wiki_revision_date` date DEFAULT NULL, | |
`title` varchar(512) COLLATE utf8_unicode_ci NOT NULL, | |
`title_hash` char(32) COLLATE utf8_unicode_ci NOT NULL COMMENT '". mysql_real_escape_string("md5(". DB_ARTICLES_TABLE .".title)") ."', | |
`redirect_to` char(32) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '". mysql_real_escape_string(DB_ARTICLES_TABLE .".title_hash") ."', | |
`type_sifted` enum('0','1') COLLATE utf8_unicode_ci DEFAULT '0' COMMENT '". mysql_real_escape_string("for type_sifter worker") ."', | |
PRIMARY KEY (`id`), | |
UNIQUE KEY `wiki_id` (`wiki_id`), | |
UNIQUE KEY `title_hash` (`title_hash`), | |
KEY `title` (`title`(6)), | |
KEY `redirect_to` (`redirect_to`) | |
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci | |
") or die("Table create error: ". mysql_error() . PHP_EOL); | |
if(false === @mysql_query("SELECT 1 FROM `". mysql_real_escape_string(DB_ARTICLES_TABLE) ."`")) { | |
print "Newly created table returned error: ". mysql_error() . PHP_EOL; | |
die; | |
} | |
} | |
// Prep mysql table for massive importing | |
@mysql_query("SET AUTOCOMMIT = 0"); | |
@mysql_query("SET FOREIGN_KEY_CHECKS = 0"); | |
@mysql_query("SET UNIQUE_CHECKS = 0"); | |
// Ready to start importing | |
$files = glob($data_dir ."/enwiki-*-pages-articles*.xml-p*"); | |
// Determine min allowed revision date (below uses < for allowing same-day edits as dump) | |
$min_revision_date = mktime(0, 0, 0, (date('n') - 1), 1); // defaults to first day of last month | |
$ts_result = @mysql_query("SELECT MAX(`wiki_revision_date`) as `max_date` FROM `". mysql_real_escape_string(DB_ARTICLES_TABLE) ."`") or print(__LINE__ .": ". mysql_error() . PHP_EOL); | |
if(@mysql_num_rows($ts_result) > 0) { | |
$ts_row = @mysql_fetch_assoc($ts_result); | |
if(!empty($ts_row['max_date'])) { | |
$min_revision_date = mktime(0, 0, 0, substr($ts_row['max_date'], 5, 2), substr($ts_row['max_date'], 7, 2), substr($ts_row['max_date'], 0, 4)); | |
} | |
} | |
foreach($files as $file) { | |
if(preg_match("#\.bz2$#si", $file)) { | |
unlink($file); | |
continue; | |
} | |
print "Reading ". $file ."...". PHP_EOL; | |
$xml = new XMLReader(); | |
$xml->open($file); | |
while($xml->read() && $xml->name !== 'page'); | |
while($xml->name === "page") { | |
$doc = new DOMDocument; | |
$node = simplexml_import_dom($doc->importNode($xml->expand(), true)); | |
$xml->next('page'); | |
$page = array( | |
'wiki_id' => (int)$node->id, | |
'wiki_revision_id' => (int)$node->revision->id, | |
'wiki_namespace' => (int)$node->ns, | |
'wiki_revision_date' => (string)$node->revision->timestamp, | |
'title' => (string)$node->title, | |
'body' => (string)$node->revision->text, | |
'redirect_title' => false, | |
); | |
// determine to skip | |
// blank namespace = english wiki article namespace | |
if(!empty($page['wiki_namespace'])) { | |
unset($doc, $node, $page); | |
continue; | |
} | |
// ignore revisions older than last dump's latest revision date | |
if(mktime(0, 0, 0, substr($page['wiki_revision_date'], 5, 2), substr($page['wiki_revision_date'], 7, 2), substr($page['wiki_revision_date'], 0, 4)) < $min_revision_date) { | |
unset($doc, $node, $page); | |
continue; | |
} | |
//if(preg_match("#\s+\(disambiguation\)$#si", $page['title'])) { | |
// unset($doc, $node, $page); | |
// | |
// continue; | |
//} | |
if(preg_match("#^(Media|File|User|Wikipedia|MediaWiki|Template|Help|Category|Portal|Book|Special|Talk|User Talk|Wikipedia talk|File talk|MediaWiki talk|Template talk|Help talk|Category talk|Portal talk|Book talk)\:#si", $page['title'])) { | |
unset($doc, $node, $page); | |
continue; | |
} | |
if($page['redirect_title'] === false && isset($node->redirect)) { | |
$node_redirect = get_object_vars($node->redirect); | |
$page['redirect_title'] = (string)$node_redirect["@attributes"]['title']; | |
} | |
if($page['redirect_title'] === false && preg_match("#\#REDIRECT\s*\[{2}\s*(.+?)\s*(?:\|[^\]]*)?\s*\]{2}#si", $page['body'], $redirect_match)) { | |
if(strlen(trim($redirect_match['1'])) > 0) { | |
$page['redirect_title'] = trim($redirect_match['1']); | |
} | |
} | |
// Prep filename structure for wiki article | |
$save_to_relative = implode("/", str_split(str_pad($page['wiki_id'], 9, "0", STR_PAD_LEFT), 3)) .".txt"; | |
$save_to = $page_dir ."/". $save_to_relative; | |
if((false === $page['redirect_title']) && (preg_match("#\#DISAMBIG#si", $page['body']) || preg_match("#\{{2}(disambiguation|disambig|dab|disamb|RoadIndex|geodis|hospital disambiguation|hndis|mathematical disambiguation|mountain index|school disambiguation|shipindex)\s*(?:\|([^\}]*))?\}{2}#si", $page['body']))) { | |
// Disambig files are a complete waste of space | |
if(file_exists($save_to)) { | |
// delete newly redirected file | |
@unlink($save_to); | |
} | |
unset($doc, $node, $page, $save_to_relative, $save_to); | |
continue; | |
} | |
mysql_query(" | |
INSERT INTO `". mysql_real_escape_string(DB_ARTICLES_TABLE) ."` | |
SET | |
`wiki_id` = '". mysql_real_escape_string($page['wiki_id']) ."', | |
`wiki_revision_id` = '". mysql_real_escape_string($page['wiki_revision_id']) ."', | |
`wiki_revision_date` = '". mysql_real_escape_string( substr($page['wiki_revision_date'], 0, 10) ." ". substr($page['wiki_revision_date'], 11, 8) ) ."', | |
`title` = '". mysql_real_escape_string($page['title']) ."', | |
`title_hash` = '". mysql_real_escape_string( md5($page['title']) ) ."', | |
`redirect_to` = ". ((false !== $page['redirect_title'])?"'". mysql_real_escape_string( md5($page['redirect_title']) ) ."'":"NULL") ." | |
ON DUPLICATE KEY UPDATE | |
`wiki_revision_id` = VALUES(`wiki_revision_id`), | |
`wiki_revision_date` = VALUES(`wiki_revision_date`), | |
`redirect_to` = VALUES(`redirect_to`) | |
") or die("MySQL Error (". __LINE__ ."): ". mysql_error()); | |
if(false !== $page['redirect_title']) { | |
if(file_exists($save_to)) { | |
// delete newly redirected file | |
@unlink($save_to); | |
} | |
} else { | |
// prepare to save | |
$save_to_dir = dirname($save_to); | |
if(!is_dir($save_to_dir)) { | |
@mkdir($save_to_dir, 0755, true); | |
@chown($save_to_dir, "apache"); | |
} | |
if(!is_writable($save_to_dir)) { | |
@chmod($save_to_dir, 0755); | |
@chown($save_to_dir, "apache"); | |
} | |
file_put_contents($save_to, $page['body']); | |
@chmod($save_to, 0755); | |
@chown($save_to, "apache"); | |
} | |
unset($node, $page, $save_to_relative, $save_to, $save_to_dir); | |
} | |
$xml->close(); | |
@unlink($file); | |
unset($xml); | |
} | |
// Release Mysql prep | |
@mysql_query("SET AUTOCOMMIT = 1"); | |
@mysql_query("SET FOREIGN_KEY_CHECKS = 1"); | |
@mysql_query("SET UNIQUE_CHECKS = 1"); | |
// Remove pending file | |
unlink($base_dir ."/pending.txt"); | |
// Lock folder | |
file_put_contents($base_dir ."/done.txt", $latest_wiki_date); | |
print "All done!". PHP_EOL; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment