Skip to content

Instantly share code, notes, and snippets.

@donwilson
Last active August 29, 2015 14:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save donwilson/0a5a09a3abc63e75c638 to your computer and use it in GitHub Desktop.
Save donwilson/0a5a09a3abc63e75c638 to your computer and use it in GitHub Desktop.
An automated Wikipedia data dump downloader written in PHP. This tool checks for the latest complete English Wikipedia data dump and automatically downloads, extracts, and stores articles into individual files for fast reads. Optionally stores relevant info into an 'articles' mysql database table.
<?php
set_time_limit(0);
ini_set('memory_limit', "1024M");
// encoding settings
try {
setlocale(LC_ALL, 'en_US.UTF8');
mb_internal_encoding("UTF-8");
} catch(Exception $e) { die("Unable to set local/internal encoding"); }
define('DATA_DIR', __DIR__);
define('CHOWN_USER', "apache"); // who to chown the downloaded folders+files
define('DB_HOST', "localhost");
define('DB_USER', "");
define('DB_PASS', "");
define('DB_NAME', "dump_wikipedia");
define('DB_ARTICLES_TABLE', "articles");
// Automated wiki data dump downloading
if(false === ($contents = file_get_contents("http://dumps.wikimedia.org/backup-index.html"))) {
print "Unable to check on status of wiki dump.". PHP_EOL;
die;
}
/*
// this waits until the entire enwiki data store is done processing
if(!preg_match("#<a href=\"enwiki\/([0-9]{8})\">enwiki<\/a>\:\s*<span(?:[^\>]*)>\s*Dump complete\s*<\/span>#si", $contents, $match)) {
print "Wiki dumping not completed.". PHP_EOL;
die;
}
*/
// only look for enwiki date
if(!preg_match("#<a href=\"enwiki\/([0-9]{8})\">#si", $contents, $match)) {
print "Wiki dumping not completed.". PHP_EOL;
die;
}
unset($contents);
$latest_wiki_date = trim($match[1]);
if(!preg_match("#^[0-9]{8}$#si", $latest_wiki_date)) {
print "latest_wiki_date malformed.". PHP_EOL;
die;
}
$base_dir = DATA_DIR ."/wiki_data";
$data_dir = $base_dir ."/raw_data";
$page_dir = $base_dir ."/pages_by_id";
if(is_file($base_dir ."/done.txt")) {
$done_contents = file_get_contents($base_dir ."/done.txt");
if($done_contents == $latest_wiki_date) {
// already downloaded
print "Already done with the current month";
die;
}
unlink($base_dir ."/done.txt");
}
if(is_file($base_dir ."/pending.txt")) {
$pending_date = trim(file_get_contents($base_dir ."/pending.txt"));
if(is_numeric($pending_date) && ($pending_date >= (time() - (60 * 60 * 24)))) {
// allow at least a day for the previous attempt to finish
print "Being processed elsewhere";
die;
}
unlink($base_dir ."/pending.txt");
}
$folders = array(
$base_dir,
$data_dir,
$page_dir,
);
foreach($folders as $folder) {
@mkdir($folder, 0755);
@chmod($folder, 0755);
@chown($folder, CHOWN_USER);
}
unset($folders);
$contents = file_get_contents("http://dumps.wikimedia.org/enwiki/". $latest_wiki_date);
if(!preg_match("#<span class='status'>\s*done\s*</span>\s*<span class='title'>\s*All pages, current versions only\.\s*</span>#si", $contents)) {
print "enwiki-". $latest_wiki_date ."-pages-articles#-p#p#.xml.bz2 still processing.";
die;
}
if(empty($contents) || !preg_match_all("#\/?enwiki\/". $latest_wiki_date ."\/enwiki\-". $latest_wiki_date ."\-pages\-articles[0-9]+\.xml\-p[0-9]+p[0-9]+\.bz2#", $contents, $matches)) {
print "Could not find raw dump urls.". PHP_EOL;
die;
}
file_put_contents($base_dir ."/pending.txt", time());
foreach($matches[0] as $url) {
$url = ltrim(trim($url), "/");
$url_bits = explode("/", $url);
$file_name = array_pop($url_bits);
unset($url_bits);
$unzipped_file_name = preg_replace("#\.bz2$#si", "", $file_name);
if(file_exists($data_dir ."/". $file_name) || file_exists($data_dir ."/". $unzipped_file_name)) {
print "Already downloaded '". $url ."'... skipping.". PHP_EOL;
continue;
}
print "Downloading '". $url ."'... ";
$fh = fopen($data_dir ."/". $file_name, "wb");
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL, "http://dumps.wikimedia.org/". ltrim(trim($url), "/"));
curl_setopt($ch, CURLOPT_FILE, $fh);
if(false === curl_exec($ch)) {
print "Error downloaded '". $file_name ."'...". PHP_EOL;
} else {
print "done!". PHP_EOL;
$curl_error = curl_error($ch);
if(!empty($curl_error)) {
print "Curl error: ". $curl_error . PHP_EOL;
} else {
$curl_info = curl_getinfo($ch);
print "Success: ". $file_name ." => ". $curl_info['download_content_length'] ." bytes". PHP_EOL;
print "Extracting... ";
flush();
exec("/usr/bin/bunzip2 \"". $data_dir ."/". $file_name ."\"");
@chmod($data_dir ."/". $unzipped_file_name, 0755);
@chown($data_dir ."/". $unzipped_file_name, "apache");
print "done!". PHP_EOL;
flush();
}
}
curl_close($ch);
fclose($fh);
flush();
}
@mysql_connect(DB_HOST, DB_USER, DB_PASS) or die(mysql_error());
if(false === @mysql_select_db(DB_NAME)) {
@mysql_query("CREATE DATABASE `". mysql_real_escape_string(DB_NAME) ."`") or die("Database create error: ". mysql_error() . PHP_EOL);
if(false === @mysql_select_db(DB_NAME)) {
print "Unable to connect to newly created mysql database: ". mysql_error() . PHP_EOL;
die;
}
}
@mysql_set_charset("utf8") or die("DB Error Line #". __LINE__ .": ". mysql_error());
@mysql_query("SET character_set_results = 'utf8', character_set_client = 'utf8', character_set_connection = 'utf8', character_set_database = 'utf8', character_set_server = 'utf8'") or die("DB Error Line #". __LINE__ .": ". mysql_error());
@mysql_query("SET NAMES utf8;") or die("DB Error Line #". __LINE__ .": ". mysql_error());
@mysql_query("SET foreign_key_checks = 0;") or die("DB Error Line #". __LINE__ .": ". mysql_error());
@mysql_query("SET sql_mode = 'NO_AUTO_VALUE_ON_ZERO';") or die("DB Error Line #". __LINE__ .": ". mysql_error());
// Test if table exists
if(false === @mysql_query("SELECT 1 FROM `". mysql_real_escape_string(DB_ARTICLES_TABLE) ."` LIMIT 1")) {
@mysql_query("
CREATE TABLE IF NOT EXISTS `". mysql_real_escape_string(DB_ARTICLES_TABLE) ."` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`wiki_id` int(10) unsigned NOT NULL COMMENT '". mysql_real_escape_string("Wikipedia ID") ."',
`wiki_revision_id` int(10) unsigned NOT NULL COMMENT '". mysql_real_escape_string("This version's wikipedia revision ID") ."',
`wiki_revision_date` date DEFAULT NULL,
`title` varchar(512) COLLATE utf8_unicode_ci NOT NULL,
`title_hash` char(32) COLLATE utf8_unicode_ci NOT NULL COMMENT '". mysql_real_escape_string("md5(". DB_ARTICLES_TABLE .".title)") ."',
`redirect_to` char(32) COLLATE utf8_unicode_ci DEFAULT NULL COMMENT '". mysql_real_escape_string(DB_ARTICLES_TABLE .".title_hash") ."',
`type_sifted` enum('0','1') COLLATE utf8_unicode_ci DEFAULT '0' COMMENT '". mysql_real_escape_string("for type_sifter worker") ."',
PRIMARY KEY (`id`),
UNIQUE KEY `wiki_id` (`wiki_id`),
UNIQUE KEY `title_hash` (`title_hash`),
KEY `title` (`title`(6)),
KEY `redirect_to` (`redirect_to`)
) ENGINE=InnoDB DEFAULT CHARSET=utf8 COLLATE=utf8_unicode_ci
") or die("Table create error: ". mysql_error() . PHP_EOL);
if(false === @mysql_query("SELECT 1 FROM `". mysql_real_escape_string(DB_ARTICLES_TABLE) ."`")) {
print "Newly created table returned error: ". mysql_error() . PHP_EOL;
die;
}
}
// Prep mysql table for massive importing
@mysql_query("SET AUTOCOMMIT = 0");
@mysql_query("SET FOREIGN_KEY_CHECKS = 0");
@mysql_query("SET UNIQUE_CHECKS = 0");
// Ready to start importing
$files = glob($data_dir ."/enwiki-*-pages-articles*.xml-p*");
// Determine min allowed revision date (below uses < for allowing same-day edits as dump)
$min_revision_date = mktime(0, 0, 0, (date('n') - 1), 1); // defaults to first day of last month
$ts_result = @mysql_query("SELECT MAX(`wiki_revision_date`) as `max_date` FROM `". mysql_real_escape_string(DB_ARTICLES_TABLE) ."`") or print(__LINE__ .": ". mysql_error() . PHP_EOL);
if(@mysql_num_rows($ts_result) > 0) {
$ts_row = @mysql_fetch_assoc($ts_result);
if(!empty($ts_row['max_date'])) {
$min_revision_date = mktime(0, 0, 0, substr($ts_row['max_date'], 5, 2), substr($ts_row['max_date'], 7, 2), substr($ts_row['max_date'], 0, 4));
}
}
foreach($files as $file) {
if(preg_match("#\.bz2$#si", $file)) {
unlink($file);
continue;
}
print "Reading ". $file ."...". PHP_EOL;
$xml = new XMLReader();
$xml->open($file);
while($xml->read() && $xml->name !== 'page');
while($xml->name === "page") {
$doc = new DOMDocument;
$node = simplexml_import_dom($doc->importNode($xml->expand(), true));
$xml->next('page');
$page = array(
'wiki_id' => (int)$node->id,
'wiki_revision_id' => (int)$node->revision->id,
'wiki_namespace' => (int)$node->ns,
'wiki_revision_date' => (string)$node->revision->timestamp,
'title' => (string)$node->title,
'body' => (string)$node->revision->text,
'redirect_title' => false,
);
// determine to skip
// blank namespace = english wiki article namespace
if(!empty($page['wiki_namespace'])) {
unset($doc, $node, $page);
continue;
}
// ignore revisions older than last dump's latest revision date
if(mktime(0, 0, 0, substr($page['wiki_revision_date'], 5, 2), substr($page['wiki_revision_date'], 7, 2), substr($page['wiki_revision_date'], 0, 4)) < $min_revision_date) {
unset($doc, $node, $page);
continue;
}
//if(preg_match("#\s+\(disambiguation\)$#si", $page['title'])) {
// unset($doc, $node, $page);
//
// continue;
//}
if(preg_match("#^(Media|File|User|Wikipedia|MediaWiki|Template|Help|Category|Portal|Book|Special|Talk|User Talk|Wikipedia talk|File talk|MediaWiki talk|Template talk|Help talk|Category talk|Portal talk|Book talk)\:#si", $page['title'])) {
unset($doc, $node, $page);
continue;
}
if($page['redirect_title'] === false && isset($node->redirect)) {
$node_redirect = get_object_vars($node->redirect);
$page['redirect_title'] = (string)$node_redirect["@attributes"]['title'];
}
if($page['redirect_title'] === false && preg_match("#\#REDIRECT\s*\[{2}\s*(.+?)\s*(?:\|[^\]]*)?\s*\]{2}#si", $page['body'], $redirect_match)) {
if(strlen(trim($redirect_match['1'])) > 0) {
$page['redirect_title'] = trim($redirect_match['1']);
}
}
// Prep filename structure for wiki article
$save_to_relative = implode("/", str_split(str_pad($page['wiki_id'], 9, "0", STR_PAD_LEFT), 3)) .".txt";
$save_to = $page_dir ."/". $save_to_relative;
if((false === $page['redirect_title']) && (preg_match("#\#DISAMBIG#si", $page['body']) || preg_match("#\{{2}(disambiguation|disambig|dab|disamb|RoadIndex|geodis|hospital disambiguation|hndis|mathematical disambiguation|mountain index|school disambiguation|shipindex)\s*(?:\|([^\}]*))?\}{2}#si", $page['body']))) {
// Disambig files are a complete waste of space
if(file_exists($save_to)) {
// delete newly redirected file
@unlink($save_to);
}
unset($doc, $node, $page, $save_to_relative, $save_to);
continue;
}
mysql_query("
INSERT INTO `". mysql_real_escape_string(DB_ARTICLES_TABLE) ."`
SET
`wiki_id` = '". mysql_real_escape_string($page['wiki_id']) ."',
`wiki_revision_id` = '". mysql_real_escape_string($page['wiki_revision_id']) ."',
`wiki_revision_date` = '". mysql_real_escape_string( substr($page['wiki_revision_date'], 0, 10) ." ". substr($page['wiki_revision_date'], 11, 8) ) ."',
`title` = '". mysql_real_escape_string($page['title']) ."',
`title_hash` = '". mysql_real_escape_string( md5($page['title']) ) ."',
`redirect_to` = ". ((false !== $page['redirect_title'])?"'". mysql_real_escape_string( md5($page['redirect_title']) ) ."'":"NULL") ."
ON DUPLICATE KEY UPDATE
`wiki_revision_id` = VALUES(`wiki_revision_id`),
`wiki_revision_date` = VALUES(`wiki_revision_date`),
`redirect_to` = VALUES(`redirect_to`)
") or die("MySQL Error (". __LINE__ ."): ". mysql_error());
if(false !== $page['redirect_title']) {
if(file_exists($save_to)) {
// delete newly redirected file
@unlink($save_to);
}
} else {
// prepare to save
$save_to_dir = dirname($save_to);
if(!is_dir($save_to_dir)) {
@mkdir($save_to_dir, 0755, true);
@chown($save_to_dir, "apache");
}
if(!is_writable($save_to_dir)) {
@chmod($save_to_dir, 0755);
@chown($save_to_dir, "apache");
}
file_put_contents($save_to, $page['body']);
@chmod($save_to, 0755);
@chown($save_to, "apache");
}
unset($node, $page, $save_to_relative, $save_to, $save_to_dir);
}
$xml->close();
@unlink($file);
unset($xml);
}
// Release Mysql prep
@mysql_query("SET AUTOCOMMIT = 1");
@mysql_query("SET FOREIGN_KEY_CHECKS = 1");
@mysql_query("SET UNIQUE_CHECKS = 1");
// Remove pending file
unlink($base_dir ."/pending.txt");
// Lock folder
file_put_contents($base_dir ."/done.txt", $latest_wiki_date);
print "All done!". PHP_EOL;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment