Instantly share code, notes, and snippets.

Embed
What would you like to do?
Export all current articles about games from PCGW to xml files.
<?php
// Find all games on PCGW
echo 'Finding games ...' . PHP_EOL;
$subject = file_get_contents('https://pcgamingwiki.com/wiki/Category:Games');
$all_pages = [];
$all_pages_count = 0;
$i = 0;
do {
preg_match_all('%^(?:<ul>)?<li><a href="\/wiki\/[^"]+" title="[^"]+">([^<]+)<\/a><\/li>%m', $subject, $matches);
$pages = $matches[1];
$all_pages_count += ($pages_count = count($pages));
// Clean up titles
foreach($pages as $page) {
$all_pages[] = html_entity_decode($page, ENT_QUOTES | ENT_HTML5);
}
echo 'Got game directory page ' . ++$i . ' (' . $all_pages_count . ' games total)' . PHP_EOL;
// Get next page
if($next_page_found = preg_match('%\(<a href="([^"]+)" title="[^"]+">next page<\/a>\)%', $subject, $next_page_match) === 1) {
$url = html_entity_decode($next_page_match[1], ENT_QUOTES | ENT_HTML5);
$url = $url[0] === '/' ? ('https://pcgamingwiki.com' . $url) : $url;
$subject = file_get_contents($url);
}
} while($next_page_found);
// Dedupe
$all_pages = array_unique($all_pages);
$all_pages_count = count($all_pages);
// Export
echo 'Exporting ...' . PHP_EOL;
$ch = curl_init();
curl_setopt($ch, CURLOPT_URL,'https://pcgamingwiki.com/wiki/Special:Export');
curl_setopt($ch, CURLOPT_SSL_VERIFYPEER, 0); // Insecure.
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
curl_setopt($ch, CURLOPT_POST, true);
$export_count = ceil($all_pages_count/5000);
for($i = 0; $i < $export_count; $i++) {
$current_pages = implode("\r\n", array_slice($all_pages, $i * 5000, 5000));
$fp = fopen('PCGW-export-' . str_pad($i + 1, 2, '0', STR_PAD_LEFT) . '.xml', 'w');
curl_setopt($ch, CURLOPT_FILE, $fp);
curl_setopt($ch, CURLOPT_POSTFIELDS,
http_build_query(
[
'catname' => '',
'pages' => $current_pages,
'curonly' => '1',
'wpDownload' => '1',
'wpEditToken' => '+\\',
'title' => 'Special:Export'
]
)
);
curl_exec($ch);
fclose($fp);
echo 'Got export ' . ($i + 1) . '/' . $export_count . PHP_EOL;
}
curl_close($ch);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment