Skip to content

Instantly share code, notes, and snippets.

@traverse
Created July 1, 2015 10:43
Show Gist options
  • Save traverse/6a7fa57af60bdc7fde68 to your computer and use it in GitHub Desktop.
Save traverse/6a7fa57af60bdc7fde68 to your computer and use it in GitHub Desktop.
Website scraper for datavisualization
<?php
$urls = [];
foreach ($urls as $url) {
$data = file_get_contents($url);
$date_regex = '|<a href="\/party\/day\/(\d*)\/(\d*)\/(\d*)">|';
$title_regex = '|<meta property="og:title" content="(.*)" \/><meta property="og:type" content="partyflock:event" \/>|';
$lineup_regex = '|<div id="lineup">(.*)<div class="l box-column" style="width:50%">|';
preg_match($date_regex, $data, $date);
preg_match($title_regex, $data, $title);
preg_match($lineup_regex, $data, $lineup);
$title = preg_replace('|festival|', '', $title[1]);
$year = $date[1];
$dat = preg_replace('|</div>|', '&break', $lineup[1]);
$dat = preg_replace("|<span(.*?)</span>|", "", $dat);
$dat = preg_replace("|<th(.*?)</th>|", "", $dat);
$dat = strip_tags($dat, '<*><a>');
$dat = preg_replace('|&break|', "\r\n", $dat);
$dat = preg_replace('|&nbsp;|', '', $dat);
$dat = preg_replace('|live:|', '', $dat);
$dat = preg_replace('|00:00,-,00:00:|', '00:00 - 00:00:', $dat);
$dat_arr = array_filter(explode(PHP_EOL, $dat));
$artist_regex = '|<a itemprop="performer" href="\/artist\/(.*)">(.*)<\/a>|';
mkdir("./{$title}");
$file = fopen("./{$title}/{$year}.txt", "a");
foreach ($dat_arr as $artist) {
preg_match($artist_regex, $artist, $artist_result);
if (isset($artist_result[1], $artist_result[2])) {
$artist_data = file_get_contents("http://partyflock.nl/artist/" . $artist_result[1]);
$nat_regex = '|<a\b[^>]* href="/country/(.*?)">(.*?)</a>|';
preg_match($nat_regex, $artist_data, $nat_result);
if(isset($nat_result[2])) {
$result = "{$artist_result[2]},{$nat_result[2]}\r\n";
fwrite($file, $result);
} else {
$result = "{$artist_result[2]}\r\n";
fwrite($file, $result);
}
}
}
fclose($file);
}
?>
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment