Created
December 17, 2010 19:44
-
-
Save jpmckinney/745581 to your computer and use it in GitHub Desktop.
PHP parser for Ville de Montréal rink conditions PDFs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
// Original by: Philippe Dagenais-Pérusse | |
// Get the files to parse | |
if ($argc) { // Command-line interface | |
$files = array_slice($argv, 1); | |
} | |
else { | |
$files = array(); | |
if ($handle = opendir($path)) { | |
while (($file = readdir($handle)) !== FALSE) { | |
if ($file != '.' && $file != '..' && strstr($file, '.pdf')) { | |
$files[$file] = $_SERVER['DOCUMENT_ROOT'] . '/glaces/lib/files/' . $file; | |
} | |
} | |
closedir($handle); | |
} | |
} | |
$headers = array( | |
1 => 'OO', // Ouverte: oui | |
2 => 'ON', // Ouverte: non | |
3 => 'DO', // Déblayée: oui | |
4 => 'DN', // Déblayée: non | |
5 => 'AO', // Arrosée: oui | |
6 => 'AN', // Arrosée: non | |
7 => 'CE', // Condition: excellente | |
8 => 'CB', // Condition: bonne | |
9 => 'CM', // Condition: mauvaise | |
); | |
// The first text line on which a rink name appears | |
$first = 13; | |
// If the status line is neither empty nor "X", parsing has failed | |
function parsable($value) { | |
return $value == '' || $value == 'X'; | |
} | |
$boroughs = array(); | |
foreach ($files as $basename => $file) { | |
// Collect all text lines from the PDF | |
$lines = array(); | |
foreach (file($file) as $line_no => $line) { | |
if (preg_match('/Tj/', $line)) { | |
$lines[] = trim(substr(str_replace(') Tj', '', stripslashes($line)), 1)); | |
} | |
} | |
// Get the borough data | |
$borough = array( | |
'Fichier' => $basename, // File | |
'MAJ-Fichier' => filemtime($file), // Mise à jour du fichier / File updated at | |
'Region' => $lines[8], // Arrondissement / Borough | |
'MAJ' => $lines[0], // Mise à jour / Updated at | |
'Remarques' => '', // Remarks | |
'Patinoires' => array(), // Rinks | |
'INTEGRITE' => 'Oui', // Parseable | |
); | |
// Get the borough remarks | |
$index = array_search($file == 'L29_79.pdf' ? 'Remarques' : 'Oui', $lines); | |
for ($i = 1; $i <= 5; $i++) { | |
if ($lines[$index - $i] == '' || $lines[$index - $i] == 'X' || is_numeric($lines[$index - $i])) break; | |
$borough['Remarques'] = $lines[$index - $i] . ' ' . $borough['Remarques']; | |
} | |
// Nombre total de patinoires / Total number of rinks | |
$total = $lines[array_search('Entretenues par les citoyens', $lines) - 1]; | |
// Get the rinks data | |
for ($i = 0; $i < $total; $i++) { | |
$index = array_search('Grand total', $lines); | |
$rink = array( | |
'Nom' => $lines[$first + $i], | |
'RO' => $lines[$index + $i + 1], // Resurfacée: oui | |
'RN' => $lines[$index + $i + 1 + $total], // Resurfacée: non | |
); | |
if (!parsable($lines[$index + $i + 1]) || !parsable($lines[$index + $i + 1 + $total])) { | |
$borough['INTEGRITE'] = 'Non'; | |
} | |
foreach ($headers as $offset => $key) { | |
$index = $first + $i + ($total * $offset); | |
$rink[$key] = $lines[$index]; | |
if (!parsable($lines[$index])) { | |
$borough['INTEGRITE'] = 'Non'; | |
} | |
} | |
$borough['Patinoires'][] = $rink; | |
} | |
$boroughs[] = $borough; | |
} | |
function cmp($a, $b) { | |
return strcmp($a['Region'], $b['Region']); | |
} | |
usort($boroughs, 'cmp'); // Order by borough |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment