Skip to content

Instantly share code, notes, and snippets.

@jpmckinney
Created December 17, 2010 19:44
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save jpmckinney/745581 to your computer and use it in GitHub Desktop.
Save jpmckinney/745581 to your computer and use it in GitHub Desktop.
PHP parser for Ville de Montréal rink conditions PDFs
<?php
// Original by: Philippe Dagenais-Pérusse
// Get the files to parse
if ($argc) { // Command-line interface
$files = array_slice($argv, 1);
}
else {
$files = array();
if ($handle = opendir($path)) {
while (($file = readdir($handle)) !== FALSE) {
if ($file != '.' && $file != '..' && strstr($file, '.pdf')) {
$files[$file] = $_SERVER['DOCUMENT_ROOT'] . '/glaces/lib/files/' . $file;
}
}
closedir($handle);
}
}
$headers = array(
1 => 'OO', // Ouverte: oui
2 => 'ON', // Ouverte: non
3 => 'DO', // Déblayée: oui
4 => 'DN', // Déblayée: non
5 => 'AO', // Arrosée: oui
6 => 'AN', // Arrosée: non
7 => 'CE', // Condition: excellente
8 => 'CB', // Condition: bonne
9 => 'CM', // Condition: mauvaise
);
// The first text line on which a rink name appears
$first = 13;
// If the status line is neither empty nor "X", parsing has failed
function parsable($value) {
return $value == '' || $value == 'X';
}
$boroughs = array();
foreach ($files as $basename => $file) {
// Collect all text lines from the PDF
$lines = array();
foreach (file($file) as $line_no => $line) {
if (preg_match('/Tj/', $line)) {
$lines[] = trim(substr(str_replace(') Tj', '', stripslashes($line)), 1));
}
}
// Get the borough data
$borough = array(
'Fichier' => $basename, // File
'MAJ-Fichier' => filemtime($file), // Mise à jour du fichier / File updated at
'Region' => $lines[8], // Arrondissement / Borough
'MAJ' => $lines[0], // Mise à jour / Updated at
'Remarques' => '', // Remarks
'Patinoires' => array(), // Rinks
'INTEGRITE' => 'Oui', // Parseable
);
// Get the borough remarks
$index = array_search($file == 'L29_79.pdf' ? 'Remarques' : 'Oui', $lines);
for ($i = 1; $i <= 5; $i++) {
if ($lines[$index - $i] == '' || $lines[$index - $i] == 'X' || is_numeric($lines[$index - $i])) break;
$borough['Remarques'] = $lines[$index - $i] . ' ' . $borough['Remarques'];
}
// Nombre total de patinoires / Total number of rinks
$total = $lines[array_search('Entretenues par les citoyens', $lines) - 1];
// Get the rinks data
for ($i = 0; $i < $total; $i++) {
$index = array_search('Grand total', $lines);
$rink = array(
'Nom' => $lines[$first + $i],
'RO' => $lines[$index + $i + 1], // Resurfacée: oui
'RN' => $lines[$index + $i + 1 + $total], // Resurfacée: non
);
if (!parsable($lines[$index + $i + 1]) || !parsable($lines[$index + $i + 1 + $total])) {
$borough['INTEGRITE'] = 'Non';
}
foreach ($headers as $offset => $key) {
$index = $first + $i + ($total * $offset);
$rink[$key] = $lines[$index];
if (!parsable($lines[$index])) {
$borough['INTEGRITE'] = 'Non';
}
}
$borough['Patinoires'][] = $rink;
}
$boroughs[] = $borough;
}
function cmp($a, $b) {
return strcmp($a['Region'], $b['Region']);
}
usort($boroughs, 'cmp'); // Order by borough
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment