Skip to content

Instantly share code, notes, and snippets.

@reinvented
Created December 7, 2018 19:22
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save reinvented/7e50b84cd85a03325bd090f6822b6ae9 to your computer and use it in GitHub Desktop.
Save reinvented/7e50b84cd85a03325bd090f6822b6ae9 to your computer and use it in GitHub Desktop.
A PHP script to convert text files of Prince Edward Island Hansard to a CSV
<?php
$files = popen('find ./documents -name "*.txt"', 'r');
$out = fopen('hansard.csv', 'w');
fwrite($out, "date,text,speaker\n");
while (!feof($files)) {
$filename = chop(fgets($files, 4096));
$date = basename($filename);
$date = str_replace("-hansard.txt", "", $date);
if ($filename != '') {
$fp = fopen($filename, 'r');
$text = array();
while(!feof($fp)) {
$line = [];
$line['date'] = $date;
$line['text'] = chop(fgets($fp, 4096));
$text[] = $line;
}
$started = FALSE;
foreach($text as $lineno => $line) {
// This line begins with a member's name, like Mr. Smith:
if (
(preg_match("/^(([A-Z][a-zA-Z\.\-]+\s?\b){1,4}):/", $line['text'], $matches)) or
(preg_match("/^(Leader of the Opposition):/", $line['text'], $matches)) or
(preg_match("/^(Chair \(.*\)):/", $line['text'], $matches)) or
(preg_match("/^(Clerk Assistant \(.*\)):/", $line['text'], $matches))
) {
if (!preg_match("/Published by Order of the Legislature/", $line['text'])) {
$text[$lineno]['speaker'] = $matches[1];
$text[$lineno]['text'] = str_replace($matches[0] . ' ' , '', $text[$lineno]['text']);
$currentline = $lineno;
$started = TRUE;
}
else {
unset($text[$lineno]);
}
}
// This line does not begin with a member's name
else {
if (preg_match("/^.*HANSARD PEI.*/", $line['text'])) {
unset($text[$lineno]);
}
else if (preg_match("/^\d+/", $line['text'])) {
unset($text[$lineno]);
}
else if ($started) {
$text[$currentline]['text'] .= ' ' . $line['text'];
unset($text[$lineno]);
}
else {
unset($text[$lineno]);
}
}
}
foreach($text as $lineno => $line) {
$text[$lineno]['wordcount'] = str_word_count($line['text']);
}
foreach($text as $lineno => $line) {
fputcsv($out, array_values($line));
}
}
}
fclose($out);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment