Skip to content

Instantly share code, notes, and snippets.

@jotaelesalinas
Created September 30, 2017 15:11
Show Gist options
  • Save jotaelesalinas/f6a36eacdb1c47c46b5260dcc15c8f68 to your computer and use it in GitHub Desktop.
Save jotaelesalinas/f6a36eacdb1c47c46b5260dcc15c8f68 to your computer and use it in GitHub Desktop.
Download birthname data from the Social Security Agency
<?php
// composer require guzzlehttp/guzzle jotaelesalinas/php-rwgen
require 'vendor/autoload.php';
define('DATA_DIR', dirname(__FILE__) . '/data');
$states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'];
$year_init = 1960;
$year_end = 2016;
// Create a client with a base URI
$client = new GuzzleHttp\Client(['base_uri' => 'https://www.ssa.gov/']);
if ( !file_exists(DATA_DIR) ) {
mkdir(DATA_DIR);
}
echo "Downloading files to ./data:\n";
for ( $year = $year_init; $year <= $year_end; $year++ ) {
foreach ( $states as $state ) {
$filename = DATA_DIR . "/raw_${year}_${state}.html";
if ( file_exists($filename) ) {
continue;
}
echo " - Downloading " .basename($filename) . "... ";
$response = $client->request('POST', '/cgi-bin/namesbystate.cgi', [ 'form_params' => [ 'state' => $state, 'year' => $year ] ]);
if ( $response->getStatusCode() != 200 ) {
echo " Failed!\n";
continue;
}
file_put_contents($filename, $response->getBody()->getContents());
echo " Ok.\n";
}
}
echo "Finished downloading data.\n";
echo "Parsing files.\n";
$output_csv = new JLSalinas\RWGen\Writers\Csv(DATA_DIR . "/original_data.csv");
foreach ( glob(DATA_DIR . '/*.html') as $filename ) {
if ( !preg_match('/^raw_(\d+)_(\w+)\.html$/', basename($filename), $m) ) {
echo " - Failed reading " . basename($filename) . ": Wrong name.\n";
continue;
}
$year = $m[1];
$state = $m[2];
echo " - Parsing " .basename($filename) . "... ";
$html = file_get_contents($filename);
$html = explode('</caption>', $html)[1];
$html = explode('</table>', $html)[0];
$html = str_replace("\r", '', $html);
$html = preg_replace("/\n|\t/ms", ' ', $html);
if ( !preg_match_all('/<tr[^>]*>(.+?)<\/tr>/ims', $html, $m) ) {
echo " Failed: No rows.\n";
continue;
}
$rows = $m[1];
array_shift($rows);
foreach ( $rows as $row ) {
$res = preg_match_all('/<td[^>]*>\s*(.+?)\s*<\/td>/ims', $row, $m);
$output_csv->send([
'year' => $year,
'state' => $state,
'gender' => 'm',
'pos' => $m[1][0],
'name' => $m[1][1],
'count' => str_replace(',', '', $m[1][2]),
]);
$output_csv->send([
'year' => $year,
'state' => $state,
'gender' => 'f',
'pos' => $m[1][0],
'name' => $m[1][3],
'count' => str_replace(',', '', $m[1][4]),
]);
}
echo "Ok.\n";
}
$output_csv->send(null);
echo "Finished parsing files.\n";
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment