jotaelesalinas/ml-ssa-birthnames-download.php

## ml-ssa-birthnames-download.php
<?php
// composer require guzzlehttp/guzzle jotaelesalinas/php-rwgen
require 'vendor/autoload.php';

define('DATA_DIR', dirname(__FILE__) . '/data');

$states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'];
$year_init = 1960;
$year_end = 2016;

// Create a client with a base URI
$client = new GuzzleHttp\Client(['base_uri' => 'https://www.ssa.gov/']);

if ( !file_exists(DATA_DIR) ) {
    mkdir(DATA_DIR);
}

echo "Downloading files to ./data:\n";

for ( $year = $year_init; $year <= $year_end; $year++ ) {
    foreach ( $states as $state ) {
        $filename = DATA_DIR . "/raw_${year}_${state}.html";
        if ( file_exists($filename) ) {
            continue;
        }

        echo " - Downloading " .basename($filename) . "... ";
        $response = $client->request('POST', '/cgi-bin/namesbystate.cgi', [ 'form_params' => [ 'state' => $state, 'year' => $year ] ]);

        if ( $response->getStatusCode() != 200 ) {
            echo " Failed!\n";
            continue;
        }

        file_put_contents($filename, $response->getBody()->getContents());
        echo " Ok.\n";
    }
}

echo "Finished downloading data.\n";

echo "Parsing files.\n";

$output_csv = new JLSalinas\RWGen\Writers\Csv(DATA_DIR . "/original_data.csv");

foreach ( glob(DATA_DIR . '/*.html') as $filename ) {
    if ( !preg_match('/^raw_(\d+)_(\w+)\.html$/', basename($filename), $m) ) {
        echo " - Failed reading " . basename($filename) . ": Wrong name.\n";
        continue;
    }

    $year = $m[1];
    $state = $m[2];

    echo " - Parsing " .basename($filename) . "... ";

    $html = file_get_contents($filename);
    $html = explode('</caption>', $html)[1];
    $html = explode('</table>', $html)[0];
    $html = str_replace("\r", '', $html);
    $html = preg_replace("/\n|\t/ms", ' ', $html);

    if ( !preg_match_all('/<tr[^>]*>(.+?)<\/tr>/ims', $html, $m) ) {
        echo " Failed: No rows.\n";
        continue;
    }
    $rows = $m[1];
    array_shift($rows);

    foreach ( $rows as $row ) {
        $res = preg_match_all('/<td[^>]*>\s*(.+?)\s*<\/td>/ims', $row, $m);

        $output_csv->send([
            'year' =>   $year,
            'state' =>  $state,
            'gender' => 'm',
            'pos' =>    $m[1][0],
            'name' =>   $m[1][1],
            'count' =>  str_replace(',', '', $m[1][2]),
        ]);
        $output_csv->send([
            'year' =>   $year,
            'state' =>  $state,
            'gender' => 'f',
            'pos' =>    $m[1][0],
            'name' =>   $m[1][3],
            'count' =>  str_replace(',', '', $m[1][4]),
        ]);
    }

    echo "Ok.\n";
}

$output_csv->send(null);
echo "Finished parsing files.\n";
	<?php
	// composer require guzzlehttp/guzzle jotaelesalinas/php-rwgen
	require 'vendor/autoload.php';

	define('DATA_DIR', dirname(__FILE__) . '/data');

	$states = ['AL', 'AK', 'AZ', 'AR', 'CA', 'CO', 'CT', 'DE', 'DC', 'FL', 'GA', 'HI', 'ID', 'IL', 'IN', 'IA', 'KS', 'KY', 'LA', 'ME', 'MD', 'MA', 'MI', 'MN', 'MS', 'MO', 'MT', 'NE', 'NV', 'NH', 'NJ', 'NM', 'NY', 'NC', 'ND', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX', 'UT', 'VT', 'VA', 'WA', 'WV', 'WI', 'WY'];
	$year_init = 1960;
	$year_end = 2016;

	// Create a client with a base URI
	$client = new GuzzleHttp\Client(['base_uri' => 'https://www.ssa.gov/']);

	if ( !file_exists(DATA_DIR) ) {
	mkdir(DATA_DIR);
	}

	echo "Downloading files to ./data:\n";

	for ( $year = $year_init; $year <= $year_end; $year++ ) {
	foreach ( $states as $state ) {
	$filename = DATA_DIR . "/raw_${year}_${state}.html";
	if ( file_exists($filename) ) {
	continue;
	}

	echo " - Downloading " .basename($filename) . "... ";
	$response = $client->request('POST', '/cgi-bin/namesbystate.cgi', [ 'form_params' => [ 'state' => $state, 'year' => $year ] ]);

	if ( $response->getStatusCode() != 200 ) {
	echo " Failed!\n";
	continue;
	}

	file_put_contents($filename, $response->getBody()->getContents());
	echo " Ok.\n";
	}
	}

	echo "Finished downloading data.\n";

	echo "Parsing files.\n";

	$output_csv = new JLSalinas\RWGen\Writers\Csv(DATA_DIR . "/original_data.csv");

	foreach ( glob(DATA_DIR . '/*.html') as $filename ) {
	if ( !preg_match('/^raw_(\d+)_(\w+)\.html$/', basename($filename), $m) ) {
	echo " - Failed reading " . basename($filename) . ": Wrong name.\n";
	continue;
	}

	$year = $m[1];
	$state = $m[2];

	echo " - Parsing " .basename($filename) . "... ";

	$html = file_get_contents($filename);
	$html = explode('</caption>', $html)[1];
	$html = explode('</table>', $html)[0];
	$html = str_replace("\r", '', $html);
	$html = preg_replace("/\n\|\t/ms", ' ', $html);

	if ( !preg_match_all('/<tr[^>]*>(.+?)<\/tr>/ims', $html, $m) ) {
	echo " Failed: No rows.\n";
	continue;
	}
	$rows = $m[1];
	array_shift($rows);

	foreach ( $rows as $row ) {
	$res = preg_match_all('/<td[^>]>\s(.+?)\s*<\/td>/ims', $row, $m);

	$output_csv->send([
	'year' => $year,
	'state' => $state,
	'gender' => 'm',
	'pos' => $m[1][0],
	'name' => $m[1][1],
	'count' => str_replace(',', '', $m[1][2]),
	]);
	$output_csv->send([
	'year' => $year,
	'state' => $state,
	'gender' => 'f',
	'pos' => $m[1][0],
	'name' => $m[1][3],
	'count' => str_replace(',', '', $m[1][4]),
	]);
	}

	echo "Ok.\n";
	}

	$output_csv->send(null);
	echo "Finished parsing files.\n";