Skip to content

Instantly share code, notes, and snippets.

@marcoceppi
Created June 7, 2011 19:24
Show Gist options
  • Save marcoceppi/1012943 to your computer and use it in GitHub Desktop.
Save marcoceppi/1012943 to your computer and use it in GitHub Desktop.
Shitty "CSV" Cleaner
#!/usr/bin/env php5
<?php
/**
* Clean up this crappy import file
*/
$files = glob('*.txt');
$addy2 = array('APT', 'UNIT', '#', 'BOX', 'SUITE', 'STE');
$skipped = 0;
$now = time();
foreach( $files as $file )
{
$lines = file($file);
$output_name = str_replace('.txt', '.txt.parsed-' . $now, $file);
$output_bad = str_replace('.txt', '.txt.bad-' . $now, $file);
file_put_contents($output_name, 'FIRST_NAME|MIDDLE|LAST_NAME|ADDRESS1|ADDRESS2|CITY|STATE|ZIP' . PHP_EOL, FILE_APPEND);
foreach( $lines as $line )
{
if( !preg_match('/GUARDIAN|C\/O|ATTN|ASSN|CUSTODIAN|ADMINISTRATOR|BANK|REHAB|CONSERVATOR|COMPANY|SPOUSE/', $line) )
{
$str_offset = 20;
$chunk_index = 200;
// Take care of crammed columns! This is where columns of 20 characters don't have a space like:
// 123 MAINSTREETUSA LANECHICAGO IL
// 123 MAINSTUSA LANE CHICAGO IL
// So this fixes that.
while( $chunk_index > 90 )
{
if( substr($line, $chunk_index, 1) != ' ' )
{
$line = substr_replace($line, ' ', $chunk_index, 0);
}
$chunk_index -= $str_offset;
}
// Get rid of ALL the extra spaces
$line = trim(ereg_replace( ' +', ' ', $line));
// Blow this bitch up
$parts = explode(' ', $line);
// First entry is zip, crammed next to garbage
$zip = substr($parts[0], 0, 5);
array_shift($parts);
// Work our way from the back of the array since the end is more sane then the front
$parts = array_reverse($parts);
// Naturally, these are easy
$state = array_shift($parts);
$city = array_shift($parts);
// Factory reset
$address1 = '';
$address2 = '';
$first_name = '';
$middle_name = '';
$last_name = '';
// Probably an Address2 but could be an address1 - try to match a series of known Address2 aliases
if( is_numeric($parts[0]) || substr($parts[0], 0, 1) == '#' || is_numeric(substr($parts[0], 0, 1)) || in_array($parts[1], $addy2) )
{
$address2 = array_shift($parts);
if( in_array($parts[0], $addy2) )
{
if( $parts[0] == 'BOX' )
{
array_shift($parts);
array_shift($parts);
$address1 = 'PO BOX' . ' ' . $address2;
$address2 = '';
}
else
{
$address2 = $parts[0] . ' ' . $address2;
}
}
}
else
{
$address2 = '';
}
// Build up the street address
if( empty($address1) )
{
while( $next = array_shift($parts) )
{
if( !in_array($next, $addy2) )
{
$address1 = $next . ' ' . $address1;
if( is_numeric($next) )
{
break 1;
}
}
}
}
// Last name
$last_name = array_shift($parts);
// Could be a suffix
if( $last_name == 'JR' )
{
$last_name = array_shift($parts) . ' JR';
}
// Make sure the last name wasn't acutally a mis-placed ADDRESS2 line or PO-BOX
if( in_array($parts[0], $addy2) )
{
if( $parts[0] == 'BOX' )
{
array_shift($parts);
array_shift($parts);
$address1 = 'PO BOX' . ' ' . $last_name;
$address2 = '';
}
else
{
$address2 = array_shift($parts) . ' ' . $last_name;
}
$last_name = array_shift($parts);
}
// Do we have a middle name?
if( count($parts) > 1 )
{
$parts = array_reverse($parts);
$first_name = substr(array_shift($parts), 1);
if( is_numeric($first_name) )
{
$first_name = substr(array_shift($parts), 1);
}
$middle_name = implode(' ', $parts);
}
else
{
$first_name = substr(array_shift($parts), 1);
}
// Build it, dump it, move along
$line = $first_name . '|' . $middle_name . '|' . $last_name . '|' . trim($address1) . '|' . trim($address2) . '|' . $city . '|' . $state . '|' . $zip . PHP_EOL;
file_put_contents($output_name, $line, FILE_APPEND);
}
else
{
$skipped++;
file_put_contents($output_bad, $line, FILE_APPEND);
}
}
}
echo $skipped . ' lines skipped' . PHP_EOL;
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment