Created
June 7, 2011 19:24
-
-
Save marcoceppi/1012943 to your computer and use it in GitHub Desktop.
Shitty "CSV" Cleaner
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env php5 | |
<?php | |
/** | |
* Clean up this crappy import file | |
*/ | |
$files = glob('*.txt'); | |
$addy2 = array('APT', 'UNIT', '#', 'BOX', 'SUITE', 'STE'); | |
$skipped = 0; | |
$now = time(); | |
foreach( $files as $file ) | |
{ | |
$lines = file($file); | |
$output_name = str_replace('.txt', '.txt.parsed-' . $now, $file); | |
$output_bad = str_replace('.txt', '.txt.bad-' . $now, $file); | |
file_put_contents($output_name, 'FIRST_NAME|MIDDLE|LAST_NAME|ADDRESS1|ADDRESS2|CITY|STATE|ZIP' . PHP_EOL, FILE_APPEND); | |
foreach( $lines as $line ) | |
{ | |
if( !preg_match('/GUARDIAN|C\/O|ATTN|ASSN|CUSTODIAN|ADMINISTRATOR|BANK|REHAB|CONSERVATOR|COMPANY|SPOUSE/', $line) ) | |
{ | |
$str_offset = 20; | |
$chunk_index = 200; | |
// Take care of crammed columns! This is where columns of 20 characters don't have a space like: | |
// 123 MAINSTREETUSA LANECHICAGO IL | |
// 123 MAINSTUSA LANE CHICAGO IL | |
// So this fixes that. | |
while( $chunk_index > 90 ) | |
{ | |
if( substr($line, $chunk_index, 1) != ' ' ) | |
{ | |
$line = substr_replace($line, ' ', $chunk_index, 0); | |
} | |
$chunk_index -= $str_offset; | |
} | |
// Get rid of ALL the extra spaces | |
$line = trim(ereg_replace( ' +', ' ', $line)); | |
// Blow this bitch up | |
$parts = explode(' ', $line); | |
// First entry is zip, crammed next to garbage | |
$zip = substr($parts[0], 0, 5); | |
array_shift($parts); | |
// Work our way from the back of the array since the end is more sane then the front | |
$parts = array_reverse($parts); | |
// Naturally, these are easy | |
$state = array_shift($parts); | |
$city = array_shift($parts); | |
// Factory reset | |
$address1 = ''; | |
$address2 = ''; | |
$first_name = ''; | |
$middle_name = ''; | |
$last_name = ''; | |
// Probably an Address2 but could be an address1 - try to match a series of known Address2 aliases | |
if( is_numeric($parts[0]) || substr($parts[0], 0, 1) == '#' || is_numeric(substr($parts[0], 0, 1)) || in_array($parts[1], $addy2) ) | |
{ | |
$address2 = array_shift($parts); | |
if( in_array($parts[0], $addy2) ) | |
{ | |
if( $parts[0] == 'BOX' ) | |
{ | |
array_shift($parts); | |
array_shift($parts); | |
$address1 = 'PO BOX' . ' ' . $address2; | |
$address2 = ''; | |
} | |
else | |
{ | |
$address2 = $parts[0] . ' ' . $address2; | |
} | |
} | |
} | |
else | |
{ | |
$address2 = ''; | |
} | |
// Build up the street address | |
if( empty($address1) ) | |
{ | |
while( $next = array_shift($parts) ) | |
{ | |
if( !in_array($next, $addy2) ) | |
{ | |
$address1 = $next . ' ' . $address1; | |
if( is_numeric($next) ) | |
{ | |
break 1; | |
} | |
} | |
} | |
} | |
// Last name | |
$last_name = array_shift($parts); | |
// Could be a suffix | |
if( $last_name == 'JR' ) | |
{ | |
$last_name = array_shift($parts) . ' JR'; | |
} | |
// Make sure the last name wasn't acutally a mis-placed ADDRESS2 line or PO-BOX | |
if( in_array($parts[0], $addy2) ) | |
{ | |
if( $parts[0] == 'BOX' ) | |
{ | |
array_shift($parts); | |
array_shift($parts); | |
$address1 = 'PO BOX' . ' ' . $last_name; | |
$address2 = ''; | |
} | |
else | |
{ | |
$address2 = array_shift($parts) . ' ' . $last_name; | |
} | |
$last_name = array_shift($parts); | |
} | |
// Do we have a middle name? | |
if( count($parts) > 1 ) | |
{ | |
$parts = array_reverse($parts); | |
$first_name = substr(array_shift($parts), 1); | |
if( is_numeric($first_name) ) | |
{ | |
$first_name = substr(array_shift($parts), 1); | |
} | |
$middle_name = implode(' ', $parts); | |
} | |
else | |
{ | |
$first_name = substr(array_shift($parts), 1); | |
} | |
// Build it, dump it, move along | |
$line = $first_name . '|' . $middle_name . '|' . $last_name . '|' . trim($address1) . '|' . trim($address2) . '|' . $city . '|' . $state . '|' . $zip . PHP_EOL; | |
file_put_contents($output_name, $line, FILE_APPEND); | |
} | |
else | |
{ | |
$skipped++; | |
file_put_contents($output_bad, $line, FILE_APPEND); | |
} | |
} | |
} | |
echo $skipped . ' lines skipped' . PHP_EOL; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment