Skip to content

Instantly share code, notes, and snippets.

@GRMule
Created December 3, 2013 19:00
Show Gist options
  • Save GRMule/7775384 to your computer and use it in GitHub Desktop.
Save GRMule/7775384 to your computer and use it in GitHub Desktop.
A partially working effort to parse and normalize address data
function parse_address($address){
$dir=array(
'N'=>'N','S'=>'S','E'=>'E','W'=>'W','NW'=>'NW','SW'=>'SW','NE'=>'NE','SE'=>'SE',
'North'=>'N','South'=>'S','East'=>'E','West'=>'W','Northwest'=>'NW','Southwest'=>'SW','Northeast'=>'NE','Southeast'=>'SE'
);
$type=array(
'ave'=>'Ave','blvd'=>'Blvd','st'=>'St','wy'=>'Wy','cir'=>'Cir','dr'=>'Dr','ln'=>'Ln','Pl'=>'Pl','Rd'=>'Rd',
'Bvd'=>'Blvd',
'Avenue'=>'Ave','Boulevard'=>'Blvd','Street'=>'St','Way'=>'Wy','Circle'=>'Cir','Drive'=>'Dr','Lane'=>'Ln','Place'=>'Pl','Road'=>'Rd'
);
$address=trim($address);
$address=str_replace('.', '', $address);
$address=str_replace(',', '', $address);
$b['raw_address']=$address;
$original=$address;
//remove any unit or apt # from the end
//a number alone at the end is not enough, we need at least # or one of the descriptors in ()
if(preg_match('/(\s+(Apt|Apartment|Suite|Ste|Unit|Bldg|Building|Room|Rm|#)\s*)+#?[-a-z0-9]+$/i',
$address,$a)){
$b['raw_unit']=$a[0];
$b['unit']=preg_replace('/(\s+(Apt|Apartment|Suite|Ste|Unit|Bldg|Building|Room|Rm|#)\s*)+#?/i','',$a[0]);
//break raw unit down
$address=substr($address,0,strlen($address)-strlen($a[0]));
}
//parse suffix direction (SW)
if(preg_match('/\s+(North|South|East|West|Northeast|Southeast|Southwest|Northwest|N|S|E|W|NE|SE|SW|NW)$/i',
$address,$a)){
$b['raw_suffix_direction']=$a[0];
$b['suffix_direction']=$dir[$b['raw_suffix_direction']];
$address=substr($address,0,strlen($address)-strlen($a[0]));
}
//remove type of street
if(preg_match('/\s+(St|Bvd|Ave|Wy|Cir|Dr|Ln|Pl|Boulevard|Blvd|Street|Avenue|Way|Circle|Drive|Lane|Place)$/i',
$address,$a)){
$b['raw_type']=$a[0];
strlen($b['raw_type'])>3 || strtolower($b['raw_type'])=='way' || strtolower($b['raw_type'])=='bvd'?$typeDefinite=false:$typeDefinite=true;
$b['type']=isset($type[strtolower($b['raw_type'])]) ? $type[strtolower($b['raw_type'])] : $b['raw_type'];
$address=substr($address,0,strlen($address)-strlen($a[0]));
}
//remove number and fraction
if(preg_match('/^[0-9]+(\s+[0-9]+\/[0-9]+)*/',$address,$a)){
$address=substr($address,strlen($a[0]),strlen($address)-strlen($a[0]));
if(preg_match('/\s+[0-9]+\/[0-9]+$/',$a[0],$aa)){
$b['fraction']=$aa[0];
$a[0]=substr($a[0],0,strlen($a[0])-strlen($aa[0]));
}
$b['number']=trim($a[0]);
$numberFormat='standard';
}else{
$numberFormat='irregular';
//account for possible P.O. Boxes and Rural Routes
if(preg_match('/^(POB\s+|P\s*O\s*Box|Post Office Box|Postal Box|Box|Boite Postal)\s*[0-9a-z]+(-[0-9a-z]+)*/i',$address,$a)){
$b['raw_po_box']=$a[0];
preg_match('/[0-9a-z]+(-[0-9a-z]+)*$/i',$a[0],$aa);
$b['po_box']=strtoupper($aa[0]);
$b['address_type']="Post Office Box";
}
if(preg_match('/(Rrte|RR|Rural Route|Rt|Rte|Route)\s+[0-9]+\s+(Box|Bx)\s+[0-9]+/i',$address,$a)){
$b['raw_route']=$a[0];
$a=explode('b',strtolower($a[0]));
$b['route_number']=preg_replace('/[^0-9]+/','',$a[0]);
$b['route_box_number']=preg_replace('/[^0-9]+/','',$a[1]);
$b['address_type']="Rural Route";
}
//Account for HC nomenclature -- for drawmack
if(preg_match('/(HC|Highway County|Hwy Cty|Hwy County)\s+[0-9]+\s+(Box|Bx)\s+[0-9]+/i',$address,$a)){
$b['raw_hc']=$a[0];
$a=explode('b',strtolower($a[0]));
$b['hc_number']=preg_replace('/[^0-9]+/','',$a[0]);
$b['hc_box_number']=preg_replace('/[^0-9]+/','',$a[1]);
$b['address_type']="Highway County Route";
}
//Account for * | Star Route
if(preg_match('/(\*\s+Rte|\*\s+Route|Star\s+Route|Star\s+Rte)\s+[0-9]+\s+(Box|Bx)\s+[0-9]+/i',$address,$a)){
$b['raw_starrt']=$a[0];
$a=explode('b',strtolower($a[0]));
$b['starrt_number']=preg_replace('/[^0-9]+/','',$a[0]);
$b['starrt_box_number']=preg_replace('/[^0-9]+/','',$a[1]);
$b['address_type']="Star Route";
}
/***
Note on the above 4 nodes: we don't check that an address only partially conforms, such as Rte 1 (no box number), and perhaps we should. Perhaps "Route 1" is even OK in some areas :-|
***/
}
//what remains is the prefix direction, and street, several analyses to make here
//note that if there is still an address left over yet we pulled a PO Box above or a Rural Route, then either something is wrong or our code missed something, this should be flagged.
$address=trim($address);
if(preg_match('/^(North|South|East|West|Northeast|Southeast|Southwest|Northwest|N|S|E|W|NE|SE|SW|NW)\s+/i',$address,$a)){
$b['prefix_direction']=$dir[trim($a[0])];
strlen($a[0])>2?$b['raw_prefix_direction']=$a[0]:'';
$address = substr($address,strlen($a[0]),strlen($address)-strlen($a[0]));
}else{
//presume all else is the name
$b['name']=trim($address);
$b['address_type']="Presumed Standard";
}
//present the array visibly in a logical order -- not required for operation but nice
$order=array(
'type_definite',
'address_type',
'raw_po_box',
'po_box',
'raw_route',
'route_number',
'route_box_number',
'raw_hc',
'hc_number',
'hc_box_number',
'raw_starrt',
'starrt_number',
'starrt_box_number',
'number',
'fraction',
'prefix_direction',
'raw_prefix_direction',
'name',
'type',
'raw_type',
'suffix_direction',
'raw_suffix_direction',
'unit',
'raw_unit',
'raw_address'
);
foreach($order as $v){
isset($b[$v])?$c[$v]=$b[$v]:'';
}
return $c;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment