Created
December 3, 2013 19:00
-
-
Save GRMule/7775384 to your computer and use it in GitHub Desktop.
A partially working effort to parse and normalize address data
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
function parse_address($address){ | |
$dir=array( | |
'N'=>'N','S'=>'S','E'=>'E','W'=>'W','NW'=>'NW','SW'=>'SW','NE'=>'NE','SE'=>'SE', | |
'North'=>'N','South'=>'S','East'=>'E','West'=>'W','Northwest'=>'NW','Southwest'=>'SW','Northeast'=>'NE','Southeast'=>'SE' | |
); | |
$type=array( | |
'ave'=>'Ave','blvd'=>'Blvd','st'=>'St','wy'=>'Wy','cir'=>'Cir','dr'=>'Dr','ln'=>'Ln','Pl'=>'Pl','Rd'=>'Rd', | |
'Bvd'=>'Blvd', | |
'Avenue'=>'Ave','Boulevard'=>'Blvd','Street'=>'St','Way'=>'Wy','Circle'=>'Cir','Drive'=>'Dr','Lane'=>'Ln','Place'=>'Pl','Road'=>'Rd' | |
); | |
$address=trim($address); | |
$address=str_replace('.', '', $address); | |
$address=str_replace(',', '', $address); | |
$b['raw_address']=$address; | |
$original=$address; | |
//remove any unit or apt # from the end | |
//a number alone at the end is not enough, we need at least # or one of the descriptors in () | |
if(preg_match('/(\s+(Apt|Apartment|Suite|Ste|Unit|Bldg|Building|Room|Rm|#)\s*)+#?[-a-z0-9]+$/i', | |
$address,$a)){ | |
$b['raw_unit']=$a[0]; | |
$b['unit']=preg_replace('/(\s+(Apt|Apartment|Suite|Ste|Unit|Bldg|Building|Room|Rm|#)\s*)+#?/i','',$a[0]); | |
//break raw unit down | |
$address=substr($address,0,strlen($address)-strlen($a[0])); | |
} | |
//parse suffix direction (SW) | |
if(preg_match('/\s+(North|South|East|West|Northeast|Southeast|Southwest|Northwest|N|S|E|W|NE|SE|SW|NW)$/i', | |
$address,$a)){ | |
$b['raw_suffix_direction']=$a[0]; | |
$b['suffix_direction']=$dir[$b['raw_suffix_direction']]; | |
$address=substr($address,0,strlen($address)-strlen($a[0])); | |
} | |
//remove type of street | |
if(preg_match('/\s+(St|Bvd|Ave|Wy|Cir|Dr|Ln|Pl|Boulevard|Blvd|Street|Avenue|Way|Circle|Drive|Lane|Place)$/i', | |
$address,$a)){ | |
$b['raw_type']=$a[0]; | |
strlen($b['raw_type'])>3 || strtolower($b['raw_type'])=='way' || strtolower($b['raw_type'])=='bvd'?$typeDefinite=false:$typeDefinite=true; | |
$b['type']=isset($type[strtolower($b['raw_type'])]) ? $type[strtolower($b['raw_type'])] : $b['raw_type']; | |
$address=substr($address,0,strlen($address)-strlen($a[0])); | |
} | |
//remove number and fraction | |
if(preg_match('/^[0-9]+(\s+[0-9]+\/[0-9]+)*/',$address,$a)){ | |
$address=substr($address,strlen($a[0]),strlen($address)-strlen($a[0])); | |
if(preg_match('/\s+[0-9]+\/[0-9]+$/',$a[0],$aa)){ | |
$b['fraction']=$aa[0]; | |
$a[0]=substr($a[0],0,strlen($a[0])-strlen($aa[0])); | |
} | |
$b['number']=trim($a[0]); | |
$numberFormat='standard'; | |
}else{ | |
$numberFormat='irregular'; | |
//account for possible P.O. Boxes and Rural Routes | |
if(preg_match('/^(POB\s+|P\s*O\s*Box|Post Office Box|Postal Box|Box|Boite Postal)\s*[0-9a-z]+(-[0-9a-z]+)*/i',$address,$a)){ | |
$b['raw_po_box']=$a[0]; | |
preg_match('/[0-9a-z]+(-[0-9a-z]+)*$/i',$a[0],$aa); | |
$b['po_box']=strtoupper($aa[0]); | |
$b['address_type']="Post Office Box"; | |
} | |
if(preg_match('/(Rrte|RR|Rural Route|Rt|Rte|Route)\s+[0-9]+\s+(Box|Bx)\s+[0-9]+/i',$address,$a)){ | |
$b['raw_route']=$a[0]; | |
$a=explode('b',strtolower($a[0])); | |
$b['route_number']=preg_replace('/[^0-9]+/','',$a[0]); | |
$b['route_box_number']=preg_replace('/[^0-9]+/','',$a[1]); | |
$b['address_type']="Rural Route"; | |
} | |
//Account for HC nomenclature -- for drawmack | |
if(preg_match('/(HC|Highway County|Hwy Cty|Hwy County)\s+[0-9]+\s+(Box|Bx)\s+[0-9]+/i',$address,$a)){ | |
$b['raw_hc']=$a[0]; | |
$a=explode('b',strtolower($a[0])); | |
$b['hc_number']=preg_replace('/[^0-9]+/','',$a[0]); | |
$b['hc_box_number']=preg_replace('/[^0-9]+/','',$a[1]); | |
$b['address_type']="Highway County Route"; | |
} | |
//Account for * | Star Route | |
if(preg_match('/(\*\s+Rte|\*\s+Route|Star\s+Route|Star\s+Rte)\s+[0-9]+\s+(Box|Bx)\s+[0-9]+/i',$address,$a)){ | |
$b['raw_starrt']=$a[0]; | |
$a=explode('b',strtolower($a[0])); | |
$b['starrt_number']=preg_replace('/[^0-9]+/','',$a[0]); | |
$b['starrt_box_number']=preg_replace('/[^0-9]+/','',$a[1]); | |
$b['address_type']="Star Route"; | |
} | |
/*** | |
Note on the above 4 nodes: we don't check that an address only partially conforms, such as Rte 1 (no box number), and perhaps we should. Perhaps "Route 1" is even OK in some areas :-| | |
***/ | |
} | |
//what remains is the prefix direction, and street, several analyses to make here | |
//note that if there is still an address left over yet we pulled a PO Box above or a Rural Route, then either something is wrong or our code missed something, this should be flagged. | |
$address=trim($address); | |
if(preg_match('/^(North|South|East|West|Northeast|Southeast|Southwest|Northwest|N|S|E|W|NE|SE|SW|NW)\s+/i',$address,$a)){ | |
$b['prefix_direction']=$dir[trim($a[0])]; | |
strlen($a[0])>2?$b['raw_prefix_direction']=$a[0]:''; | |
$address = substr($address,strlen($a[0]),strlen($address)-strlen($a[0])); | |
}else{ | |
//presume all else is the name | |
$b['name']=trim($address); | |
$b['address_type']="Presumed Standard"; | |
} | |
//present the array visibly in a logical order -- not required for operation but nice | |
$order=array( | |
'type_definite', | |
'address_type', | |
'raw_po_box', | |
'po_box', | |
'raw_route', | |
'route_number', | |
'route_box_number', | |
'raw_hc', | |
'hc_number', | |
'hc_box_number', | |
'raw_starrt', | |
'starrt_number', | |
'starrt_box_number', | |
'number', | |
'fraction', | |
'prefix_direction', | |
'raw_prefix_direction', | |
'name', | |
'type', | |
'raw_type', | |
'suffix_direction', | |
'raw_suffix_direction', | |
'unit', | |
'raw_unit', | |
'raw_address' | |
); | |
foreach($order as $v){ | |
isset($b[$v])?$c[$v]=$b[$v]:''; | |
} | |
return $c; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment