Last active
November 29, 2023 02:15
-
-
Save RainJayTsai/9552dd413a4c71c62e6b61233485a3b0 to your computer and use it in GitHub Desktop.
regex to analysis tw address
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
try: | |
import re2 as re | |
except: | |
import re | |
pattern = '(?P<code>\d{5}|\d{3})?\s*(?P<country>[台臺]灣)?(?P<city>\D+?[縣市])(?P<district>\D+?(市區|鎮區|鎮市|[鄉鎮市區]))?(?P<village>\D+?[村里](?!路))?(?P<neighbor>\d+[鄰])?(?P<road>\D+?(村路|[路街道段]))?(?P<section>(\d|\D)?段)?(?P<lane>\d+巷)?(?P<alley>\d+弄)?(?P<no>\d+號?)?(?P<seq>[-之]\d+?(號))?(?P<floor>(((\D|\d)+樓)|((B|地下室?)\d+)))?(?P<other>.+)?' | |
addr_regex = re.compile(pattern) | |
cn_num = '一二三四五六七八九十' | |
cn_mapping = {k: str(v) for k, v in zip(list(cn_num), | |
range(1, 11))} | |
an_mapping = {v: k for k, v in cn_mapping.items()} | |
def norm_address(address): | |
address = address.replace('\u200b', '') | |
address = address.replace('―', '-') | |
address = address.replace('臺', '台') | |
address = address.replace('巿', '市') | |
address = address.replace(',', '、') | |
address = address.replace('ㄧ樓', '1樓') | |
address = address.replace('桃園縣', '桃園市') | |
address = re.sub('(\d+)[▔▁——─――一之_](\d+)', '\\1-\\2', address) | |
address = re.sub('(\d+)\s*[Ff]', '\\1樓', address) | |
# print(address) | |
for ele in re.findall(f'([{cn_num[:-1]}])?(十)?(?<![雙双])([{cn_num}])路', address): | |
new_num = int(cn_mapping.get(ele[0], 0)) * 10 + int(cn_mapping.get(ele[-1]))%10 | |
if new_num < 10 and ele[1]: | |
new_num += 10 | |
if ele[0] == ele[1] == '' and ele[2] == '十': | |
new_num = 10 | |
address = address.replace("".join(ele) + '路', str(new_num) + '路') | |
for ele in re.findall('\d+路', address): | |
if len(ele) == 2: | |
# print(address) | |
address = address.replace(ele, an_mapping[ele[0]]+ele[1:]) | |
address = address.strip() | |
return address | |
def parse_address(address): | |
try: | |
parsed = addr_regex.match(address).groupdict() | |
except AttributeError: | |
parsed = {'code': None, 'country': None, 'city': None, | |
'district': None, 'village': None, 'neighbor': None, | |
'road': None, 'section': None, 'lane': None, 'alley': None, | |
'no': None, 'seq': None, 'floor': None, 'other': None} | |
for k, v in parsed.items(): | |
if v: | |
parsed[k] = v.strip() | |
for key in ['section', 'no', 'seq', 'floor']: | |
if not parsed[key]: | |
continue | |
parsed[key] = re.sub('[一二三四五六七八九十]', | |
lambda x: cn_mapping[x.group()], | |
parsed[key]) | |
return parsed |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment