Skip to content

Instantly share code, notes, and snippets.

@RainJayTsai
Last active November 29, 2023 02:15
Show Gist options
  • Save RainJayTsai/9552dd413a4c71c62e6b61233485a3b0 to your computer and use it in GitHub Desktop.
Save RainJayTsai/9552dd413a4c71c62e6b61233485a3b0 to your computer and use it in GitHub Desktop.
regex to analysis tw address
try:
import re2 as re
except:
import re
pattern = '(?P<code>\d{5}|\d{3})?\s*(?P<country>[台臺]灣)?(?P<city>\D+?[縣市])(?P<district>\D+?(市區|鎮區|鎮市|[鄉鎮市區]))?(?P<village>\D+?[村里](?!路))?(?P<neighbor>\d+[鄰])?(?P<road>\D+?(村路|[路街道段]))?(?P<section>(\d|\D)?段)?(?P<lane>\d+巷)?(?P<alley>\d+弄)?(?P<no>\d+號?)?(?P<seq>[-之]\d+?(號))?(?P<floor>(((\D|\d)+樓)|((B|地下室?)\d+)))?(?P<other>.+)?'
addr_regex = re.compile(pattern)
cn_num = '一二三四五六七八九十'
cn_mapping = {k: str(v) for k, v in zip(list(cn_num),
range(1, 11))}
an_mapping = {v: k for k, v in cn_mapping.items()}
def norm_address(address):
address = address.replace('\u200b', '')
address = address.replace('―', '-')
address = address.replace('臺', '台')
address = address.replace('巿', '市')
address = address.replace(',', '、')
address = address.replace('ㄧ樓', '1樓')
address = address.replace('桃園縣', '桃園市')
address = re.sub('(\d+)[▔▁——─――一之_](\d+)', '\\1-\\2', address)
address = re.sub('(\d+)\s*[Ff]', '\\1樓', address)
# print(address)
for ele in re.findall(f'([{cn_num[:-1]}])?(十)?(?<![雙双])([{cn_num}])路', address):
new_num = int(cn_mapping.get(ele[0], 0)) * 10 + int(cn_mapping.get(ele[-1]))%10
if new_num < 10 and ele[1]:
new_num += 10
if ele[0] == ele[1] == '' and ele[2] == '十':
new_num = 10
address = address.replace("".join(ele) + '路', str(new_num) + '路')
for ele in re.findall('\d+路', address):
if len(ele) == 2:
# print(address)
address = address.replace(ele, an_mapping[ele[0]]+ele[1:])
address = address.strip()
return address
def parse_address(address):
try:
parsed = addr_regex.match(address).groupdict()
except AttributeError:
parsed = {'code': None, 'country': None, 'city': None,
'district': None, 'village': None, 'neighbor': None,
'road': None, 'section': None, 'lane': None, 'alley': None,
'no': None, 'seq': None, 'floor': None, 'other': None}
for k, v in parsed.items():
if v:
parsed[k] = v.strip()
for key in ['section', 'no', 'seq', 'floor']:
if not parsed[key]:
continue
parsed[key] = re.sub('[一二三四五六七八九十]',
lambda x: cn_mapping[x.group()],
parsed[key])
return parsed
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment