Skip to content

Instantly share code, notes, and snippets.

@yatszhash
Created October 18, 2017 02:42
Show Gist options
  • Save yatszhash/a50df1d8a392b33f61848f98dae64db5 to your computer and use it in GitHub Desktop.
Save yatszhash/a50df1d8a392b33f61848f98dae64db5 to your computer and use it in GitHub Desktop.
check the type (i.e. Phone Number) of words in the list. Check the proportion is larger than given threshold.
import re
import numpy as np
class BaseMatcher(object):
def __init__(self, pattern, threshold):
self.pattern = pattern
self.compiled_pattern = re.compile(pattern=self.pattern)
self.threshold = threshold
def match(self, X):
macth_proportions = map(self.get_match_proportion, X)
return [proportion >= self.threshold for proportion in macth_proportions]
#todo is fullmatch valid?
#todo strip should be transferred to preprocessor
def pattern_match(self, word_list):
return [self.compiled_pattern.fullmatch(x.strip()) is not None for x in word_list
if x is not None and x != ""]
def get_match_proportion(self, word_list):
return np.mean(self.pattern_match(word_list))
class MailaddressMatcher(BaseMatcher):
pattern = r"[a-zA-Za-zA-Z0-90-9*!#$%&’*+/=?^`{\|\-_.}]+?@[a-zA-Za-zA-Z0-90-9*!#$%&’*+/=?^`{\|\-_.}]+?"
def __init__(self, threshold=0.8):
super().__init__(pattern=MailaddressMatcher.pattern, threshold=threshold)
# if __name__ == "__main__":
# import pandas as pd
# matcher = MailaddressMatcher()
# X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv")
# X = [X_df[value].astype(str).tolist() for value in X_df.columns.values]
# #X = [["______@gamil.com"]]
# #print(list(map(matcher.compiled_pattern.search, X[0])))
# result = matcher.match(X)
# print(result)
class UrlMatcher(BaseMatcher):
pattern = r"(https?://)?www.[a-zA-Za-zA-Z0-90-9!#\$%&’\*\+/=\?^`\|\-_\.]+"
#r"/.[a-zA-Za-zA-Z0-90-9*!#$%&’*+/=?^`" \
#r"{\|\-_.}]+?/.[a-zA-Za-zA-Z0-90-9*!#$%&’*+/=?^`{\|\-_.}]+?"
def __init__(self, threshold=0.8):
super().__init__(pattern=UrlMatcher.pattern, threshold=threshold)
if __name__ == "__main__":
import pandas as pd
matcher = UrlMatcher()
X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv")
X = [X_df[value].astype(str).tolist() for value in X_df.columns.values]
X = [["\u3000 https://www.google.co.jp/", "http://www.kantei.go.jp/", "www.google.com"]]
#"〒1234567", "080ー9561", "gdsajgl;daj"]]
#print(list(map(matcher.compiled_pattern.search, X[0])))
result = matcher.match(X)
print(result)
class PostalcodeMatcher(BaseMatcher):
pattern = r"[〒]{0,1}[0-90-9]{3}[-ー]{0,1}[0-90-9]{4}"
def __init__(self, threshold=0.8):
super().__init__(pattern=PostalcodeMatcher.pattern, threshold=threshold)
# if __name__ == "__main__":
# import pandas as pd
# matcher = PostalcodeMatcher()
# X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv")
# X = [X_df[value].astype(str).tolist() for value in X_df.columns.values]
# #X = [["\u3000 654-3515 ", " 567-9999 ", "〒1234567", "080ー9561", "gdsajgl;daj"]]
# #print(list(map(matcher.compiled_pattern.search, X[0])))
# result = matcher.match(X)
# print(result)
class DataMatcher(BaseMatcher):
year_kansuuji = r"一二三四五六七八九十壱弐参拾百千\u25CB\u25EF\u3007"
mon_day_kannsuuji = r"一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007"
a_year = r"[西暦]?[ \u3000]?[\d一二三四五六七八九壱弐参\u25CB\u25EF\u3007]?" \
r"[\d一二三四五六七八九壱弐参][十拾]?[\d一二三四五六七八九壱弐参][年]?[ \u3000\-_/]?"
b_year = "[西暦]?[ \u3000]?[\d一二三四五六七八九壱弐参][十拾\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参\u25CB\u25EF\u3007][年]?[ \u3000\-_/]?"
a_mon = r"[0-101一壱]?[十拾\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参\u25CB\u25EF\u3007][月]?" \
r"[ \u3000\-_/]?"
chirist_year_pattern = r"[西暦]?[ \u3000]?[\d一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007]{1,3}[年]?[ \u3000\-_/]?"
wareki_year_pattern = r"[昭和|平成|大正|西暦]?[ \u3000]?[\d一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007]{1,4}[年]?[ \u3000\-_/]?"
year_pattern = r"[昭和|平成|大正|西暦]?[ \u3000]?[\d一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007]{1,4}[年]?[ \u3000\-_/]?"
mon_pattern = r"[0-101一壱]?[十\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参拾\u25CB\u25EF\u3007][月]?" \
r"[ \u3000\-_/]?"
day_pattern = r"[0-3023一二三壱弐参]?[十\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参拾\u25CB\u25EF\u3007][日]?" \
r"[ \u3000\-_/]?"
pattern = year_pattern + mon_pattern + day_pattern
def __init__(self, threshold=0.8):
super().__init__(pattern=DataMatcher.pattern, threshold=threshold)
# if __name__ == "__main__":
# import pandas as pd
# matcher = DataMatcher()
# #X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv")
# #X = [X_df[value].astype(str).tolist() for value in X_df.columns.values]
# X = [["2016年09月26日", "1600-8-10", "平成 28年 4月 8日", "昭和三十二年五月壹〇日", "353", "20160"], ["212-0000"],
# ["20160809"]]
# #X = [["2016年", "1600", "平成 28年", "昭和三十二年", "2016"], ["212-0000"],
# # ["2016"]]
# #print(list(map(matcher.compiled_pattern.search, X[0])))
# result = matcher.match(X)
# print(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment