Created
October 18, 2017 02:42
-
-
Save yatszhash/a50df1d8a392b33f61848f98dae64db5 to your computer and use it in GitHub Desktop.
check the type (i.e. Phone Number) of words in the list. Check the proportion is larger than given threshold.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import numpy as np | |
class BaseMatcher(object): | |
def __init__(self, pattern, threshold): | |
self.pattern = pattern | |
self.compiled_pattern = re.compile(pattern=self.pattern) | |
self.threshold = threshold | |
def match(self, X): | |
macth_proportions = map(self.get_match_proportion, X) | |
return [proportion >= self.threshold for proportion in macth_proportions] | |
#todo is fullmatch valid? | |
#todo strip should be transferred to preprocessor | |
def pattern_match(self, word_list): | |
return [self.compiled_pattern.fullmatch(x.strip()) is not None for x in word_list | |
if x is not None and x != ""] | |
def get_match_proportion(self, word_list): | |
return np.mean(self.pattern_match(word_list)) | |
class MailaddressMatcher(BaseMatcher): | |
pattern = r"[a-zA-Za-zA-Z0-90-9*!#$%&’*+/=?^`{\|\-_.}]+?@[a-zA-Za-zA-Z0-90-9*!#$%&’*+/=?^`{\|\-_.}]+?" | |
def __init__(self, threshold=0.8): | |
super().__init__(pattern=MailaddressMatcher.pattern, threshold=threshold) | |
# if __name__ == "__main__": | |
# import pandas as pd | |
# matcher = MailaddressMatcher() | |
# X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv") | |
# X = [X_df[value].astype(str).tolist() for value in X_df.columns.values] | |
# #X = [["______@gamil.com"]] | |
# #print(list(map(matcher.compiled_pattern.search, X[0]))) | |
# result = matcher.match(X) | |
# print(result) | |
class UrlMatcher(BaseMatcher): | |
pattern = r"(https?://)?www.[a-zA-Za-zA-Z0-90-9!#\$%&’\*\+/=\?^`\|\-_\.]+" | |
#r"/.[a-zA-Za-zA-Z0-90-9*!#$%&’*+/=?^`" \ | |
#r"{\|\-_.}]+?/.[a-zA-Za-zA-Z0-90-9*!#$%&’*+/=?^`{\|\-_.}]+?" | |
def __init__(self, threshold=0.8): | |
super().__init__(pattern=UrlMatcher.pattern, threshold=threshold) | |
if __name__ == "__main__": | |
import pandas as pd | |
matcher = UrlMatcher() | |
X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv") | |
X = [X_df[value].astype(str).tolist() for value in X_df.columns.values] | |
X = [["\u3000 https://www.google.co.jp/", "http://www.kantei.go.jp/", "www.google.com"]] | |
#"〒1234567", "080ー9561", "gdsajgl;daj"]] | |
#print(list(map(matcher.compiled_pattern.search, X[0]))) | |
result = matcher.match(X) | |
print(result) | |
class PostalcodeMatcher(BaseMatcher): | |
pattern = r"[〒]{0,1}[0-90-9]{3}[-ー]{0,1}[0-90-9]{4}" | |
def __init__(self, threshold=0.8): | |
super().__init__(pattern=PostalcodeMatcher.pattern, threshold=threshold) | |
# if __name__ == "__main__": | |
# import pandas as pd | |
# matcher = PostalcodeMatcher() | |
# X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv") | |
# X = [X_df[value].astype(str).tolist() for value in X_df.columns.values] | |
# #X = [["\u3000 654-3515 ", " 567-9999 ", "〒1234567", "080ー9561", "gdsajgl;daj"]] | |
# #print(list(map(matcher.compiled_pattern.search, X[0]))) | |
# result = matcher.match(X) | |
# print(result) | |
class DataMatcher(BaseMatcher): | |
year_kansuuji = r"一二三四五六七八九十壱弐参拾百千\u25CB\u25EF\u3007" | |
mon_day_kannsuuji = r"一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007" | |
a_year = r"[西暦]?[ \u3000]?[\d一二三四五六七八九壱弐参\u25CB\u25EF\u3007]?" \ | |
r"[\d一二三四五六七八九壱弐参][十拾]?[\d一二三四五六七八九壱弐参][年]?[ \u3000\-_/]?" | |
b_year = "[西暦]?[ \u3000]?[\d一二三四五六七八九壱弐参][十拾\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参\u25CB\u25EF\u3007][年]?[ \u3000\-_/]?" | |
a_mon = r"[0-101一壱]?[十拾\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参\u25CB\u25EF\u3007][月]?" \ | |
r"[ \u3000\-_/]?" | |
chirist_year_pattern = r"[西暦]?[ \u3000]?[\d一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007]{1,3}[年]?[ \u3000\-_/]?" | |
wareki_year_pattern = r"[昭和|平成|大正|西暦]?[ \u3000]?[\d一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007]{1,4}[年]?[ \u3000\-_/]?" | |
year_pattern = r"[昭和|平成|大正|西暦]?[ \u3000]?[\d一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007]{1,4}[年]?[ \u3000\-_/]?" | |
mon_pattern = r"[0-101一壱]?[十\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参拾\u25CB\u25EF\u3007][月]?" \ | |
r"[ \u3000\-_/]?" | |
day_pattern = r"[0-3023一二三壱弐参]?[十\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参拾\u25CB\u25EF\u3007][日]?" \ | |
r"[ \u3000\-_/]?" | |
pattern = year_pattern + mon_pattern + day_pattern | |
def __init__(self, threshold=0.8): | |
super().__init__(pattern=DataMatcher.pattern, threshold=threshold) | |
# if __name__ == "__main__": | |
# import pandas as pd | |
# matcher = DataMatcher() | |
# #X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv") | |
# #X = [X_df[value].astype(str).tolist() for value in X_df.columns.values] | |
# X = [["2016年09月26日", "1600-8-10", "平成 28年 4月 8日", "昭和三十二年五月壹〇日", "353", "20160"], ["212-0000"], | |
# ["20160809"]] | |
# #X = [["2016年", "1600", "平成 28年", "昭和三十二年", "2016"], ["212-0000"], | |
# # ["2016"]] | |
# #print(list(map(matcher.compiled_pattern.search, X[0]))) | |
# result = matcher.match(X) | |
# print(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment