yatszhash/CheckValueTypeMatchers.py

## CheckValueTypeMatchers.py
import re
import numpy as np

class BaseMatcher(object):
    def __init__(self, pattern, threshold):
        self.pattern = pattern
        self.compiled_pattern = re.compile(pattern=self.pattern)
        self.threshold = threshold

    def match(self, X):
        macth_proportions = map(self.get_match_proportion, X)
        return [proportion >= self.threshold for proportion in macth_proportions]

        #todo is fullmatch valid?
    #todo strip should be transferred to preprocessor
    def pattern_match(self, word_list):
        return [self.compiled_pattern.fullmatch(x.strip()) is not None for x in word_list
                if x is not None and x != ""]

    def get_match_proportion(self, word_list):
        return np.mean(self.pattern_match(word_list))

class MailaddressMatcher(BaseMatcher):
    pattern = r"[a-zA-Zａ-ｚＡ-Ｚ0-9０-９*!#$%&’*+/=?^`{\|\-_.}]+?@[a-zA-Zａ-ｚＡ-Ｚ0-9０-９*!#$%&’*+/=?^`{\|\-_.}]+?"

    def __init__(self, threshold=0.8):
        super().__init__(pattern=MailaddressMatcher.pattern, threshold=threshold)

# if __name__ == "__main__":
#     import pandas as pd
#     matcher = MailaddressMatcher()
#     X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv")
#     X = [X_df[value].astype(str).tolist() for value in X_df.columns.values]

#     #X = [["______@gamil.com"]]
#     #print(list(map(matcher.compiled_pattern.search, X[0])))

#     result = matcher.match(X)
#     print(result)

class UrlMatcher(BaseMatcher):
    pattern = r"(https?://)?www.[a-zA-Zａ-ｚＡ-Ｚ0-9０-９!#\$%&’\*\+/=\?^`\|\-_\.]+"
              #r"/.[a-zA-Zａ-ｚＡ-Ｚ0-9０-９*!#$%&’*+/=?^`" \
              #r"{\|\-_.}]+?/.[a-zA-Zａ-ｚＡ-Ｚ0-9０-９*!#$%&’*+/=?^`{\|\-_.}]+?"

    def __init__(self, threshold=0.8):
        super().__init__(pattern=UrlMatcher.pattern, threshold=threshold)

if __name__ == "__main__":
    import pandas as pd
    matcher = UrlMatcher()
    X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv")
    X = [X_df[value].astype(str).tolist() for value in X_df.columns.values]

    X = [["\u3000 https://www.google.co.jp/", "http://www.kantei.go.jp/", "www.google.com"]]
          #"〒1234567", "０８０ー９５６１", "gdsajgl;daj"]]
    #print(list(map(matcher.compiled_pattern.search, X[0])))

    result = matcher.match(X)
    print(result)


class PostalcodeMatcher(BaseMatcher):
    pattern = r"[〒]{0,1}[0-9０-９]{3}[-ー]{0,1}[0-9０-９]{4}"

    def __init__(self, threshold=0.8):
        super().__init__(pattern=PostalcodeMatcher.pattern, threshold=threshold)

# if __name__ == "__main__":
#     import pandas as pd
#     matcher = PostalcodeMatcher()
#     X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv")
#     X = [X_df[value].astype(str).tolist() for value in X_df.columns.values]

#     #X = [["\u3000 654-3515 ", " 567-9999 ", "〒1234567", "０８０ー９５６１", "gdsajgl;daj"]]
#     #print(list(map(matcher.compiled_pattern.search, X[0])))

#     result = matcher.match(X)
#     print(result)


class DataMatcher(BaseMatcher):

    year_kansuuji = r"一二三四五六七八九十壱弐参拾百千\u25CB\u25EF\u3007"
    mon_day_kannsuuji = r"一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007"

    a_year = r"[西暦]?[ \u3000]?[\d一二三四五六七八九壱弐参\u25CB\u25EF\u3007]?" \
             r"[\d一二三四五六七八九壱弐参][十拾]?[\d一二三四五六七八九壱弐参][年]?[ \u3000\-_/]?"
    b_year = "[西暦]?[ \u3000]?[\d一二三四五六七八九壱弐参][十拾\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参\u25CB\u25EF\u3007][年]?[ \u3000\-_/]?"

    a_mon = r"[0-1０１一壱]?[十拾\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参\u25CB\u25EF\u3007][月]?" \
            r"[ \u3000\-_/]?"


    chirist_year_pattern = r"[西暦]?[ \u3000]?[\d一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007]{1,3}[年]?[ \u3000\-_/]?"
    wareki_year_pattern = r"[昭和|平成|大正|西暦]?[ \u3000]?[\d一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007]{1,4}[年]?[ \u3000\-_/]?"

    year_pattern = r"[昭和|平成|大正|西暦]?[ \u3000]?[\d一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007]{1,4}[年]?[ \u3000\-_/]?"
    mon_pattern = r"[0-1０１一壱]?[十\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参拾\u25CB\u25EF\u3007][月]?" \
                  r"[ \u3000\-_/]?"
    day_pattern = r"[0-3０２３一二三壱弐参]?[十\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参拾\u25CB\u25EF\u3007][日]?" \
                  r"[ \u3000\-_/]?"

    pattern = year_pattern + mon_pattern + day_pattern

    def __init__(self, threshold=0.8):

        super().__init__(pattern=DataMatcher.pattern, threshold=threshold)

# if __name__ == "__main__":
#     import pandas as pd
#     matcher = DataMatcher()
#     #X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv")
#     #X = [X_df[value].astype(str).tolist() for value in X_df.columns.values]

#     X = [["2016年09月26日", "1600-8-10", "平成 ２８年 4月 8日", "昭和三十二年五月壹〇日", "353", "20160"], ["212-0000"],
#          ["20160809"]]

#     #X = [["2016年", "1600", "平成 ２８年", "昭和三十二年", "2016"], ["212-0000"],
#     #    ["2016"]]

#     #print(list(map(matcher.compiled_pattern.search, X[0])))

#     result = matcher.match(X)
#     print(result)
	import re
	import numpy as np

	class BaseMatcher(object):
	def __init__(self, pattern, threshold):
	self.pattern = pattern
	self.compiled_pattern = re.compile(pattern=self.pattern)
	self.threshold = threshold

	def match(self, X):
	macth_proportions = map(self.get_match_proportion, X)
	return [proportion >= self.threshold for proportion in macth_proportions]

	#todo is fullmatch valid?
	#todo strip should be transferred to preprocessor
	def pattern_match(self, word_list):
	return [self.compiled_pattern.fullmatch(x.strip()) is not None for x in word_list
	if x is not None and x != ""]

	def get_match_proportion(self, word_list):
	return np.mean(self.pattern_match(word_list))

	class MailaddressMatcher(BaseMatcher):
	pattern = r"[a-zA-Zａ-ｚＡ-Ｚ0-9０-９!#$%&’+/=?^`{\\|\-_.}]+?@[a-zA-Zａ-ｚＡ-Ｚ0-9０-９!#$%&’+/=?^`{\\|\-_.}]+?"

	def __init__(self, threshold=0.8):
	super().__init__(pattern=MailaddressMatcher.pattern, threshold=threshold)

	# if __name__ == "__main__":
	# import pandas as pd
	# matcher = MailaddressMatcher()
	# X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv")
	# X = [X_df[value].astype(str).tolist() for value in X_df.columns.values]

	# #X = [["______@gamil.com"]]
	# #print(list(map(matcher.compiled_pattern.search, X[0])))

	# result = matcher.match(X)
	# print(result)

	class UrlMatcher(BaseMatcher):
	pattern = r"(https?://)?www.[a-zA-Zａ-ｚＡ-Ｚ0-9０-９!#\$%&’\*\+/=\?^`\\|\-_\.]+"
	#r"/.[a-zA-Zａ-ｚＡ-Ｚ0-9０-９!#$%&’+/=?^`" \
	#r"{\\|\-_.}]+?/.[a-zA-Zａ-ｚＡ-Ｚ0-9０-９!#$%&’+/=?^`{\\|\-_.}]+?"

	def __init__(self, threshold=0.8):
	super().__init__(pattern=UrlMatcher.pattern, threshold=threshold)

	if __name__ == "__main__":
	import pandas as pd
	matcher = UrlMatcher()
	X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv")
	X = [X_df[value].astype(str).tolist() for value in X_df.columns.values]

	X = [["\u3000 https://www.google.co.jp/", "http://www.kantei.go.jp/", "www.google.com"]]
	#"〒1234567", "０８０ー９５６１", "gdsajgl;daj"]]
	#print(list(map(matcher.compiled_pattern.search, X[0])))

	result = matcher.match(X)
	print(result)


	class PostalcodeMatcher(BaseMatcher):
	pattern = r"[〒]{0,1}[0-9０-９]{3}[-ー]{0,1}[0-9０-９]{4}"

	def __init__(self, threshold=0.8):
	super().__init__(pattern=PostalcodeMatcher.pattern, threshold=threshold)

	# if __name__ == "__main__":
	# import pandas as pd
	# matcher = PostalcodeMatcher()
	# X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv")
	# X = [X_df[value].astype(str).tolist() for value in X_df.columns.values]

	# #X = [["\u3000 654-3515 ", " 567-9999 ", "〒1234567", "０８０ー９５６１", "gdsajgl;daj"]]
	# #print(list(map(matcher.compiled_pattern.search, X[0])))

	# result = matcher.match(X)
	# print(result)


	class DataMatcher(BaseMatcher):

	year_kansuuji = r"一二三四五六七八九十壱弐参拾百千\u25CB\u25EF\u3007"
	mon_day_kannsuuji = r"一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007"

	a_year = r"[西暦]?[ \u3000]?[\d一二三四五六七八九壱弐参\u25CB\u25EF\u3007]?" \
	r"[\d一二三四五六七八九壱弐参][十拾]?[\d一二三四五六七八九壱弐参][年]?[ \u3000\-_/]?"
	b_year = "[西暦]?[ \u3000]?[\d一二三四五六七八九壱弐参][十拾\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参\u25CB\u25EF\u3007][年]?[ \u3000\-_/]?"

	a_mon = r"[0-1０１一壱]?[十拾\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参\u25CB\u25EF\u3007][月]?" \
	r"[ \u3000\-_/]?"


	chirist_year_pattern = r"[西暦]?[ \u3000]?[\d一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007]{1,3}[年]?[ \u3000\-_/]?"
	wareki_year_pattern = r"[昭和\|平成\|大正\|西暦]?[ \u3000]?[\d一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007]{1,4}[年]?[ \u3000\-_/]?"

	year_pattern = r"[昭和\|平成\|大正\|西暦]?[ \u3000]?[\d一二三四五六七八九十壱弐参拾\u25CB\u25EF\u3007]{1,4}[年]?[ \u3000\-_/]?"
	mon_pattern = r"[0-1０１一壱]?[十\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参拾\u25CB\u25EF\u3007][月]?" \
	r"[ \u3000\-_/]?"
	day_pattern = r"[0-3０２３一二三壱弐参]?[十\u25CB\u25EF\u3007]?[\d一二三四五六七八九壱弐参拾\u25CB\u25EF\u3007][日]?" \
	r"[ \u3000\-_/]?"

	pattern = year_pattern + mon_pattern + day_pattern

	def __init__(self, threshold=0.8):

	super().__init__(pattern=DataMatcher.pattern, threshold=threshold)

	# if __name__ == "__main__":
	# import pandas as pd
	# matcher = DataMatcher()
	# #X_df = pd.read_csv("../datasets/demo-referred_V0.1.0.csv")
	# #X = [X_df[value].astype(str).tolist() for value in X_df.columns.values]

	# X = [["2016年09月26日", "1600-8-10", "平成２８年 4月 8日", "昭和三十二年五月壹〇日", "353", "20160"], ["212-0000"],
	# ["20160809"]]

	# #X = [["2016年", "1600", "平成２８年", "昭和三十二年", "2016"], ["212-0000"],
	# # ["2016"]]

	# #print(list(map(matcher.compiled_pattern.search, X[0])))

	# result = matcher.match(X)
	# print(result)