Last active
November 10, 2017 16:47
-
-
Save otykhonruk/740dbd74477fefb84cfd90d241dce504 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import json | |
import math | |
import re | |
import urllib.parse | |
from collections import Counter | |
from difflib import SequenceMatcher | |
from pprint import pprint | |
from tldextract import extract as tldextract | |
from unidecode import unidecode | |
STOPWORDS = ('aps',) | |
def get_domain(url): | |
netloc = urllib.parse.urlsplit(url).netloc | |
parts = tldextract(netloc) | |
domain = parts.domain.replace('-', '') | |
return domain | |
def get_names(name): | |
name = unidecode(name.lower()) | |
parts = re.split('\W', name) | |
return list(filter(lambda x: len(x) > 1 and x not in STOPWORDS, parts)) | |
def can_compose_word(word, parts): | |
""" Check, if the word can be composed from the parts. | |
TODO: check overlapping """ | |
nlen = len(word) | |
mlen = 0 | |
for p in parts: | |
if word.find(p) != -1: | |
mlen += len(p) | |
# strict match | |
if mlen == nlen: | |
return True | |
# approximate match | |
elif math.isclose(mlen, nlen, rel_tol=0.2): | |
return True | |
return False | |
def is_valid_match(names, url): | |
name = ''.join(names) | |
sm = SequenceMatcher(None, name, url) | |
blocks = sm.get_matching_blocks() | |
mlen = sum(s.size for s in blocks) | |
slen = min(len(url), len(name)) | |
if math.isclose(mlen, slen, rel_tol=0.25): | |
return True | |
return False | |
def process_record(data, debug=False): | |
for _, name, urls in data: | |
for url in urls: | |
nameparts = get_names(name) | |
name_ = ''.join(nameparts) | |
domain = get_domain(url) | |
result = is_valid_match(name_, domain) | |
if result: | |
if debug: | |
print('MATCH: ', nameparts, url) | |
else: | |
print(nameparts, domain, ' \t# ', url) | |
def count(data): | |
c = Counter() | |
for _, _, urls in data: | |
c[len(urls)] += 1 | |
print('Total: {}'.format(sum(c.values()))) | |
print('Number of urls per company:') | |
for v in sorted(c.most_common()): | |
print('{}: {}'.format(*v)) | |
if __name__ == '__main__': | |
import argparse | |
ap = argparse.ArgumentParser() | |
ap.add_argument('--debug', '-d', action='store_true') | |
ap.add_argument('--count', '-c', action='store_true') | |
ap.add_argument('infile', type=argparse.FileType('r')) | |
args = ap.parse_args() | |
data = json.load(args.infile) | |
if args.count: | |
count(data) | |
else: | |
process_record(data, args.debug) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment