Skip to content

Instantly share code, notes, and snippets.

@otykhonruk
Last active November 10, 2017 16:47
Show Gist options
  • Save otykhonruk/740dbd74477fefb84cfd90d241dce504 to your computer and use it in GitHub Desktop.
Save otykhonruk/740dbd74477fefb84cfd90d241dce504 to your computer and use it in GitHub Desktop.
import json
import math
import re
import urllib.parse
from collections import Counter
from difflib import SequenceMatcher
from pprint import pprint
from tldextract import extract as tldextract
from unidecode import unidecode
STOPWORDS = ('aps',)
def get_domain(url):
netloc = urllib.parse.urlsplit(url).netloc
parts = tldextract(netloc)
domain = parts.domain.replace('-', '')
return domain
def get_names(name):
name = unidecode(name.lower())
parts = re.split('\W', name)
return list(filter(lambda x: len(x) > 1 and x not in STOPWORDS, parts))
def can_compose_word(word, parts):
""" Check, if the word can be composed from the parts.
TODO: check overlapping """
nlen = len(word)
mlen = 0
for p in parts:
if word.find(p) != -1:
mlen += len(p)
# strict match
if mlen == nlen:
return True
# approximate match
elif math.isclose(mlen, nlen, rel_tol=0.2):
return True
return False
def is_valid_match(names, url):
name = ''.join(names)
sm = SequenceMatcher(None, name, url)
blocks = sm.get_matching_blocks()
mlen = sum(s.size for s in blocks)
slen = min(len(url), len(name))
if math.isclose(mlen, slen, rel_tol=0.25):
return True
return False
def process_record(data, debug=False):
for _, name, urls in data:
for url in urls:
nameparts = get_names(name)
name_ = ''.join(nameparts)
domain = get_domain(url)
result = is_valid_match(name_, domain)
if result:
if debug:
print('MATCH: ', nameparts, url)
else:
print(nameparts, domain, ' \t# ', url)
def count(data):
c = Counter()
for _, _, urls in data:
c[len(urls)] += 1
print('Total: {}'.format(sum(c.values())))
print('Number of urls per company:')
for v in sorted(c.most_common()):
print('{}: {}'.format(*v))
if __name__ == '__main__':
import argparse
ap = argparse.ArgumentParser()
ap.add_argument('--debug', '-d', action='store_true')
ap.add_argument('--count', '-c', action='store_true')
ap.add_argument('infile', type=argparse.FileType('r'))
args = ap.parse_args()
data = json.load(args.infile)
if args.count:
count(data)
else:
process_record(data, args.debug)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment