Created
November 22, 2016 16:47
-
-
Save englehardt/9d46b7d581d22de705103540e8298951 to your computer and use it in GitHub Desktop.
Python script used to generate `organizations.json` (https://gist.github.com/englehardt/a8ce765e410615de83bb40533b0eed29).
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import defaultdict | |
import json | |
import dill | |
import os | |
DATA_DIR = './' | |
WEBXRAY_LIST = 'webxray_orgs.json' | |
DISCONNECT_LIST = 'disconnect_list.json' | |
OUT_LIST = 'merged_organizations.dill' | |
# This list maps org keys disconnect:webxray | |
# webxrays names are preferred as they seem much more updated | |
# most of this mapping generated automatically by merging orgs that shared | |
# domains between the list. I spot checked many but not all to ensure an | |
# actual merge/rebrand occured, but bugs are possible. | |
MAPPING = { | |
'safecount':'kantar', | |
'exponential interactive':'tribal fusion', | |
'exponential interactive':'exponential advertising intelligence', | |
'krux':'krux digital', | |
'shopzilla':'the e.w. scripps company', | |
'amazon.com':'amazon', | |
'tubemogul':'tube mogul', | |
'brighttag':'signal', | |
'33across':'33 across', | |
'contextweb':'pulsepoint', | |
'datran media':'pulsepoint', | |
'liveinternet':'yadro', | |
'maxpoint':'maxpoint interactive', | |
'adsafe media':'integral ad science', | |
'glam media':'mode media', | |
'ibm':'international business machines corporation (ibm)', | |
'radiumone':'radium one', | |
'[x+1]':'x+1', | |
'cbs interactive':'cbs corporation', | |
'audiencescience':'audience science', | |
'horyzon media':'smart ad server', | |
'linezing':'alibaba cloud computing ltd.', | |
'cnzz':'alibaba cloud computing ltd.', | |
'rapleaf':'liveramp, inc.', | |
'webtrends':'netiq', | |
'chango':'chango, inc.', | |
'dg':'extreme reach', | |
'sharethis':'share this', | |
'rocket fuel':'rocketfuel', | |
'burst media':'blinkx', | |
'neustar':'neustar, inc.', | |
'ak':'neustar, inc.', | |
'mediamath':'media math', | |
'm6d':'dstillery', | |
'videology':'videology group', | |
'insightexpress':'insight express', | |
'teads.tv':'teads', | |
'spotxchange':'spot x change', | |
'cox digital solutions':'coxreps', | |
'turn':'turn, inc.', | |
'dg':'sizmek', | |
'sitescout':'centro', | |
'bloom digital platforms':'adgear technologies', | |
'vizu':'nielsen online', | |
'nielsen':'nielsen online', | |
'mochila':'inform', | |
'ndn':'inform', | |
'doubleverify':'double verify', | |
'magnetic':'magnetic media online', | |
'cognitive match':'magnetic media online', | |
'mybuys':'magnetic media online', | |
'fox one stop media':'rubicon project', | |
'switch':'switch concepts' | |
} | |
# Import and parse disconnect list | |
d_orgs = defaultdict(set) | |
disconnect = json.load(open(os.path.join(DATA_DIR,DISCONNECT_LIST), 'r')) | |
for l in ['Content','Analytics','Social','Advertising']: | |
for row in disconnect['categories'][l]: | |
for org, info in row.items(): | |
for url, domains in info.items(): | |
d_orgs[org.lower()] = d_orgs[org.lower()].union(set(domains)) | |
# The webxray list | |
webxray = json.load(open(os.path.join(DATA_DIR,WEBXRAY_LIST), 'r')) | |
w_orgs = defaultdict(set) | |
for row in webxray: | |
org = row['organization'].lower() | |
domains = row['domains'] | |
w_orgs[org] = w_orgs[org].union(set(domains)) | |
# Check for overlap in keys | |
print "Checking for overlap in domains between lists" | |
diff = set(w_orgs.keys()).difference(set(d_orgs.keys())).difference(set(MAPPING.values())) | |
matches = dict() | |
for key in diff: | |
if key in MAPPING.values(): | |
continue | |
for org, domains in d_orgs.items(): | |
if len(domains.intersection(w_orgs[key])) > 0: | |
print "Likely Match! '%s':'%s'" % (org, key) | |
matches[key] = org | |
# Merge the lists | |
merged = w_orgs | |
for org, domains in d_orgs.items(): | |
if merged.has_key(org): | |
merged[org] = merged[org].union(domains) | |
elif MAPPING.has_key(org): | |
merged[MAPPING[org]] = merged[MAPPING[org]].union(domains) | |
else: | |
merged[org] = domains | |
# Manual removal of bad mappings | |
merged['facebook'].remove('akamaihd.net') #Akamai is a cdn for Facebook | |
merged['exponential advertising intelligence'].remove('tribalfusion.com') # Not same company? | |
merged.pop('liverail') # now owned by facebook (already included) | |
merged['conversant media'] = merged['conversant media'].union(merged['valueclick']) # valueclick is now conversant media | |
merged.pop('valueclick') | |
merged['alibaba cloud computing ltd.'].remove('360buyimg.com') # not really alibaba (domain might point to them) | |
merged['alibaba cloud computing ltd.'].remove('ykimg.com') # not alibaba (domain might point to them) | |
merged.pop('adometry') # subsumed (and owned) by google | |
merged['sizmek'].remove('dgit.com') # DG was parent company that sizmek spun off from. rest of DB bought by extreme reach | |
merged['aol'] = merged['aol'].union(merged['adtech']) #adtech owned by AOL | |
merged.pop('adtech') | |
merged.pop('buzzcity') # subsumed (and part of) `bv! media` | |
merged.pop('anquan') # appears to be same company as knownsec | |
merged.pop('visual revenue') # acquired by outbrain (domain already in list) | |
merged.pop('vertical acuity') # acquired by outbrain (domain already in list) | |
merged.pop('convertro') # acquired by AOL (domain already in list) | |
merged.pop('gravity') # acquired by AOL (domain already in list) | |
merged.pop('adap.tv') # acquired by AOL (domain already in list) | |
merged.pop('emar box') # included twice | |
merged['quisma'] = merged['quisma'].union(merged['i-behavior']) # seem to be same company | |
merged.pop('i-behavior') | |
merged['matomy'] = merged['matomy'].union(merged['matomy market']) # same company | |
merged.pop('matomy market') | |
merged['iac'] = merged['iac'].union(merged['vimeo']) # vimeo is an iac brand | |
merged.pop('vimeo') | |
merged['iac'] = merged['iac'].union(merged['match.com']) # match.com is an iac brand | |
merged.pop('match.com') | |
merged.pop('brightroll') # owned (and subsumed) by yahoo | |
merged.pop('tumblr') # owned (and subsumed) by yahoo | |
merged.pop('legolas media') # owned (and subsumed) by undertone | |
merged.pop('vcmedia') # they are admicro | |
merged['prime visibility'] = merged['prime visibility'].union(merged['adon network']) # adOn acquired by PV | |
merged.pop('adon network') | |
merged.pop('bizo') # bizo is linkedin -- now killed off | |
merged['oracle'] = merged['oracle'].union(merged['bluekai']) # bluekai bought by oracle | |
merged.pop('bluekai') | |
merged['oracle'] = merged['oracle'].union(merged['addthis']) # addthis bought by oracle | |
merged.pop('addthis') | |
# Print domains mapped to multple orgs (bad mappings) | |
# Should be none if the above list is comprehensive | |
print "Checking for domains mapped to multiple orgs..." | |
for org, domains in merged.iteritems(): | |
for org2, domains2 in merged.iteritems(): | |
if org == org2: | |
continue | |
if len(domains.intersection(domains2)) > 0: | |
print "\n\nOrg1: %s" % org | |
print domains | |
print "Org2: %s" % org2 | |
print domains2 | |
print "SHARED DOMAINS" | |
print domains.intersection(domains2) | |
dill.dump(merged, open(os.path.join(DATA_DIR,OUT_LIST), 'w')) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment