Skip to content

Instantly share code, notes, and snippets.

@englehardt
Created November 22, 2016 16:47
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save englehardt/9d46b7d581d22de705103540e8298951 to your computer and use it in GitHub Desktop.
Save englehardt/9d46b7d581d22de705103540e8298951 to your computer and use it in GitHub Desktop.
Python script used to generate `organizations.json` (https://gist.github.com/englehardt/a8ce765e410615de83bb40533b0eed29).
from collections import defaultdict
import json
import dill
import os
DATA_DIR = './'
WEBXRAY_LIST = 'webxray_orgs.json'
DISCONNECT_LIST = 'disconnect_list.json'
OUT_LIST = 'merged_organizations.dill'
# This list maps org keys disconnect:webxray
# webxrays names are preferred as they seem much more updated
# most of this mapping generated automatically by merging orgs that shared
# domains between the list. I spot checked many but not all to ensure an
# actual merge/rebrand occured, but bugs are possible.
MAPPING = {
'safecount':'kantar',
'exponential interactive':'tribal fusion',
'exponential interactive':'exponential advertising intelligence',
'krux':'krux digital',
'shopzilla':'the e.w. scripps company',
'amazon.com':'amazon',
'tubemogul':'tube mogul',
'brighttag':'signal',
'33across':'33 across',
'contextweb':'pulsepoint',
'datran media':'pulsepoint',
'liveinternet':'yadro',
'maxpoint':'maxpoint interactive',
'adsafe media':'integral ad science',
'glam media':'mode media',
'ibm':'international business machines corporation (ibm)',
'radiumone':'radium one',
'[x+1]':'x+1',
'cbs interactive':'cbs corporation',
'audiencescience':'audience science',
'horyzon media':'smart ad server',
'linezing':'alibaba cloud computing ltd.',
'cnzz':'alibaba cloud computing ltd.',
'rapleaf':'liveramp, inc.',
'webtrends':'netiq',
'chango':'chango, inc.',
'dg':'extreme reach',
'sharethis':'share this',
'rocket fuel':'rocketfuel',
'burst media':'blinkx',
'neustar':'neustar, inc.',
'ak':'neustar, inc.',
'mediamath':'media math',
'm6d':'dstillery',
'videology':'videology group',
'insightexpress':'insight express',
'teads.tv':'teads',
'spotxchange':'spot x change',
'cox digital solutions':'coxreps',
'turn':'turn, inc.',
'dg':'sizmek',
'sitescout':'centro',
'bloom digital platforms':'adgear technologies',
'vizu':'nielsen online',
'nielsen':'nielsen online',
'mochila':'inform',
'ndn':'inform',
'doubleverify':'double verify',
'magnetic':'magnetic media online',
'cognitive match':'magnetic media online',
'mybuys':'magnetic media online',
'fox one stop media':'rubicon project',
'switch':'switch concepts'
}
# Import and parse disconnect list
d_orgs = defaultdict(set)
disconnect = json.load(open(os.path.join(DATA_DIR,DISCONNECT_LIST), 'r'))
for l in ['Content','Analytics','Social','Advertising']:
for row in disconnect['categories'][l]:
for org, info in row.items():
for url, domains in info.items():
d_orgs[org.lower()] = d_orgs[org.lower()].union(set(domains))
# The webxray list
webxray = json.load(open(os.path.join(DATA_DIR,WEBXRAY_LIST), 'r'))
w_orgs = defaultdict(set)
for row in webxray:
org = row['organization'].lower()
domains = row['domains']
w_orgs[org] = w_orgs[org].union(set(domains))
# Check for overlap in keys
print "Checking for overlap in domains between lists"
diff = set(w_orgs.keys()).difference(set(d_orgs.keys())).difference(set(MAPPING.values()))
matches = dict()
for key in diff:
if key in MAPPING.values():
continue
for org, domains in d_orgs.items():
if len(domains.intersection(w_orgs[key])) > 0:
print "Likely Match! '%s':'%s'" % (org, key)
matches[key] = org
# Merge the lists
merged = w_orgs
for org, domains in d_orgs.items():
if merged.has_key(org):
merged[org] = merged[org].union(domains)
elif MAPPING.has_key(org):
merged[MAPPING[org]] = merged[MAPPING[org]].union(domains)
else:
merged[org] = domains
# Manual removal of bad mappings
merged['facebook'].remove('akamaihd.net') #Akamai is a cdn for Facebook
merged['exponential advertising intelligence'].remove('tribalfusion.com') # Not same company?
merged.pop('liverail') # now owned by facebook (already included)
merged['conversant media'] = merged['conversant media'].union(merged['valueclick']) # valueclick is now conversant media
merged.pop('valueclick')
merged['alibaba cloud computing ltd.'].remove('360buyimg.com') # not really alibaba (domain might point to them)
merged['alibaba cloud computing ltd.'].remove('ykimg.com') # not alibaba (domain might point to them)
merged.pop('adometry') # subsumed (and owned) by google
merged['sizmek'].remove('dgit.com') # DG was parent company that sizmek spun off from. rest of DB bought by extreme reach
merged['aol'] = merged['aol'].union(merged['adtech']) #adtech owned by AOL
merged.pop('adtech')
merged.pop('buzzcity') # subsumed (and part of) `bv! media`
merged.pop('anquan') # appears to be same company as knownsec
merged.pop('visual revenue') # acquired by outbrain (domain already in list)
merged.pop('vertical acuity') # acquired by outbrain (domain already in list)
merged.pop('convertro') # acquired by AOL (domain already in list)
merged.pop('gravity') # acquired by AOL (domain already in list)
merged.pop('adap.tv') # acquired by AOL (domain already in list)
merged.pop('emar box') # included twice
merged['quisma'] = merged['quisma'].union(merged['i-behavior']) # seem to be same company
merged.pop('i-behavior')
merged['matomy'] = merged['matomy'].union(merged['matomy market']) # same company
merged.pop('matomy market')
merged['iac'] = merged['iac'].union(merged['vimeo']) # vimeo is an iac brand
merged.pop('vimeo')
merged['iac'] = merged['iac'].union(merged['match.com']) # match.com is an iac brand
merged.pop('match.com')
merged.pop('brightroll') # owned (and subsumed) by yahoo
merged.pop('tumblr') # owned (and subsumed) by yahoo
merged.pop('legolas media') # owned (and subsumed) by undertone
merged.pop('vcmedia') # they are admicro
merged['prime visibility'] = merged['prime visibility'].union(merged['adon network']) # adOn acquired by PV
merged.pop('adon network')
merged.pop('bizo') # bizo is linkedin -- now killed off
merged['oracle'] = merged['oracle'].union(merged['bluekai']) # bluekai bought by oracle
merged.pop('bluekai')
merged['oracle'] = merged['oracle'].union(merged['addthis']) # addthis bought by oracle
merged.pop('addthis')
# Print domains mapped to multple orgs (bad mappings)
# Should be none if the above list is comprehensive
print "Checking for domains mapped to multiple orgs..."
for org, domains in merged.iteritems():
for org2, domains2 in merged.iteritems():
if org == org2:
continue
if len(domains.intersection(domains2)) > 0:
print "\n\nOrg1: %s" % org
print domains
print "Org2: %s" % org2
print domains2
print "SHARED DOMAINS"
print domains.intersection(domains2)
dill.dump(merged, open(os.path.join(DATA_DIR,OUT_LIST), 'w'))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment