Skip to content

Instantly share code, notes, and snippets.

@satomacoto
Last active March 24, 2017 05:52
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save satomacoto/5279498 to your computer and use it in GitHub Desktop.
Save satomacoto/5279498 to your computer and use it in GitHub Desktop.
count tag occurrence and cooccurrence on xvideos.com-db.csv
('blowjob', 'hardcore') 594241
('blowjob', 'brunette') 336485
('blowjob', 'teen') 334746
('amateur', 'teen') 322670
('brunette', 'hardcore') 320892
('hardcore', 'teen') 302974
('blonde', 'brunette') 282348
('blonde', 'blowjob') 272567
('blowjob', 'oral') 269461
('blonde', 'hardcore') 252474
('amateur', 'hardcore') 250742
('hardcore', 'pussy') 247591
('amateur', 'blowjob') 244265
('blowjob', 'cumshot') 230831
('blowjob', 'tits') 230508
('hardcore', 'tits') 229731
('blowjob', 'facial') 225786
('blowjob', 'pussy') 225396
('boobs', 'tits') 222670
('brunette', 'teen') 222564
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# count tag cooccurrence
# $ cat xvideos.com-db.csv|python cooccur.py
import sys
from collections import defaultdict
from itertools import combinations
d = defaultdict(int)
for line in sys.stdin:
elements = line.strip().split(';')
tags = elements[5]
if tags:
tags = sorted(tags.split(','))
for pair in combinations(tags, 2):
d[pair] += 1
for k in sorted(d, key=d.get, reverse=True)[:20]:
print k, d[k]
blowjob 931322
hardcore 862966
amateur 672166
teen 617227
brunette 513748
blonde 468833
pussy 432378
tits 378725
anal 374469
oral 335429
cumshot 322557
fucking 312005
boobs 303696
ass 297976
facial 284967
asian 256598
babe 247601
gay 232872
bigtits 227229
sex 214113
#!/usr/bin/env python
# -*- coding:utf-8 -*-
# count tag occurrence
# $ cat xvideos.com-db.csv|python occur.py
import sys
from collections import defaultdict
d = defaultdict(int)
for line in sys.stdin:
elements = line.strip().split(';')
tags = elements[5]
if tags:
for tag in tags.split(','):
d[tag] += 1
for k in sorted(d, key=d.get, reverse=True)[:20]:
print k, d[k]
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment