Skip to content

Instantly share code, notes, and snippets.

@satomacoto
Last active December 15, 2022 10:34
Show Gist options
  • Star 3 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save satomacoto/5290437 to your computer and use it in GitHub Desktop.
Save satomacoto/5290437 to your computer and use it in GitHub Desktop.
kNN on xvideos.com-db.csv
# -*- coding:utf-8 -*-
from pymongo import MongoClient
client = MongoClient()
db = client.xvideos
def create_db():
f = open('xvideos.com-db.csv')
for line in f:
try:
row = line[:-1].split(';')
title = row[1]
tags = row[5].strip()
genre = row[6]
video = {'title': title,
'tags': tags.split(',') if tags else [],
'genre': genre}
db.videos.insert(video)
except:
pass
create_db()
# -*- coding:utf-8 -*-
import heapq
import itertools
from pymongo import MongoClient
client = MongoClient()
db = client.xvideos
# http://stackoverflow.com/questions/1518522/python-most-common-element-in-a-list
def most_common(L):
''' find most common item '''
groups = itertools.groupby(sorted(L))
def _auxfun((item, iterable)):
return len(list(iterable)), -L.index(item)
return max(groups, key=_auxfun)[0]
def jaccard(L, M):
''' jaccard index '''
a = set(L)
b = set(M)
return 1. * len(a & b) / len(a | b)
def knn(query_tags, k=10, dist=jaccard):
''' k nearest neighbor '''
videos = db.videos.find({'tags': {'$in': query_tags}, 'genre': {'$ne': 'Unknow'}})
query_tags = set(query_tags)
rank = []
for video in videos:
tags = set(video['tags'])
rank.append((dist(query_tags, tags), video['genre']))
return most_common([k for v, k in heapq.nlargest(k, rank)])
def test_knn():
query_tags = ['shower', 'morning', 'toy', 'sexy']
print kNN(query_tags)
# queries = db.videos.find({'genre': {'$ne': 'Unknow'}}).limit(100)
# for query in queries:
# print query['genre'], kNN(query['tags'])
test_kNN()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment