Skip to content

Instantly share code, notes, and snippets.

@madrugado
Last active October 22, 2015 21:21
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save madrugado/9263c77ff9481f387149 to your computer and use it in GitHub Desktop.
Save madrugado/9263c77ff9481f387149 to your computer and use it in GitHub Desktop.
We have dataset of requests and vacancies titles and descriptions. We need to find similar queries and jobs.
#!/usr/bin/python
# -*- coding: utf-8 -*-
__author__ = 'madrugado'
import sys
import pymystem3
import itertools
from collections import Counter
import math
def main():
queries = {}
vacancies = {}
m = pymystem3.Mystem()
with open(sys.argv[1]) as q, open(sys.argv[2]) as v:
for line in q:
parts = line.strip().split("\t")
if len(parts) == 1:
query = ""
uid = int(parts[0])
elif len(parts) == 2:
query = parts[0].lower()
if parts[1] == "NULL":
uid = None
else:
uid = int(parts[1])
clean_query = " ".join(filter(lambda x: len(x) != 1 and x != 'or',
m.lemmatize(query.replace(".,()-", '')
.decode(encoding='utf8', errors='ignore')
.replace(u".,()-", '')
.lower()
.encode('utf8'))[:-1]))
if uid in queries:
queries[uid].append(clean_query)
else:
queries[uid] = [clean_query]
for line in v:
parts = line.strip().split("\t")
vacancy = int(parts[0])
uid = int(parts[1])
if len(parts) >= 3:
query = parts[2].lower()
else:
query = ""
clean_query = " ".join(filter(lambda x: len(x) != 1 and x != 'or',
m.lemmatize(query
.decode(encoding='utf8', errors='ignore')
.replace(".,()-", '')
.lower()
.encode('utf8'))[:-1]))
if uid in vacancies:
vacancies[uid].append((vacancy, clean_query))
else:
vacancies[uid] = [(vacancy, clean_query)]
all_q = [item for uid in queries for item in queries[uid]]
uniq_q = set(all_q)
tokens = Counter([item for sublist in map(lambda x: x.split(' '), all_q) for item in sublist])
# TODO: add interesting query
for _ in xrange(10):
input_query = sys.stdin.readline().strip()
clean_query = " ".join(filter(lambda x: len(x) != 1 and x != 'or',
m.lemmatize(input_query
.decode(encoding='utf8', errors='ignore')
.replace(".,()-", '')
.lower()
.encode('utf8'))[:-1]))
output = []
for test_q in all_q:
tokens_a = set(test_q.split(' '))
tokens_b = set(clean_query.split(' '))
output.append((test_q, len(tokens_a.intersection(tokens_b)) \
* math.log(float(len(uniq_q)) / sum([tokens[x] for x in tokens_a.union(tokens_b)]))))
print sorted(output, key=lambda x: -x[1])[:5]
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment