Last active
May 10, 2016 03:17
-
-
Save PseudoSky/3afa3170fc522a4084e403f8d2deaa13 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas import * | |
import numpy as np | |
from mrjob.job import MRJob | |
from mrjob.step import MRStep | |
from mrjob.protocol import JSONValueProtocol | |
def dist( p1, p2 ): | |
# Get the list of shared_items | |
if len(p1) == 0 or len(p2) == 0: return [-1,[]] | |
same = set(p1).intersection(set(p2)) | |
return [(len(same) ),list(same)] | |
REC_COUNT=20 | |
# Yield label for each project need, project | |
# Yield label for each user, user | |
class MAPR(MRJob): | |
OUTPUT_PROTOCOL = JSONValueProtocol | |
INPUT_PROTOCOL = JSONValueProtocol | |
def mapper(self, _, line): | |
if 'person' in line[0]['_id']: | |
print "PERSON" | |
for u in line: | |
if len(u['projects'])>0: | |
yield u['label'], {"type":1,'id':u['_id'],'skills':u['skills'],'interests':u['interests'],'loc':u['location'],"refs":u['projects']} | |
else: | |
print 'PROJECT' | |
lbl_map={ | |
"1":'Developers', | |
"2":'Designers', | |
"3":"Artists", | |
"4":"Writers", | |
"5":"Scientists", | |
"6":"Musicians", | |
"7":"Product Managers", | |
"8":"Filmmakers", | |
"9":"Engineers" | |
} | |
for p in line: | |
for col in p['contributors']: | |
colab_lbl=lbl_map[col[1][-6:-5]] | |
if colab_lbl not in p['needs']: | |
p['needs'].append(colab_lbl) | |
if len(p['contributors'])>0: | |
ass=map(lambda e: e[0],p['contributors']) | |
else: ass=[p['owner']] | |
for need in p['needs']: | |
yield need[:-1], {"type":0,'id':p['_id'],'tags':p['tags'],"associated":ass,'loc':p['locations']} | |
# Yield project id, list of user distances | |
def label_reducer(self, key, values): | |
l = list(values) | |
projects=filter(lambda e: e["type"]==0, l) | |
users=filter(lambda e: e["type"]==1, l) | |
print "\n\nKEY: ",key,"\n" | |
for p in projects: | |
reccs={"label":key,"total":len(p['associated']),"users":[],"hits":0} | |
for u in users: | |
d=dist(p['tags'],u['interests']+u['skills']) | |
if u['loc'].split(',')[1].strip() in map(lambda l: l.split(',')[1].strip(),p['loc']): d[0]+=2 | |
if d[0]>0: | |
if u['id'] in p['associated']: reccs['hits']+=1 | |
reccs['users'].append({'id':u['id'],'match':d[0],'same':d[1]}) | |
yield u['id'],{'id':p['id'],'match':d[0],'same':d[1],'refs':u['refs']} | |
reccs['users']=sorted(reccs['users'], key=lambda k: -k['match']) | |
yield p['id'],reccs | |
def project_reducer(self,key,values): | |
l=list(values) | |
if '/person/' in key: | |
sorted(l, key=lambda k: k['match']) | |
if len(l)>REC_COUNT: l=l[-REC_COUNT:] | |
hits=0 | |
score=0 | |
if len(l[0]['refs'])>0: | |
hits=len(set(l[0]['refs']).intersection(set(map(lambda x: x["id"],l)))) | |
tots=len(l[0]['refs']) | |
if len(l[0]['refs'])>REC_COUNT:tots=REC_COUNT | |
score=hits/(tots*1.) | |
print {'user':key,"projects":l[:3]} | |
yield 2,{"user":key,"projects":l[:3],"hits":hits,"score":score,"total":tots,"refs":l[0]['refs']} | |
else: | |
hits=0 | |
for i in l: | |
hits+=i['hits'] | |
score=hits/(l[0]['total']*1.) | |
print {'project':key,"positions":l[:3]} | |
yield 1,{"project":key,"positions":l[:3],"hits":hits,"score":score,"total":l[0]['total']} | |
def assess(self,key,values): | |
count=0. | |
hits=0. | |
associated=0. | |
score=[] | |
if key==1: | |
for proj in values: | |
print "\nproject: ",proj['project'] | |
print "associated: ",proj['total'] | |
print "hits: ",proj['hits'] | |
print "score: ",proj['score'] | |
count+=1 | |
hits+=proj['hits'] | |
if proj['score']>0: score+=[proj['score']] | |
associated+=proj['total'] | |
else: | |
for user in values: | |
print "\nuser: ",user['user'] | |
print "associated: ",user['total'] | |
print "hits: ",user['hits'] | |
print "score: ",user['score'] | |
count+=1 | |
hits+=user['hits'] | |
if user['score']>0: score+=[user['score']] | |
associated+=user['total'] | |
print "\n\nRunning score\n" | |
print "count: ",(count) | |
print "associated: ",(associated) | |
print "total hits: ",(hits) | |
print "\n\nAveraged Per Record: \n" | |
print "proportion hit: ",np.mean(np.array(score) ) | |
print "associated: ", (associated/(count*1.) ) | |
print "hit proportion: ",(hits/(associated*1.)) | |
def steps(self): | |
return [MRStep(mapper=self.mapper, | |
reducer=self.label_reducer), | |
MRStep(reducer=self.project_reducer), | |
MRStep(reducer=self.assess)] | |
if __name__ == '__main__': | |
MAPR.run() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas import * | |
import re,string | |
import operator | |
from pprint import pprint as pp | |
import random | |
datUsers=read_json('./../person.json') | |
datProjects=read_json('./../project.json') | |
datUsers.set_index(['_id']) | |
datProjects.set_index(['_id']) | |
# Words to take out (didn't seem useful / broke normal fmt) | |
stopwords=['\?','\(','\)','Framework','Foundation$','Foundation ','Architect$','Architect ','Developer$','Developer ','Development$','Development '] | |
# Make regex for finding all the stopwords, delimited by ors "|" | |
sp='|'.join(stopwords) | |
pattern=re.compile(sp) | |
# Remove all stopwords | |
def removem(s,arr): | |
return re.sub(pattern,'',s).lower() | |
def euclidean_distance( prefsUser, prefsProj, p1, p2 ): | |
# Get the list of shared_items | |
if len(prefsUser[p1]) == 0 or len(prefsProj[p2]) == 0: return -1,[] | |
same = set(prefsUser[p1]).intersection(set(prefsProj[p2])) | |
diff = len(set(prefsProj[p2]) - set(prefsUser[p1])) | |
return (len(same) / ((len(prefsUser[p1]) + len(prefsProj[p2]))/2.0)),same | |
def runEuc(n,k,column): | |
dic = {} | |
count = 0 | |
users = random.sample(datUsers.index,n) | |
for ix in users: | |
index= datUsers["_id"][ix] | |
count += 1 | |
if count == n: | |
return dic | |
dic[index] = {} | |
for idx2,p2 in datProjects.iterrows(): | |
(p,same) = euclidean_distance(datUsers[column], datProjects["tags"], ix, idx2) | |
if p > 0: | |
dic[index][p2['_id']]=p | |
dic[index] = sorted(dic[index].items(), key=operator.itemgetter(1))[-k:] | |
dic[index] = dic[index][::-1] | |
return dic | |
pp(runEuc(10,3,"interests")) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas import * | |
import numpy as np | |
from mrjob.job import MRJob | |
from mrjob.step import MRStep | |
from mrjob.protocol import JSONValueProtocol | |
REC_COUNT=3 | |
def dist( user_a, user_b ): | |
user_a["interests"] | |
user_b["interests"] | |
user_a["skills"] | |
user_b["skills"] | |
# Get the list of shared_items | |
if len(p1) == 0 or len(p2) == 0: return -1,[] | |
similar_interests=inner(user_a_interests,user_b_interests) | |
different_skills=outer(user_a_skills,user_b_skills) | |
return len(similar_interests)+len(different_skills),list(same) | |
def dist_measure( skills, interests, loc_same, edu, group ): | |
return (10 * len(skills) + 7 * len(interests)) * loc_same +5*(len(edu)+1) +5*(len(group)+1) | |
def inner( p1, p2 ): | |
# Get the list of shared_items | |
if len(p1) == 0 or len(p2) == 0: return -1,[] | |
same = set(p1).intersection(set(p2)) | |
return list(same) | |
def outer( p1, p2 ): | |
# Get the list of shared_items | |
if len(p1) == 0 or len(p2) == 0: return -1,[] | |
outer_join = set(p2) - set(p1) | |
return list(outer_join) | |
class MAPR(MRJob): | |
OUTPUT_PROTOCOL = JSONValueProtocol | |
INPUT_PROTOCOL = JSONValueProtocol | |
def mapper(self, _, line): | |
print "PERSON" | |
for u in line[:5000]: | |
for interest in u['interests']: | |
yield interest, {'id':u['_id'],'type':'interest',"skills":u['skills'],'loc':u['location'].split(',')[1].strip(),"groups":u["groups"],"education":u["education"]} | |
# Yield project id, list of user distances | |
def label_reducer(self, key, values): | |
l = list(values) | |
for e in range(len(l)): | |
for i in l[e+1:]: | |
ls=1 | |
if i['loc']==l[e]['loc']:ls=2 | |
d=outer(l[e]["skills"],i["skills"]) | |
d2=outer(i["skills"],l[e]["skills"]) | |
g=inner(l[e]["groups"],i["groups"]) | |
edu=inner(l[e]["education"],i["education"]) | |
yield (l[e]['id']+' -> '+i['id']), {"interest": key,"skills":d,"d2":d2, 'loc':ls,'edu':edu,'groups':g} | |
def skill_reducer(self,key,values): | |
l=list(values) | |
(u1,u2)=key.split(' -> ') | |
d=dist_measure(l,l[0]["skills"],l[0]["loc"],l[0]["edu"],l[0]["groups"]) | |
d2=dist_measure(l,l[0]["d2"],l[0]["loc"],l[0]["edu"],l[0]["groups"]) | |
yield u1,{"id":u2,"match":d} | |
yield u2,{"id":u1,"match":d2} | |
def user_reducer(self,key,values): | |
l=list(values) | |
l=sorted(l, key=lambda k: k['match']) | |
if len(l)>REC_COUNT: l=l[-REC_COUNT:] | |
yield key,l | |
def steps(self): | |
return [MRStep(mapper=self.mapper, | |
reducer=self.label_reducer), | |
MRStep(reducer=self.skill_reducer), | |
MRStep(reducer=self.user_reducer)] | |
if __name__ == '__main__': | |
MAPR.run() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment