Skip to content

Instantly share code, notes, and snippets.

@PseudoSky
Last active May 10, 2016 03:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save PseudoSky/3afa3170fc522a4084e403f8d2deaa13 to your computer and use it in GitHub Desktop.
Save PseudoSky/3afa3170fc522a4084e403f8d2deaa13 to your computer and use it in GitHub Desktop.
from pandas import *
import numpy as np
from mrjob.job import MRJob
from mrjob.step import MRStep
from mrjob.protocol import JSONValueProtocol
def dist( p1, p2 ):
# Get the list of shared_items
if len(p1) == 0 or len(p2) == 0: return [-1,[]]
same = set(p1).intersection(set(p2))
return [(len(same) ),list(same)]
REC_COUNT=20
# Yield label for each project need, project
# Yield label for each user, user
class MAPR(MRJob):
OUTPUT_PROTOCOL = JSONValueProtocol
INPUT_PROTOCOL = JSONValueProtocol
def mapper(self, _, line):
if 'person' in line[0]['_id']:
print "PERSON"
for u in line:
if len(u['projects'])>0:
yield u['label'], {"type":1,'id':u['_id'],'skills':u['skills'],'interests':u['interests'],'loc':u['location'],"refs":u['projects']}
else:
print 'PROJECT'
lbl_map={
"1":'Developers',
"2":'Designers',
"3":"Artists",
"4":"Writers",
"5":"Scientists",
"6":"Musicians",
"7":"Product Managers",
"8":"Filmmakers",
"9":"Engineers"
}
for p in line:
for col in p['contributors']:
colab_lbl=lbl_map[col[1][-6:-5]]
if colab_lbl not in p['needs']:
p['needs'].append(colab_lbl)
if len(p['contributors'])>0:
ass=map(lambda e: e[0],p['contributors'])
else: ass=[p['owner']]
for need in p['needs']:
yield need[:-1], {"type":0,'id':p['_id'],'tags':p['tags'],"associated":ass,'loc':p['locations']}
# Yield project id, list of user distances
def label_reducer(self, key, values):
l = list(values)
projects=filter(lambda e: e["type"]==0, l)
users=filter(lambda e: e["type"]==1, l)
print "\n\nKEY: ",key,"\n"
for p in projects:
reccs={"label":key,"total":len(p['associated']),"users":[],"hits":0}
for u in users:
d=dist(p['tags'],u['interests']+u['skills'])
if u['loc'].split(',')[1].strip() in map(lambda l: l.split(',')[1].strip(),p['loc']): d[0]+=2
if d[0]>0:
if u['id'] in p['associated']: reccs['hits']+=1
reccs['users'].append({'id':u['id'],'match':d[0],'same':d[1]})
yield u['id'],{'id':p['id'],'match':d[0],'same':d[1],'refs':u['refs']}
reccs['users']=sorted(reccs['users'], key=lambda k: -k['match'])
yield p['id'],reccs
def project_reducer(self,key,values):
l=list(values)
if '/person/' in key:
sorted(l, key=lambda k: k['match'])
if len(l)>REC_COUNT: l=l[-REC_COUNT:]
hits=0
score=0
if len(l[0]['refs'])>0:
hits=len(set(l[0]['refs']).intersection(set(map(lambda x: x["id"],l))))
tots=len(l[0]['refs'])
if len(l[0]['refs'])>REC_COUNT:tots=REC_COUNT
score=hits/(tots*1.)
print {'user':key,"projects":l[:3]}
yield 2,{"user":key,"projects":l[:3],"hits":hits,"score":score,"total":tots,"refs":l[0]['refs']}
else:
hits=0
for i in l:
hits+=i['hits']
score=hits/(l[0]['total']*1.)
print {'project':key,"positions":l[:3]}
yield 1,{"project":key,"positions":l[:3],"hits":hits,"score":score,"total":l[0]['total']}
def assess(self,key,values):
count=0.
hits=0.
associated=0.
score=[]
if key==1:
for proj in values:
print "\nproject: ",proj['project']
print "associated: ",proj['total']
print "hits: ",proj['hits']
print "score: ",proj['score']
count+=1
hits+=proj['hits']
if proj['score']>0: score+=[proj['score']]
associated+=proj['total']
else:
for user in values:
print "\nuser: ",user['user']
print "associated: ",user['total']
print "hits: ",user['hits']
print "score: ",user['score']
count+=1
hits+=user['hits']
if user['score']>0: score+=[user['score']]
associated+=user['total']
print "\n\nRunning score\n"
print "count: ",(count)
print "associated: ",(associated)
print "total hits: ",(hits)
print "\n\nAveraged Per Record: \n"
print "proportion hit: ",np.mean(np.array(score) )
print "associated: ", (associated/(count*1.) )
print "hit proportion: ",(hits/(associated*1.))
def steps(self):
return [MRStep(mapper=self.mapper,
reducer=self.label_reducer),
MRStep(reducer=self.project_reducer),
MRStep(reducer=self.assess)]
if __name__ == '__main__':
MAPR.run()
from pandas import *
import re,string
import operator
from pprint import pprint as pp
import random
datUsers=read_json('./../person.json')
datProjects=read_json('./../project.json')
datUsers.set_index(['_id'])
datProjects.set_index(['_id'])
# Words to take out (didn't seem useful / broke normal fmt)
stopwords=['\?','\(','\)','Framework','Foundation$','Foundation ','Architect$','Architect ','Developer$','Developer ','Development$','Development ']
# Make regex for finding all the stopwords, delimited by ors "|"
sp='|'.join(stopwords)
pattern=re.compile(sp)
# Remove all stopwords
def removem(s,arr):
return re.sub(pattern,'',s).lower()
def euclidean_distance( prefsUser, prefsProj, p1, p2 ):
# Get the list of shared_items
if len(prefsUser[p1]) == 0 or len(prefsProj[p2]) == 0: return -1,[]
same = set(prefsUser[p1]).intersection(set(prefsProj[p2]))
diff = len(set(prefsProj[p2]) - set(prefsUser[p1]))
return (len(same) / ((len(prefsUser[p1]) + len(prefsProj[p2]))/2.0)),same
def runEuc(n,k,column):
dic = {}
count = 0
users = random.sample(datUsers.index,n)
for ix in users:
index= datUsers["_id"][ix]
count += 1
if count == n:
return dic
dic[index] = {}
for idx2,p2 in datProjects.iterrows():
(p,same) = euclidean_distance(datUsers[column], datProjects["tags"], ix, idx2)
if p > 0:
dic[index][p2['_id']]=p
dic[index] = sorted(dic[index].items(), key=operator.itemgetter(1))[-k:]
dic[index] = dic[index][::-1]
return dic
pp(runEuc(10,3,"interests"))
from pandas import *
import numpy as np
from mrjob.job import MRJob
from mrjob.step import MRStep
from mrjob.protocol import JSONValueProtocol
REC_COUNT=3
def dist( user_a, user_b ):
user_a["interests"]
user_b["interests"]
user_a["skills"]
user_b["skills"]
# Get the list of shared_items
if len(p1) == 0 or len(p2) == 0: return -1,[]
similar_interests=inner(user_a_interests,user_b_interests)
different_skills=outer(user_a_skills,user_b_skills)
return len(similar_interests)+len(different_skills),list(same)
def dist_measure( skills, interests, loc_same, edu, group ):
return (10 * len(skills) + 7 * len(interests)) * loc_same +5*(len(edu)+1) +5*(len(group)+1)
def inner( p1, p2 ):
# Get the list of shared_items
if len(p1) == 0 or len(p2) == 0: return -1,[]
same = set(p1).intersection(set(p2))
return list(same)
def outer( p1, p2 ):
# Get the list of shared_items
if len(p1) == 0 or len(p2) == 0: return -1,[]
outer_join = set(p2) - set(p1)
return list(outer_join)
class MAPR(MRJob):
OUTPUT_PROTOCOL = JSONValueProtocol
INPUT_PROTOCOL = JSONValueProtocol
def mapper(self, _, line):
print "PERSON"
for u in line[:5000]:
for interest in u['interests']:
yield interest, {'id':u['_id'],'type':'interest',"skills":u['skills'],'loc':u['location'].split(',')[1].strip(),"groups":u["groups"],"education":u["education"]}
# Yield project id, list of user distances
def label_reducer(self, key, values):
l = list(values)
for e in range(len(l)):
for i in l[e+1:]:
ls=1
if i['loc']==l[e]['loc']:ls=2
d=outer(l[e]["skills"],i["skills"])
d2=outer(i["skills"],l[e]["skills"])
g=inner(l[e]["groups"],i["groups"])
edu=inner(l[e]["education"],i["education"])
yield (l[e]['id']+' -> '+i['id']), {"interest": key,"skills":d,"d2":d2, 'loc':ls,'edu':edu,'groups':g}
def skill_reducer(self,key,values):
l=list(values)
(u1,u2)=key.split(' -> ')
d=dist_measure(l,l[0]["skills"],l[0]["loc"],l[0]["edu"],l[0]["groups"])
d2=dist_measure(l,l[0]["d2"],l[0]["loc"],l[0]["edu"],l[0]["groups"])
yield u1,{"id":u2,"match":d}
yield u2,{"id":u1,"match":d2}
def user_reducer(self,key,values):
l=list(values)
l=sorted(l, key=lambda k: k['match'])
if len(l)>REC_COUNT: l=l[-REC_COUNT:]
yield key,l
def steps(self):
return [MRStep(mapper=self.mapper,
reducer=self.label_reducer),
MRStep(reducer=self.skill_reducer),
MRStep(reducer=self.user_reducer)]
if __name__ == '__main__':
MAPR.run()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment