PseudoSky/user-project-mapreduce.py

## user-project-mapreduce.py
from pandas import *
import numpy as np
from mrjob.job import MRJob
from mrjob.step import MRStep
from mrjob.protocol import JSONValueProtocol


def dist( p1, p2 ):
    # Get the list of shared_items
    if len(p1) == 0 or len(p2) == 0: return [-1,[]]
    same = set(p1).intersection(set(p2))
    return [(len(same) ),list(same)]


REC_COUNT=20

# Yield label for each project need, project
# Yield label for each user, user
class MAPR(MRJob):
  OUTPUT_PROTOCOL = JSONValueProtocol
  INPUT_PROTOCOL = JSONValueProtocol
  def mapper(self, _, line):

    if 'person' in line[0]['_id']:
      print "PERSON"
      for u in line:
        if len(u['projects'])>0:
          yield u['label'], {"type":1,'id':u['_id'],'skills':u['skills'],'interests':u['interests'],'loc':u['location'],"refs":u['projects']}
    else:
      print 'PROJECT'
      lbl_map={
        "1":'Developers',
        "2":'Designers',
        "3":"Artists",
        "4":"Writers",
        "5":"Scientists",
        "6":"Musicians",
        "7":"Product Managers",
        "8":"Filmmakers",
        "9":"Engineers"

      }

      for p in line:
        for col in p['contributors']:
          colab_lbl=lbl_map[col[1][-6:-5]]
          if colab_lbl not in p['needs']:
            p['needs'].append(colab_lbl)

        if len(p['contributors'])>0:
          ass=map(lambda e: e[0],p['contributors'])
        else: ass=[p['owner']]
        for need in p['needs']:
          yield need[:-1], {"type":0,'id':p['_id'],'tags':p['tags'],"associated":ass,'loc':p['locations']}
  # Yield project id, list of user distances
  def label_reducer(self, key, values):
    l = list(values)
    projects=filter(lambda e: e["type"]==0, l)
    users=filter(lambda e: e["type"]==1, l)

    print "\n\nKEY: ",key,"\n"

    for p in projects:
      reccs={"label":key,"total":len(p['associated']),"users":[],"hits":0}
      for u in users:
        d=dist(p['tags'],u['interests']+u['skills'])

        if u['loc'].split(',')[1].strip() in map(lambda l: l.split(',')[1].strip(),p['loc']): d[0]+=2

        if d[0]>0:
          if u['id'] in p['associated']: reccs['hits']+=1

          reccs['users'].append({'id':u['id'],'match':d[0],'same':d[1]})
          yield u['id'],{'id':p['id'],'match':d[0],'same':d[1],'refs':u['refs']}


      reccs['users']=sorted(reccs['users'], key=lambda k: -k['match'])
      yield p['id'],reccs

  def project_reducer(self,key,values):
    l=list(values)
    if '/person/' in key:
      sorted(l, key=lambda k: k['match'])
      if len(l)>REC_COUNT: l=l[-REC_COUNT:]
      hits=0
      score=0

      if len(l[0]['refs'])>0:
        hits=len(set(l[0]['refs']).intersection(set(map(lambda x: x["id"],l))))

        tots=len(l[0]['refs'])

        if len(l[0]['refs'])>REC_COUNT:tots=REC_COUNT

        score=hits/(tots*1.)
      print {'user':key,"projects":l[:3]}
      yield 2,{"user":key,"projects":l[:3],"hits":hits,"score":score,"total":tots,"refs":l[0]['refs']}
    else:
      hits=0
      for i in l:
        hits+=i['hits']
      score=hits/(l[0]['total']*1.)
      print {'project':key,"positions":l[:3]}
      yield 1,{"project":key,"positions":l[:3],"hits":hits,"score":score,"total":l[0]['total']}
  def assess(self,key,values):

    count=0.
    hits=0.
    associated=0.
    score=[]
    if key==1:
      for proj in values:
        print "\nproject: ",proj['project']
        print "associated: ",proj['total']
        print "hits: ",proj['hits']
        print "score: ",proj['score']
        count+=1
        hits+=proj['hits']
        if proj['score']>0: score+=[proj['score']]
        associated+=proj['total']

    else:
      for user in values:
        print "\nuser: ",user['user']
        print "associated: ",user['total']
        print "hits: ",user['hits']
        print "score: ",user['score']
        count+=1
        hits+=user['hits']
        if user['score']>0: score+=[user['score']]
        associated+=user['total']

    print "\n\nRunning score\n"
    print "count: ",(count)
    print "associated: ",(associated)
    print "total hits: ",(hits)
    print "\n\nAveraged Per Record: \n"
    print "proportion hit: ",np.mean(np.array(score) )
    print "associated: ", (associated/(count*1.) )
    print "hit proportion: ",(hits/(associated*1.))

  def steps(self):
      return [MRStep(mapper=self.mapper,
                      reducer=self.label_reducer),
              MRStep(reducer=self.project_reducer),
              MRStep(reducer=self.assess)]


if __name__ == '__main__':
    MAPR.run()

## user-project.py
from pandas import *
import re,string
import operator
from pprint import pprint as pp
import random

datUsers=read_json('./../person.json')
datProjects=read_json('./../project.json')

datUsers.set_index(['_id'])
datProjects.set_index(['_id'])

# Words to take out (didn't seem useful / broke normal fmt)
stopwords=['\?','\(','\)','Framework','Foundation$','Foundation ','Architect$','Architect ','Developer$','Developer ','Development$','Development ']

# Make regex for finding all the stopwords, delimited by ors "|"
sp='|'.join(stopwords)
pattern=re.compile(sp)

# Remove all stopwords
def removem(s,arr):
	return re.sub(pattern,'',s).lower()

def euclidean_distance( prefsUser, prefsProj, p1, p2 ):
    # Get the list of shared_items
    if len(prefsUser[p1]) == 0 or len(prefsProj[p2]) == 0: return -1,[]

    same = set(prefsUser[p1]).intersection(set(prefsProj[p2]))
    diff = len(set(prefsProj[p2]) - set(prefsUser[p1]))
    return (len(same) / ((len(prefsUser[p1]) + len(prefsProj[p2]))/2.0)),same


def runEuc(n,k,column):
	dic = {}
	count = 0
	users = random.sample(datUsers.index,n)
	for ix in users:
		index= datUsers["_id"][ix]
		count += 1
		if count == n:
			return dic
		dic[index] = {}
		for idx2,p2 in datProjects.iterrows():
			(p,same) = euclidean_distance(datUsers[column], datProjects["tags"], ix, idx2)
			if p > 0:
				dic[index][p2['_id']]=p
		dic[index] = sorted(dic[index].items(), key=operator.itemgetter(1))[-k:]
		dic[index] = dic[index][::-1]
	return dic


pp(runEuc(10,3,"interests"))

## user-user-mapreduce.py
from pandas import *
import numpy as np
from mrjob.job import MRJob
from mrjob.step import MRStep
from mrjob.protocol import JSONValueProtocol

REC_COUNT=3

def dist( user_a, user_b ):
    user_a["interests"]
    user_b["interests"]
    user_a["skills"]
    user_b["skills"]
    # Get the list of shared_items
    if len(p1) == 0 or len(p2) == 0: return -1,[]
    similar_interests=inner(user_a_interests,user_b_interests)
    different_skills=outer(user_a_skills,user_b_skills)
    return len(similar_interests)+len(different_skills),list(same)

def dist_measure( skills, interests, loc_same, edu, group ):
    return (10 * len(skills) + 7 * len(interests)) * loc_same +5*(len(edu)+1) +5*(len(group)+1)

def inner( p1, p2 ):
    # Get the list of shared_items
    if len(p1) == 0 or len(p2) == 0: return -1,[]
    same = set(p1).intersection(set(p2))
    return list(same)


def outer( p1, p2 ):
    # Get the list of shared_items
    if len(p1) == 0 or len(p2) == 0: return -1,[]
    outer_join = set(p2) - set(p1)
    return list(outer_join)

class MAPR(MRJob):
  OUTPUT_PROTOCOL = JSONValueProtocol
  INPUT_PROTOCOL = JSONValueProtocol
  def mapper(self, _, line):
    print "PERSON"
    for u in line[:5000]:

      for interest in u['interests']:
        yield interest, {'id':u['_id'],'type':'interest',"skills":u['skills'],'loc':u['location'].split(',')[1].strip(),"groups":u["groups"],"education":u["education"]}


  # Yield project id, list of user distances
  def label_reducer(self, key, values):
    l = list(values)

    for e in range(len(l)):
      for i in l[e+1:]:
        ls=1
        if i['loc']==l[e]['loc']:ls=2
        d=outer(l[e]["skills"],i["skills"])
        d2=outer(i["skills"],l[e]["skills"])
        g=inner(l[e]["groups"],i["groups"])
        edu=inner(l[e]["education"],i["education"])
        yield (l[e]['id']+' -> '+i['id']), {"interest": key,"skills":d,"d2":d2, 'loc':ls,'edu':edu,'groups':g}

  def skill_reducer(self,key,values):
    l=list(values)
    (u1,u2)=key.split(' -> ')
    d=dist_measure(l,l[0]["skills"],l[0]["loc"],l[0]["edu"],l[0]["groups"])
    d2=dist_measure(l,l[0]["d2"],l[0]["loc"],l[0]["edu"],l[0]["groups"])
    yield u1,{"id":u2,"match":d}
    yield u2,{"id":u1,"match":d2}

  def user_reducer(self,key,values):
    l=list(values)
    l=sorted(l, key=lambda k: k['match'])
    if len(l)>REC_COUNT: l=l[-REC_COUNT:]
    yield key,l


  def steps(self):
      return [MRStep(mapper=self.mapper,
                      reducer=self.label_reducer),
              MRStep(reducer=self.skill_reducer),
              MRStep(reducer=self.user_reducer)]

if __name__ == '__main__':
    MAPR.run()
	from pandas import *
	import numpy as np
	from mrjob.job import MRJob
	from mrjob.step import MRStep
	from mrjob.protocol import JSONValueProtocol


	def dist( p1, p2 ):
	# Get the list of shared_items
	if len(p1) == 0 or len(p2) == 0: return [-1,[]]
	same = set(p1).intersection(set(p2))
	return [(len(same) ),list(same)]


	REC_COUNT=20

	# Yield label for each project need, project
	# Yield label for each user, user
	class MAPR(MRJob):
	OUTPUT_PROTOCOL = JSONValueProtocol
	INPUT_PROTOCOL = JSONValueProtocol
	def mapper(self, _, line):

	if 'person' in line[0]['_id']:
	print "PERSON"
	for u in line:
	if len(u['projects'])>0:
	yield u['label'], {"type":1,'id':u['_id'],'skills':u['skills'],'interests':u['interests'],'loc':u['location'],"refs":u['projects']}
	else:
	print 'PROJECT'
	lbl_map={
	"1":'Developers',
	"2":'Designers',
	"3":"Artists",
	"4":"Writers",
	"5":"Scientists",
	"6":"Musicians",
	"7":"Product Managers",
	"8":"Filmmakers",
	"9":"Engineers"

	}

	for p in line:
	for col in p['contributors']:
	colab_lbl=lbl_map[col[1][-6:-5]]
	if colab_lbl not in p['needs']:
	p['needs'].append(colab_lbl)

	if len(p['contributors'])>0:
	ass=map(lambda e: e[0],p['contributors'])
	else: ass=[p['owner']]
	for need in p['needs']:
	yield need[:-1], {"type":0,'id':p['_id'],'tags':p['tags'],"associated":ass,'loc':p['locations']}
	# Yield project id, list of user distances
	def label_reducer(self, key, values):
	l = list(values)
	projects=filter(lambda e: e["type"]==0, l)
	users=filter(lambda e: e["type"]==1, l)

	print "\n\nKEY: ",key,"\n"

	for p in projects:
	reccs={"label":key,"total":len(p['associated']),"users":[],"hits":0}
	for u in users:
	d=dist(p['tags'],u['interests']+u['skills'])

	if u['loc'].split(',')[1].strip() in map(lambda l: l.split(',')[1].strip(),p['loc']): d[0]+=2

	if d[0]>0:
	if u['id'] in p['associated']: reccs['hits']+=1

	reccs['users'].append({'id':u['id'],'match':d[0],'same':d[1]})
	yield u['id'],{'id':p['id'],'match':d[0],'same':d[1],'refs':u['refs']}



	reccs['users']=sorted(reccs['users'], key=lambda k: -k['match'])
	yield p['id'],reccs

	def project_reducer(self,key,values):
	l=list(values)
	if '/person/' in key:
	sorted(l, key=lambda k: k['match'])
	if len(l)>REC_COUNT: l=l[-REC_COUNT:]
	hits=0
	score=0

	if len(l[0]['refs'])>0:
	hits=len(set(l[0]['refs']).intersection(set(map(lambda x: x["id"],l))))

	tots=len(l[0]['refs'])

	if len(l[0]['refs'])>REC_COUNT:tots=REC_COUNT

	score=hits/(tots*1.)
	print {'user':key,"projects":l[:3]}
	yield 2,{"user":key,"projects":l[:3],"hits":hits,"score":score,"total":tots,"refs":l[0]['refs']}
	else:
	hits=0
	for i in l:
	hits+=i['hits']
	score=hits/(l[0]['total']*1.)
	print {'project':key,"positions":l[:3]}
	yield 1,{"project":key,"positions":l[:3],"hits":hits,"score":score,"total":l[0]['total']}
	def assess(self,key,values):

	count=0.
	hits=0.
	associated=0.
	score=[]
	if key==1:
	for proj in values:
	print "\nproject: ",proj['project']
	print "associated: ",proj['total']
	print "hits: ",proj['hits']
	print "score: ",proj['score']
	count+=1
	hits+=proj['hits']
	if proj['score']>0: score+=[proj['score']]
	associated+=proj['total']

	else:
	for user in values:
	print "\nuser: ",user['user']
	print "associated: ",user['total']
	print "hits: ",user['hits']
	print "score: ",user['score']
	count+=1
	hits+=user['hits']
	if user['score']>0: score+=[user['score']]
	associated+=user['total']

	print "\n\nRunning score\n"
	print "count: ",(count)
	print "associated: ",(associated)
	print "total hits: ",(hits)
	print "\n\nAveraged Per Record: \n"
	print "proportion hit: ",np.mean(np.array(score) )
	print "associated: ", (associated/(count*1.) )
	print "hit proportion: ",(hits/(associated*1.))

	def steps(self):
	return [MRStep(mapper=self.mapper,
	reducer=self.label_reducer),
	MRStep(reducer=self.project_reducer),
	MRStep(reducer=self.assess)]



	if __name__ == '__main__':
	MAPR.run()
	from pandas import *
	import re,string
	import operator
	from pprint import pprint as pp
	import random

	datUsers=read_json('./../person.json')
	datProjects=read_json('./../project.json')

	datUsers.set_index(['_id'])
	datProjects.set_index(['_id'])

	# Words to take out (didn't seem useful / broke normal fmt)
	stopwords=['\?','\(','\)','Framework','Foundation$','Foundation ','Architect$','Architect ','Developer$','Developer ','Development$','Development ']

	# Make regex for finding all the stopwords, delimited by ors "\|"
	sp='\|'.join(stopwords)
	pattern=re.compile(sp)

	# Remove all stopwords
	def removem(s,arr):
	return re.sub(pattern,'',s).lower()

	def euclidean_distance( prefsUser, prefsProj, p1, p2 ):
	# Get the list of shared_items
	if len(prefsUser[p1]) == 0 or len(prefsProj[p2]) == 0: return -1,[]

	same = set(prefsUser[p1]).intersection(set(prefsProj[p2]))
	diff = len(set(prefsProj[p2]) - set(prefsUser[p1]))
	return (len(same) / ((len(prefsUser[p1]) + len(prefsProj[p2]))/2.0)),same


	def runEuc(n,k,column):
	dic = {}
	count = 0
	users = random.sample(datUsers.index,n)
	for ix in users:
	index= datUsers["_id"][ix]
	count += 1
	if count == n:
	return dic
	dic[index] = {}
	for idx2,p2 in datProjects.iterrows():
	(p,same) = euclidean_distance(datUsers[column], datProjects["tags"], ix, idx2)
	if p > 0:
	dic[index][p2['_id']]=p
	dic[index] = sorted(dic[index].items(), key=operator.itemgetter(1))[-k:]
	dic[index] = dic[index][::-1]
	return dic


	pp(runEuc(10,3,"interests"))