Skip to content

Instantly share code, notes, and snippets.

@PseudoSky
Created April 26, 2016 21:46
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save PseudoSky/06239d2ce4f88948a378108e05b34725 to your computer and use it in GitHub Desktop.
Save PseudoSky/06239d2ce4f88948a378108e05b34725 to your computer and use it in GitHub Desktop.
Manipulating skill based user data with pandas
from pandas import *
import re,string
dat=read_json('./users.json')
# Words to take out (didn't seem useful / broke normal fmt)
stopwords=['\?','\(','\)','Framework','Foundation$','Foundation ','Architect$','Architect ','Developer$','Developer ','Development$','Development ']
# Make regex for finding all the stopwords, delimited by ors "|"
sp='|'.join(stopwords)
pattern=re.compile(sp)
# Remove all stopwords
def removem(s,arr):
return re.sub(pattern,'',s).lower()
# Iterate over skills, parse and yield an edge
def at(dat,idx):
a=[]
for s in dat['skills']:
if s is not None:
for m in removem(s,stopwords).replace('/',',').split(','):
c=m.encode('utf-8').strip(' \t\n\r()')
if len(c) > 0:
a+=[{'idx':idx, 'skill': c,'label':dat['label']}]
return a
# Compile the edges between a user and a skill
def it(dat):
return list(( yield at(row,index)) for index, row in dat.iterrows())
v=[]
# v = map(lambda r,i: at(r,i),dat.iterrows()) # Map dataframe to edges
v = filter(lambda e: e is not None,it(dat)) # Take out nulls
v = reduce(lambda x,y: x+y,v) # Flatten array
v=DataFrame(data=v) # Create new frame
# Group by skills then label
# The result of count is a subset of 'idx' for some reason
# So the ['idx'], reset, and columns=... are for that
v = v.groupby(['skill','label']).count()['idx'].reset_index()
v.columns=['skill','label','count']
# Print the frequencies grouped with their labels ordered by count
print v.sort(['label','count','skill'], ascending=[1,0,1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment