Created
April 26, 2016 21:46
-
-
Save PseudoSky/06239d2ce4f88948a378108e05b34725 to your computer and use it in GitHub Desktop.
Manipulating skill based user data with pandas
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pandas import * | |
import re,string | |
dat=read_json('./users.json') | |
# Words to take out (didn't seem useful / broke normal fmt) | |
stopwords=['\?','\(','\)','Framework','Foundation$','Foundation ','Architect$','Architect ','Developer$','Developer ','Development$','Development '] | |
# Make regex for finding all the stopwords, delimited by ors "|" | |
sp='|'.join(stopwords) | |
pattern=re.compile(sp) | |
# Remove all stopwords | |
def removem(s,arr): | |
return re.sub(pattern,'',s).lower() | |
# Iterate over skills, parse and yield an edge | |
def at(dat,idx): | |
a=[] | |
for s in dat['skills']: | |
if s is not None: | |
for m in removem(s,stopwords).replace('/',',').split(','): | |
c=m.encode('utf-8').strip(' \t\n\r()') | |
if len(c) > 0: | |
a+=[{'idx':idx, 'skill': c,'label':dat['label']}] | |
return a | |
# Compile the edges between a user and a skill | |
def it(dat): | |
return list(( yield at(row,index)) for index, row in dat.iterrows()) | |
v=[] | |
# v = map(lambda r,i: at(r,i),dat.iterrows()) # Map dataframe to edges | |
v = filter(lambda e: e is not None,it(dat)) # Take out nulls | |
v = reduce(lambda x,y: x+y,v) # Flatten array | |
v=DataFrame(data=v) # Create new frame | |
# Group by skills then label | |
# The result of count is a subset of 'idx' for some reason | |
# So the ['idx'], reset, and columns=... are for that | |
v = v.groupby(['skill','label']).count()['idx'].reset_index() | |
v.columns=['skill','label','count'] | |
# Print the frequencies grouped with their labels ordered by count | |
print v.sort(['label','count','skill'], ascending=[1,0,1]) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment