Skip to content

Instantly share code, notes, and snippets.

@152334H
Created October 1, 2022 03:29
Show Gist options
  • Save 152334H/799f0f0aa9bab2172fe3cb3fc2bfca47 to your computer and use it in GitHub Desktop.
Save 152334H/799f0f0aa9bab2172fe3cb3fc2bfca47 to your computer and use it in GitHub Desktop.
hacked out script made in <1hr to look at the tech interview spreadsheet data a bit better for my purposes
''' NOTE:
0. install pandas and numpy to run this
1. Download the data at https://docs.google.com/spreadsheets/d/1QtC8efWw0mVkGXW4QA9bX4f0nJhGbmFqCfVLSumMZ0I/edit **AS A CSV FILE**
2. **REMOVE THE FIRST LINE FROM THE CSV FILE BEFORE RUNNING THIS**
'''
import pandas as pd
import numpy as np
import re
def lof(it): print(len(it))
def regex(s: str, nocase=True): return re.compile(s, re.IGNORECASE) if nocase else re.compile(s)
df = pd.read_csv('./singapore_tech_salaries_responses.csv', names=[
'Timestamp', 'Type', 'Company', 'Role',
'Salary', # 'Monthly/Annual Salary (Specify if not SGD)',
'Stocks', # 'Total Stocks (Specify if not SGD)',
'SignBonus',#'Sign On (Specify if not SGD)',
'Comp', # 'Annual Total Compensation (Specify if not SGD)',
'TargetBonus', #'Target Annual Bonus',
'ExtraInfo',#'Extra Information'
], skiprows=[0]) # pyright: ignore
assert isinstance(df, pd.DataFrame) # pyright
df['Salary'] = pd.to_numeric(df['Salary'], errors='coerce')
prev_len = len(df)
df = df.dropna(subset=['Salary'])
assert isinstance(df, pd.DataFrame) # pyright
print(f'dropped {prev_len-len(df)} rows!')
print(df)
print(df.columns)
internships = df[df.Type == 'Internship'].groupby('Company').filter(lambda x: len(x) > 2)
ROLES = {
"ML": [regex("data"), regex('AI', False), regex("ML", False), regex("machine learning"), regex('vision')],
"InfoSec": [regex('cyber'), regex('security'), regex('infosec')],
"SWE": [regex('software'), regex('developer'), regex('backend'), regex('programming'), regex('programmer')],
"infra": [regex("devops"), regex('cloud')],
"dontcare": [regex('trader'), regex('manage'), regex('techops'), regex('design'), regex('business'), regex('consulting'), regex('supply')],
"unknown": [regex('')]
}
role_agg = {k:set() for k in ROLES}
for role in internships.Role.unique():
for typ,ls in ROLES.items():
if any(r.search(role) for r in ls):
role_agg[typ].add(role)
break
print("Categorised job roles: ")
for rtyp,ls in role_agg.items():
print('-----', rtyp, '-----')
for role in ls:
print('\t', role)
def rolecount(group):
d = {'Total': len(group)}
for role,rset in role_agg.items():
d[role] = len(group.Role[group.Role.str.contains('|'.join(rset))])
return pd.Series(d)
print(
internships.groupby('Company')["Salary"].agg(["count", np.mean, np.std, np.var])
)
print(
internships.groupby('Company').apply(rolecount)
)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment