Skip to content

Instantly share code, notes, and snippets.

View rileypredum's full-sized avatar
💭
Coding 👍 🔢

Riley Predum rileypredum

💭
Coding 👍 🔢
View GitHub Profile
@rileypredum
rileypredum / webscrape_code.py
Created July 28, 2022 23:23
The core of the webscraping script for the IMDB website publishing.
# Note this takes about 40 min to run if np.arange is set to 9951 as the stopping point.
pages = np.arange(1, 100, 50) # Last time I tried, I could only go to 10000 items because after that the URI has no discernable pattern to combat webcrawlers; I just did 4 pages for demonstration purposes. You can increase this for your own projects.
headers = {'Accept-Language': 'en-US,en;q=0.8'} # If this is not specified, the default language is Chinese
#initialize empty lists to store the variables scraped
titles = []
years = []
ratings = []
genres = []
@rileypredum
rileypredum / all_skills_list.py
Created August 5, 2020 22:43
EMSI - Global Skills List
def extract_skills_list():
all_skills_endpoint = "https://emsiservices.com/skills/versions/latest/skills" # List of all skills endpoint
auth = "Authorization: Bearer " + access_token # Auth string including access token from above
headers = {'authorization': auth} # headers
response = requests.request("GET", all_skills_endpoint, headers=headers) # response
response = response.json()['data'] # the data
all_skills_df = pd.DataFrame(json_normalize(response)); # Where response is a JSON object drilled down to the level of 'data' key
return all_skills_df
@rileypredum
rileypredum / related_skills.py
Last active November 4, 2023 04:39
EMSI - Find Related Skills
@rileypredum
rileypredum / skill_by_substring.py
Created August 5, 2020 22:31
EMSI - Find Skill by Name
# Looking up the ID of a skill based on the name, or whether it contains something
# Could be useful in coming up with a search function for the end user
def find_id_by_skill_name(name_substring):
all_skills_df = extract_skills_list() # pull all skills into a DF
return all_skills_df[all_skills_df['name'].str.contains(name_substring)] # Filter that DF by substring
find_id_by_skill_name("Python")
@rileypredum
rileypredum / extract_skills_from_doc.py
Created August 5, 2020 22:30
EMSI - Extract Skills from a Doc
# Extract skills from a document
def extract_skills_from_document():
skills_from_doc_endpoint = "https://emsiservices.com/skills/versions/latest/extract"
text = input("Paste the text from which you want to extract skills here: ")
confidence_interval = str(input(".1 to 1, enter the confidence threshold you're willing to accept: "))
payload = "{ \"text\": \"... " + text + " ...\", \"confidenceThreshold\": " + confidence_interval + " }"
headers = {
'authorization': "Bearer " + access_token,
'content-type': "application/json"
@rileypredum
rileypredum / authentication.py
Created August 5, 2020 22:23
EMSI - Authentication
auth_endpoint = "https://auth.emsicloud.com/connect/token" # auth endpoint
client_id = "your_client_id" # replace 'your_client_id' with your client id from your api invite email
client_secret = "your_client_secret" # replace 'your_client_secret' with your client secret from your api invite email
scope = "emsi_open" # ok to leave as is, this is the scope we will used
payload = "client_id=" + client_id + "&client_secret=" + client_secret + "&grant_type=client_credentials&scope=" + scope # set credentials and scope
headers = {'content-type': 'application/x-www-form-urlencoded'} # headers for the response
access_token = json.loads((requests.request("POST", auth_endpoint, data=payload, headers=headers)).text)['access_token'] # grabs request's text and loads as JSON, then pulls the access token from that
@rileypredum
rileypredum / import.py
Created August 5, 2020 22:19
EMSI - Import Statements
import requests
import json
import numpy as np
import pandas as pd
from pandas import json_normalize # easy JSON -> pd.DataFrame
plt.figure(figsize=(10,5));
# Countplot of number of salaries by age
experience = df['years_professional_experience_post_college'] # set the age series to be the x variable
experience_ordering = [
'1 year or less',
'2 - 4 years',
'5-7 years',
'8 - 10 years',
'11 - 20 years',
plt.figure(figsize=(10,5));
# Countplot of number of salaries by age
bucket_ordering = ['18-24', '25-34', '35-44', '45-54', '55-64', '65 or over'] # order the buckets more logically
# plot the count of ages in the dataset by age bracket
ax = sns.countplot(x=ages, order=bucket_ordering)
ax.set_title('Distribution of Ages');
ax.set_ylabel('Num people');
ax.set_xlabel('Age group');
# save some typing by defining variables of important columns
salaries = df['salary_clean'] # cleaned to int
ages = df['age'] # clean
location = df['location'] # needs NLP
industry = df['industry'] # needs NLP
experience = df['years_professional_experience_post_college'] # clean
role_titles = df['role_title'] # needs NLP