This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Note this takes about 40 min to run if np.arange is set to 9951 as the stopping point. | |
pages = np.arange(1, 100, 50) # Last time I tried, I could only go to 10000 items because after that the URI has no discernable pattern to combat webcrawlers; I just did 4 pages for demonstration purposes. You can increase this for your own projects. | |
headers = {'Accept-Language': 'en-US,en;q=0.8'} # If this is not specified, the default language is Chinese | |
#initialize empty lists to store the variables scraped | |
titles = [] | |
years = [] | |
ratings = [] | |
genres = [] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def extract_skills_list(): | |
all_skills_endpoint = "https://emsiservices.com/skills/versions/latest/skills" # List of all skills endpoint | |
auth = "Authorization: Bearer " + access_token # Auth string including access token from above | |
headers = {'authorization': auth} # headers | |
response = requests.request("GET", all_skills_endpoint, headers=headers) # response | |
response = response.json()['data'] # the data | |
all_skills_df = pd.DataFrame(json_normalize(response)); # Where response is a JSON object drilled down to the level of 'data' key | |
return all_skills_df |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Looking up the ID of a skill based on the name, or whether it contains something | |
# Could be useful in coming up with a search function for the end user | |
def find_id_by_skill_name(name_substring): | |
all_skills_df = extract_skills_list() # pull all skills into a DF | |
return all_skills_df[all_skills_df['name'].str.contains(name_substring)] # Filter that DF by substring | |
find_id_by_skill_name("Python") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Extract skills from a document | |
def extract_skills_from_document(): | |
skills_from_doc_endpoint = "https://emsiservices.com/skills/versions/latest/extract" | |
text = input("Paste the text from which you want to extract skills here: ") | |
confidence_interval = str(input(".1 to 1, enter the confidence threshold you're willing to accept: ")) | |
payload = "{ \"text\": \"... " + text + " ...\", \"confidenceThreshold\": " + confidence_interval + " }" | |
headers = { | |
'authorization': "Bearer " + access_token, | |
'content-type': "application/json" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
auth_endpoint = "https://auth.emsicloud.com/connect/token" # auth endpoint | |
client_id = "your_client_id" # replace 'your_client_id' with your client id from your api invite email | |
client_secret = "your_client_secret" # replace 'your_client_secret' with your client secret from your api invite email | |
scope = "emsi_open" # ok to leave as is, this is the scope we will used | |
payload = "client_id=" + client_id + "&client_secret=" + client_secret + "&grant_type=client_credentials&scope=" + scope # set credentials and scope | |
headers = {'content-type': 'application/x-www-form-urlencoded'} # headers for the response | |
access_token = json.loads((requests.request("POST", auth_endpoint, data=payload, headers=headers)).text)['access_token'] # grabs request's text and loads as JSON, then pulls the access token from that |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import requests | |
import json | |
import numpy as np | |
import pandas as pd | |
from pandas import json_normalize # easy JSON -> pd.DataFrame |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
plt.figure(figsize=(10,5)); | |
# Countplot of number of salaries by age | |
experience = df['years_professional_experience_post_college'] # set the age series to be the x variable | |
experience_ordering = [ | |
'1 year or less', | |
'2 - 4 years', | |
'5-7 years', | |
'8 - 10 years', | |
'11 - 20 years', |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
plt.figure(figsize=(10,5)); | |
# Countplot of number of salaries by age | |
bucket_ordering = ['18-24', '25-34', '35-44', '45-54', '55-64', '65 or over'] # order the buckets more logically | |
# plot the count of ages in the dataset by age bracket | |
ax = sns.countplot(x=ages, order=bucket_ordering) | |
ax.set_title('Distribution of Ages'); | |
ax.set_ylabel('Num people'); | |
ax.set_xlabel('Age group'); |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# save some typing by defining variables of important columns | |
salaries = df['salary_clean'] # cleaned to int | |
ages = df['age'] # clean | |
location = df['location'] # needs NLP | |
industry = df['industry'] # needs NLP | |
experience = df['years_professional_experience_post_college'] # clean | |
role_titles = df['role_title'] # needs NLP |
NewerOlder