Riley Predum rileypredum

## webscrape_code.py
# Note this takes about 40 min to run if np.arange is set to 9951 as the stopping point.

pages = np.arange(1, 100, 50) # Last time I tried, I could only go to 10000 items because after that the URI has no discernable pattern to combat webcrawlers; I just did 4 pages for demonstration purposes. You can increase this for your own projects.
headers = {'Accept-Language': 'en-US,en;q=0.8'} # If this is not specified, the default language is Chinese

#initialize empty lists to store the variables scraped
titles = []
years = []
ratings = []
genres = []

## all_skills_list.py
def extract_skills_list():
  all_skills_endpoint = "https://emsiservices.com/skills/versions/latest/skills" # List of all skills endpoint
  auth = "Authorization: Bearer " + access_token # Auth string including access token from above
  headers = {'authorization': auth} # headers
  response = requests.request("GET", all_skills_endpoint, headers=headers) # response
  response = response.json()['data'] # the data

  all_skills_df = pd.DataFrame(json_normalize(response)); # Where response is a JSON object drilled down to the level of 'data' key
  return all_skills_df

## related_skills.py
# Skills IDs to find related skills to:
# Python (Programming Language): KS125LS6N7WP4S6SFTCK
# Pandas (Python Package): KSGWPO6DSN70GRY20JFT


url = "https://emsiservices.com/skills/versions/latest/related"

payload = "{ \"ids\": [ \"KS125LS6N7WP4S6SFTCK\", \"KSGWPO6DSN70GRY20JFT\" ] }"
headers = {
    'authorization': "Bearer " + access_token,

## skill_by_substring.py
# Looking up the ID of a skill based on the name, or whether it contains something
# Could be useful in coming up with a search function for the end user

def find_id_by_skill_name(name_substring):
  all_skills_df = extract_skills_list() # pull all skills into a DF

  return all_skills_df[all_skills_df['name'].str.contains(name_substring)] # Filter that DF by substring

find_id_by_skill_name("Python")

## extract_skills_from_doc.py
# Extract skills from a document
def extract_skills_from_document():
  skills_from_doc_endpoint = "https://emsiservices.com/skills/versions/latest/extract"
  text = input("Paste the text from which you want to extract skills here: ")
  confidence_interval = str(input(".1 to 1, enter the confidence threshold you're willing to accept: "))
  payload = "{ \"text\": \"... " + text + " ...\", \"confidenceThreshold\": " + confidence_interval + " }"

  headers = {
      'authorization': "Bearer " + access_token,
      'content-type': "application/json"

## authentication.py
auth_endpoint = "https://auth.emsicloud.com/connect/token" # auth endpoint

client_id = "your_client_id" # replace 'your_client_id' with your client id from your api invite email
client_secret = "your_client_secret" # replace 'your_client_secret' with your client secret from your api invite email
scope = "emsi_open" # ok to leave as is, this is the scope we will used

payload = "client_id=" + client_id + "&client_secret=" + client_secret + "&grant_type=client_credentials&scope=" + scope # set credentials and scope
headers = {'content-type': 'application/x-www-form-urlencoded'} # headers for the response
access_token = json.loads((requests.request("POST", auth_endpoint, data=payload, headers=headers)).text)['access_token'] # grabs request's text and loads as JSON, then pulls the access token from that

## import.py
import requests
import json
import numpy as np
import pandas as pd
from pandas import json_normalize # easy JSON -> pd.DataFrame

## exp_dist.py
plt.figure(figsize=(10,5));

# Countplot of number of salaries by age
experience = df['years_professional_experience_post_college'] # set the age series to be the x variable
experience_ordering = [
          '1 year or less',
          '2 - 4 years',
          '5-7 years',
          '8 - 10 years',
          '11 - 20 years',

## age_dist.py
plt.figure(figsize=(10,5));

# Countplot of number of salaries by age
bucket_ordering = ['18-24', '25-34', '35-44', '45-54', '55-64', '65 or over'] # order the buckets more logically

# plot the count of ages in the dataset by age bracket
ax = sns.countplot(x=ages, order=bucket_ordering)
ax.set_title('Distribution of Ages');
ax.set_ylabel('Num people');
ax.set_xlabel('Age group');

## var_assign.py
# save some typing by defining variables of important columns
salaries = df['salary_clean'] # cleaned to int
ages = df['age'] # clean
location = df['location'] # needs NLP
industry = df['industry'] # needs NLP
experience = df['years_professional_experience_post_college'] # clean
role_titles = df['role_title'] # needs NLP
	# Note this takes about 40 min to run if np.arange is set to 9951 as the stopping point.

	pages = np.arange(1, 100, 50) # Last time I tried, I could only go to 10000 items because after that the URI has no discernable pattern to combat webcrawlers; I just did 4 pages for demonstration purposes. You can increase this for your own projects.
	headers = {'Accept-Language': 'en-US,en;q=0.8'} # If this is not specified, the default language is Chinese

	#initialize empty lists to store the variables scraped
	titles = []
	years = []
	ratings = []
	genres = []
	def extract_skills_list():
	all_skills_endpoint = "https://emsiservices.com/skills/versions/latest/skills" # List of all skills endpoint
	auth = "Authorization: Bearer " + access_token # Auth string including access token from above
	headers = {'authorization': auth} # headers
	response = requests.request("GET", all_skills_endpoint, headers=headers) # response
	response = response.json()['data'] # the data

	all_skills_df = pd.DataFrame(json_normalize(response)); # Where response is a JSON object drilled down to the level of 'data' key
	return all_skills_df
	# Skills IDs to find related skills to:
	# Python (Programming Language): KS125LS6N7WP4S6SFTCK
	# Pandas (Python Package): KSGWPO6DSN70GRY20JFT


	url = "https://emsiservices.com/skills/versions/latest/related"

	payload = "{ \"ids\": [ \"KS125LS6N7WP4S6SFTCK\", \"KSGWPO6DSN70GRY20JFT\" ] }"
	headers = {
	'authorization': "Bearer " + access_token,
	# Looking up the ID of a skill based on the name, or whether it contains something
	# Could be useful in coming up with a search function for the end user

	def find_id_by_skill_name(name_substring):
	all_skills_df = extract_skills_list() # pull all skills into a DF

	return all_skills_df[all_skills_df['name'].str.contains(name_substring)] # Filter that DF by substring

	find_id_by_skill_name("Python")
	# Extract skills from a document
	def extract_skills_from_document():
	skills_from_doc_endpoint = "https://emsiservices.com/skills/versions/latest/extract"
	text = input("Paste the text from which you want to extract skills here: ")
	confidence_interval = str(input(".1 to 1, enter the confidence threshold you're willing to accept: "))
	payload = "{ \"text\": \"... " + text + " ...\", \"confidenceThreshold\": " + confidence_interval + " }"

	headers = {
	'authorization': "Bearer " + access_token,
	'content-type': "application/json"
	auth_endpoint = "https://auth.emsicloud.com/connect/token" # auth endpoint

	client_id = "your_client_id" # replace 'your_client_id' with your client id from your api invite email
	client_secret = "your_client_secret" # replace 'your_client_secret' with your client secret from your api invite email
	scope = "emsi_open" # ok to leave as is, this is the scope we will used

	payload = "client_id=" + client_id + "&client_secret=" + client_secret + "&grant_type=client_credentials&scope=" + scope # set credentials and scope
	headers = {'content-type': 'application/x-www-form-urlencoded'} # headers for the response
	access_token = json.loads((requests.request("POST", auth_endpoint, data=payload, headers=headers)).text)['access_token'] # grabs request's text and loads as JSON, then pulls the access token from that
	import requests
	import json
	import numpy as np
	import pandas as pd
	from pandas import json_normalize # easy JSON -> pd.DataFrame
	plt.figure(figsize=(10,5));

	# Countplot of number of salaries by age
	experience = df['years_professional_experience_post_college'] # set the age series to be the x variable
	experience_ordering = [
	'1 year or less',
	'2 - 4 years',
	'5-7 years',
	'8 - 10 years',
	'11 - 20 years',
	plt.figure(figsize=(10,5));

	# Countplot of number of salaries by age
	bucket_ordering = ['18-24', '25-34', '35-44', '45-54', '55-64', '65 or over'] # order the buckets more logically

	# plot the count of ages in the dataset by age bracket
	ax = sns.countplot(x=ages, order=bucket_ordering)
	ax.set_title('Distribution of Ages');
	ax.set_ylabel('Num people');
	ax.set_xlabel('Age group');
	# save some typing by defining variables of important columns
	salaries = df['salary_clean'] # cleaned to int
	ages = df['age'] # clean
	location = df['location'] # needs NLP
	industry = df['industry'] # needs NLP
	experience = df['years_professional_experience_post_college'] # clean
	role_titles = df['role_title'] # needs NLP