Skip to content

Instantly share code, notes, and snippets.

@KakoozaJerry
Created November 17, 2021 17:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save KakoozaJerry/90a14ca5b4326718fd0bcd6929532f18 to your computer and use it in GitHub Desktop.
Save KakoozaJerry/90a14ca5b4326718fd0bcd6929532f18 to your computer and use it in GitHub Desktop.
task greenhouse: :environment do
require 'open-uri'
require 'nokogiri'
pages = [
"https://boards.greenhouse.io/remotecom",
"https://boards.greenhouse.io/alt",
"https://boards.greenhouse.io/gitlab",
"https://boards.greenhouse.io/github",
"https://boards.greenhouse.io/gocardless",
"https://boards.greenhouse.io/carta",
"https://boards.greenhouse.io/twilio",
"https://boards.greenhouse.io/tusimple",
"https://boards.greenhouse.io/sweeten",
"https://boards.greenhouse.io/calm",
"https://boards.greenhouse.io/capsulecares",
"https://boards.greenhouse.io/nubank",
"https://boards.greenhouse.io/oyster",
"https://boards.greenhouse.io/invision",
"https://boards.greenhouse.io/shopee",
"https://boards.greenhouse.io/genius",
"https://boards.greenhouse.io/databento",
"https://boards.greenhouse.io/ndb",
"https://boards.greenhouse.io/nuvocargo",
"https://boards.greenhouse.io/greenthumbindustries",
"https://boards.greenhouse.io/shearersinternalmobility",
"https://boards.greenhouse.io/webflow",
"https://boards.greenhouse.io/careem",
"https://boards.greenhouse.io/mastercardfoundation",
"https://boards.greenhouse.io/cerebral",
"https://boards.greenhouse.io/cazoo",
"https://boards.greenhouse.io/gomedia",
"https://boards.greenhouse.io/jobsforthefuture",
"https://boards.greenhouse.io/axios",
"https://boards.greenhouse.io/crescolabs",
"https://boards.greenhouse.io/circleci",
"https://boards.greenhouse.io/pokemoncareers",
"https://boards.greenhouse.io/chief",
"https://boards.greenhouse.io/bucknerinternational",
"https://boards.greenhouse.io/amedigital",
"https://boards.greenhouse.io/patreon",
"https://boards.greenhouse.io/monzo",
"https://robinhood.com/us/en/careers/openings/",
"https://boards.greenhouse.io/codefresh",
"https://boards.greenhouse.io/hudl",
"https://careers.masterclass.com/",
"https://boards.greenhouse.io/soluto",
"https://boards.greenhouse.io/duolingo",
"https://boards.greenhouse.io/recordedfuture",
"https://boards.greenhouse.io/fabric",
"https://boards.greenhouse.io/tatari",
"https://boards.greenhouse.io/slicareers",
"https://boards.greenhouse.io/freenow",
"https://boards.greenhouse.io/adyen",
"https://boards.greenhouse.io/samsungresearchamericainternship",
"https://boards.greenhouse.io/anaconda",
"https://boards.greenhouse.io/buzzfeed",
"https://boards.greenhouse.io/reddit"
]
pages.each do |page|
doc = Nokogiri::HTML(URI.open(page))
postings = doc.search('section.level-0')
#logopic = doc.search("div#logo > a > img")[0]['src']
sleep(10)
postings.each do |p|
check = false
department = ""
department = p.search('h1').text
if department == ""
department = p.search('h2').text
if department == ""
department = p.search('h3').text
if department == ""
department = p.search('h4').text
if department == ""
department = p.search('h5').text
if department == ""
department = p.search('h6').text
else
check = true
end
else
check = true
end
else
check = true
end
else
check = true
end
else
check = true
end
if department.include?("Design") or department.include?("Engineering") or department.include?("Product") or department.include?("Operations") or department.include?("Engineering") or department.include?("Infrastructure") or department.include?("Quality") or department.include?("Security") or department.include?("UX") or department.include?("Services") or department.include?("Support") or department.include?("Business Systems and IT") or department.include?("Data") or department.include?("Machine Learning") or department.include?("R&D") or department.include?("IT") or department.include?("02 Algorithm") or department.include?("08 IT / InfoSec") or department.include?("Engineer") or department.include?("Technology") or department.include?("Research & Development") or department.include?("Engenharia") or department.include?("Operações") or department.include?("Cybersecurity") or department.include?("DevOps") or department.include?("Agile") or department.include?("Lead") or department.include?("IT") or department.include?("Data") or department.include?("Cyber")
#do this
sleep(10)
postingsagain = p.search('div.opening')
postingsagain.each do |ps|
job_title = ps.search('a').text
job_title = job_title.strip
boardname = page.split(/\//)[2]
if boardname.include?("boards.greenhouse.io")
company = page.split(/\//)[3]
intermidiateurl = ps.search('a')[0]["href"]
url = 'https://boards.greenhouse.io' + intermidiateurl
else
if page.include?("https://robinhood.com/us/en/careers/openings/")
company = "Robinhood"
elsif page.include?('https://careers.masterclass.com/')
company = "MasterClass"
end
url = ps.search('a')[0]["href"]
end
location = ps.search('span').text
location = location.strip
sleep(20)
move_up_page = Nokogiri::HTML(URI.open(url))
description = move_up_page.search("div#content").text
year = description.scan(/\w+\W+year*/) or description.scan(/(\d\-\d\s+year*)/) or description.scan(/(\w+\s+yr*)/) or description.scan(/(\w+\s+year*)/) or description.scan(/(\d\-\d+year*)/) or description.scan(/(\w+yr*)/)
years = year*","
years_value = years.scan(/\d/).sort.first.to_i
# Technology Stack
# newone = ""
# grap = ""
# yearshash = ""
# newone = description.scan(/technologies(.*?)\./)[0]
# if newone == nil
# newone = description.scan(/skills(.*?)\./)[0]
# if newone == nil
# newone = description.scan(/technology\sframeworks(.*?)\./)[0]
# if newone == nil
# newone = description.scan(/programming\slanguage(.*?)\./)[0]
# if newone == nil
# newone = description.scan(/Proficiency\sin(.*?)\./)[0]
# if newone == nil
# newone = description.scan(/Tools(.*?)\./)[0]
# if newone == nil
# else
# grap = newone[0].to_s.split(",")
# end
# else
# grap = newone[0].to_s.split(",")
# end
# else
# grap = newone[0].to_s.split(",")
# end
# else
# grap = newone[0].to_s.split(",")
# end
# else
# grap = newone[0].to_s.split(",")
# end
# else
# grap = newone[0].to_s.split(",")
# end
stacks = []
grouping = []
stack_tags = ["JamStack", "M.E.R.N", "LAMP", "ASP.NET", "M.E.A.N", "MEVN", "Ruby on Rails"]
frameworks_tags = ["flask","React","Django","laravel","angularJS","Vue","expressJs","cakePHP","jQuery","Bootstrap","CodeIgnitor","Drupal","NextJS","FastAPI","node","npm","Revel"]
programming_lang = ["Python","Javascript","Typescript","PHP","C#","C++","Swift","Perl","MATLAB","Dart","Kotlin","Golang","Bash","VBA"]
database_tags = ["SQL","NoSQL","Oracle","Cassandra","RDBMs","MySQL","PostgreSQL","MongoDB","Mongoose","Vitess",]
design_tags = ["figma","InDesign","PhotoShop","illustrator","Adobe Creative Cloud Products","UI/UX"]
data_analytics = ["apache spark","apache kafka","apache flink","Kafka", "Spark","Jupyter","Looker","Tableau","Metabase","ClickHouse"]
deployment_tags = ["Google Cloud","S3","Kubernetes","Apache","Nginx","HAproxy","Ansible","Puppet","Chef","OpenAPI","GraphQL","Docker","K8S","Terraform","Hadoop"]
networking_tags = ["TCP/IP","DNS","HTTP","Active Directory"]
operatingsystem_tags = ["linux","macOS","OS X","Windows","Ubuntu","Android","iOS"]
vanilla_tags = ["HTML","CSS","HTML5","ES6","SaaS","SCSS"]
ci_cd_tags = ["Github","Gitlab","Jenkins","BitBucket"]
other_data_analytics_cases = ["BI"]
other_programming_lang_cases = ["Java "," C ", ", C,", "C ", "C,","Rust","Scala","Go "]
other_design_cases = ["Sketch","UI","UX"]
other_ci_cd_cases = ["Git "]
other_deployment_cases = ["API","AWS"]
if description == nil || company == nil
else
stack_tags.each do |tag|
if description.downcase.to_s.include?(tag.downcase)
if grouping.include?("Tech Stacks")
else
grouping.append("Tech Stacks")
end
stacks.append(tag)
end
end
frameworks_tags.each do |tag|
if description.downcase.to_s.include?(tag.downcase)
if grouping.include?("FrameWorks")
else
grouping.append("FrameWorks")
end
stacks.append(tag)
end
end
programming_lang.each do |tag|
if description.downcase.to_s.include?(tag.downcase)
if grouping.include?("Programming Languages")
else
grouping.append("Programming Languages")
end
stacks.append(tag)
end
end
database_tags.each do |tag|
if description.downcase.to_s.include?(tag.downcase)
if grouping.include?("Database")
else
grouping.append("Database")
end
stacks.append(tag)
end
end
design_tags.each do |tag|
if description.downcase.to_s.include?(tag.downcase)
if grouping.include?("Design")
else
grouping.append("Design")
end
stacks.append(tag)
end
end
data_analytics.each do |tag|
if description.downcase.to_s.include?(tag.downcase)
if grouping.include?("Data Analytics")
else
grouping.append("Data Analytics")
end
stacks.append(tag)
end
end
deployment_tags.each do |tag|
if description.downcase.to_s.include?(tag.downcase)
if grouping.include?("Deployment")
else
grouping.append("Deployment")
end
stacks.append(tag)
end
end
networking_tags.each do |tag|
if description.downcase.to_s.include?(tag.downcase)
if grouping.include?("Networks")
else
grouping.append("Networks")
end
stacks.append(tag)
end
end
operatingsystem_tags.each do |tag|
if description.downcase.to_s.include?(tag.downcase)
if grouping.include?("Operating Systems")
else
grouping.append("Operating Systems")
end
stacks.append(tag)
end
end
vanilla_tags.each do |tag|
if description.downcase.to_s.include?(tag.downcase)
if grouping.include?("Vanilla Skills")
else
grouping.append("Vanilla Skills")
end
stacks.append(tag)
end
end
ci_cd_tags.each do |tag|
if description.downcase.to_s.include?(tag.downcase)
if grouping.include?("CI/CD")
else
grouping.append("CI/CD")
end
stacks.append(tag)
end
end
other_data_analytics_cases.each do |tag|
if description.to_s.include?(tag)
if grouping.include?("Data Analytics")
else
grouping.append("Data Analytics")
end
stacks.append(tag)
end
end
other_programming_lang_cases.each do |tag|
if description.to_s.include?(tag)
if grouping.include?("Programming Languages")
else
grouping.append("Programming Languages")
end
stacks.append(tag)
end
end
other_design_cases.each do |tag|
if description.to_s.include?(tag)
if grouping.include?("Design")
else
grouping.append("Design")
end
stacks.append(tag)
end
end
other_ci_cd_cases.each do |tag|
if description.to_s.include?(tag)
if grouping.include?("CI/CD")
else
grouping.append("CI/CD")
end
stacks.append(tag)
end
end
other_deployment_cases.each do |tag|
if description.to_s.include?(tag)
if grouping.include?("Deployment")
else
grouping.append("Deployment")
end
stacks.append(tag)
end
end
end
tags = []
# regular tags are searched for and found in job description
if description == nil || company == nil
else
reg_tags = ["Government", "Financial", "Nonprofit", "Javascript"]
faang_tags = ["Netflix","Google", "Apple", "Facebook", "Amazon", "Microsoft", "LinkedIn", "Uber", "Lyft"]
ai_tags = ["MLFlow", "MLOps", "ml-ops", "ml-based", "Machine Learning", "kubernetes"]
fin_tags = ["shopify", "cryptocurrency", "cryptofinance", "deloitte", "accenture", "citi", "jpmc", "Wells Fargo", "credit", "e-commerce"]
sports_tags = ["soccer", "NBA", "NHL", "NASCAR"]
edu_tags = ["teachers", "educational"]
retail_tags = ["jcpenney", "macys", "retail"]
cloud_tags = ["docker", "azure", "aws", "cloud"]
devops_tags = ["Agile", "Lean", "DevOps"]
startup_tags = ["bespoke", "MVP"]
toys_tags = ["lego", "mattel", "hasbro"]
telecomm_tags = ["t-mobile", "at&t", "verizon"]
food_tags = ["cuisine", "pizzas", "dining", "cater", "miller"]
med_tags = ["nursing", "post-surgical", "medically"]
cyber_tags = ["cybersecurity", "security"]
data_tags = ["data", "information"]
reg_tags.each do |tag|
if description.downcase.to_s.include?(tag)
tags.append(tag)
end
end
faang_tags.each do |tag|
if description.downcase.to_s.include?(tag)
if tags.include?("FAANGM+")
else
final_tag = "FAANGM+"
tags.append(final_tag)
end
end
end
ai_tags.each do |tag|
if description.downcase.to_s.include?(tag)
if tags.include?("AI")
else
final_tag = "AI"
tags.append(final_tag)
end
end
end
fin_tags.each do |tag|
if description.downcase.to_s.include?(tag)
if tags.include?("Financial")
else
final_tag = "Financial"
tags.append(final_tag)
end
end
end
startup_tags.each do |tag|
if description.downcase.to_s.include?(tag)
if tags.include?("Startups")
else
final_tag = "Startups"
tags.append(final_tag)
end
end
end
sports_tags.each do |tag|
if description.downcase.to_s.include?(tag)
if tags.include?("Sports")
else
final_tag = "Sports"
tags.append(final_tag)
end
end
end
edu_tags.each do |tag|
if description.downcase.to_s.include?(tag)
if tags.include?("Education")
else
final_tag = "Education"
tags.append(final_tag)
end
end
retail_tags.each do |tag|
if description.downcase.to_s.include?(tag)
if tags.include?("Retail")
else
final_tag = "Retail"
tags.append(final_tag)
end
end
end
cloud_tags.each do |tag|
if description.downcase.to_s.include?(tag)
if tags.include?("Cloud")
else
final_tag = "Cloud"
tags.append(final_tag)
end
end
end
devops_tags.each do |tag|
if description.downcase.to_s.include?(tag)
if tags.include?("DevOps")
else
final_tag = "DevOps"
tags.append(final_tag)
end
end
end
toys_tags.each do |tag|
if description.downcase.to_s.include?(tag)
if tags.include?("Toys")
else
final_tag = "Toys"
tags.append(final_tag)
end
end
end
telecomm_tags.each do |tag|
if description.downcase.to_s.include?(tag)
if tags.include?("Telecommunications")
else
final_tag = "Telecommunications"
tags.append(final_tag)
end
end
end
food_tags.each do |tag|
if description.downcase.to_s.include?(tag)
if tags.include?("Food")
else
final_tag = "Food"
tags.append(final_tag)
end
end
end
med_tags.each do |tag|
if description.downcase.to_s.include?(tag)
if tags.include?("Medical")
else
final_tag = "Medical"
tags.append(final_tag)
end
end
end
cyber_tags.each do |tag|
if description.downcase.to_s.include?(tag)
if tags.include?("Cybersecurity")
else
final_tag = "Cybersecurity"
tags.append(final_tag)
end
end
end
data_tags.each do |tag|
if description.downcase.to_s.include?(tag)
if tags.include?("Data")
else
final_tag = "Data"
tags.append(final_tag)
end
end
end
end
end
# only save jobs that are non mid level or senior
if job_title.include?("Senior") or job_title.include?("Mid") or job_title.include?("Sr") or job_title.include?("Director") or job_title.include?("Technical") or job_title.include?("Manager") or job_title.include?("Lead") or job_title.include?("Staff") or job_title.include?("Head") or job_title.include?("Principal") or job_title.include?("Chief") or job_title.include?("Architect") or job_title.include?("Applied")
# do not save Job to db
else
if Job.where(title:job_title, location:location, company:company, url:url).count <= 0
Job.create(
portal_id: 1,
title:job_title,
location:location,
company:company,
job_board: "Greenhouse",
description:description,
yearsofexperience:years_value,
url:url,
tags:tags,
tech_stack:stacks,
grouping:grouping)
puts 'Added: ' + (job_title ? job_title : '') + ':' + (company ? company : '') + ':' + (location ? location : '') + ':' + (url ? url : '') + (description ? description : '') + '\n\n'.gsub!('\n', "\n") + '>>>>>>>>>>>>>>>>>>>>>>>> Tech Stacks <<<<<<<<<<<<<<<<<<<<<<<<<' + '\n'.gsub!('\n', "\n") + stacks.to_s + '\n\n'.gsub!('\n', "\n") + '>>>>>>>>>>>>>>>>>>>>>>>> Grouping <<<<<<<<<<<<<<<<<<<<<<<<<' + '\n'.gsub!('\n', "\n") + grouping.to_s + '\n\n'.gsub!('\n', "\n") + '>>>>>>>>>>>>>>>>>>>>>>>> Tags <<<<<<<<<<<<<<<<<<<<<<<<<' + '\n'.gsub!('\n', "\n") + tags.to_s
else
puts 'Skipped: ' + (job_title ? job_title : '') + ':' + (company ? company : '') + ':' + (location ? location : '') + ':' + (url ? url : '') + (description ? description : '') + '\n\n'.gsub!('\n', "\n") + '>>>>>>>>>>>>>>>>>>>>>> Tech Stacks <<<<<<<<<<<<<<<<<<<<<<<<<<' + '\n'.gsub!('\n', "\n") + stacks.to_s + '\n'.gsub!('\n', "\n") + '>>>>>>>>>>>>>>>>>>>>>>>> Grouping <<<<<<<<<<<<<<<<<<<<<<<<<' + '\n'.gsub!('\n', "\n") + grouping.to_s + '\n\n'.gsub!('\n', "\n") + '>>>>>>>>>>>>>>>>>>>>>>>> Tags <<<<<<<<<<<<<<<<<<<<<<<<<' + '\n'.gsub!('\n', "\n") + tags.to_s
end
end
end
else
#do nothing
end
end
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment