Created
November 17, 2021 17:40
-
-
Save KakoozaJerry/90a14ca5b4326718fd0bcd6929532f18 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
task greenhouse: :environment do | |
require 'open-uri' | |
require 'nokogiri' | |
pages = [ | |
"https://boards.greenhouse.io/remotecom", | |
"https://boards.greenhouse.io/alt", | |
"https://boards.greenhouse.io/gitlab", | |
"https://boards.greenhouse.io/github", | |
"https://boards.greenhouse.io/gocardless", | |
"https://boards.greenhouse.io/carta", | |
"https://boards.greenhouse.io/twilio", | |
"https://boards.greenhouse.io/tusimple", | |
"https://boards.greenhouse.io/sweeten", | |
"https://boards.greenhouse.io/calm", | |
"https://boards.greenhouse.io/capsulecares", | |
"https://boards.greenhouse.io/nubank", | |
"https://boards.greenhouse.io/oyster", | |
"https://boards.greenhouse.io/invision", | |
"https://boards.greenhouse.io/shopee", | |
"https://boards.greenhouse.io/genius", | |
"https://boards.greenhouse.io/databento", | |
"https://boards.greenhouse.io/ndb", | |
"https://boards.greenhouse.io/nuvocargo", | |
"https://boards.greenhouse.io/greenthumbindustries", | |
"https://boards.greenhouse.io/shearersinternalmobility", | |
"https://boards.greenhouse.io/webflow", | |
"https://boards.greenhouse.io/careem", | |
"https://boards.greenhouse.io/mastercardfoundation", | |
"https://boards.greenhouse.io/cerebral", | |
"https://boards.greenhouse.io/cazoo", | |
"https://boards.greenhouse.io/gomedia", | |
"https://boards.greenhouse.io/jobsforthefuture", | |
"https://boards.greenhouse.io/axios", | |
"https://boards.greenhouse.io/crescolabs", | |
"https://boards.greenhouse.io/circleci", | |
"https://boards.greenhouse.io/pokemoncareers", | |
"https://boards.greenhouse.io/chief", | |
"https://boards.greenhouse.io/bucknerinternational", | |
"https://boards.greenhouse.io/amedigital", | |
"https://boards.greenhouse.io/patreon", | |
"https://boards.greenhouse.io/monzo", | |
"https://robinhood.com/us/en/careers/openings/", | |
"https://boards.greenhouse.io/codefresh", | |
"https://boards.greenhouse.io/hudl", | |
"https://careers.masterclass.com/", | |
"https://boards.greenhouse.io/soluto", | |
"https://boards.greenhouse.io/duolingo", | |
"https://boards.greenhouse.io/recordedfuture", | |
"https://boards.greenhouse.io/fabric", | |
"https://boards.greenhouse.io/tatari", | |
"https://boards.greenhouse.io/slicareers", | |
"https://boards.greenhouse.io/freenow", | |
"https://boards.greenhouse.io/adyen", | |
"https://boards.greenhouse.io/samsungresearchamericainternship", | |
"https://boards.greenhouse.io/anaconda", | |
"https://boards.greenhouse.io/buzzfeed", | |
"https://boards.greenhouse.io/reddit" | |
] | |
pages.each do |page| | |
doc = Nokogiri::HTML(URI.open(page)) | |
postings = doc.search('section.level-0') | |
#logopic = doc.search("div#logo > a > img")[0]['src'] | |
sleep(10) | |
postings.each do |p| | |
check = false | |
department = "" | |
department = p.search('h1').text | |
if department == "" | |
department = p.search('h2').text | |
if department == "" | |
department = p.search('h3').text | |
if department == "" | |
department = p.search('h4').text | |
if department == "" | |
department = p.search('h5').text | |
if department == "" | |
department = p.search('h6').text | |
else | |
check = true | |
end | |
else | |
check = true | |
end | |
else | |
check = true | |
end | |
else | |
check = true | |
end | |
else | |
check = true | |
end | |
if department.include?("Design") or department.include?("Engineering") or department.include?("Product") or department.include?("Operations") or department.include?("Engineering") or department.include?("Infrastructure") or department.include?("Quality") or department.include?("Security") or department.include?("UX") or department.include?("Services") or department.include?("Support") or department.include?("Business Systems and IT") or department.include?("Data") or department.include?("Machine Learning") or department.include?("R&D") or department.include?("IT") or department.include?("02 Algorithm") or department.include?("08 IT / InfoSec") or department.include?("Engineer") or department.include?("Technology") or department.include?("Research & Development") or department.include?("Engenharia") or department.include?("Operações") or department.include?("Cybersecurity") or department.include?("DevOps") or department.include?("Agile") or department.include?("Lead") or department.include?("IT") or department.include?("Data") or department.include?("Cyber") | |
#do this | |
sleep(10) | |
postingsagain = p.search('div.opening') | |
postingsagain.each do |ps| | |
job_title = ps.search('a').text | |
job_title = job_title.strip | |
boardname = page.split(/\//)[2] | |
if boardname.include?("boards.greenhouse.io") | |
company = page.split(/\//)[3] | |
intermidiateurl = ps.search('a')[0]["href"] | |
url = 'https://boards.greenhouse.io' + intermidiateurl | |
else | |
if page.include?("https://robinhood.com/us/en/careers/openings/") | |
company = "Robinhood" | |
elsif page.include?('https://careers.masterclass.com/') | |
company = "MasterClass" | |
end | |
url = ps.search('a')[0]["href"] | |
end | |
location = ps.search('span').text | |
location = location.strip | |
sleep(20) | |
move_up_page = Nokogiri::HTML(URI.open(url)) | |
description = move_up_page.search("div#content").text | |
year = description.scan(/\w+\W+year*/) or description.scan(/(\d\-\d\s+year*)/) or description.scan(/(\w+\s+yr*)/) or description.scan(/(\w+\s+year*)/) or description.scan(/(\d\-\d+year*)/) or description.scan(/(\w+yr*)/) | |
years = year*"," | |
years_value = years.scan(/\d/).sort.first.to_i | |
# Technology Stack | |
# newone = "" | |
# grap = "" | |
# yearshash = "" | |
# newone = description.scan(/technologies(.*?)\./)[0] | |
# if newone == nil | |
# newone = description.scan(/skills(.*?)\./)[0] | |
# if newone == nil | |
# newone = description.scan(/technology\sframeworks(.*?)\./)[0] | |
# if newone == nil | |
# newone = description.scan(/programming\slanguage(.*?)\./)[0] | |
# if newone == nil | |
# newone = description.scan(/Proficiency\sin(.*?)\./)[0] | |
# if newone == nil | |
# newone = description.scan(/Tools(.*?)\./)[0] | |
# if newone == nil | |
# else | |
# grap = newone[0].to_s.split(",") | |
# end | |
# else | |
# grap = newone[0].to_s.split(",") | |
# end | |
# else | |
# grap = newone[0].to_s.split(",") | |
# end | |
# else | |
# grap = newone[0].to_s.split(",") | |
# end | |
# else | |
# grap = newone[0].to_s.split(",") | |
# end | |
# else | |
# grap = newone[0].to_s.split(",") | |
# end | |
stacks = [] | |
grouping = [] | |
stack_tags = ["JamStack", "M.E.R.N", "LAMP", "ASP.NET", "M.E.A.N", "MEVN", "Ruby on Rails"] | |
frameworks_tags = ["flask","React","Django","laravel","angularJS","Vue","expressJs","cakePHP","jQuery","Bootstrap","CodeIgnitor","Drupal","NextJS","FastAPI","node","npm","Revel"] | |
programming_lang = ["Python","Javascript","Typescript","PHP","C#","C++","Swift","Perl","MATLAB","Dart","Kotlin","Golang","Bash","VBA"] | |
database_tags = ["SQL","NoSQL","Oracle","Cassandra","RDBMs","MySQL","PostgreSQL","MongoDB","Mongoose","Vitess",] | |
design_tags = ["figma","InDesign","PhotoShop","illustrator","Adobe Creative Cloud Products","UI/UX"] | |
data_analytics = ["apache spark","apache kafka","apache flink","Kafka", "Spark","Jupyter","Looker","Tableau","Metabase","ClickHouse"] | |
deployment_tags = ["Google Cloud","S3","Kubernetes","Apache","Nginx","HAproxy","Ansible","Puppet","Chef","OpenAPI","GraphQL","Docker","K8S","Terraform","Hadoop"] | |
networking_tags = ["TCP/IP","DNS","HTTP","Active Directory"] | |
operatingsystem_tags = ["linux","macOS","OS X","Windows","Ubuntu","Android","iOS"] | |
vanilla_tags = ["HTML","CSS","HTML5","ES6","SaaS","SCSS"] | |
ci_cd_tags = ["Github","Gitlab","Jenkins","BitBucket"] | |
other_data_analytics_cases = ["BI"] | |
other_programming_lang_cases = ["Java "," C ", ", C,", "C ", "C,","Rust","Scala","Go "] | |
other_design_cases = ["Sketch","UI","UX"] | |
other_ci_cd_cases = ["Git "] | |
other_deployment_cases = ["API","AWS"] | |
if description == nil || company == nil | |
else | |
stack_tags.each do |tag| | |
if description.downcase.to_s.include?(tag.downcase) | |
if grouping.include?("Tech Stacks") | |
else | |
grouping.append("Tech Stacks") | |
end | |
stacks.append(tag) | |
end | |
end | |
frameworks_tags.each do |tag| | |
if description.downcase.to_s.include?(tag.downcase) | |
if grouping.include?("FrameWorks") | |
else | |
grouping.append("FrameWorks") | |
end | |
stacks.append(tag) | |
end | |
end | |
programming_lang.each do |tag| | |
if description.downcase.to_s.include?(tag.downcase) | |
if grouping.include?("Programming Languages") | |
else | |
grouping.append("Programming Languages") | |
end | |
stacks.append(tag) | |
end | |
end | |
database_tags.each do |tag| | |
if description.downcase.to_s.include?(tag.downcase) | |
if grouping.include?("Database") | |
else | |
grouping.append("Database") | |
end | |
stacks.append(tag) | |
end | |
end | |
design_tags.each do |tag| | |
if description.downcase.to_s.include?(tag.downcase) | |
if grouping.include?("Design") | |
else | |
grouping.append("Design") | |
end | |
stacks.append(tag) | |
end | |
end | |
data_analytics.each do |tag| | |
if description.downcase.to_s.include?(tag.downcase) | |
if grouping.include?("Data Analytics") | |
else | |
grouping.append("Data Analytics") | |
end | |
stacks.append(tag) | |
end | |
end | |
deployment_tags.each do |tag| | |
if description.downcase.to_s.include?(tag.downcase) | |
if grouping.include?("Deployment") | |
else | |
grouping.append("Deployment") | |
end | |
stacks.append(tag) | |
end | |
end | |
networking_tags.each do |tag| | |
if description.downcase.to_s.include?(tag.downcase) | |
if grouping.include?("Networks") | |
else | |
grouping.append("Networks") | |
end | |
stacks.append(tag) | |
end | |
end | |
operatingsystem_tags.each do |tag| | |
if description.downcase.to_s.include?(tag.downcase) | |
if grouping.include?("Operating Systems") | |
else | |
grouping.append("Operating Systems") | |
end | |
stacks.append(tag) | |
end | |
end | |
vanilla_tags.each do |tag| | |
if description.downcase.to_s.include?(tag.downcase) | |
if grouping.include?("Vanilla Skills") | |
else | |
grouping.append("Vanilla Skills") | |
end | |
stacks.append(tag) | |
end | |
end | |
ci_cd_tags.each do |tag| | |
if description.downcase.to_s.include?(tag.downcase) | |
if grouping.include?("CI/CD") | |
else | |
grouping.append("CI/CD") | |
end | |
stacks.append(tag) | |
end | |
end | |
other_data_analytics_cases.each do |tag| | |
if description.to_s.include?(tag) | |
if grouping.include?("Data Analytics") | |
else | |
grouping.append("Data Analytics") | |
end | |
stacks.append(tag) | |
end | |
end | |
other_programming_lang_cases.each do |tag| | |
if description.to_s.include?(tag) | |
if grouping.include?("Programming Languages") | |
else | |
grouping.append("Programming Languages") | |
end | |
stacks.append(tag) | |
end | |
end | |
other_design_cases.each do |tag| | |
if description.to_s.include?(tag) | |
if grouping.include?("Design") | |
else | |
grouping.append("Design") | |
end | |
stacks.append(tag) | |
end | |
end | |
other_ci_cd_cases.each do |tag| | |
if description.to_s.include?(tag) | |
if grouping.include?("CI/CD") | |
else | |
grouping.append("CI/CD") | |
end | |
stacks.append(tag) | |
end | |
end | |
other_deployment_cases.each do |tag| | |
if description.to_s.include?(tag) | |
if grouping.include?("Deployment") | |
else | |
grouping.append("Deployment") | |
end | |
stacks.append(tag) | |
end | |
end | |
end | |
tags = [] | |
# regular tags are searched for and found in job description | |
if description == nil || company == nil | |
else | |
reg_tags = ["Government", "Financial", "Nonprofit", "Javascript"] | |
faang_tags = ["Netflix","Google", "Apple", "Facebook", "Amazon", "Microsoft", "LinkedIn", "Uber", "Lyft"] | |
ai_tags = ["MLFlow", "MLOps", "ml-ops", "ml-based", "Machine Learning", "kubernetes"] | |
fin_tags = ["shopify", "cryptocurrency", "cryptofinance", "deloitte", "accenture", "citi", "jpmc", "Wells Fargo", "credit", "e-commerce"] | |
sports_tags = ["soccer", "NBA", "NHL", "NASCAR"] | |
edu_tags = ["teachers", "educational"] | |
retail_tags = ["jcpenney", "macys", "retail"] | |
cloud_tags = ["docker", "azure", "aws", "cloud"] | |
devops_tags = ["Agile", "Lean", "DevOps"] | |
startup_tags = ["bespoke", "MVP"] | |
toys_tags = ["lego", "mattel", "hasbro"] | |
telecomm_tags = ["t-mobile", "at&t", "verizon"] | |
food_tags = ["cuisine", "pizzas", "dining", "cater", "miller"] | |
med_tags = ["nursing", "post-surgical", "medically"] | |
cyber_tags = ["cybersecurity", "security"] | |
data_tags = ["data", "information"] | |
reg_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
tags.append(tag) | |
end | |
end | |
faang_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
if tags.include?("FAANGM+") | |
else | |
final_tag = "FAANGM+" | |
tags.append(final_tag) | |
end | |
end | |
end | |
ai_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
if tags.include?("AI") | |
else | |
final_tag = "AI" | |
tags.append(final_tag) | |
end | |
end | |
end | |
fin_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
if tags.include?("Financial") | |
else | |
final_tag = "Financial" | |
tags.append(final_tag) | |
end | |
end | |
end | |
startup_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
if tags.include?("Startups") | |
else | |
final_tag = "Startups" | |
tags.append(final_tag) | |
end | |
end | |
end | |
sports_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
if tags.include?("Sports") | |
else | |
final_tag = "Sports" | |
tags.append(final_tag) | |
end | |
end | |
end | |
edu_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
if tags.include?("Education") | |
else | |
final_tag = "Education" | |
tags.append(final_tag) | |
end | |
end | |
retail_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
if tags.include?("Retail") | |
else | |
final_tag = "Retail" | |
tags.append(final_tag) | |
end | |
end | |
end | |
cloud_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
if tags.include?("Cloud") | |
else | |
final_tag = "Cloud" | |
tags.append(final_tag) | |
end | |
end | |
end | |
devops_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
if tags.include?("DevOps") | |
else | |
final_tag = "DevOps" | |
tags.append(final_tag) | |
end | |
end | |
end | |
toys_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
if tags.include?("Toys") | |
else | |
final_tag = "Toys" | |
tags.append(final_tag) | |
end | |
end | |
end | |
telecomm_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
if tags.include?("Telecommunications") | |
else | |
final_tag = "Telecommunications" | |
tags.append(final_tag) | |
end | |
end | |
end | |
food_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
if tags.include?("Food") | |
else | |
final_tag = "Food" | |
tags.append(final_tag) | |
end | |
end | |
end | |
med_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
if tags.include?("Medical") | |
else | |
final_tag = "Medical" | |
tags.append(final_tag) | |
end | |
end | |
end | |
cyber_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
if tags.include?("Cybersecurity") | |
else | |
final_tag = "Cybersecurity" | |
tags.append(final_tag) | |
end | |
end | |
end | |
data_tags.each do |tag| | |
if description.downcase.to_s.include?(tag) | |
if tags.include?("Data") | |
else | |
final_tag = "Data" | |
tags.append(final_tag) | |
end | |
end | |
end | |
end | |
end | |
# only save jobs that are non mid level or senior | |
if job_title.include?("Senior") or job_title.include?("Mid") or job_title.include?("Sr") or job_title.include?("Director") or job_title.include?("Technical") or job_title.include?("Manager") or job_title.include?("Lead") or job_title.include?("Staff") or job_title.include?("Head") or job_title.include?("Principal") or job_title.include?("Chief") or job_title.include?("Architect") or job_title.include?("Applied") | |
# do not save Job to db | |
else | |
if Job.where(title:job_title, location:location, company:company, url:url).count <= 0 | |
Job.create( | |
portal_id: 1, | |
title:job_title, | |
location:location, | |
company:company, | |
job_board: "Greenhouse", | |
description:description, | |
yearsofexperience:years_value, | |
url:url, | |
tags:tags, | |
tech_stack:stacks, | |
grouping:grouping) | |
puts 'Added: ' + (job_title ? job_title : '') + ':' + (company ? company : '') + ':' + (location ? location : '') + ':' + (url ? url : '') + (description ? description : '') + '\n\n'.gsub!('\n', "\n") + '>>>>>>>>>>>>>>>>>>>>>>>> Tech Stacks <<<<<<<<<<<<<<<<<<<<<<<<<' + '\n'.gsub!('\n', "\n") + stacks.to_s + '\n\n'.gsub!('\n', "\n") + '>>>>>>>>>>>>>>>>>>>>>>>> Grouping <<<<<<<<<<<<<<<<<<<<<<<<<' + '\n'.gsub!('\n', "\n") + grouping.to_s + '\n\n'.gsub!('\n', "\n") + '>>>>>>>>>>>>>>>>>>>>>>>> Tags <<<<<<<<<<<<<<<<<<<<<<<<<' + '\n'.gsub!('\n', "\n") + tags.to_s | |
else | |
puts 'Skipped: ' + (job_title ? job_title : '') + ':' + (company ? company : '') + ':' + (location ? location : '') + ':' + (url ? url : '') + (description ? description : '') + '\n\n'.gsub!('\n', "\n") + '>>>>>>>>>>>>>>>>>>>>>> Tech Stacks <<<<<<<<<<<<<<<<<<<<<<<<<<' + '\n'.gsub!('\n', "\n") + stacks.to_s + '\n'.gsub!('\n', "\n") + '>>>>>>>>>>>>>>>>>>>>>>>> Grouping <<<<<<<<<<<<<<<<<<<<<<<<<' + '\n'.gsub!('\n', "\n") + grouping.to_s + '\n\n'.gsub!('\n', "\n") + '>>>>>>>>>>>>>>>>>>>>>>>> Tags <<<<<<<<<<<<<<<<<<<<<<<<<' + '\n'.gsub!('\n', "\n") + tags.to_s | |
end | |
end | |
end | |
else | |
#do nothing | |
end | |
end | |
end | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment