Last active
October 8, 2019 15:02
-
-
Save kanevk/58c688849ba6d9e06d27a7d3e25566d7 to your computer and use it in GitHub Desktop.
A crawlers for fetching insights for the Sofia dev market
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'bundler/inline' | |
gemfile do | |
source 'https://rubygems.org' | |
gem 'wombat' | |
gem 'pry' | |
end | |
puts 'Gems installed and loaded!' | |
TECHNOLOGY_STATS = Struct.new(:jobs, :filtred_jobs) | |
DEBUG = !!ARGV[0] | |
NOISE_REGEXS = [/devops|scien|test|research|assurance|frontend|front-end|стаж/i, /QA/] | |
REGEX_PER_TECHOLOGY = { | |
ruby: /ruby|rails|web|fullstack/i, | |
python: /python|django|web|fullstack/i, | |
nodejs: /node|javascript|web|fullstack/i, | |
dotnet: /net|c#|web|fullstack/i | |
} | |
JOBSBG_ITEMS_PAGE_SIZE = 15 | |
STATS = {} | |
def find_job_titles(technology, pages_count:, crawl_page:) | |
all_job_titles = (0..pages_count).map { |page| crawl_page.call(page).fetch('jobs') }.flatten | |
p all_job_titles if DEBUG | |
jobs_titles_without_noise = | |
all_job_titles | |
.grep(REGEX_PER_TECHOLOGY.fetch(technology)) | |
.reject { |s| NOISE_REGEXS.any? { |reg| s.match?(reg) } } | |
p jobs_titles_without_noise if DEBUG | |
puts if DEBUG | |
STATS[technology] = TECHNOLOGY_STATS.new(all_job_titles.count, jobs_titles_without_noise.count) | |
end | |
# Ruby | |
def find_jobsbg_ruby_page(page) | |
Wombat.crawl do | |
base_url 'https://www.jobs.bg' | |
path "/front_job_search.php?frompage=#{page * JOBSBG_ITEMS_PAGE_SIZE}&distance=0&location_sid=1&all_categories=0&all_type=0&position_level%5B0%5D=8&position_level%5B1%5D=9&all_company_type=1&keywords%5B0%5D=ruby&keywords%5B1%5D=rails&keyword=&csrf_token=ONiOhzWBxZ0ya_xq5qW6BF80oLzUQdkchM2PnM0T2Go&subm=1#paging" | |
jobs({ css: ".offerslistRow .joblink" }, :list) | |
end | |
end | |
find_job_titles(:ruby, pages_count: 4, crawl_page: method(:find_jobsbg_ruby_page)) | |
# Python | |
def find_jobsbg_python_page(page) | |
Wombat.crawl do | |
base_url 'https://www.jobs.bg' | |
path "/front_job_search.php?frompage=#{page * JOBSBG_ITEMS_PAGE_SIZE}&zone_id=0&distance=0&location_sid=1&all_categories=0&all_type=0&position_level%5B0%5D=8&position_level%5B1%5D=9&all_company_type=1&keywords%5B0%5D=python&keywords%5B1%5D=django&keyword=&csrf_token=ONiOhzWBxZ0ya_xq5qW6BF80oLzUQdkchM2PnM0T2Go&last=0&subm=1#paging" | |
jobs({ css: ".offerslistRow .joblink" }, :list) | |
end | |
end | |
find_job_titles(:python, pages_count: 18, crawl_page: method(:find_jobsbg_python_page)) | |
# NodeJS | |
def find_jobsbg_nodejs_page(page) | |
Wombat.crawl do | |
base_url 'https://www.jobs.bg' | |
# The keywords here are `node, nodejs, node js`, but not javascript because we want | |
# the Frontend position are considered as a noise for this research | |
path "/front_job_search.php?frompage=#{page * JOBSBG_ITEMS_PAGE_SIZE}&zone_id=0&distance=0&location_sid=1&all_categories=0&all_type=0&position_level%5B0%5D=8&position_level%5B1%5D=9&all_company_type=1&keywords%5B0%5D=node&keywords%5B1%5D=node+js&keywords%5B2%5D=nodejs&keyword=&csrf_token=zWKZg6WleAio7bGWHn6BsRW0HRiQpivf6zVPwacHanI&last=0&email=&subscribe=1&subm=1#paging" | |
jobs({ css: ".offerslistRow .joblink" }, :list) | |
end | |
end | |
find_job_titles(:nodejs, pages_count: 10, crawl_page: method(:find_jobsbg_nodejs_page)) | |
def find_jobsbg_dotnet_page(page) | |
Wombat.crawl do | |
base_url 'https://www.jobs.bg' | |
# The keywords here are `".net", ".net developer", ".net c#"` | |
path "/front_job_search.php?frompage=#{page * JOBSBG_ITEMS_PAGE_SIZE}&zone_id=0&distance=0&location_sid=1&all_categories=0&all_type=0&position_level%5B0%5D=8&position_level%5B1%5D=9&all_company_type=1&keywords%5B0%5D=.net&keywords%5B1%5D=.net+developer&keywords%5B2%5D=.net+c%23&keyword=&csrf_token=zWKZg6WleAio7bGWHn6BsRW0HRiQpivf6zVPwacHanI&last=0&email=&subscribe=1&subm=1#paging" | |
jobs({ css: ".offerslistRow .joblink" }, :list) | |
end | |
end | |
find_job_titles(:dotnet, pages_count: 12, crawl_page: method(:find_jobsbg_dotnet_page)) | |
puts 'JobsBG jobs' | |
STATS.each do |stat| | |
puts | |
puts stat | |
end |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env ruby | |
require 'bundler/inline' | |
gemfile do | |
source 'https://rubygems.org' | |
gem 'wombat' | |
gem 'pry' | |
end | |
puts 'Gems installed and loaded!' | |
TECHNOLOGY_STATS = Struct.new(:jobs, :filtred_jobs) | |
DEBUG = !!ARGV[0] | |
NOISE_REGEXS = [/devops|scien|test|research|assurance|frontend|front-end|стаж/i, /QA/] | |
REGEX_PER_TECHOLOGY = { | |
ruby: /ruby|rails|web/i, | |
python: /python|django|web/i, | |
nodejs: /node|javascript|web/i, | |
dotnet: /net|c#|web/i | |
} | |
STATS = {} | |
def find_job_titles(technology, crawler:) | |
result = crawler.call | |
first_page_jobs = result.fetch('jobs') | |
all_count = result.fetch('jobs_count').to_i | |
p first_page_jobs if DEBUG | |
first_page_jobs_without_noise = | |
first_page_jobs | |
.grep(REGEX_PER_TECHOLOGY.fetch(technology)) | |
.reject { |s| NOISE_REGEXS.any? { |reg| s.match?(reg) } } | |
p first_page_jobs_without_noise if DEBUG | |
STATS[technology] = TECHNOLOGY_STATS.new(all_count, all_count * (first_page_jobs_without_noise.count.to_f / first_page_jobs.count) ) | |
end | |
# Ruby | |
def linkedin_ruby_crawler(keywords_string) | |
Wombat.crawl do | |
base_url 'https://bg.linkedin.com' | |
path "/jobs/search?keywords=#{keywords_string}&location=Sofia%2C%20Sofia%20City%2C%20Bulgaria&trk=guest_job_search_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0" | |
jobs({ css: '.result-card__full-card-link' }, :list) | |
jobs_count({css: ".results-context-header__job-count"}) | |
end | |
end | |
find_job_titles(:ruby, crawler: -> { linkedin_ruby_crawler('ruby developer') }) | |
find_job_titles(:python, crawler: -> { linkedin_ruby_crawler('python') }) | |
find_job_titles(:nodejs, crawler: -> { linkedin_ruby_crawler('node developer') }) | |
find_job_titles(:dotnet, crawler: -> { linkedin_ruby_crawler('.net developer') }) | |
# REPORT | |
puts 'LinkedIn jobs' | |
STATS.each do |technology, stat| | |
puts | |
puts "#{technology} - All jobs are #{stat.jobs}, filtred jobs are #{stat.filtred_jobs}" | |
end |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment