Skip to content

Instantly share code, notes, and snippets.

@kanevk
Last active October 8, 2019 15:02
Show Gist options
  • Save kanevk/58c688849ba6d9e06d27a7d3e25566d7 to your computer and use it in GitHub Desktop.
Save kanevk/58c688849ba6d9e06d27a7d3e25566d7 to your computer and use it in GitHub Desktop.
A crawlers for fetching insights for the Sofia dev market
#!/usr/bin/env ruby
require 'bundler/inline'
gemfile do
source 'https://rubygems.org'
gem 'wombat'
gem 'pry'
end
puts 'Gems installed and loaded!'
TECHNOLOGY_STATS = Struct.new(:jobs, :filtred_jobs)
DEBUG = !!ARGV[0]
NOISE_REGEXS = [/devops|scien|test|research|assurance|frontend|front-end|стаж/i, /QA/]
REGEX_PER_TECHOLOGY = {
ruby: /ruby|rails|web|fullstack/i,
python: /python|django|web|fullstack/i,
nodejs: /node|javascript|web|fullstack/i,
dotnet: /net|c#|web|fullstack/i
}
JOBSBG_ITEMS_PAGE_SIZE = 15
STATS = {}
def find_job_titles(technology, pages_count:, crawl_page:)
all_job_titles = (0..pages_count).map { |page| crawl_page.call(page).fetch('jobs') }.flatten
p all_job_titles if DEBUG
jobs_titles_without_noise =
all_job_titles
.grep(REGEX_PER_TECHOLOGY.fetch(technology))
.reject { |s| NOISE_REGEXS.any? { |reg| s.match?(reg) } }
p jobs_titles_without_noise if DEBUG
puts if DEBUG
STATS[technology] = TECHNOLOGY_STATS.new(all_job_titles.count, jobs_titles_without_noise.count)
end
# Ruby
def find_jobsbg_ruby_page(page)
Wombat.crawl do
base_url 'https://www.jobs.bg'
path "/front_job_search.php?frompage=#{page * JOBSBG_ITEMS_PAGE_SIZE}&distance=0&location_sid=1&all_categories=0&all_type=0&position_level%5B0%5D=8&position_level%5B1%5D=9&all_company_type=1&keywords%5B0%5D=ruby&keywords%5B1%5D=rails&keyword=&csrf_token=ONiOhzWBxZ0ya_xq5qW6BF80oLzUQdkchM2PnM0T2Go&subm=1#paging"
jobs({ css: ".offerslistRow .joblink" }, :list)
end
end
find_job_titles(:ruby, pages_count: 4, crawl_page: method(:find_jobsbg_ruby_page))
# Python
def find_jobsbg_python_page(page)
Wombat.crawl do
base_url 'https://www.jobs.bg'
path "/front_job_search.php?frompage=#{page * JOBSBG_ITEMS_PAGE_SIZE}&zone_id=0&distance=0&location_sid=1&all_categories=0&all_type=0&position_level%5B0%5D=8&position_level%5B1%5D=9&all_company_type=1&keywords%5B0%5D=python&keywords%5B1%5D=django&keyword=&csrf_token=ONiOhzWBxZ0ya_xq5qW6BF80oLzUQdkchM2PnM0T2Go&last=0&subm=1#paging"
jobs({ css: ".offerslistRow .joblink" }, :list)
end
end
find_job_titles(:python, pages_count: 18, crawl_page: method(:find_jobsbg_python_page))
# NodeJS
def find_jobsbg_nodejs_page(page)
Wombat.crawl do
base_url 'https://www.jobs.bg'
# The keywords here are `node, nodejs, node js`, but not javascript because we want
# the Frontend position are considered as a noise for this research
path "/front_job_search.php?frompage=#{page * JOBSBG_ITEMS_PAGE_SIZE}&zone_id=0&distance=0&location_sid=1&all_categories=0&all_type=0&position_level%5B0%5D=8&position_level%5B1%5D=9&all_company_type=1&keywords%5B0%5D=node&keywords%5B1%5D=node+js&keywords%5B2%5D=nodejs&keyword=&csrf_token=zWKZg6WleAio7bGWHn6BsRW0HRiQpivf6zVPwacHanI&last=0&email=&subscribe=1&subm=1#paging"
jobs({ css: ".offerslistRow .joblink" }, :list)
end
end
find_job_titles(:nodejs, pages_count: 10, crawl_page: method(:find_jobsbg_nodejs_page))
def find_jobsbg_dotnet_page(page)
Wombat.crawl do
base_url 'https://www.jobs.bg'
# The keywords here are `".net", ".net developer", ".net c#"`
path "/front_job_search.php?frompage=#{page * JOBSBG_ITEMS_PAGE_SIZE}&zone_id=0&distance=0&location_sid=1&all_categories=0&all_type=0&position_level%5B0%5D=8&position_level%5B1%5D=9&all_company_type=1&keywords%5B0%5D=.net&keywords%5B1%5D=.net+developer&keywords%5B2%5D=.net+c%23&keyword=&csrf_token=zWKZg6WleAio7bGWHn6BsRW0HRiQpivf6zVPwacHanI&last=0&email=&subscribe=1&subm=1#paging"
jobs({ css: ".offerslistRow .joblink" }, :list)
end
end
find_job_titles(:dotnet, pages_count: 12, crawl_page: method(:find_jobsbg_dotnet_page))
puts 'JobsBG jobs'
STATS.each do |stat|
puts
puts stat
end
#!/usr/bin/env ruby
require 'bundler/inline'
gemfile do
source 'https://rubygems.org'
gem 'wombat'
gem 'pry'
end
puts 'Gems installed and loaded!'
TECHNOLOGY_STATS = Struct.new(:jobs, :filtred_jobs)
DEBUG = !!ARGV[0]
NOISE_REGEXS = [/devops|scien|test|research|assurance|frontend|front-end|стаж/i, /QA/]
REGEX_PER_TECHOLOGY = {
ruby: /ruby|rails|web/i,
python: /python|django|web/i,
nodejs: /node|javascript|web/i,
dotnet: /net|c#|web/i
}
STATS = {}
def find_job_titles(technology, crawler:)
result = crawler.call
first_page_jobs = result.fetch('jobs')
all_count = result.fetch('jobs_count').to_i
p first_page_jobs if DEBUG
first_page_jobs_without_noise =
first_page_jobs
.grep(REGEX_PER_TECHOLOGY.fetch(technology))
.reject { |s| NOISE_REGEXS.any? { |reg| s.match?(reg) } }
p first_page_jobs_without_noise if DEBUG
STATS[technology] = TECHNOLOGY_STATS.new(all_count, all_count * (first_page_jobs_without_noise.count.to_f / first_page_jobs.count) )
end
# Ruby
def linkedin_ruby_crawler(keywords_string)
Wombat.crawl do
base_url 'https://bg.linkedin.com'
path "/jobs/search?keywords=#{keywords_string}&location=Sofia%2C%20Sofia%20City%2C%20Bulgaria&trk=guest_job_search_jobs-search-bar_search-submit&redirect=false&position=1&pageNum=0"
jobs({ css: '.result-card__full-card-link' }, :list)
jobs_count({css: ".results-context-header__job-count"})
end
end
find_job_titles(:ruby, crawler: -> { linkedin_ruby_crawler('ruby developer') })
find_job_titles(:python, crawler: -> { linkedin_ruby_crawler('python') })
find_job_titles(:nodejs, crawler: -> { linkedin_ruby_crawler('node developer') })
find_job_titles(:dotnet, crawler: -> { linkedin_ruby_crawler('.net developer') })
# REPORT
puts 'LinkedIn jobs'
STATS.each do |technology, stat|
puts
puts "#{technology} - All jobs are #{stat.jobs}, filtred jobs are #{stat.filtred_jobs}"
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment