Skip to content

Instantly share code, notes, and snippets.

@Guri-ksolves
Created August 28, 2017 14:23
Show Gist options
  • Save Guri-ksolves/e0e06bd01e32eb431b7a15f7df3f087c to your computer and use it in GitHub Desktop.
Save Guri-ksolves/e0e06bd01e32eb431b7a15f7df3f087c to your computer and use it in GitHub Desktop.
web scrapping
require 'httparty'
require 'pry'
require 'json'
require 'nokogiri'
require 'csv'
require 'active_record'
require 'mechanize'
# model_name = "web_scraper"
# table_name = job_list
ActiveRecord::Base.establish_connection(
adapter: 'postgresql',
host: 'localhost',
encoding: 'unicode', database: 'web_scraper'
)
# creating model Company
ActiveRecord::Schema.define do
create_table :companies, force: true do |t|
t.string :name
t.integer :no_of_jobs
t.string :loaction
end
create_table :jobs, force: true do |t|
t.string :title
t.belongs_to :company, index: true
end
end
# creating model Company
class Company < ActiveRecord::Base
has_many :jobs
end
# creating model Job
class Job < ActiveRecord::Base
belongs_to :company
end
company = Company.create(name: 'infosys', loaction: 'Noida')
current = ARGV[0]
page = HTTParty.get(current)
parse_page = Nokogiri::HTML(page)
parse_page.css('.row').each do |f|
job = f.css('.jobtitle').text.strip
company.jobs.create!(title: job)
end
all_links = parse_page.css('div.pagination a').map { |link| link['href'] }
page = ARGV[1]
nxt = all_links[all_links.size - 1]
nxt = page + nxt
while nxt
doc = HTTParty.get(nxt)
parse_page = Nokogiri::HTML(doc)
parse_page.css('.row').each do |f|
job = f.css('.jobtitle').text.strip
company.jobs.create!(title: job)
puts job
end
all_links = parse_page.css('div.pagination a').map { |all_linksink| all_linksink['href'] }
nxt = all_links[all_links.size - 1]
temp = page + nxt
np = parse_page.css('.np').blank?
if np
nxt = temp
else
nxt = nil
end
end
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment