Skip to content

Instantly share code, notes, and snippets.

@uroybd
Created September 25, 2016 04:25
Show Gist options
  • Save uroybd/6e1d84dcd0a8fff07e0674c74aa193d3 to your computer and use it in GitHub Desktop.
Save uroybd/6e1d84dcd0a8fff07e0674c74aa193d3 to your computer and use it in GitHub Desktop.
A simple Job crawler for bdjobs
import csv
from requests import post
from functools import reduce
from bs4 import BeautifulSoup
def soup_to_dict(dt):
"""Job html soup to dictionary"""
main_site = 'http://jobs.bdjobs.com/'
data_dict = {};
job_title = dt.find('div', {'class': 'job-title-text'})
data_dict['job_link'] = main_site + job_title.find('a', {'href': True})['href']
data_dict['job_title_text'] = job_title.text.strip()
data_dict['company_name'] = dt.find('div', {'class': 'comp-name-text'}).text.strip()
data_dict['education'] = dt.find('div', {'class': 'edu-text-d'}).text.strip()
data_dict['experience'] = dt.find('div', {'class': 'exp-text-d'}).text.strip()
data_dict['deadline'] = dt.find('div', {'class': 'dead-text-d'}).text.strip()
return data_dict
def crawler(page=1, category_id=8, get_page=False):
"""Crawl A page of Category of category_id. By default get the first page of IT section"""
url = 'http://jobs.bdjobs.com/jobsearch.asp'
post_data = {
'Country': '0',
'MPostings': '',
'Newspaper': '0',
'fcat': '{}'.format(category_id),
'fcatId': '{}'.format(category_id),
'hidJobSearch': 'JobSearch',
'hidOrder': '',
'iCat': '0',
'pg': '{}'.format(page),
'qAge': '0',
'qDeadline': '0',
'qExp': '0',
'qJobLevel': '0',
'qJobNature': '0',
'qJobSpecialSkill': '-1',
'qOT': '0',
'qPosted': '0',
'txtsearch': '',
'ver': ''
}
soup = BeautifulSoup(post(url, data=post_data).content, 'html.parser')
if get_page is True:
return int(soup.find('div', {'id': 'topPagging'}).find_all('li')[-1].text.strip().replace('.', ''))
else:
html_data = soup.find_all('div', {'class': 'norm-jobs-wrapper'})
return [soup_to_dict(dt) for dt in html_data]
def scrapper(filename="output.csv", category=8):
"""Main function to get all jobs of a specific category and write them in a csv file"""
column_names = ['job_title_text', 'company_name', 'education', 'experience', 'deadline', 'job_link']
pages = crawler(category_id=category, get_page=True)
data = reduce(lambda x,y: x+y, [crawler(page=pg, category_id=category) for pg in range(1, pages+1)])
with open(filename, 'w') as output:
csv_writer = csv.DictWriter(output, fieldnames=column_names)
csv_writer.writerow({'job_title_text': 'Job Title',
'company_name': 'Company Name',
'education': 'Education',
'experience': 'Experience',
'deadline': 'Deadline',
'job_link': 'Job Link'})
csv_writer.writerows(data)
print('Successfully completed!')
print('Total Job : {0}'.format(len(data)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment