Created
September 25, 2016 04:25
-
-
Save uroybd/6e1d84dcd0a8fff07e0674c74aa193d3 to your computer and use it in GitHub Desktop.
A simple Job crawler for bdjobs
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import csv | |
from requests import post | |
from functools import reduce | |
from bs4 import BeautifulSoup | |
def soup_to_dict(dt): | |
"""Job html soup to dictionary""" | |
main_site = 'http://jobs.bdjobs.com/' | |
data_dict = {}; | |
job_title = dt.find('div', {'class': 'job-title-text'}) | |
data_dict['job_link'] = main_site + job_title.find('a', {'href': True})['href'] | |
data_dict['job_title_text'] = job_title.text.strip() | |
data_dict['company_name'] = dt.find('div', {'class': 'comp-name-text'}).text.strip() | |
data_dict['education'] = dt.find('div', {'class': 'edu-text-d'}).text.strip() | |
data_dict['experience'] = dt.find('div', {'class': 'exp-text-d'}).text.strip() | |
data_dict['deadline'] = dt.find('div', {'class': 'dead-text-d'}).text.strip() | |
return data_dict | |
def crawler(page=1, category_id=8, get_page=False): | |
"""Crawl A page of Category of category_id. By default get the first page of IT section""" | |
url = 'http://jobs.bdjobs.com/jobsearch.asp' | |
post_data = { | |
'Country': '0', | |
'MPostings': '', | |
'Newspaper': '0', | |
'fcat': '{}'.format(category_id), | |
'fcatId': '{}'.format(category_id), | |
'hidJobSearch': 'JobSearch', | |
'hidOrder': '', | |
'iCat': '0', | |
'pg': '{}'.format(page), | |
'qAge': '0', | |
'qDeadline': '0', | |
'qExp': '0', | |
'qJobLevel': '0', | |
'qJobNature': '0', | |
'qJobSpecialSkill': '-1', | |
'qOT': '0', | |
'qPosted': '0', | |
'txtsearch': '', | |
'ver': '' | |
} | |
soup = BeautifulSoup(post(url, data=post_data).content, 'html.parser') | |
if get_page is True: | |
return int(soup.find('div', {'id': 'topPagging'}).find_all('li')[-1].text.strip().replace('.', '')) | |
else: | |
html_data = soup.find_all('div', {'class': 'norm-jobs-wrapper'}) | |
return [soup_to_dict(dt) for dt in html_data] | |
def scrapper(filename="output.csv", category=8): | |
"""Main function to get all jobs of a specific category and write them in a csv file""" | |
column_names = ['job_title_text', 'company_name', 'education', 'experience', 'deadline', 'job_link'] | |
pages = crawler(category_id=category, get_page=True) | |
data = reduce(lambda x,y: x+y, [crawler(page=pg, category_id=category) for pg in range(1, pages+1)]) | |
with open(filename, 'w') as output: | |
csv_writer = csv.DictWriter(output, fieldnames=column_names) | |
csv_writer.writerow({'job_title_text': 'Job Title', | |
'company_name': 'Company Name', | |
'education': 'Education', | |
'experience': 'Experience', | |
'deadline': 'Deadline', | |
'job_link': 'Job Link'}) | |
csv_writer.writerows(data) | |
print('Successfully completed!') | |
print('Total Job : {0}'.format(len(data))) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment