uroybd/scrapper.py

## scrapper.py
import csv
from requests import post
from functools import reduce
from bs4 import BeautifulSoup

def soup_to_dict(dt):
    """Job html soup to dictionary"""

    main_site = 'http://jobs.bdjobs.com/'
    data_dict = {};

    job_title = dt.find('div', {'class': 'job-title-text'})
    data_dict['job_link'] = main_site + job_title.find('a', {'href': True})['href']

    data_dict['job_title_text'] = job_title.text.strip()
    data_dict['company_name'] = dt.find('div', {'class': 'comp-name-text'}).text.strip()
    data_dict['education'] = dt.find('div', {'class': 'edu-text-d'}).text.strip()
    data_dict['experience'] = dt.find('div', {'class': 'exp-text-d'}).text.strip()
    data_dict['deadline'] = dt.find('div', {'class': 'dead-text-d'}).text.strip()

    return data_dict

def crawler(page=1, category_id=8, get_page=False):
    """Crawl A page of Category of category_id. By default get the first page of IT section"""

    url = 'http://jobs.bdjobs.com/jobsearch.asp'

    post_data = {
        'Country': '0',
        'MPostings': '',
        'Newspaper': '0',
        'fcat': '{}'.format(category_id),
        'fcatId': '{}'.format(category_id),
        'hidJobSearch': 'JobSearch',
        'hidOrder': '',
        'iCat': '0',
        'pg': '{}'.format(page),
        'qAge': '0',
        'qDeadline': '0',
        'qExp': '0',
        'qJobLevel': '0',
        'qJobNature': '0',
        'qJobSpecialSkill': '-1',
        'qOT': '0',
        'qPosted': '0',
        'txtsearch': '',
        'ver': ''
    }

    soup = BeautifulSoup(post(url, data=post_data).content, 'html.parser')

    if get_page is True:
        return int(soup.find('div', {'id': 'topPagging'}).find_all('li')[-1].text.strip().replace('.', ''))
    else:
        html_data = soup.find_all('div', {'class': 'norm-jobs-wrapper'})
        return [soup_to_dict(dt) for dt in html_data]


def scrapper(filename="output.csv", category=8):
    """Main function to get all jobs of a specific category and write them in a csv file"""

    column_names = ['job_title_text', 'company_name', 'education', 'experience', 'deadline', 'job_link']
    pages = crawler(category_id=category, get_page=True)
    data = reduce(lambda x,y: x+y, [crawler(page=pg, category_id=category) for pg in range(1, pages+1)])

    with open(filename, 'w') as output:
        csv_writer = csv.DictWriter(output, fieldnames=column_names)
        csv_writer.writerow({'job_title_text': 'Job Title',
                             'company_name': 'Company Name',
                             'education': 'Education',
                             'experience': 'Experience',
                             'deadline': 'Deadline',
                             'job_link': 'Job Link'})
        csv_writer.writerows(data)

    print('Successfully completed!')
    print('Total Job : {0}'.format(len(data)))
	import csv
	from requests import post
	from functools import reduce
	from bs4 import BeautifulSoup

	def soup_to_dict(dt):
	"""Job html soup to dictionary"""

	main_site = 'http://jobs.bdjobs.com/'
	data_dict = {};

	job_title = dt.find('div', {'class': 'job-title-text'})
	data_dict['job_link'] = main_site + job_title.find('a', {'href': True})['href']

	data_dict['job_title_text'] = job_title.text.strip()
	data_dict['company_name'] = dt.find('div', {'class': 'comp-name-text'}).text.strip()
	data_dict['education'] = dt.find('div', {'class': 'edu-text-d'}).text.strip()
	data_dict['experience'] = dt.find('div', {'class': 'exp-text-d'}).text.strip()
	data_dict['deadline'] = dt.find('div', {'class': 'dead-text-d'}).text.strip()

	return data_dict

	def crawler(page=1, category_id=8, get_page=False):
	"""Crawl A page of Category of category_id. By default get the first page of IT section"""

	url = 'http://jobs.bdjobs.com/jobsearch.asp'

	post_data = {
	'Country': '0',
	'MPostings': '',
	'Newspaper': '0',
	'fcat': '{}'.format(category_id),
	'fcatId': '{}'.format(category_id),
	'hidJobSearch': 'JobSearch',
	'hidOrder': '',
	'iCat': '0',
	'pg': '{}'.format(page),
	'qAge': '0',
	'qDeadline': '0',
	'qExp': '0',
	'qJobLevel': '0',
	'qJobNature': '0',
	'qJobSpecialSkill': '-1',
	'qOT': '0',
	'qPosted': '0',
	'txtsearch': '',
	'ver': ''
	}

	soup = BeautifulSoup(post(url, data=post_data).content, 'html.parser')

	if get_page is True:
	return int(soup.find('div', {'id': 'topPagging'}).find_all('li')[-1].text.strip().replace('.', ''))
	else:
	html_data = soup.find_all('div', {'class': 'norm-jobs-wrapper'})
	return [soup_to_dict(dt) for dt in html_data]


	def scrapper(filename="output.csv", category=8):
	"""Main function to get all jobs of a specific category and write them in a csv file"""

	column_names = ['job_title_text', 'company_name', 'education', 'experience', 'deadline', 'job_link']
	pages = crawler(category_id=category, get_page=True)
	data = reduce(lambda x,y: x+y, [crawler(page=pg, category_id=category) for pg in range(1, pages+1)])

	with open(filename, 'w') as output:
	csv_writer = csv.DictWriter(output, fieldnames=column_names)
	csv_writer.writerow({'job_title_text': 'Job Title',
	'company_name': 'Company Name',
	'education': 'Education',
	'experience': 'Experience',
	'deadline': 'Deadline',
	'job_link': 'Job Link'})
	csv_writer.writerows(data)

	print('Successfully completed!')
	print('Total Job : {0}'.format(len(data)))