nautatva/Internship-blog-scrapper.py

## Internship-blog-scrapper.py
import requests
from bs4 import BeautifulSoup, Comment
import csv
from enum import Enum

# CSV: Job number, Job name, date opened, category, stipend, resume type
# Conf:
DEBUG = 1
ldap_user = ''
ldap_pass = ''

jobNumber = 1
name = ''
date = ''
cat = ''
stipend = ''
resume = ''
extra = ''

paged = 1
if DEBUG == 1:
    url = "http://localhost/internship-blog/Internship-Blog-2018-19.html"
    response = requests.get(url)
else:
    base_url = "http://placements.iitb.ac.in/trainingblog/blog18/?paged="
    url = base_url + str(paged)
    response = requests.get(url, verify=False, auth=(ldap_user, ldap_pass))


def populate(div):
    global name, date, cat, stipend, resume
    for tag in div:
        if tag == '\n':
            continue
        elif tag.string == None and tag.name != 'div':
            extra = tag
            print(tag, '\n', "tag is None")
        elif tag.name == "div":
            populate(tag)
        else:
            if "category" in tag.string.lower():
                if "i1" in tag.string.lower():
                    cat = "I1"
                elif "i2" in tag.string.lower():
                    cat = "I2"
                elif "i3" in tag.string.lower():
                    cat = "I3"
                elif "i4" in tag.string.lower():
                    cat = "I4"
                elif "i5" in tag.string.lower():
                    cat = "I5"
                elif "u1" in tag.string.lower():
                    cat = "U1"
                elif "u2" in tag.string.lower():
                    cat = "U2"
                else:
                    cat = tag.string
            elif "stipend" in tag.string.lower():
                stipend = tag.string
            elif "resume" in tag.string.lower():
                resume = tag.string


if __name__ == "__main__":
    while response.status_code != 404:
        print("Analysing page ", paged)
        # analyse and write into csv
        content = BeautifulSoup(response.content, "html.parser")
        jobs = content.findAll("div", attrs={"class": "status-publish"})

        for job in jobs:
            line = ''
            for tag in job:
                if tag == '\n':
                    continue  # Tag is empty
                elif tag.name == "h2":
                    # Check if IAF has just opened
                    if "iaf" in tag.string.lower() and "open" in tag.string.lower():
                        name = tag.string
                        continue
                    else:
                        # print("Skipping: ", tag)
                        break  # Don't process this job
                elif tag.name == "small":
                    date = tag.contents[0]
                    continue
                else:
                    if "entry" in tag["class"]:
                        populate(tag)

            if name != '':
                name.replace(",", "-")
                date.replace(",", "-")
                cat.replace(",", "-")
                stipend.replace(",", "-")
                resume.replace(",", "-")
                extra.replace(",", "-")
                row = [str(jobNumber), name, date, cat, stipend, resume, extra]

                with open('data.csv', 'a') as csvFile:
                    writer = csv.writer(csvFile)
                    writer.writerow(row)

                csvFile.close()

                jobNumber = jobNumber + 1
                name = ''
                date = ''
                cat = ''
                stipend = ''
                resume = ''
                extra = ''

        # DEBUG_BREAK
        if DEBUG == 1:
            # Single Page
            break
        else:
            if paged == 1000:
                break

        # done analysis, switch to next response
        paged = paged+1
        url = base_url + str(paged)
        response = requests.get(
            url, verify=False, auth=(ldap_user, ldap_pass))
        print('\n')

    print('\n', "Analysed till page ", paged)
	import requests
	from bs4 import BeautifulSoup, Comment
	import csv
	from enum import Enum

	# CSV: Job number, Job name, date opened, category, stipend, resume type
	# Conf:
	DEBUG = 1
	ldap_user = ''
	ldap_pass = ''

	jobNumber = 1
	name = ''
	date = ''
	cat = ''
	stipend = ''
	resume = ''
	extra = ''

	paged = 1
	if DEBUG == 1:
	url = "http://localhost/internship-blog/Internship-Blog-2018-19.html"
	response = requests.get(url)
	else:
	base_url = "http://placements.iitb.ac.in/trainingblog/blog18/?paged="
	url = base_url + str(paged)
	response = requests.get(url, verify=False, auth=(ldap_user, ldap_pass))


	def populate(div):
	global name, date, cat, stipend, resume
	for tag in div:
	if tag == '\n':
	continue
	elif tag.string == None and tag.name != 'div':
	extra = tag
	print(tag, '\n', "tag is None")
	elif tag.name == "div":
	populate(tag)
	else:
	if "category" in tag.string.lower():
	if "i1" in tag.string.lower():
	cat = "I1"
	elif "i2" in tag.string.lower():
	cat = "I2"
	elif "i3" in tag.string.lower():
	cat = "I3"
	elif "i4" in tag.string.lower():
	cat = "I4"
	elif "i5" in tag.string.lower():
	cat = "I5"
	elif "u1" in tag.string.lower():
	cat = "U1"
	elif "u2" in tag.string.lower():
	cat = "U2"
	else:
	cat = tag.string
	elif "stipend" in tag.string.lower():
	stipend = tag.string
	elif "resume" in tag.string.lower():
	resume = tag.string


	if __name__ == "__main__":
	while response.status_code != 404:
	print("Analysing page ", paged)
	# analyse and write into csv
	content = BeautifulSoup(response.content, "html.parser")
	jobs = content.findAll("div", attrs={"class": "status-publish"})

	for job in jobs:
	line = ''
	for tag in job:
	if tag == '\n':
	continue # Tag is empty
	elif tag.name == "h2":
	# Check if IAF has just opened
	if "iaf" in tag.string.lower() and "open" in tag.string.lower():
	name = tag.string
	continue
	else:
	# print("Skipping: ", tag)
	break # Don't process this job
	elif tag.name == "small":
	date = tag.contents[0]
	continue
	else:
	if "entry" in tag["class"]:
	populate(tag)

	if name != '':
	name.replace(",", "-")
	date.replace(",", "-")
	cat.replace(",", "-")
	stipend.replace(",", "-")
	resume.replace(",", "-")
	extra.replace(",", "-")
	row = [str(jobNumber), name, date, cat, stipend, resume, extra]

	with open('data.csv', 'a') as csvFile:
	writer = csv.writer(csvFile)
	writer.writerow(row)

	csvFile.close()

	jobNumber = jobNumber + 1
	name = ''
	date = ''
	cat = ''
	stipend = ''
	resume = ''
	extra = ''

	# DEBUG_BREAK
	if DEBUG == 1:
	# Single Page
	break
	else:
	if paged == 1000:
	break

	# done analysis, switch to next response
	paged = paged+1
	url = base_url + str(paged)
	response = requests.get(
	url, verify=False, auth=(ldap_user, ldap_pass))
	print('\n')

	print('\n', "Analysed till page ", paged)