Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
Scrape internship blog to get data in CSV
import requests
from bs4 import BeautifulSoup, Comment
import csv
from enum import Enum
# CSV: Job number, Job name, date opened, category, stipend, resume type
# Conf:
DEBUG = 1
ldap_user = ''
ldap_pass = ''
jobNumber = 1
name = ''
date = ''
cat = ''
stipend = ''
resume = ''
extra = ''
paged = 1
if DEBUG == 1:
url = "http://localhost/internship-blog/Internship-Blog-2018-19.html"
response = requests.get(url)
else:
base_url = "http://placements.iitb.ac.in/trainingblog/blog18/?paged="
url = base_url + str(paged)
response = requests.get(url, verify=False, auth=(ldap_user, ldap_pass))
def populate(div):
global name, date, cat, stipend, resume
for tag in div:
if tag == '\n':
continue
elif tag.string == None and tag.name != 'div':
extra = tag
print(tag, '\n', "tag is None")
elif tag.name == "div":
populate(tag)
else:
if "category" in tag.string.lower():
if "i1" in tag.string.lower():
cat = "I1"
elif "i2" in tag.string.lower():
cat = "I2"
elif "i3" in tag.string.lower():
cat = "I3"
elif "i4" in tag.string.lower():
cat = "I4"
elif "i5" in tag.string.lower():
cat = "I5"
elif "u1" in tag.string.lower():
cat = "U1"
elif "u2" in tag.string.lower():
cat = "U2"
else:
cat = tag.string
elif "stipend" in tag.string.lower():
stipend = tag.string
elif "resume" in tag.string.lower():
resume = tag.string
if __name__ == "__main__":
while response.status_code != 404:
print("Analysing page ", paged)
# analyse and write into csv
content = BeautifulSoup(response.content, "html.parser")
jobs = content.findAll("div", attrs={"class": "status-publish"})
for job in jobs:
line = ''
for tag in job:
if tag == '\n':
continue # Tag is empty
elif tag.name == "h2":
# Check if IAF has just opened
if "iaf" in tag.string.lower() and "open" in tag.string.lower():
name = tag.string
continue
else:
# print("Skipping: ", tag)
break # Don't process this job
elif tag.name == "small":
date = tag.contents[0]
continue
else:
if "entry" in tag["class"]:
populate(tag)
if name != '':
name.replace(",", "-")
date.replace(",", "-")
cat.replace(",", "-")
stipend.replace(",", "-")
resume.replace(",", "-")
extra.replace(",", "-")
row = [str(jobNumber), name, date, cat, stipend, resume, extra]
with open('data.csv', 'a') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(row)
csvFile.close()
jobNumber = jobNumber + 1
name = ''
date = ''
cat = ''
stipend = ''
resume = ''
extra = ''
# DEBUG_BREAK
if DEBUG == 1:
# Single Page
break
else:
if paged == 1000:
break
# done analysis, switch to next response
paged = paged+1
url = base_url + str(paged)
response = requests.get(
url, verify=False, auth=(ldap_user, ldap_pass))
print('\n')
print('\n', "Analysed till page ", paged)
@ShubhamKaudewar

This comment has been minimized.

Copy link

ShubhamKaudewar commented Aug 12, 2019

Machaya Nautatva !!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.