Skip to content

Instantly share code, notes, and snippets.

@nautatva
Created July 27, 2019 06:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nautatva/d02da67e1d2b6bf7d0c2c66e989b758a to your computer and use it in GitHub Desktop.
Save nautatva/d02da67e1d2b6bf7d0c2c66e989b758a to your computer and use it in GitHub Desktop.
Scrape internship blog to get data in CSV
import requests
from bs4 import BeautifulSoup, Comment
import csv
from enum import Enum
# CSV: Job number, Job name, date opened, category, stipend, resume type
# Conf:
DEBUG = 1
ldap_user = ''
ldap_pass = ''
jobNumber = 1
name = ''
date = ''
cat = ''
stipend = ''
resume = ''
extra = ''
paged = 1
if DEBUG == 1:
url = "http://localhost/internship-blog/Internship-Blog-2018-19.html"
response = requests.get(url)
else:
base_url = "http://placements.iitb.ac.in/trainingblog/blog18/?paged="
url = base_url + str(paged)
response = requests.get(url, verify=False, auth=(ldap_user, ldap_pass))
def populate(div):
global name, date, cat, stipend, resume
for tag in div:
if tag == '\n':
continue
elif tag.string == None and tag.name != 'div':
extra = tag
print(tag, '\n', "tag is None")
elif tag.name == "div":
populate(tag)
else:
if "category" in tag.string.lower():
if "i1" in tag.string.lower():
cat = "I1"
elif "i2" in tag.string.lower():
cat = "I2"
elif "i3" in tag.string.lower():
cat = "I3"
elif "i4" in tag.string.lower():
cat = "I4"
elif "i5" in tag.string.lower():
cat = "I5"
elif "u1" in tag.string.lower():
cat = "U1"
elif "u2" in tag.string.lower():
cat = "U2"
else:
cat = tag.string
elif "stipend" in tag.string.lower():
stipend = tag.string
elif "resume" in tag.string.lower():
resume = tag.string
if __name__ == "__main__":
while response.status_code != 404:
print("Analysing page ", paged)
# analyse and write into csv
content = BeautifulSoup(response.content, "html.parser")
jobs = content.findAll("div", attrs={"class": "status-publish"})
for job in jobs:
line = ''
for tag in job:
if tag == '\n':
continue # Tag is empty
elif tag.name == "h2":
# Check if IAF has just opened
if "iaf" in tag.string.lower() and "open" in tag.string.lower():
name = tag.string
continue
else:
# print("Skipping: ", tag)
break # Don't process this job
elif tag.name == "small":
date = tag.contents[0]
continue
else:
if "entry" in tag["class"]:
populate(tag)
if name != '':
name.replace(",", "-")
date.replace(",", "-")
cat.replace(",", "-")
stipend.replace(",", "-")
resume.replace(",", "-")
extra.replace(",", "-")
row = [str(jobNumber), name, date, cat, stipend, resume, extra]
with open('data.csv', 'a') as csvFile:
writer = csv.writer(csvFile)
writer.writerow(row)
csvFile.close()
jobNumber = jobNumber + 1
name = ''
date = ''
cat = ''
stipend = ''
resume = ''
extra = ''
# DEBUG_BREAK
if DEBUG == 1:
# Single Page
break
else:
if paged == 1000:
break
# done analysis, switch to next response
paged = paged+1
url = base_url + str(paged)
response = requests.get(
url, verify=False, auth=(ldap_user, ldap_pass))
print('\n')
print('\n', "Analysed till page ", paged)
@ShubhamKaudewar
Copy link

Machaya Nautatva !!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment