Skip to content

Instantly share code, notes, and snippets.

@aljiwala
Created July 25, 2018 13:27
Show Gist options
  • Save aljiwala/aa805a1b3da090ecdd91dfb9ab519d30 to your computer and use it in GitHub Desktop.
Save aljiwala/aa805a1b3da090ecdd91dfb9ab519d30 to your computer and use it in GitHub Desktop.
Extract courses from http://education-india.in
import requests
from time import sleep
from bs4 import BeautifulSoup
def extract_from_ed_india():
final = list()
pages = range(1, 32)
for i in pages:
def get_soup(url):
resp = requests.get(url)
return BeautifulSoup(resp.text, 'html.parser')
base_url = 'http://education-india.in/Education/Courses/'
soup = get_soup('{}?PageNumber={}'.format(base_url, i))
rows = soup.find_all('table')[2].findAll('tr')
for row in rows:
if row.th:
continue
d = dict()
td_list = row.find_all('td')
for index, td in enumerate(td_list):
stripped = td.text.strip()
in_ignore = ('Total Record', 'Showing Page No', '[First] [Prev]')
if stripped.startswith(in_ignore):
continue
if index == 0:
d['sr_no'] = stripped
elif index == 1:
d['course'] = stripped
href = td.a.get('href', '')
d['course_href'] = href
if href:
course_url = base_url + href
course_soup = get_soup(course_url)
th = course_soup.find('table', class_='detail').th.text.strip()
d['course_sn'] = th.replace(stripped, '').replace(' Details', '')[1:-1]
elif index == 2:
d['duration'] = stripped
elif index == 3:
d['eligibility'] = stripped
else:
pass
if d:
print('sr_no: {}, extracted.'.format(d['sr_no']))
final.append(d)
sleep(2)
print()
print()
print()
print(final)
def main():
extract_from_ed_india()
exit()
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment