edison12a/uce_results_scraper.py

## uce_results_scraper.py
# if you want to pull data from an API, requests is your friend
import requests
# BeautifulSoup helps us extract data from html, xml, ..
from bs4 import BeautifulSoup
# this is the type of strings extracted from html
from bs4.element import NavigableString


ug_results = []
# set these to a big number like 1000, 10000, any number that makes sense as an index number
schs= 10 # this is the number of schools to loop over
stds = 100 # this is the assumed number of students to loop over


# loop over possible school index numbers
for sch in range(1, schs):
    # loop over possible student index numbers from that school
    for std in range(1, stds):

        # use a try to ignore the errors that result out of invalid index numbers
        try:
            # prepend zeros to the numbers and slice them to a sensibe number of characters
            sch =  ('000'+str(sch) )[-4::]
            std =  ('00'+str(std) )[-3::]
            # use an f-string to add the numbers to the posts data format you got from the curl request
            data = f"index_no=u{sch}%2F{std}"
            url = "https://ereg.uneb.ac.ug/ajax_calls/results_status"

            # make the post request and store it at this variable
            response = requests.post(
                url,
                headers = {"accept":"*/*",
                           "content-type":"application/x-www-form-urlencoded; charset=UTF-8",
                           "x-requested-with":"XMLHttpRequest",
                           'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
                           },
                data = data
            )
            # extract the html part from the response object
            response = response.text
            # print(response)

            # parse the html using BS4
            soup = BeautifulSoup(response, 'html.parser')
            # get soup from the div that contains results
            results = soup.find(class_='col-md-10')
            # print(results.text)

            # once you get the results html, this is where your creativity comes into play
            # this is how i went about it

            # define a dict to store new results
            student_results = {}
            # this generator helps extract strings out of an element that has <br>s
            # i got it from stackoverflow!
            for result_str in results.childGenerator():
                # this is the type of strings found in html
                if type(result_str) is NavigableString:
                    # strip it of spaces
                    result_str = str(result_str).strip()
                    # i found out that strings with a subject and grade have a length of 7
                    if len(result_str) == 7:
                        # split a tring like "ENG : 2" into two parts
                        split_result = result_str.split(' : ')
                        # add it to this student's result dic
                        student_results[split_result[0]]=split_result[1]

            print(student_results)  # {'ENG': '4', 'LIT': '7', 'HIS': '4', 'GEO': '5', 'MAT': '6', 'PHY': '6', 'CHE': '7', 'BIO': '6', 'COM': '6', 'CST': '7'}
            # add results to list
            ug_results.append(student_results)

        except Exception as e:
            # print(sch, std, data)
            print(str(e))
            pass

# do what you want with your results, Have fun!
print(ug_results)
	# if you want to pull data from an API, requests is your friend
	import requests
	# BeautifulSoup helps us extract data from html, xml, ..
	from bs4 import BeautifulSoup
	# this is the type of strings extracted from html
	from bs4.element import NavigableString


	ug_results = []
	# set these to a big number like 1000, 10000, any number that makes sense as an index number
	schs= 10 # this is the number of schools to loop over
	stds = 100 # this is the assumed number of students to loop over


	# loop over possible school index numbers
	for sch in range(1, schs):
	# loop over possible student index numbers from that school
	for std in range(1, stds):

	# use a try to ignore the errors that result out of invalid index numbers
	try:
	# prepend zeros to the numbers and slice them to a sensibe number of characters
	sch = ('000'+str(sch) )[-4::]
	std = ('00'+str(std) )[-3::]
	# use an f-string to add the numbers to the posts data format you got from the curl request
	data = f"index_no=u{sch}%2F{std}"
	url = "https://ereg.uneb.ac.ug/ajax_calls/results_status"

	# make the post request and store it at this variable
	response = requests.post(
	url,
	headers = {"accept":"/",
	"content-type":"application/x-www-form-urlencoded; charset=UTF-8",
	"x-requested-with":"XMLHttpRequest",
	'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
	},
	data = data
	)
	# extract the html part from the response object
	response = response.text
	# print(response)

	# parse the html using BS4
	soup = BeautifulSoup(response, 'html.parser')
	# get soup from the div that contains results
	results = soup.find(class_='col-md-10')
	# print(results.text)

	# once you get the results html, this is where your creativity comes into play
	# this is how i went about it

	# define a dict to store new results
	student_results = {}
	# this generator helps extract strings out of an element that has <br>s
	# i got it from stackoverflow!
	for result_str in results.childGenerator():
	# this is the type of strings found in html
	if type(result_str) is NavigableString:
	# strip it of spaces
	result_str = str(result_str).strip()
	# i found out that strings with a subject and grade have a length of 7
	if len(result_str) == 7:
	# split a tring like "ENG : 2" into two parts
	split_result = result_str.split(' : ')
	# add it to this student's result dic
	student_results[split_result[0]]=split_result[1]

	print(student_results) # {'ENG': '4', 'LIT': '7', 'HIS': '4', 'GEO': '5', 'MAT': '6', 'PHY': '6', 'CHE': '7', 'BIO': '6', 'COM': '6', 'CST': '7'}
	# add results to list
	ug_results.append(student_results)

	except Exception as e:
	# print(sch, std, data)
	print(str(e))
	pass

	# do what you want with your results, Have fun!
	print(ug_results)