Skip to content

Instantly share code, notes, and snippets.

@kartoch
Created November 24, 2017 15:30
Show Gist options
  • Save kartoch/6935ab592eed72e623263d10227f9c5d to your computer and use it in GitHub Desktop.
Save kartoch/6935ab592eed72e623263d10227f9c5d to your computer and use it in GitHub Desktop.
Python @ Polytech'Lille - TP2
#!/usr/bin/python
# Goal: scrap the "Annuaire" web page of Polytech Lille to gather the list of
# all students and their id (encoded in base 64). Print the number of
# entry found
#
# You can use requests for scraping and re for extracting data from the page
import base64
import requests
import re
FILTER = '<a href=\'annuaire.php\?a=([\w=]+)\'>([^&]+)\&nbsp;([^<]+)<'
annuaire = {}
if __name__ == '__main__':
for c in range(ord('a'),ord('z')+1):
r = requests.post('http://www.polytech-lille.fr/annuaire.php',
data={'nom':chr(c)})
groups = re.findall(FILTER, r.text)
for i in groups:
code = int(str(base64.b64decode(i[0]),'utf-8').partition('*')[0])
firstname = i[1]
lastname = i[2]
if code not in annuaire:
annuaire[code] = (firstname,lastname)
print(chr(c) + " loaded")
print("number of entries: %i" % len(annuaire))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment