Skip to content

Instantly share code, notes, and snippets.

@Eastkap
Last active January 24, 2017 11:24
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Eastkap/878c120d57a764197205b427990cc9be to your computer and use it in GitHub Desktop.
Save Eastkap/878c120d57a764197205b427990cc9be to your computer and use it in GitHub Desktop.
import sys, re, csv, requests, time, random
from bs4 import BeautifulSoup
if sys.version_info[0] == 3:
from urllib.request import *
else:
# Not Python 3 - today, it is most likely to be Python 2
# But note that this might need an update when Python 4
# might be around one day
from urllib import urlopen
#filename = '/Users/Jacobo/pr/upmatch/a.jpg'
#with open(filename, 'wb') as file:
# file.write(urlopen('https://etu.math.upmc.fr/math/photos/3520121.jpg').read())
def wait():
a=random.randint(500,7500)/1000
time.sleep(a)
def get_infos(r):
soup = BeautifulSoup(r.text, 'html.parser')
info = dict()
for o in soup.findAll("td",{'class':'result'}):
info['name'] =o.text.strip() #prenom NOM
info['etu']=str(o)[77:84] #numETU
if (int(info['etu'][0])!=3 or int(info['etu'][1])<3):
print(info['etu'])
return dict()
adresse='https://www.annuaire.upmc.fr/upmc/list.upmc?method=list&dn=uidInterne='+str(o)[re.search(r'Interne=', str(o)).span()[1]:re.search(r',ou=', str(o)).span()[0]]+',ou=People&mode=display'
print(adresse)
try:
wait()
req=Request(adresse,headers=hdr)
with urlopen(req) as url:
s = url.read()
full = BeautifulSoup(s, 'html.parser')
mail=full.findAll("td",{'class':'attributeDisplayer_value'})
mail=str(mail[5])
info['mail']=mail[re.search(r'@etu.upmc.fr">',mail).span()[1]:re.search(r'</a>',mail).span()[0]]
except:
info['mail']=0
return info
def make_csv(infos, filename):
with open(filename, 'w') as csvfile:
fieldnames = list({key for info in infos for key in info.keys()})
writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
writer.writeheader()
for info in infos:
writer.writerow(info)
if __name__ == '__main__':
random.seed()
url = "https://www.annuaire.upmc.fr/upmc/simpleSearch.upmc"
infos=list()
filedebut="/Users/Jacobo/pr/upmatch/data/"
hdr={'User-Agent': 'Mozilla/5.0'}
for i in range(ord('a'),ord('z')+1):
for j in range(ord('a'),ord('z')+1):
for k in range(ord('a'),ord('z')+1):
name=chr(i)+chr(j)+chr(k)
payload = {
'name': name,
'name_query_type': 'NAME*',
'surname': '',
'surname_query_type': 'SURNAME*',
'filter': 'objectClass=etudiant',
'number': '2500',
'bouton.x': '38',
'bouton.y': '15',
'inputPage': 'inputBadSearchWithoutFilters',
}
wait()
session = requests.session()
r = requests.post(url, data=payload, headers=hdr)
#call get info with r
information=get_infos(r)
if not (information==dict()):
infos.append(information)
filename=filedebut+chr(i)+chr(j)+'.csv'
print(filename)
make_csv(infos,filename)
infos=list()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment