Created
March 26, 2015 03:14
-
-
Save anonymous/3bc472ad0f9631025dce to your computer and use it in GitHub Desktop.
bajador de carpetas siding
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import cookielib | |
import os | |
import urllib | |
import urllib2 | |
from bs4 import BeautifulSoup | |
import re | |
# Datos de la cuenta de siding | |
username = "<username>" | |
password = "<pass>" | |
cookie_filename = "siding.cookies" | |
root_url = 'https://intrawww.ing.puc.cl/' | |
siding_url = root_url + 'siding/dirdes/ingcursos/cursos/' | |
catalogo_url = siding_url + 'index.phtml?acc_inicio=catalogo' | |
curso_url = (siding_url + 'vista.phtml' + | |
'?accion_curso=avisos&acc_aviso=mostrar&id_curso_ic=%s') | |
carpeta_url = (siding_url + 'vista.phtml' + | |
'?accion_curso=carpetas&acc_carp=abrir_carpeta' + | |
'&id_curso_ic=%s&id_carpeta=%s') | |
archivo_url = siding_url + 'descarga.phtml?id_curso_ic=%s&id_archivo=%s' | |
class SidingLogger(object): | |
def __init__(self, login, password): | |
""" Start up... """ | |
self.login = login | |
self.password = password | |
self.cj = cookielib.MozillaCookieJar(cookie_filename) | |
if os.access(cookie_filename, os.F_OK): | |
self.cj.load() | |
self.opener = urllib2.build_opener( | |
urllib2.HTTPRedirectHandler(), | |
urllib2.HTTPHandler(debuglevel=0), | |
urllib2.HTTPSHandler(debuglevel=0), | |
urllib2.HTTPCookieProcessor(self.cj) | |
) | |
self.opener.addheaders = [ | |
('User-agent', ('Mozilla/4.0 (compatible; MSIE 6.0; ' | |
'Windows NT 5.2; .NET CLR 1.1.4322)')) | |
] | |
# need this twice - once to set cookies, once to log in... | |
self.loginSiding() | |
self.loginSiding() | |
self.cj.save() | |
def loginSiding(self): | |
""" | |
Handle login. This should populate our cookie jar. | |
""" | |
login_data = urllib.urlencode({ | |
'passwd': self.password, | |
'login': self.login, | |
}) | |
print 'logineando' | |
response = self.opener.open( | |
root_url + 'siding/index.phtml', | |
login_data) | |
return ''.join(response.readlines()) | |
def getData(self, site): | |
response = self.opener.open(site) | |
return ''.join(response.readlines()) | |
def openUrl(self, url): | |
return self.opener.open(url) | |
def parse_catalogo(html): | |
soup = BeautifulSoup(html) | |
link_cursos = soup.find_all(href=re.compile("vista\.phtml.*")) | |
cursos = {} | |
for link in link_cursos: | |
curso_id = link['href'].split('=')[-1] | |
curso_sigla = link.text.split(' ')[0] | |
curso_nombre = ' '.join(link.text.split(' ')[1:]) | |
cursos[curso_sigla] = { | |
'id': curso_id, | |
'nombre': curso_nombre, | |
'sigla': curso_sigla} | |
return cursos | |
def parse_curso(html, curso): | |
soup = BeautifulSoup(html) | |
link_carpetas = soup.find_all(href=re.compile("vista\.phtml.*")) | |
curso['carpetas'] = {} | |
for link in link_carpetas: | |
carpeta_id = link['href'].split('=')[-1] | |
carpeta_nombre = link.text | |
curso['carpetas'][carpeta_nombre] = { | |
'id': carpeta_id, | |
'nombre': carpeta_nombre} | |
def parse_carpeta(html, curso, carpeta): | |
soup = BeautifulSoup(html) | |
titulo = soup.find(text=re.compile('carpeta ' + carpeta, re.IGNORECASE)) | |
links = titulo.find_next('table').find_all(href=True) | |
curso['carpetas'][carpeta]['archivos'] = [] | |
for link in links: | |
archivo_id = link['href'].split('=')[-1] | |
archivo_nombre = link.text | |
archivo_link = link['href'] | |
curso['carpetas'][carpeta]['archivos'].append({ | |
'id': archivo_id, | |
'nombre': archivo_nombre, | |
'link': archivo_link | |
}) | |
l = SidingLogger(username, password) | |
html = l.getData(catalogo_url) | |
cursos = parse_catalogo(html) | |
print 'Ingrese sigla para bajar archivos' | |
sigla = raw_input() | |
while sigla not in cursos: | |
print 'No existe ese curso D:' | |
print 'Ingrese sigla para bajar archivos' | |
sigla = raw_input() | |
curso = cursos[sigla] | |
html = l.getData(curso_url % (curso['id'],)) | |
parse_curso(html, curso) | |
for carpeta in curso['carpetas']: | |
html = l.getData(carpeta_url % (curso['id'], | |
curso['carpetas'][carpeta]['id'])) | |
parse_carpeta(html, curso, carpeta) | |
for carpeta in curso['carpetas']: | |
for archivo in curso['carpetas'][carpeta]['archivos']: | |
f = l.openUrl(archivo_url % (curso['id'], | |
archivo['id'])) | |
ar = open(archivo['nombre'], 'w') | |
ar.write(f.read()) | |
ar.close() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment