Skip to content

Instantly share code, notes, and snippets.

@aaferrari
Last active November 27, 2018 03:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save aaferrari/096a8de1f26acbb4242c73e3c9c55e62 to your computer and use it in GitHub Desktop.
Save aaferrari/096a8de1f26acbb4242c73e3c9c55e62 to your computer and use it in GitHub Desktop.
Script para extraer informacion del catalogo de Emerald Video
from BeautifulSoup import BeautifulSoup as bs
import re, requests
from braceexpand import braceexpand
patron_interpretes = re.compile('<a class="textoNormal" href="peliculas\.php\?action=search&tipo=interprete&que=[^"]+">([^<]+)')
patron_extras = re.compile('<b>Otros extras</b>: (.+)(?:<br>)</span>')
saltos = re.compile("[\r\n]")
def obtener_sinopsis():
global sopa
sopa = bs(pagina)
try:
sinopsis= sopa.find("b",text="Sinopsis:").findNext("span")
except AttributeError: return ""
texto= "".join([tag.string for tag in sinopsis.contents if tag.string != None])
return saltos.sub("\\n", texto.strip())
parseadores = {"interpretes": lambda: saltos.sub("\\n", ", ".join(patron_interpretes.findall(pagina))),
"portada": lambda: sopa.find("img", {"src": re.compile(".*/images-peliculas/[0-9]+_pic2.jpg")})["src"],
"otros extras": lambda: saltos.sub("\\n", ", ".join(patron_extras.findall(pagina))),
"titulo": lambda: buscador_regex('<b class="pelisTitulo">([^<]+)</b>'),
"codigo": lambda: buscador_regex('<b>C(?:&oacute;|\xf3)digo: ([^<]+)</b>'),
"pais": lambda: buscador_regex('<b>Pa&iacute;s de origen:</b> <a class="textoNormal" href="[^"]+">([^<]+)</a>'),
"sello": lambda: buscador_regex('<b>Sello:</b> <span class="textoNormal">([^<]+)</span>'),
"director": lambda: buscador_regex('<b>Director:</b> <a class="textoNormal" href="[^"]+">([^<]+)</a>'),
"calificacion": lambda: buscador_regex('<b>Calificaci&(?:&oacute;|\xf3)n:</b> <span class="textoNormal">([^<]+)</span>'),
"duracion": lambda: buscador_regex('<b>Duraci&(?:&oacute;|\xf3)n:</b> <span class="textoNormal">([^<]+)</span>'),
"estreno": lambda: buscador_regex('<b>Fecha de estreno:</b> <span class="textoNormal">([^<]+)</span>'),
"extras": lambda: "-" if buscador_regex('<b>Extras en dvd:</b><br>') != None else None,
"discos": lambda: buscador_regex('<b>Discos</b>: ([^<]+)'),
"pantalla": lambda: buscador_regex('<b>Pantalla</b>: ([^<]+)'),
"idiomas": lambda: buscador_regex('<b>Idiomas</b>: ([^<]+)'),
"region": lambda: buscador_regex('<b>Region</b>: ([^<]+)'),
"subtitulos": lambda: buscador_regex('<b>Subtitulos</b>: ([^<]+)'),
"seleccion": lambda: buscador_regex('<b>Selecci&(?:&oacute;|\xf3)n de escenas</b>: ([^<]+)'),
"menu": lambda: buscador_regex('<b>Menu interactivo</b>: ([^<]+)'),
"trailer": lambda: buscador_regex('<b>Trailer</b>: ([^<]+)'),
"sinopsis": obtener_sinopsis}
def buscador_regex(patron):
try: return re.findall(patron, pagina, re.I)[0]
except IndexError: return None
def obtener_videos(html):
global sopa
sopa = bs(html)
return ["http://www.emeraldvideo.com.ar/" + pelicula["href"] for pelicula in sopa.findAll("a", {"href": re.compile('pelicula-ficha\.php\?id=[0-9]+') })]
peliculas = []
sopa = None
for caracter in list(braceexpand(r'{A..Z}')):
print "Iterando %s" % caracter
catalogo = requests.get("http://www.emeraldvideo.com.ar/peliculas-catalogo.php?letra=%s" % caracter).text
for ficha in obtener_videos(catalogo):
print "\tIterando %s" % ficha
pagina = requests.get(ficha).text
datos = {"enlace": ficha}
for clave, valor in parseadores.iteritems():
datos[clave] = valor()
peliculas.append(datos)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment