Skip to content

Instantly share code, notes, and snippets.

Last active February 11, 2020 09:34
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save aaferrari/fbd3d861771aa255b52b9f5500fe9e3b to your computer and use it in GitHub Desktop.
Obtiene la cantidad de comentarios que hay en los posts de (funciona con Python 2 y 3)
import re, json
from requests import request
from sys import version_info
if version_info.major == 2:
from urllib import urlencode
elif version_info.major == 3:
from urllib.parse import urlencode
# Convierte los caracteres codificados como entidades HTML a texto normal
def html2char(cadena):
entidades = list(set(re.findall("&#([0-9]+);", cadena)))
for ent in entidades:
cadena = cadena.replace("&#%s;" % ent, unichr(int(ent)))
return cadena
sitemaps = ["", ""]
enlaces = []
# Obtenemos enlaces de los posts
for mapa in sitemaps:
peticion = request('GET', mapa)
enlaces.extend(re.findall("<loc>([^<]+)</loc>", peticion.text))
# Obtenemos el titulo y cantidad de comentarios de cada post
for post in enlaces:
comentarios = request('GET', "" % urlencode({"t_u": post}))
datos = json.loads("{" + re.findall('"posts":[0-9]+.+"title":"[^"]+"', comentarios.text)[0] + "}")
datos["title"] = html2char(datos["title"])
print("%s;%s;%i" % (post, datos["title"], datos["posts"]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment