Skip to content

Instantly share code, notes, and snippets.

Created September 21, 2013 17:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save anonymous/6652357 to your computer and use it in GitHub Desktop.
Save anonymous/6652357 to your computer and use it in GitHub Desktop.
getting only the interesting part of le monde.fr
from requests import get
from pyquery import PyQuery as P
from urlparse import urljoin as join
from os import system
src = "http://lemonde.fr"
open("plus.html","w").write((
r"""
<!doctype html>
<html lang="fr">
<head>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<link rel="stylesheet" type="text/css" href="%s/s1.lemde.fr/bootstrap/www/d9319e90ad4f9a4bde5e9c60f3f02c8b.css"/>
<title>Le Monde.fr - Actualite a la Une</title>
</head>
<body>
""" % src ).decode("latin-1").encode("latin-1")
)
interst = P(
url = "http://lemonde.fr",
opener =lambda url,**kw:get(url, **kw).content
)(".plus_partages")
for i,el in enumerate(interst("[href]")):
P(interst("[href]")[i]).attr("href",join(src,P(el).attr("href")))
interst("a").each(lambda el : P(el).attr("target", "_blank"))
open("plus.html","a").write(
unicode.encode(
interst.html(),
"latin-1") + """</body></html>""")
system("firefox plus.html")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment