Skip to content

Instantly share code, notes, and snippets.

@ezioruan
Created November 14, 2013 06:16
Show Gist options
  • Save ezioruan/7462277 to your computer and use it in GitHub Desktop.
Save ezioruan/7462277 to your computer and use it in GitHub Desktop.
get_content
# -*- coding: utf-8 -*-
from xml.sax.saxutils import escape
import urllib, re, os, urlparse
import HTMLParser
from BeautifulSoup import BeautifulSoup
from pprint import pprint
import urllib2
import codecs
import sys
streamWriter = codecs.lookup('utf-8')[-1]
sys.stdout = streamWriter(sys.stdout)
NEGATIVE = re.compile("comment|meta|footer|footnote|foot")
POSITIVE = re.compile("post|hentry|entry|content|text|body|article")
PUNCTUATION = re.compile("""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]""")
def get_content(url):
"""Fetches a URL"""
opener = urllib2.build_opener()
opener.addheaders = [
('User-agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.114 Safari/537.36'),
# ('Accept-Encoding','gzip,deflate,sdch'),
# ('Accept-Language','zh-CN,zh;q=0.8'),
]
response = opener.open(url)
content_type = response.headers.get('content-type')
encoding = re.search(r'charset=(?P<encoding>\w+)', content_type).group('encoding')
print encoding
content = response.read().decode(encoding)
return grabContent(content)
def grabContent (html):
replaceBrs = re.compile("<br */? *>[ \r\n]*<br */? *>")
html = re.sub(replaceBrs, "</p><p>", html)
try:
soup = BeautifulSoup(html)
except HTMLParser.HTMLParseError:
return ""
# REMOVE SCRIPTS
for s in soup.findAll("script"):
s.extract()
allParagraphs = soup.findAll("p")
topParent = None
parents = []
for paragraph in allParagraphs:
parent = paragraph.parent
if (parent not in parents):
parents.append(parent)
parent.score = 0
if (parent.has_key("class")):
if (NEGATIVE.match(parent["class"])):
parent.score -= 50
if (POSITIVE.match(parent["class"])):
parent.score += 25
if (parent.has_key("id")):
if (NEGATIVE.match(parent["id"])):
parent.score -= 50
if (POSITIVE.match(parent["id"])):
parent.score += 25
if (parent.score == None):
parent.score = 0
innerText = paragraph.renderContents() #"".join(paragraph.findAll(text=True))
if (len(innerText) > 10):
parent.score += 1
parent.score += innerText.count(",")
for parent in parents:
if ((not topParent) or (parent.score > topParent.score)):
topParent = parent
if (not topParent):
return ""
# REMOVE LINK'D STYLES
styleLinks = soup.findAll("link", attrs={"type" : "text/css"})
for s in styleLinks:
s.extract()
# REMOVE ON PAGE STYLES
for s in soup.findAll("style"):
s.extract()
# CLEAN STYLES FROM ELEMENTS IN TOP PARENT
for ele in topParent.findAll(True):
del(ele['style'])
del(ele['class'])
killDivs(topParent)
clean(topParent, "form")
clean(topParent, "object")
clean(topParent, "iframe")
return topParent.text
def fixLinks(parent, link):
tags = parent.findAll(True)
for t in tags:
if (t.has_key("href")):
t["href"] = urlparse.urljoin(link, t["href"])
if (t.has_key("src")):
t["src"] = urlparse.urljoin(link, t["src"])
def clean(top, tag, minWords=10000):
tags = top.findAll(tag)
for t in tags:
if (t.renderContents().count(" ") < minWords):
t.extract()
def killDivs(parent):
divs = parent.findAll("div")
for d in divs:
p = len(d.findAll("p"))
img = len(d.findAll("img"))
li = len(d.findAll("li"))
a = len(d.findAll("a"))
embed = len(d.findAll("embed"))
pre = len(d.findAll("pre"))
code = len(d.findAll("code"))
if (d.renderContents().count(",") < 10):
if ((pre == 0) and (code == 0)):
if ((img > p ) or (li > p) or (a > p) or (p == 0) or (embed > 0)):
d.extract()
if __name__ == "__main__":
print get_content('http://ent.163.com/13/1109/22/9D99KPFU00034JAU.html#p=9D9A99CT00AJ0003')
print get_content('http://blog.csdn.net/pleasecallmewhy/article/details/8923067')
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment