ezioruan/gist:7462277

## gistfile1.py

# -*- coding: utf-8 -*-

from xml.sax.saxutils import escape

import urllib, re, os, urlparse
import HTMLParser
from BeautifulSoup import BeautifulSoup
from pprint import pprint
import urllib2

import codecs
import sys
streamWriter = codecs.lookup('utf-8')[-1]
sys.stdout = streamWriter(sys.stdout)


NEGATIVE    = re.compile("comment|meta|footer|footnote|foot")
POSITIVE    = re.compile("post|hentry|entry|content|text|body|article")
PUNCTUATION = re.compile("""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~]""")


def get_content(url):
    """Fetches a URL"""
    opener = urllib2.build_opener()
    opener.addheaders = [
                ('User-agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.114 Safari/537.36'),
#                 ('Accept-Encoding','gzip,deflate,sdch'),
#                 ('Accept-Language','zh-CN,zh;q=0.8'),
                ]
    response =  opener.open(url)
    content_type = response.headers.get('content-type')
    encoding = re.search(r'charset=(?P<encoding>\w+)', content_type).group('encoding')
    print encoding
    content = response.read().decode(encoding)
    return grabContent(content)


def grabContent (html):

    replaceBrs = re.compile("<br */? *>[ \r\n]*<br */? *>")
    html = re.sub(replaceBrs, "</p><p>", html)

    try:
        soup = BeautifulSoup(html)
    except HTMLParser.HTMLParseError:
        return ""

    # REMOVE SCRIPTS
    for s in soup.findAll("script"):
        s.extract()

    allParagraphs = soup.findAll("p")
    topParent     = None

    parents = []
    for paragraph in allParagraphs:

        parent = paragraph.parent

        if (parent not in parents):
            parents.append(parent)
            parent.score = 0

            if (parent.has_key("class")):
                if (NEGATIVE.match(parent["class"])):
                    parent.score -= 50
                if (POSITIVE.match(parent["class"])):
                    parent.score += 25

            if (parent.has_key("id")):
                if (NEGATIVE.match(parent["id"])):
                    parent.score -= 50
                if (POSITIVE.match(parent["id"])):
                    parent.score += 25

        if (parent.score == None):
            parent.score = 0

        innerText = paragraph.renderContents() #"".join(paragraph.findAll(text=True))
        if (len(innerText) > 10):
            parent.score += 1

        parent.score += innerText.count(",")

    for parent in parents:
        if ((not topParent) or (parent.score > topParent.score)):
            topParent = parent

    if (not topParent):
        return ""

    # REMOVE LINK'D STYLES
    styleLinks = soup.findAll("link", attrs={"type" : "text/css"})
    for s in styleLinks:
        s.extract()

    # REMOVE ON PAGE STYLES
    for s in soup.findAll("style"):
        s.extract()

    # CLEAN STYLES FROM ELEMENTS IN TOP PARENT
    for ele in topParent.findAll(True):
        del(ele['style'])
        del(ele['class'])

    killDivs(topParent)
    clean(topParent, "form")
    clean(topParent, "object")
    clean(topParent, "iframe")

    return topParent.text


def fixLinks(parent, link):
    tags = parent.findAll(True)

    for t in tags:
        if (t.has_key("href")):
            t["href"] = urlparse.urljoin(link, t["href"])
        if (t.has_key("src")):
            t["src"] = urlparse.urljoin(link, t["src"])


def clean(top, tag, minWords=10000):
    tags = top.findAll(tag)

    for t in tags:
        if (t.renderContents().count(" ") < minWords):
            t.extract()


def killDivs(parent):

    divs = parent.findAll("div")
    for d in divs:
        p     = len(d.findAll("p"))
        img   = len(d.findAll("img"))
        li    = len(d.findAll("li"))
        a     = len(d.findAll("a"))
        embed = len(d.findAll("embed"))
        pre   = len(d.findAll("pre"))
        code  = len(d.findAll("code"))

        if (d.renderContents().count(",") < 10):
            if ((pre == 0) and (code == 0)):
                if ((img > p ) or (li > p) or (a > p) or (p == 0) or (embed > 0)):
                    d.extract()


if __name__ == "__main__":
    print get_content('http://ent.163.com/13/1109/22/9D99KPFU00034JAU.html#p=9D9A99CT00AJ0003')
    print get_content('http://blog.csdn.net/pleasecallmewhy/article/details/8923067')

	# -- coding: utf-8 --

	from xml.sax.saxutils import escape

	import urllib, re, os, urlparse
	import HTMLParser
	from BeautifulSoup import BeautifulSoup
	from pprint import pprint
	import urllib2

	import codecs
	import sys
	streamWriter = codecs.lookup('utf-8')[-1]
	sys.stdout = streamWriter(sys.stdout)



	NEGATIVE = re.compile("comment\|meta\|footer\|footnote\|foot")
	POSITIVE = re.compile("post\|hentry\|entry\|content\|text\|body\|article")
	PUNCTUATION = re.compile("""[!"#$%&\'()*+,-./:;<=>?@[\\]^_`{\|}~]""")


	def get_content(url):
	"""Fetches a URL"""
	opener = urllib2.build_opener()
	opener.addheaders = [
	('User-agent','Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/30.0.1599.114 Safari/537.36'),
	# ('Accept-Encoding','gzip,deflate,sdch'),
	# ('Accept-Language','zh-CN,zh;q=0.8'),
	]
	response = opener.open(url)
	content_type = response.headers.get('content-type')
	encoding = re.search(r'charset=(?P<encoding>\w+)', content_type).group('encoding')
	print encoding
	content = response.read().decode(encoding)
	return grabContent(content)




	def grabContent (html):

	replaceBrs = re.compile("<br /? >[ \r\n]<br /? *>")
	html = re.sub(replaceBrs, "</p><p>", html)

	try:
	soup = BeautifulSoup(html)
	except HTMLParser.HTMLParseError:
	return ""

	# REMOVE SCRIPTS
	for s in soup.findAll("script"):
	s.extract()

	allParagraphs = soup.findAll("p")
	topParent = None

	parents = []
	for paragraph in allParagraphs:

	parent = paragraph.parent

	if (parent not in parents):
	parents.append(parent)
	parent.score = 0

	if (parent.has_key("class")):
	if (NEGATIVE.match(parent["class"])):
	parent.score -= 50
	if (POSITIVE.match(parent["class"])):
	parent.score += 25

	if (parent.has_key("id")):
	if (NEGATIVE.match(parent["id"])):
	parent.score -= 50
	if (POSITIVE.match(parent["id"])):
	parent.score += 25

	if (parent.score == None):
	parent.score = 0

	innerText = paragraph.renderContents() #"".join(paragraph.findAll(text=True))
	if (len(innerText) > 10):
	parent.score += 1

	parent.score += innerText.count(",")

	for parent in parents:
	if ((not topParent) or (parent.score > topParent.score)):
	topParent = parent

	if (not topParent):
	return ""

	# REMOVE LINK'D STYLES
	styleLinks = soup.findAll("link", attrs={"type" : "text/css"})
	for s in styleLinks:
	s.extract()

	# REMOVE ON PAGE STYLES
	for s in soup.findAll("style"):
	s.extract()

	# CLEAN STYLES FROM ELEMENTS IN TOP PARENT
	for ele in topParent.findAll(True):
	del(ele['style'])
	del(ele['class'])

	killDivs(topParent)
	clean(topParent, "form")
	clean(topParent, "object")
	clean(topParent, "iframe")

	return topParent.text


	def fixLinks(parent, link):
	tags = parent.findAll(True)

	for t in tags:
	if (t.has_key("href")):
	t["href"] = urlparse.urljoin(link, t["href"])
	if (t.has_key("src")):
	t["src"] = urlparse.urljoin(link, t["src"])


	def clean(top, tag, minWords=10000):
	tags = top.findAll(tag)

	for t in tags:
	if (t.renderContents().count(" ") < minWords):
	t.extract()


	def killDivs(parent):

	divs = parent.findAll("div")
	for d in divs:
	p = len(d.findAll("p"))
	img = len(d.findAll("img"))
	li = len(d.findAll("li"))
	a = len(d.findAll("a"))
	embed = len(d.findAll("embed"))
	pre = len(d.findAll("pre"))
	code = len(d.findAll("code"))

	if (d.renderContents().count(",") < 10):
	if ((pre == 0) and (code == 0)):
	if ((img > p ) or (li > p) or (a > p) or (p == 0) or (embed > 0)):
	d.extract()


	if __name__ == "__main__":
	print get_content('http://ent.163.com/13/1109/22/9D99KPFU00034JAU.html#p=9D9A99CT00AJ0003')
	print get_content('http://blog.csdn.net/pleasecallmewhy/article/details/8923067')