nkt1546789/cesspr.py

## cesspr.py
import numpy as np
import scipy.sparse.linalg as la
import bs4
from scipy import sparse

def get_text_elements(elem):
    if isinstance(elem, bs4.NavigableString):
        if type(elem) not in (bs4.Comment, bs4.Declaration) and elem.strip():
            yield elem
    elif elem.name not in ('script', 'style'):
        for content in elem.contents:
            for text_elem in get_text_elements(content):
                yield text_elem


class SSPageRank(object):
    def __init__(self, alpha=0.1, beta=1.0):
        self.alpha = alpha
        self.beta = beta

    def fit(self, A, u):
        n = A.shape[0]
        D = sparse.diags(np.asarray(A.sum(axis=1).T), [0])
        L = D - A
        self.f = la.spsolve(L+self.alpha*sparse.identity(n), u)
        #self.f = la.spsolve(self.beta*L/(n**2)+self.alpha*sparse.identity(n), u) # <- this is the correct formula,
        self.f = la.spsolve(self.beta*L+self.alpha*sparse.identity(n), u) # but we use this one for convenience
        self.f = np.maximum(0.0, self.f)
        return self

    def fit_predict(self, A, u):
        self.fit(A, u)
        return self.f

class DomPageRank(SSPageRank):
    def weight_func(self):
        """
        Weight function to DOM elements
        default: text uniform weight
        You can implement your own weight function by overriding this function
        """
        # text uniform weight (default)
        u = np.zeros(self.n_elems)
        mask = np.array([elem.id for elem in get_text_elements(self.soup.body)])
        u[mask] = 1.0
        return u

    def fit(self, html):
        self.soup = bs4.BeautifulSoup(html, "lxml")

        # assign id to each element
        setattr(self.soup, "id", 0)
        self.elems = []
        for elem_id, elem in enumerate(self.soup.descendants, start=1):
            setattr(elem, "id", elem_id)
            self.elems.append(elem)
        self.elems = [self.soup.body] + self.elems
        self.n_elems = len(self.elems)

        # create undirected adjacency matrix (child -> parent)
        A = sparse.lil_matrix((self.n_elems, self.n_elems))
        for elem in self.elems:
            i = elem.id
            j = elem.parent.id
            A[i,j] = 1
            if not hasattr(elem, "children"):
                continue
            for child in elem.children:
                j = child.id
                A[j,i] = 1
        A = A + A.T

        # compute initial weight. default: text_uniform_weight
        u = self.weight_func()
        super(DomPageRank, self).fit(A, u)
        return self

class ContentExtractor(DomPageRank):
    tags = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'table', 'map', 'section', 'article', 'ul'])
    irrelevant_tags = set(['script', 'style', 'nav', 'aside'])

    def weight_func(self):
        def get_text_elements(elem):
            if isinstance(elem, bs4.NavigableString):
                if type(elem) not in (bs4.Comment, bs4.Declaration) and elem.strip():
                    yield elem
            elif elem.name not in ContentExtractor.irrelevant_tags:
                for content in elem.contents:
                    for text_elem in get_text_elements(content):
                        yield text_elem
        u = np.zeros(self.n_elems)
        elems = [elem for elem in get_text_elements(self.soup.body)]
        for elem in elems:
            u[elem.id] = len(elem.string.strip())
        return u

    def get_weighted_texts(self, root=None):
        root = root if root is not None else self.soup.body
        if not hasattr(self, "g"):
            self.weight_elements(root)
        h = []
        def extract(elem, weight):
            if isinstance(elem, bs4.NavigableString):
                if type(elem) not in (bs4.Comment, bs4.Declaration) and elem.strip():
                    h.append((elem.string.strip(), weight))
            elif elem.name not in ContentExtractor.irrelevant_tags:
                for content in elem.contents:
                    if elem.id in self.g:
                        extract(content, weight + self.g[elem.id])
                    else:
                        extract(content, weight)
        extract(root, 0.0)
        return h

    def weight_elements(self, root=None):
        root = root if root is not None else self.soup.body
        elems = [root] + list(root.descendants)
        self.g = {}
        for elem in elems:
            if elem.name not in ContentExtractor.tags:
                continue
            score = self.f[elem.id]
            if hasattr(elem, "children"):
                for child in elem.children:
                    if child.name in ContentExtractor.tags:
                        score += self.f[child.id]
            self.g.setdefault(elem.id, score)

    def extract_elements(self, root=None, topn=1):
        root = root if root is not None else self.soup.body
        if not hasattr(self, "g"):
            self.weight_elements(root)
        elems = [elem for elem in [root] + list(root.descendants) if elem.id in self.g]
        scores = [self.g[elem.id] for elem in elems]
        return [(elems[i], scores[i]) for i in np.argsort(scores)[::-1][:topn]]

    def extract_text(self, deliminator=u" "):
        elem, score = self.extract_elements(topn=1)[0]
        self.text = deliminator.join(elem.string.strip() for elem in get_text_elements(elem) if elem.string.strip())
        return self.text

    def extract_images(self, topn=1):
        image_scores = []
        for elem, score in self.extract_elements(topn=topn):
            for image in elem.find_all("img"):
                image_scores.append((image.attrs["src"], score))
        return image_scores

if __name__ == '__main__':
    import sys, requests
    url = sys.argv[1]
    html = url_open(url)

    ce = ContentExtractor(alpha=0.1, beta=1.0).fit(html)
    # extract weighted texts
    h = ce.get_weighted_texts()
    for text, weight in sorted(h, key=lambda x:x[1], reverse=True)[:10]:
        print text, weight
    # extract bs4's elements
    for elem, score in ce.extract_elements(topn=5):
        print elem.name, elem.attrs, score
    # extract content text (this is not robust, please use "weighted texts")
    print ce.extract_text(deliminator=u"\n")
    # extract images from topn elements and its confidences.
    for src, score in ce.extract_images(topn=10):
        print src, score
	import numpy as np
	import scipy.sparse.linalg as la
	import bs4
	from scipy import sparse

	def get_text_elements(elem):
	if isinstance(elem, bs4.NavigableString):
	if type(elem) not in (bs4.Comment, bs4.Declaration) and elem.strip():
	yield elem
	elif elem.name not in ('script', 'style'):
	for content in elem.contents:
	for text_elem in get_text_elements(content):
	yield text_elem


	class SSPageRank(object):
	def __init__(self, alpha=0.1, beta=1.0):
	self.alpha = alpha
	self.beta = beta

	def fit(self, A, u):
	n = A.shape[0]
	D = sparse.diags(np.asarray(A.sum(axis=1).T), [0])
	L = D - A
	self.f = la.spsolve(L+self.alpha*sparse.identity(n), u)
	#self.f = la.spsolve(self.betaL/(n2)+self.alphasparse.identity(n), u) # <- this is the correct formula,
	self.f = la.spsolve(self.betaL+self.alphasparse.identity(n), u) # but we use this one for convenience
	self.f = np.maximum(0.0, self.f)
	return self

	def fit_predict(self, A, u):
	self.fit(A, u)
	return self.f

	class DomPageRank(SSPageRank):
	def weight_func(self):
	"""
	Weight function to DOM elements
	default: text uniform weight
	You can implement your own weight function by overriding this function
	"""
	# text uniform weight (default)
	u = np.zeros(self.n_elems)
	mask = np.array([elem.id for elem in get_text_elements(self.soup.body)])
	u[mask] = 1.0
	return u

	def fit(self, html):
	self.soup = bs4.BeautifulSoup(html, "lxml")

	# assign id to each element
	setattr(self.soup, "id", 0)
	self.elems = []
	for elem_id, elem in enumerate(self.soup.descendants, start=1):
	setattr(elem, "id", elem_id)
	self.elems.append(elem)
	self.elems = [self.soup.body] + self.elems
	self.n_elems = len(self.elems)

	# create undirected adjacency matrix (child -> parent)
	A = sparse.lil_matrix((self.n_elems, self.n_elems))
	for elem in self.elems:
	i = elem.id
	j = elem.parent.id
	A[i,j] = 1
	if not hasattr(elem, "children"):
	continue
	for child in elem.children:
	j = child.id
	A[j,i] = 1
	A = A + A.T

	# compute initial weight. default: text_uniform_weight
	u = self.weight_func()
	super(DomPageRank, self).fit(A, u)
	return self

	class ContentExtractor(DomPageRank):
	tags = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'table', 'map', 'section', 'article', 'ul'])
	irrelevant_tags = set(['script', 'style', 'nav', 'aside'])

	def weight_func(self):
	def get_text_elements(elem):
	if isinstance(elem, bs4.NavigableString):
	if type(elem) not in (bs4.Comment, bs4.Declaration) and elem.strip():
	yield elem
	elif elem.name not in ContentExtractor.irrelevant_tags:
	for content in elem.contents:
	for text_elem in get_text_elements(content):
	yield text_elem
	u = np.zeros(self.n_elems)
	elems = [elem for elem in get_text_elements(self.soup.body)]
	for elem in elems:
	u[elem.id] = len(elem.string.strip())
	return u

	def get_weighted_texts(self, root=None):
	root = root if root is not None else self.soup.body
	if not hasattr(self, "g"):
	self.weight_elements(root)
	h = []
	def extract(elem, weight):
	if isinstance(elem, bs4.NavigableString):
	if type(elem) not in (bs4.Comment, bs4.Declaration) and elem.strip():
	h.append((elem.string.strip(), weight))
	elif elem.name not in ContentExtractor.irrelevant_tags:
	for content in elem.contents:
	if elem.id in self.g:
	extract(content, weight + self.g[elem.id])
	else:
	extract(content, weight)
	extract(root, 0.0)
	return h

	def weight_elements(self, root=None):
	root = root if root is not None else self.soup.body
	elems = [root] + list(root.descendants)
	self.g = {}
	for elem in elems:
	if elem.name not in ContentExtractor.tags:
	continue
	score = self.f[elem.id]
	if hasattr(elem, "children"):
	for child in elem.children:
	if child.name in ContentExtractor.tags:
	score += self.f[child.id]
	self.g.setdefault(elem.id, score)

	def extract_elements(self, root=None, topn=1):
	root = root if root is not None else self.soup.body
	if not hasattr(self, "g"):
	self.weight_elements(root)
	elems = [elem for elem in [root] + list(root.descendants) if elem.id in self.g]
	scores = [self.g[elem.id] for elem in elems]
	return [(elems[i], scores[i]) for i in np.argsort(scores)[::-1][:topn]]

	def extract_text(self, deliminator=u" "):
	elem, score = self.extract_elements(topn=1)[0]
	self.text = deliminator.join(elem.string.strip() for elem in get_text_elements(elem) if elem.string.strip())
	return self.text

	def extract_images(self, topn=1):
	image_scores = []
	for elem, score in self.extract_elements(topn=topn):
	for image in elem.find_all("img"):
	image_scores.append((image.attrs["src"], score))
	return image_scores

	if __name__ == '__main__':
	import sys, requests
	url = sys.argv[1]
	html = url_open(url)

	ce = ContentExtractor(alpha=0.1, beta=1.0).fit(html)
	# extract weighted texts
	h = ce.get_weighted_texts()
	for text, weight in sorted(h, key=lambda x:x[1], reverse=True)[:10]:
	print text, weight
	# extract bs4's elements
	for elem, score in ce.extract_elements(topn=5):
	print elem.name, elem.attrs, score
	# extract content text (this is not robust, please use "weighted texts")
	print ce.extract_text(deliminator=u"\n")
	# extract images from topn elements and its confidences.
	for src, score in ce.extract_images(topn=10):
	print src, score