Last active
July 27, 2016 17:35
-
-
Save nkt1546789/dfc4f01dbf4aa8a9d32762e904865560 to your computer and use it in GitHub Desktop.
Content Extractor from HTML documents via Semi-Supervised PageRank.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import scipy.sparse.linalg as la | |
import bs4 | |
from scipy import sparse | |
def get_text_elements(elem): | |
if isinstance(elem, bs4.NavigableString): | |
if type(elem) not in (bs4.Comment, bs4.Declaration) and elem.strip(): | |
yield elem | |
elif elem.name not in ('script', 'style'): | |
for content in elem.contents: | |
for text_elem in get_text_elements(content): | |
yield text_elem | |
class SSPageRank(object): | |
def __init__(self, alpha=0.1, beta=1.0): | |
self.alpha = alpha | |
self.beta = beta | |
def fit(self, A, u): | |
n = A.shape[0] | |
D = sparse.diags(np.asarray(A.sum(axis=1).T), [0]) | |
L = D - A | |
self.f = la.spsolve(L+self.alpha*sparse.identity(n), u) | |
#self.f = la.spsolve(self.beta*L/(n**2)+self.alpha*sparse.identity(n), u) # <- this is the correct formula, | |
self.f = la.spsolve(self.beta*L+self.alpha*sparse.identity(n), u) # but we use this one for convenience | |
self.f = np.maximum(0.0, self.f) | |
return self | |
def fit_predict(self, A, u): | |
self.fit(A, u) | |
return self.f | |
class DomPageRank(SSPageRank): | |
def weight_func(self): | |
""" | |
Weight function to DOM elements | |
default: text uniform weight | |
You can implement your own weight function by overriding this function | |
""" | |
# text uniform weight (default) | |
u = np.zeros(self.n_elems) | |
mask = np.array([elem.id for elem in get_text_elements(self.soup.body)]) | |
u[mask] = 1.0 | |
return u | |
def fit(self, html): | |
self.soup = bs4.BeautifulSoup(html, "lxml") | |
# assign id to each element | |
setattr(self.soup, "id", 0) | |
self.elems = [] | |
for elem_id, elem in enumerate(self.soup.descendants, start=1): | |
setattr(elem, "id", elem_id) | |
self.elems.append(elem) | |
self.elems = [self.soup.body] + self.elems | |
self.n_elems = len(self.elems) | |
# create undirected adjacency matrix (child -> parent) | |
A = sparse.lil_matrix((self.n_elems, self.n_elems)) | |
for elem in self.elems: | |
i = elem.id | |
j = elem.parent.id | |
A[i,j] = 1 | |
if not hasattr(elem, "children"): | |
continue | |
for child in elem.children: | |
j = child.id | |
A[j,i] = 1 | |
A = A + A.T | |
# compute initial weight. default: text_uniform_weight | |
u = self.weight_func() | |
super(DomPageRank, self).fit(A, u) | |
return self | |
class ContentExtractor(DomPageRank): | |
tags = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'table', 'map', 'section', 'article', 'ul']) | |
irrelevant_tags = set(['script', 'style', 'nav', 'aside']) | |
def weight_func(self): | |
def get_text_elements(elem): | |
if isinstance(elem, bs4.NavigableString): | |
if type(elem) not in (bs4.Comment, bs4.Declaration) and elem.strip(): | |
yield elem | |
elif elem.name not in ContentExtractor.irrelevant_tags: | |
for content in elem.contents: | |
for text_elem in get_text_elements(content): | |
yield text_elem | |
u = np.zeros(self.n_elems) | |
elems = [elem for elem in get_text_elements(self.soup.body)] | |
for elem in elems: | |
u[elem.id] = len(elem.string.strip()) | |
return u | |
def get_weighted_texts(self, root=None): | |
root = root if root is not None else self.soup.body | |
if not hasattr(self, "g"): | |
self.weight_elements(root) | |
h = [] | |
def extract(elem, weight): | |
if isinstance(elem, bs4.NavigableString): | |
if type(elem) not in (bs4.Comment, bs4.Declaration) and elem.strip(): | |
h.append((elem.string.strip(), weight)) | |
elif elem.name not in ContentExtractor.irrelevant_tags: | |
for content in elem.contents: | |
if elem.id in self.g: | |
extract(content, weight + self.g[elem.id]) | |
else: | |
extract(content, weight) | |
extract(root, 0.0) | |
return h | |
def weight_elements(self, root=None): | |
root = root if root is not None else self.soup.body | |
elems = [root] + list(root.descendants) | |
self.g = {} | |
for elem in elems: | |
if elem.name not in ContentExtractor.tags: | |
continue | |
score = self.f[elem.id] | |
if hasattr(elem, "children"): | |
for child in elem.children: | |
if child.name in ContentExtractor.tags: | |
score += self.f[child.id] | |
self.g.setdefault(elem.id, score) | |
def extract_elements(self, root=None, topn=1): | |
root = root if root is not None else self.soup.body | |
if not hasattr(self, "g"): | |
self.weight_elements(root) | |
elems = [elem for elem in [root] + list(root.descendants) if elem.id in self.g] | |
scores = [self.g[elem.id] for elem in elems] | |
return [(elems[i], scores[i]) for i in np.argsort(scores)[::-1][:topn]] | |
def extract_text(self, deliminator=u" "): | |
elem, score = self.extract_elements(topn=1)[0] | |
self.text = deliminator.join(elem.string.strip() for elem in get_text_elements(elem) if elem.string.strip()) | |
return self.text | |
def extract_images(self, topn=1): | |
image_scores = [] | |
for elem, score in self.extract_elements(topn=topn): | |
for image in elem.find_all("img"): | |
image_scores.append((image.attrs["src"], score)) | |
return image_scores | |
if __name__ == '__main__': | |
import sys, requests | |
url = sys.argv[1] | |
html = url_open(url) | |
ce = ContentExtractor(alpha=0.1, beta=1.0).fit(html) | |
# extract weighted texts | |
h = ce.get_weighted_texts() | |
for text, weight in sorted(h, key=lambda x:x[1], reverse=True)[:10]: | |
print text, weight | |
# extract bs4's elements | |
for elem, score in ce.extract_elements(topn=5): | |
print elem.name, elem.attrs, score | |
# extract content text (this is not robust, please use "weighted texts") | |
print ce.extract_text(deliminator=u"\n") | |
# extract images from topn elements and its confidences. | |
for src, score in ce.extract_images(topn=10): | |
print src, score |
しょこたんブログもいけました.
対象:http://ameblo.jp/nakagawa-shoko/entry-12180749681.html
抽出結果:
2016-07-15 09:53:22 NEW ! テーマ: ブログ いよいよ明日はポケモン映画公開
‼️ 10年連続で映画ゲスト声優として参加させていただけるなんて幸せの極みです。感謝と愛‼️ であしたの舞台挨拶を楽しみにしています 夏はポケモン‼️ 今日までしか買えない前売り券でボルケニオンをもらおう AD いいね! リブログする NEW リブログボタンが追加されました! さっそく試してみよう リブログって何? いいね!した人 |
リブログ(0) 中川翔子さんの読者になろう ブログの更新情報が受け取れて、アクセスが簡単になります 読者になる ツイート 記事一覧 | ポケモンGO » 最近の画像つき記事 ドラゴンクエストライ… 昨日 アリーナ おととい ドラゴンクエストライ… おととい
Now cesspr.py can extract content images.
Moved to the repository: https://github.com/nkt1546789/weightress
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
日本語解説記事:http://nktmemoja.github.io/jekyll/update/2016/07/10/content-extraction-sspagerank.html