Skip to content

Instantly share code, notes, and snippets.

@nkt1546789
Last active July 27, 2016 17:35
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save nkt1546789/dfc4f01dbf4aa8a9d32762e904865560 to your computer and use it in GitHub Desktop.
Save nkt1546789/dfc4f01dbf4aa8a9d32762e904865560 to your computer and use it in GitHub Desktop.
Content Extractor from HTML documents via Semi-Supervised PageRank.
import numpy as np
import scipy.sparse.linalg as la
import bs4
from scipy import sparse
def get_text_elements(elem):
if isinstance(elem, bs4.NavigableString):
if type(elem) not in (bs4.Comment, bs4.Declaration) and elem.strip():
yield elem
elif elem.name not in ('script', 'style'):
for content in elem.contents:
for text_elem in get_text_elements(content):
yield text_elem
class SSPageRank(object):
def __init__(self, alpha=0.1, beta=1.0):
self.alpha = alpha
self.beta = beta
def fit(self, A, u):
n = A.shape[0]
D = sparse.diags(np.asarray(A.sum(axis=1).T), [0])
L = D - A
self.f = la.spsolve(L+self.alpha*sparse.identity(n), u)
#self.f = la.spsolve(self.beta*L/(n**2)+self.alpha*sparse.identity(n), u) # <- this is the correct formula,
self.f = la.spsolve(self.beta*L+self.alpha*sparse.identity(n), u) # but we use this one for convenience
self.f = np.maximum(0.0, self.f)
return self
def fit_predict(self, A, u):
self.fit(A, u)
return self.f
class DomPageRank(SSPageRank):
def weight_func(self):
"""
Weight function to DOM elements
default: text uniform weight
You can implement your own weight function by overriding this function
"""
# text uniform weight (default)
u = np.zeros(self.n_elems)
mask = np.array([elem.id for elem in get_text_elements(self.soup.body)])
u[mask] = 1.0
return u
def fit(self, html):
self.soup = bs4.BeautifulSoup(html, "lxml")
# assign id to each element
setattr(self.soup, "id", 0)
self.elems = []
for elem_id, elem in enumerate(self.soup.descendants, start=1):
setattr(elem, "id", elem_id)
self.elems.append(elem)
self.elems = [self.soup.body] + self.elems
self.n_elems = len(self.elems)
# create undirected adjacency matrix (child -> parent)
A = sparse.lil_matrix((self.n_elems, self.n_elems))
for elem in self.elems:
i = elem.id
j = elem.parent.id
A[i,j] = 1
if not hasattr(elem, "children"):
continue
for child in elem.children:
j = child.id
A[j,i] = 1
A = A + A.T
# compute initial weight. default: text_uniform_weight
u = self.weight_func()
super(DomPageRank, self).fit(A, u)
return self
class ContentExtractor(DomPageRank):
tags = set(['h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'p', 'div', 'table', 'map', 'section', 'article', 'ul'])
irrelevant_tags = set(['script', 'style', 'nav', 'aside'])
def weight_func(self):
def get_text_elements(elem):
if isinstance(elem, bs4.NavigableString):
if type(elem) not in (bs4.Comment, bs4.Declaration) and elem.strip():
yield elem
elif elem.name not in ContentExtractor.irrelevant_tags:
for content in elem.contents:
for text_elem in get_text_elements(content):
yield text_elem
u = np.zeros(self.n_elems)
elems = [elem for elem in get_text_elements(self.soup.body)]
for elem in elems:
u[elem.id] = len(elem.string.strip())
return u
def get_weighted_texts(self, root=None):
root = root if root is not None else self.soup.body
if not hasattr(self, "g"):
self.weight_elements(root)
h = []
def extract(elem, weight):
if isinstance(elem, bs4.NavigableString):
if type(elem) not in (bs4.Comment, bs4.Declaration) and elem.strip():
h.append((elem.string.strip(), weight))
elif elem.name not in ContentExtractor.irrelevant_tags:
for content in elem.contents:
if elem.id in self.g:
extract(content, weight + self.g[elem.id])
else:
extract(content, weight)
extract(root, 0.0)
return h
def weight_elements(self, root=None):
root = root if root is not None else self.soup.body
elems = [root] + list(root.descendants)
self.g = {}
for elem in elems:
if elem.name not in ContentExtractor.tags:
continue
score = self.f[elem.id]
if hasattr(elem, "children"):
for child in elem.children:
if child.name in ContentExtractor.tags:
score += self.f[child.id]
self.g.setdefault(elem.id, score)
def extract_elements(self, root=None, topn=1):
root = root if root is not None else self.soup.body
if not hasattr(self, "g"):
self.weight_elements(root)
elems = [elem for elem in [root] + list(root.descendants) if elem.id in self.g]
scores = [self.g[elem.id] for elem in elems]
return [(elems[i], scores[i]) for i in np.argsort(scores)[::-1][:topn]]
def extract_text(self, deliminator=u" "):
elem, score = self.extract_elements(topn=1)[0]
self.text = deliminator.join(elem.string.strip() for elem in get_text_elements(elem) if elem.string.strip())
return self.text
def extract_images(self, topn=1):
image_scores = []
for elem, score in self.extract_elements(topn=topn):
for image in elem.find_all("img"):
image_scores.append((image.attrs["src"], score))
return image_scores
if __name__ == '__main__':
import sys, requests
url = sys.argv[1]
html = url_open(url)
ce = ContentExtractor(alpha=0.1, beta=1.0).fit(html)
# extract weighted texts
h = ce.get_weighted_texts()
for text, weight in sorted(h, key=lambda x:x[1], reverse=True)[:10]:
print text, weight
# extract bs4's elements
for elem, score in ce.extract_elements(topn=5):
print elem.name, elem.attrs, score
# extract content text (this is not robust, please use "weighted texts")
print ce.extract_text(deliminator=u"\n")
# extract images from topn elements and its confidences.
for src, score in ce.extract_images(topn=10):
print src, score
@nkt1546789
Copy link
Author

@nkt1546789
Copy link
Author

しょこたんブログもいけました.
対象:http://ameblo.jp/nakagawa-shoko/entry-12180749681.html
抽出結果:

2016-07-15 09:53:22 NEW ! テーマ: ブログ いよいよ明日はポケモン映画公開‼️ 10年連続で映画ゲスト声優として参加させていただけるなんて幸せの極みです。感謝と愛‼️であしたの舞台挨拶を楽しみにしています 夏はポケモン‼️ 今日までしか買えない前売り券でボルケニオンをもらおう AD いいね! リブログする NEW リブログボタンが追加されました! さっそく試してみよう リブログって何? いいね!した人 |
リブログ(0) 中川翔子さんの読者になろう ブログの更新情報が受け取れて、アクセスが簡単になります 読者になる ツイート 記事一覧 | ポケモンGO » 最近の画像つき記事 ドラゴンクエストライ… 昨日 アリーナ おととい ドラゴンクエストライ… おととい

@nkt1546789
Copy link
Author

Now cesspr.py can extract content images.

@nkt1546789
Copy link
Author

Moved to the repository: https://github.com/nkt1546789/weightress

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment