Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Sat Mar 31 22:57:58 2018
https://www.quora.com/How-can-I-extract-only-text-data-from-HTML-pages
https://qiita.com/matsu0228/items/edf7dbba9b0b0246ef8f
@author: ks
"""
import sys, re, datetime
import urllib
from bs4 import BeautifulSoup
import MeCab as mc
fname = '_'.join(['out',
datetime.datetime.today().strftime('%Y-%m-%dT%H%M%S'),
]) + '.txt'
tagger = mc.Tagger('-Owakati') # 形態素は情報はいらない
# url = 'http://ill-identified.hatenablog.com/entry/2017/04/30/004258'
url = sys.argv[1]
print('URL: ' + url)
html = urllib.request.urlopen(url)
soup = BeautifulSoup(html, 'lxml')
for x in soup.find_all('pre'):
x.decompose()
for x in soup.find_all('table'):
x.decompose()
res = soup.find('div', attrs={'class': 'entry-content'}).get_text()
with open(fname, 'w') as f:
for x in re.sub('[ \t ]', '', res).split('\n'):
if x != '':
f.write(tagger.parse(x))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.