Skip to content

Instantly share code, notes, and snippets.

What would you like to do?
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
Created on Sat Mar 31 22:57:58 2018
@author: ks
import sys, re, datetime
import urllib
from bs4 import BeautifulSoup
import MeCab as mc
fname = '_'.join(['out','%Y-%m-%dT%H%M%S'),
]) + '.txt'
tagger = mc.Tagger('-Owakati') # 形態素は情報はいらない
# url = ''
url = sys.argv[1]
print('URL: ' + url)
html = urllib.request.urlopen(url)
soup = BeautifulSoup(html, 'lxml')
for x in soup.find_all('pre'):
for x in soup.find_all('table'):
res = soup.find('div', attrs={'class': 'entry-content'}).get_text()
with open(fname, 'w') as f:
for x in re.sub('[ \t ]', '', res).split('\n'):
if x != '':
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.