Skip to content

Instantly share code, notes, and snippets.

@k5trismegistus
Created January 9, 2015 14:34
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save k5trismegistus/65675211b870287c00fa to your computer and use it in GitHub Desktop.
Save k5trismegistus/65675211b870287c00fa to your computer and use it in GitHub Desktop.
blogger
import urllib.request
import bs4
import time
def fetch_page(url):
html = urllib.request.urlopen(url).read().decode('utf-8')
soup = bs4.BeautifulSoup(html)
return soup
def get_content(soup):
content = soup.find('div', class_='post-body entry-content')
for i in content.find_all('div', class_='amazlet-box'):
i.extract()
return content.text
def get_next(soup):
try:
next_url = soup.find('a', class_='blog-pager-older-link').get('href')
return next_url
except:
raise
if __name__ == "__main__":
blog_url = 'http://awakara.blogspot.jp/'
s = fetch_page(blog_url)
u = get_next(s)
t = get_content(s)
while u != None:
s = fetch_page(u)
t += get_content(s)
try:
u = get_next(s)
except:
break
time.sleep(1)
t.replace(u'\xa0', u' ')
f = open('text.txt', 'w')
f.write(t)
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment