ellimilial/blog_pl_scraper.py

## blog_pl_scraper.py
from collections import OrderedDict

import os
import bs4
import requests
import re
import json

from requests.adapters import HTTPAdapter
from urllib3 import Retry

BLOG_ADDRESS = 'http://arianrhod.blog.pl'
OUTPUT_BASE_DIR = 'arianrhod_blog_pl_archive'
YM_START = (2001, 12)
YM_END = (2009, 3)


s = requests.Session()

retries = Retry(total=5,
                backoff_factor=0.2,
                status_forcelist=[500, 502, 503, 504])

s.mount('http://', HTTPAdapter(max_retries=retries))


def get_posts_for_month(address, y, m):
    p_c = s.get(_get_month_page_url(address, y, m)).content
    soup = bs4.BeautifulSoup(p_c, 'html.parser')
    post_heads = soup.find_all('h2', {'class': 'postTitle'})
    post_links = [h.span.a['href'] for h in post_heads]
    for l in reversed(post_links):
        yield get_post(l)


def _get_month_page_url(ad, y, m):
    return '{}/{}/{}'.format(ad, y, m)

RE_COMMENT_CONTENT_ID = re.compile(r'commentbody-.*')


def get_post(url):
    post = OrderedDict(url=url)
    p_c = s.get(url).content
    soup = bs4.BeautifulSoup(p_c, 'html.parser')
    post['name'] = soup.find('h1', {'class': 'postTitle'}).span.a.text
    post['published'] = soup.find('span', {'class': 'postDate'}).text
    post['content'] = soup.find('div', {'class': 'postContent'}).text[1:-1]
    post['comment_count'] = soup.find('a', {'class': 'postCommentLabel'}).span.text.split()[0]

    comments = []
    for c in soup.find_all('li', {'class': 'comment'}):
        t = c.find('span', {'class': 'title'}).text
        content = c.find('div', {'id': RE_COMMENT_CONTENT_ID}).text[1:-1]
        comments.append({
            'author': t[t.index('przez ')+6:t.index('około')-1],
            'content': content
        })
    post['comments'] = comments

    return post


def main():
    with open(os.path.join(OUTPUT_BASE_DIR, 'all.json'), 'w') as f_all, \
                open(os.path.join(OUTPUT_BASE_DIR, 'readable_all.json'), 'w') as f_all_no_newlines:
        for y in range(YM_START[0], YM_END[0]+1):
            m = 1 if y != YM_START[0] else YM_START[1]
            m_e = 12 if y != YM_END[0] else YM_END[1]
            while m <= m_e:
                print(y, m)
                posts = get_posts_for_month(BLOG_ADDRESS, y, m)
                with open(os.path.join(OUTPUT_BASE_DIR, '{}-{}.json'.format(y, m)), 'w') as f_month, \
                        open(os.path.join(OUTPUT_BASE_DIR, 'readable_{}-{}.json'.format(y, m)), 'w') as f_month_no_newline:
                    for p in posts:
                        js = json.dumps(p, ensure_ascii=False, indent=4)
                        js_no_newlines = js.replace('\\n', ' ')
                        f_all.write(js+'\n')
                        f_month.write(js+'\n')
                        f_all_no_newlines.write(js_no_newlines+'\n')
                        f_month_no_newline.write(js_no_newlines+'\n')
                    m += 1


if __name__ == '__main__':
    main()
	from collections import OrderedDict

	import os
	import bs4
	import requests
	import re
	import json

	from requests.adapters import HTTPAdapter
	from urllib3 import Retry

	BLOG_ADDRESS = 'http://arianrhod.blog.pl'
	OUTPUT_BASE_DIR = 'arianrhod_blog_pl_archive'
	YM_START = (2001, 12)
	YM_END = (2009, 3)


	s = requests.Session()

	retries = Retry(total=5,
	backoff_factor=0.2,
	status_forcelist=[500, 502, 503, 504])

	s.mount('http://', HTTPAdapter(max_retries=retries))


	def get_posts_for_month(address, y, m):
	p_c = s.get(_get_month_page_url(address, y, m)).content
	soup = bs4.BeautifulSoup(p_c, 'html.parser')
	post_heads = soup.find_all('h2', {'class': 'postTitle'})
	post_links = [h.span.a['href'] for h in post_heads]
	for l in reversed(post_links):
	yield get_post(l)


	def _get_month_page_url(ad, y, m):
	return '{}/{}/{}'.format(ad, y, m)

	RE_COMMENT_CONTENT_ID = re.compile(r'commentbody-.*')


	def get_post(url):
	post = OrderedDict(url=url)
	p_c = s.get(url).content
	soup = bs4.BeautifulSoup(p_c, 'html.parser')
	post['name'] = soup.find('h1', {'class': 'postTitle'}).span.a.text
	post['published'] = soup.find('span', {'class': 'postDate'}).text
	post['content'] = soup.find('div', {'class': 'postContent'}).text[1:-1]
	post['comment_count'] = soup.find('a', {'class': 'postCommentLabel'}).span.text.split()[0]

	comments = []
	for c in soup.find_all('li', {'class': 'comment'}):
	t = c.find('span', {'class': 'title'}).text
	content = c.find('div', {'id': RE_COMMENT_CONTENT_ID}).text[1:-1]
	comments.append({
	'author': t[t.index('przez ')+6:t.index('około')-1],
	'content': content
	})
	post['comments'] = comments

	return post


	def main():
	with open(os.path.join(OUTPUT_BASE_DIR, 'all.json'), 'w') as f_all, \
	open(os.path.join(OUTPUT_BASE_DIR, 'readable_all.json'), 'w') as f_all_no_newlines:
	for y in range(YM_START[0], YM_END[0]+1):
	m = 1 if y != YM_START[0] else YM_START[1]
	m_e = 12 if y != YM_END[0] else YM_END[1]
	while m <= m_e:
	print(y, m)
	posts = get_posts_for_month(BLOG_ADDRESS, y, m)
	with open(os.path.join(OUTPUT_BASE_DIR, '{}-{}.json'.format(y, m)), 'w') as f_month, \
	open(os.path.join(OUTPUT_BASE_DIR, 'readable_{}-{}.json'.format(y, m)), 'w') as f_month_no_newline:
	for p in posts:
	js = json.dumps(p, ensure_ascii=False, indent=4)
	js_no_newlines = js.replace('\\n', ' ')
	f_all.write(js+'\n')
	f_month.write(js+'\n')
	f_all_no_newlines.write(js_no_newlines+'\n')
	f_month_no_newline.write(js_no_newlines+'\n')
	m += 1


	if __name__ == '__main__':
	main()