Created
January 7, 2018 20:02
-
-
Save ellimilial/bdf6ea5db4733d351ec6086fdfe70efd to your computer and use it in GitHub Desktop.
Sample scraper for blog.pl website, which is about to become decomissioned.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections import OrderedDict | |
import os | |
import bs4 | |
import requests | |
import re | |
import json | |
from requests.adapters import HTTPAdapter | |
from urllib3 import Retry | |
BLOG_ADDRESS = 'http://arianrhod.blog.pl' | |
OUTPUT_BASE_DIR = 'arianrhod_blog_pl_archive' | |
YM_START = (2001, 12) | |
YM_END = (2009, 3) | |
s = requests.Session() | |
retries = Retry(total=5, | |
backoff_factor=0.2, | |
status_forcelist=[500, 502, 503, 504]) | |
s.mount('http://', HTTPAdapter(max_retries=retries)) | |
def get_posts_for_month(address, y, m): | |
p_c = s.get(_get_month_page_url(address, y, m)).content | |
soup = bs4.BeautifulSoup(p_c, 'html.parser') | |
post_heads = soup.find_all('h2', {'class': 'postTitle'}) | |
post_links = [h.span.a['href'] for h in post_heads] | |
for l in reversed(post_links): | |
yield get_post(l) | |
def _get_month_page_url(ad, y, m): | |
return '{}/{}/{}'.format(ad, y, m) | |
RE_COMMENT_CONTENT_ID = re.compile(r'commentbody-.*') | |
def get_post(url): | |
post = OrderedDict(url=url) | |
p_c = s.get(url).content | |
soup = bs4.BeautifulSoup(p_c, 'html.parser') | |
post['name'] = soup.find('h1', {'class': 'postTitle'}).span.a.text | |
post['published'] = soup.find('span', {'class': 'postDate'}).text | |
post['content'] = soup.find('div', {'class': 'postContent'}).text[1:-1] | |
post['comment_count'] = soup.find('a', {'class': 'postCommentLabel'}).span.text.split()[0] | |
comments = [] | |
for c in soup.find_all('li', {'class': 'comment'}): | |
t = c.find('span', {'class': 'title'}).text | |
content = c.find('div', {'id': RE_COMMENT_CONTENT_ID}).text[1:-1] | |
comments.append({ | |
'author': t[t.index('przez ')+6:t.index('około')-1], | |
'content': content | |
}) | |
post['comments'] = comments | |
return post | |
def main(): | |
with open(os.path.join(OUTPUT_BASE_DIR, 'all.json'), 'w') as f_all, \ | |
open(os.path.join(OUTPUT_BASE_DIR, 'readable_all.json'), 'w') as f_all_no_newlines: | |
for y in range(YM_START[0], YM_END[0]+1): | |
m = 1 if y != YM_START[0] else YM_START[1] | |
m_e = 12 if y != YM_END[0] else YM_END[1] | |
while m <= m_e: | |
print(y, m) | |
posts = get_posts_for_month(BLOG_ADDRESS, y, m) | |
with open(os.path.join(OUTPUT_BASE_DIR, '{}-{}.json'.format(y, m)), 'w') as f_month, \ | |
open(os.path.join(OUTPUT_BASE_DIR, 'readable_{}-{}.json'.format(y, m)), 'w') as f_month_no_newline: | |
for p in posts: | |
js = json.dumps(p, ensure_ascii=False, indent=4) | |
js_no_newlines = js.replace('\\n', ' ') | |
f_all.write(js+'\n') | |
f_month.write(js+'\n') | |
f_all_no_newlines.write(js_no_newlines+'\n') | |
f_month_no_newline.write(js_no_newlines+'\n') | |
m += 1 | |
if __name__ == '__main__': | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment