Skip to content

Instantly share code, notes, and snippets.

@ellimilial
Created January 7, 2018 20:02
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save ellimilial/bdf6ea5db4733d351ec6086fdfe70efd to your computer and use it in GitHub Desktop.
Save ellimilial/bdf6ea5db4733d351ec6086fdfe70efd to your computer and use it in GitHub Desktop.
Sample scraper for blog.pl website, which is about to become decomissioned.
from collections import OrderedDict
import os
import bs4
import requests
import re
import json
from requests.adapters import HTTPAdapter
from urllib3 import Retry
BLOG_ADDRESS = 'http://arianrhod.blog.pl'
OUTPUT_BASE_DIR = 'arianrhod_blog_pl_archive'
YM_START = (2001, 12)
YM_END = (2009, 3)
s = requests.Session()
retries = Retry(total=5,
backoff_factor=0.2,
status_forcelist=[500, 502, 503, 504])
s.mount('http://', HTTPAdapter(max_retries=retries))
def get_posts_for_month(address, y, m):
p_c = s.get(_get_month_page_url(address, y, m)).content
soup = bs4.BeautifulSoup(p_c, 'html.parser')
post_heads = soup.find_all('h2', {'class': 'postTitle'})
post_links = [h.span.a['href'] for h in post_heads]
for l in reversed(post_links):
yield get_post(l)
def _get_month_page_url(ad, y, m):
return '{}/{}/{}'.format(ad, y, m)
RE_COMMENT_CONTENT_ID = re.compile(r'commentbody-.*')
def get_post(url):
post = OrderedDict(url=url)
p_c = s.get(url).content
soup = bs4.BeautifulSoup(p_c, 'html.parser')
post['name'] = soup.find('h1', {'class': 'postTitle'}).span.a.text
post['published'] = soup.find('span', {'class': 'postDate'}).text
post['content'] = soup.find('div', {'class': 'postContent'}).text[1:-1]
post['comment_count'] = soup.find('a', {'class': 'postCommentLabel'}).span.text.split()[0]
comments = []
for c in soup.find_all('li', {'class': 'comment'}):
t = c.find('span', {'class': 'title'}).text
content = c.find('div', {'id': RE_COMMENT_CONTENT_ID}).text[1:-1]
comments.append({
'author': t[t.index('przez ')+6:t.index('około')-1],
'content': content
})
post['comments'] = comments
return post
def main():
with open(os.path.join(OUTPUT_BASE_DIR, 'all.json'), 'w') as f_all, \
open(os.path.join(OUTPUT_BASE_DIR, 'readable_all.json'), 'w') as f_all_no_newlines:
for y in range(YM_START[0], YM_END[0]+1):
m = 1 if y != YM_START[0] else YM_START[1]
m_e = 12 if y != YM_END[0] else YM_END[1]
while m <= m_e:
print(y, m)
posts = get_posts_for_month(BLOG_ADDRESS, y, m)
with open(os.path.join(OUTPUT_BASE_DIR, '{}-{}.json'.format(y, m)), 'w') as f_month, \
open(os.path.join(OUTPUT_BASE_DIR, 'readable_{}-{}.json'.format(y, m)), 'w') as f_month_no_newline:
for p in posts:
js = json.dumps(p, ensure_ascii=False, indent=4)
js_no_newlines = js.replace('\\n', ' ')
f_all.write(js+'\n')
f_month.write(js+'\n')
f_all_no_newlines.write(js_no_newlines+'\n')
f_month_no_newline.write(js_no_newlines+'\n')
m += 1
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment