Created
October 30, 2019 14:41
-
-
Save jtprogru/64f149a7414a8b3869d0c23d056e5d22 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# coding=utf-8 | |
# Created by JTProgru | |
# Date: 2019-10-29 | |
# https://jtprog.ru/ | |
__author__ = 'jtprogru' | |
__version__ = '0.0.1' | |
__author_email__ = 'mail@jtprog.ru' | |
import requests | |
from bs4 import BeautifulSoup | |
import hashlib as h | |
import json | |
from time import sleep | |
def dump_post(post, name): | |
f_name = 'posts/post_' + str(name) + '.json' | |
with open(f_name, "w+") as f: | |
json.dump(post, fp=f, ensure_ascii=False) | |
print(f"Post saved in file: {f_name}") | |
def get_post_obj(article): | |
post = {} | |
try: | |
post_text = article.find("div", attrs={"id": "post-content-body"}).text | |
author = article.find('a', attrs={"class": "post__user-info user-info"}) | |
pub_date = article.find("span", attrs={"class": "post__time"}) | |
post['author'] = author.text.strip() | |
post['author_url'] = author.attrs['href'] | |
post['pub_date'] = pub_date.attrs['data-time_published'] | |
post['post_text'] = post_text | |
return post | |
except Exception: | |
return post | |
def post_dumper(link): | |
try: | |
res = requests.get(link) | |
s = BeautifulSoup(res.text, "lxml") | |
prep_post = get_post_obj(s.article) | |
pid = h.sha256(str(prep_post['post_text']).encode()).hexdigest() | |
post = [pid, prep_post] | |
dump_post(post=post, name=pid) | |
except Exception: | |
pass | |
def links_loader(): | |
with open("data/links.txt", "r+") as f: | |
return f.readlines() | |
if __name__ == '__main__': | |
links = links_loader() | |
for li in links: | |
sleep(3) | |
post_dumper(li[:-1]) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment