Skip to content

Instantly share code, notes, and snippets.

@jtprogru
Created October 30, 2019 14:41
Show Gist options
  • Save jtprogru/64f149a7414a8b3869d0c23d056e5d22 to your computer and use it in GitHub Desktop.
Save jtprogru/64f149a7414a8b3869d0c23d056e5d22 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
# coding=utf-8
# Created by JTProgru
# Date: 2019-10-29
# https://jtprog.ru/
__author__ = 'jtprogru'
__version__ = '0.0.1'
__author_email__ = 'mail@jtprog.ru'
import requests
from bs4 import BeautifulSoup
import hashlib as h
import json
from time import sleep
def dump_post(post, name):
f_name = 'posts/post_' + str(name) + '.json'
with open(f_name, "w+") as f:
json.dump(post, fp=f, ensure_ascii=False)
print(f"Post saved in file: {f_name}")
def get_post_obj(article):
post = {}
try:
post_text = article.find("div", attrs={"id": "post-content-body"}).text
author = article.find('a', attrs={"class": "post__user-info user-info"})
pub_date = article.find("span", attrs={"class": "post__time"})
post['author'] = author.text.strip()
post['author_url'] = author.attrs['href']
post['pub_date'] = pub_date.attrs['data-time_published']
post['post_text'] = post_text
return post
except Exception:
return post
def post_dumper(link):
try:
res = requests.get(link)
s = BeautifulSoup(res.text, "lxml")
prep_post = get_post_obj(s.article)
pid = h.sha256(str(prep_post['post_text']).encode()).hexdigest()
post = [pid, prep_post]
dump_post(post=post, name=pid)
except Exception:
pass
def links_loader():
with open("data/links.txt", "r+") as f:
return f.readlines()
if __name__ == '__main__':
links = links_loader()
for li in links:
sleep(3)
post_dumper(li[:-1])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment