Created
November 14, 2018 04:53
-
-
Save mikolasan/6133721ce4902b239f101fe6ae751807 to your computer and use it in GitHub Desktop.
yap scraping
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from requests import get | |
from requests.exceptions import RequestException | |
from contextlib import closing | |
from bs4 import BeautifulSoup | |
from bs4 import Comment | |
from datetime import datetime | |
def simple_get(url): | |
""" | |
Attempts to get the content at `url` by making an HTTP GET request. | |
If the content-type of response is some kind of HTML/XML, return the | |
text content, otherwise return None. | |
""" | |
try: | |
with closing(get(url, stream=True)) as resp: | |
if is_good_response(resp): | |
return resp.content | |
else: | |
return None | |
except RequestException as e: | |
log_error('Error during requests to {0} : {1}'.format(url, str(e))) | |
return None | |
def is_good_response(resp): | |
""" | |
Returns True if the response seems to be HTML, False otherwise. | |
""" | |
content_type = resp.headers['Content-Type'].lower() | |
return (resp.status_code == 200 | |
and content_type is not None | |
and content_type.find('html') > -1) | |
def log_error(e): | |
""" | |
It is always a good idea to log errors. | |
This function just prints them, but you can | |
make it do anything. | |
""" | |
print(e) | |
def get_yap_soup(): | |
url = 'http://www.yaplakal.com/' | |
raw_html = simple_get(url) | |
return BeautifulSoup(raw_html, 'html.parser') | |
class YapError(Exception): | |
"""Base class for Yap parser""" | |
pass | |
class ParserError(YapError): | |
"""Exception raised if html template has been changed""" | |
def __init__(self, message): | |
self.message = message | |
def parse_content(post, soup): | |
picture = soup.img | |
youtube_video = soup.find('iframe', attrs={'class':'youtube-player'}) | |
coub_video = None | |
if not youtube_video and soup.iframe: | |
link = soup.iframe['src'] | |
coub_video = link if link.find('coub.com') != -1 else None | |
comments = soup.find_all(string=lambda text:isinstance(text, Comment)) | |
yap_video = None if comments is None else list(filter(lambda x: x.find('Begin Video:') == 0, comments)) | |
if youtube_video: | |
print(post['id'], "YouTube video", youtube_video['src']) | |
elif coub_video: | |
print(post['id'], "Coub video", coub_video) | |
elif yap_video: | |
print(post['id'], "Yap video", yap_video) | |
elif picture: | |
print(post['id'], "Static picture", picture['src']) | |
def get_page_titles(): | |
soup = get_yap_soup() | |
lenta = soup.find('table', attrs={'class':'lenta'}) | |
tr = lenta.find_next('tr') | |
count = 0 | |
max_count = 50 | |
while count < max_count: | |
count = count + 1 | |
post = {'id': count} | |
while True: | |
if not 'class' in tr.td.attrs: | |
tr = tr.find_next('tr') | |
continue | |
cell_class = tr.td['class'] | |
if 'newshead' in cell_class: | |
if tr.td['id'] == 'topic_' + str(count): | |
if not tr.td.div.div.a is None: | |
post['rating'] = tr.td.div.div.a.text | |
else: | |
post['rating'] = 0 | |
post['link'] = tr.td.div.h2.a['href'] | |
post['title'] = tr.td.div.h2.a.text | |
else: | |
raise ParserError('scipping title') | |
elif 'news-content' in cell_class: | |
if tr.td['id'] == 'news_' + str(count): | |
post['content'] = tr.td.contents | |
parse_content(post, tr.td) | |
else: | |
raise ParserError('scipping content') | |
elif 'newsbottom' in cell_class: | |
post_info = tr.find_all('b') | |
for info in post_info: | |
if 'icon-user' in info['class']: | |
post['author'] = info.text | |
elif 'icon-forum' in info['class']: | |
post['section'] = info.text | |
elif 'icon-date' in info['class']: | |
post['date'] = datetime.strptime(info.text, "%d.%m.%Y - %H:%M") | |
elif 'icon-comment' in info['class']: | |
post['comments'] = info.span.text | |
break | |
tr = tr.find_next('tr') | |
# send post to DB | |
#print(post) | |
#print(count, post['date'], post['rating']) | |
tr = tr.find_next('tr') | |
get_page_titles() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment