Skip to content

Instantly share code, notes, and snippets.

@mikolasan
Created November 14, 2018 04:53
Show Gist options
  • Save mikolasan/6133721ce4902b239f101fe6ae751807 to your computer and use it in GitHub Desktop.
Save mikolasan/6133721ce4902b239f101fe6ae751807 to your computer and use it in GitHub Desktop.
yap scraping
from requests import get
from requests.exceptions import RequestException
from contextlib import closing
from bs4 import BeautifulSoup
from bs4 import Comment
from datetime import datetime
def simple_get(url):
"""
Attempts to get the content at `url` by making an HTTP GET request.
If the content-type of response is some kind of HTML/XML, return the
text content, otherwise return None.
"""
try:
with closing(get(url, stream=True)) as resp:
if is_good_response(resp):
return resp.content
else:
return None
except RequestException as e:
log_error('Error during requests to {0} : {1}'.format(url, str(e)))
return None
def is_good_response(resp):
"""
Returns True if the response seems to be HTML, False otherwise.
"""
content_type = resp.headers['Content-Type'].lower()
return (resp.status_code == 200
and content_type is not None
and content_type.find('html') > -1)
def log_error(e):
"""
It is always a good idea to log errors.
This function just prints them, but you can
make it do anything.
"""
print(e)
def get_yap_soup():
url = 'http://www.yaplakal.com/'
raw_html = simple_get(url)
return BeautifulSoup(raw_html, 'html.parser')
class YapError(Exception):
"""Base class for Yap parser"""
pass
class ParserError(YapError):
"""Exception raised if html template has been changed"""
def __init__(self, message):
self.message = message
def parse_content(post, soup):
picture = soup.img
youtube_video = soup.find('iframe', attrs={'class':'youtube-player'})
coub_video = None
if not youtube_video and soup.iframe:
link = soup.iframe['src']
coub_video = link if link.find('coub.com') != -1 else None
comments = soup.find_all(string=lambda text:isinstance(text, Comment))
yap_video = None if comments is None else list(filter(lambda x: x.find('Begin Video:') == 0, comments))
if youtube_video:
print(post['id'], "YouTube video", youtube_video['src'])
elif coub_video:
print(post['id'], "Coub video", coub_video)
elif yap_video:
print(post['id'], "Yap video", yap_video)
elif picture:
print(post['id'], "Static picture", picture['src'])
def get_page_titles():
soup = get_yap_soup()
lenta = soup.find('table', attrs={'class':'lenta'})
tr = lenta.find_next('tr')
count = 0
max_count = 50
while count < max_count:
count = count + 1
post = {'id': count}
while True:
if not 'class' in tr.td.attrs:
tr = tr.find_next('tr')
continue
cell_class = tr.td['class']
if 'newshead' in cell_class:
if tr.td['id'] == 'topic_' + str(count):
if not tr.td.div.div.a is None:
post['rating'] = tr.td.div.div.a.text
else:
post['rating'] = 0
post['link'] = tr.td.div.h2.a['href']
post['title'] = tr.td.div.h2.a.text
else:
raise ParserError('scipping title')
elif 'news-content' in cell_class:
if tr.td['id'] == 'news_' + str(count):
post['content'] = tr.td.contents
parse_content(post, tr.td)
else:
raise ParserError('scipping content')
elif 'newsbottom' in cell_class:
post_info = tr.find_all('b')
for info in post_info:
if 'icon-user' in info['class']:
post['author'] = info.text
elif 'icon-forum' in info['class']:
post['section'] = info.text
elif 'icon-date' in info['class']:
post['date'] = datetime.strptime(info.text, "%d.%m.%Y - %H:%M")
elif 'icon-comment' in info['class']:
post['comments'] = info.span.text
break
tr = tr.find_next('tr')
# send post to DB
#print(post)
#print(count, post['date'], post['rating'])
tr = tr.find_next('tr')
get_page_titles()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment