Created
February 15, 2018 13:28
-
-
Save bmwant/6aa277eb22093f619a9c59b0dde36152 to your computer and use it in GitHub Desktop.
Get article from Telegra.ph and transform resulting html
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import os | |
import argparse | |
from urllib import request | |
from urllib.parse import urljoin | |
from html.parser import HTMLParser | |
class ArticleParser(HTMLParser): | |
IMAGES_DIR = 'images' | |
def __init__(self, base_url): | |
super().__init__() | |
self.base_url = base_url | |
self.resulting_html = '' | |
self._appending = False | |
self._data_buf = '' | |
self._tags_stack = [] | |
@property | |
def html(self): | |
return self.resulting_html | |
@staticmethod | |
def _wrap_in_tag(tag, data): | |
return '<{tag}>{data}</{tag}>'.format(tag=tag, data=data.lstrip()) | |
def handle_starttag(self, tag, attrs): | |
if tag == 'br' and self._appending: | |
self._data_buf += '\n' | |
if tag == 'br': | |
return | |
# Download images | |
if tag == 'img': | |
for attr, value in attrs: | |
if attr == 'src': | |
filename = self.download_file(value) | |
new_url_path = os.path.join('/', self.IMAGES_DIR, filename) | |
self.resulting_html += self._wrap_in_tag('figure', '<img src="{}">'.format(new_url_path)) | |
return | |
# Handle section title | |
if tag == 'h3': | |
self._appending = True | |
# Handle main text | |
if tag == 'p': | |
self._appending = True | |
# Handle quotes | |
if tag == 'blockquote': | |
self._appending = True | |
self._tags_stack.append(tag) | |
def download_file(self, path): | |
filename = os.path.basename(path) | |
filepath = os.path.join(self.IMAGES_DIR, filename) | |
url = urljoin(self.base_url, path) | |
request.urlretrieve(url, filename=filepath) | |
return filename | |
def handle_endtag(self, tag): | |
if tag == 'br': | |
return | |
if not self._tags_stack: | |
raise ValueError('Open/closing tags are not balanced') | |
current_tag = self._tags_stack.pop() | |
if tag in ('p', 'h3', 'blockquote'): | |
if current_tag != tag: | |
raise ValueError('Invalid closing tag: %s. Current on stack: %s.', tag, current_tag) | |
if self._data_buf: | |
self.resulting_html += self._wrap_in_tag(current_tag, self._data_buf) | |
self._appending = False | |
self._data_buf = '' | |
def handle_data(self, data): | |
if self._appending: | |
self._data_buf += data | |
def find_tag(tag_name, html_data): | |
exp = '<{tag_name}[^>]*>(.*?)</{tag_name}>'.format(tag_name=tag_name) | |
m = re.search(exp, html_data) | |
result = m.group(1) # Match within a tag | |
return result | |
def parse_args(): | |
parser = argparse.ArgumentParser(description='Grab article from telegra.ph') | |
parser.add_argument('--url', dest='url', required=True, | |
help='URL to the article') | |
args = parser.parse_args() | |
return args |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment