Skip to content

Instantly share code, notes, and snippets.

@bmwant
Created February 15, 2018 13:28
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save bmwant/6aa277eb22093f619a9c59b0dde36152 to your computer and use it in GitHub Desktop.
Save bmwant/6aa277eb22093f619a9c59b0dde36152 to your computer and use it in GitHub Desktop.
Get article from Telegra.ph and transform resulting html
import re
import os
import argparse
from urllib import request
from urllib.parse import urljoin
from html.parser import HTMLParser
class ArticleParser(HTMLParser):
IMAGES_DIR = 'images'
def __init__(self, base_url):
super().__init__()
self.base_url = base_url
self.resulting_html = ''
self._appending = False
self._data_buf = ''
self._tags_stack = []
@property
def html(self):
return self.resulting_html
@staticmethod
def _wrap_in_tag(tag, data):
return '<{tag}>{data}</{tag}>'.format(tag=tag, data=data.lstrip())
def handle_starttag(self, tag, attrs):
if tag == 'br' and self._appending:
self._data_buf += '\n'
if tag == 'br':
return
# Download images
if tag == 'img':
for attr, value in attrs:
if attr == 'src':
filename = self.download_file(value)
new_url_path = os.path.join('/', self.IMAGES_DIR, filename)
self.resulting_html += self._wrap_in_tag('figure', '<img src="{}">'.format(new_url_path))
return
# Handle section title
if tag == 'h3':
self._appending = True
# Handle main text
if tag == 'p':
self._appending = True
# Handle quotes
if tag == 'blockquote':
self._appending = True
self._tags_stack.append(tag)
def download_file(self, path):
filename = os.path.basename(path)
filepath = os.path.join(self.IMAGES_DIR, filename)
url = urljoin(self.base_url, path)
request.urlretrieve(url, filename=filepath)
return filename
def handle_endtag(self, tag):
if tag == 'br':
return
if not self._tags_stack:
raise ValueError('Open/closing tags are not balanced')
current_tag = self._tags_stack.pop()
if tag in ('p', 'h3', 'blockquote'):
if current_tag != tag:
raise ValueError('Invalid closing tag: %s. Current on stack: %s.', tag, current_tag)
if self._data_buf:
self.resulting_html += self._wrap_in_tag(current_tag, self._data_buf)
self._appending = False
self._data_buf = ''
def handle_data(self, data):
if self._appending:
self._data_buf += data
def find_tag(tag_name, html_data):
exp = '<{tag_name}[^>]*>(.*?)</{tag_name}>'.format(tag_name=tag_name)
m = re.search(exp, html_data)
result = m.group(1) # Match within a tag
return result
def parse_args():
parser = argparse.ArgumentParser(description='Grab article from telegra.ph')
parser.add_argument('--url', dest='url', required=True,
help='URL to the article')
args = parser.parse_args()
return args
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment