Created
April 15, 2017 16:36
-
-
Save berekuk/0ae9780538194b7c3928280ad9ebf384 to your computer and use it in GitHub Desktop.
html2vk
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
import argparse | |
from bs4 import BeautifulSoup | |
import requests | |
import urllib.parse | |
import re | |
def extract_tumblr_content(soup): | |
item = soup.find(class_='post-content') | |
if not item: | |
raise Exception("Can't find .post-content") | |
return item | |
def italic(text): | |
return "''" + text + "''" | |
def bold(text): | |
return "'''" + text + "'''" | |
def center(text): | |
return "<center>" + text + "</center>" | |
def convert_unknown(item): | |
return 'UNKNOWN[' + str(item) + ']' | |
def convert_link(item): | |
if item.name != 'a': | |
raise Exception("Expected <a> tag") | |
link = item['href'] | |
if link.startswith('http://t.umblr.com/redirect'): | |
link = urllib.parse.parse_qs(urllib.parse.urlparse(link).query)['z'][0] | |
text = item.string | |
wrap = lambda x: x | |
if text.startswith('[') and text.endswith(']'): | |
text = text[1:-1] | |
wrap = lambda x: '[' + x + ']' | |
return wrap('[{link}|{text}]'.format(link=link, text=text)) | |
def convert_header(item): | |
return bold(convert_plain(item)) + '\n' | |
def convert_plain(item): | |
return re.sub('\n', '', item.string) | |
def convert_list(children): | |
return ''.join( | |
[convert_item(child) for child in children] | |
) | |
def convert_i(item): | |
return italic(convert_list(item.children)) | |
def convert_b(item): | |
return bold(convert_list(item.children)) | |
def convert_ul(item): | |
result = '' | |
for child in item.children: | |
if not child: | |
continue | |
if not child.name and not str(child).strip(): | |
continue | |
if child.name != 'li': | |
raise Exception("Expected <li> in <ul>, got {} instead".format(child)) | |
result += '* ' + convert_list(child.children) + '\n' | |
result += '\n' | |
return result | |
def convert_br(item): | |
return '\n' + convert_list(item.children) | |
def convert_center(item): | |
return center(convert_list(item.children)) | |
def convert_paragraph(item): | |
if item.attrs == {}: | |
return convert_list(item.children) + '\n\n' | |
elif item.attrs == {'style': 'text-align: center;'}: | |
return center(convert_list(item.children)) + '\n\n' | |
else: | |
print(item.attrs) | |
return convert_unknown(item) | |
def convert_div(item): | |
if item.attrs == {'class': ['post-content']}: | |
return convert_list(item) # for top-level element | |
else: | |
return convert_unknown(item) | |
def convert_twitter(item): | |
body = ''.join([s for s in item.find('p').strings]) | |
link = item.find_all('a')[-1]['href'] | |
date = item.find_all('a')[-1].string | |
login = re.search('\(@(.*?)\)', ''.join([repr(s) for s in item.stripped_strings])).group(1) | |
return '<left><blockquote>{body}\n<right>[https://twitter.com/{login}|@{login}], [{link}|{date}]</right></blockquote></left>'.format(body=body, login=login, link=link, date=date) | |
def convert_blockquote(item): | |
if item.attrs == {}: | |
inner = convert_list(item.children).strip('\n') | |
return '<blockquote>' + inner + '</blockquote>\n' | |
elif 'twitter-tweet' in item.attrs.get('class', []): | |
return convert_twitter(item) | |
else: | |
return convert_unknown(item) | |
def convert_item(item): | |
if not item: | |
return '' | |
elif not item.name: | |
return convert_plain(item) | |
elif item.name == 'h1': | |
return convert_header(item) | |
elif item.name == 'a': | |
return convert_link(item) | |
elif item.name == 'i': | |
return convert_i(item) | |
elif item.name == 'b': | |
return convert_b(item) | |
elif item.name == 'ul': | |
return convert_ul(item) | |
elif item.name == 'p': | |
return convert_paragraph(item) | |
elif item.name == 'br': | |
return convert_br(item) | |
elif item.name == 'center': | |
return convert_center(item) | |
elif item.name == 'div': | |
return convert_div(item) | |
elif item.name == 'blockquote': | |
return convert_blockquote(item) | |
elif item.name == 'script': | |
return '' | |
else: | |
return convert_unknown(item) | |
def get_next_part(part): | |
parts = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X'] | |
if part not in parts: | |
raise Exception('Unknown part ' + part) | |
next_parts = {} | |
for i in range(len(parts) - 1): | |
next_parts[parts[i]] = parts[i+1] | |
return next_parts[part] | |
def extract_ssc_part(text, part): | |
next_part = get_next_part(part) | |
regex = "('''{}\.'''.*?)(?:$|'''{}\.''')".format(part, next_part) | |
match = re.search(regex, text, flags=re.DOTALL) | |
if not match: | |
raise Exception("Part {} not found".format(part)) | |
return match.group(1) | |
def main(): | |
parser = argparse.ArgumentParser() | |
parser.add_argument('--url') | |
parser.add_argument('--file') | |
parser.add_argument('--part') | |
args = parser.parse_args() | |
if not args.url and not args.file: | |
raise Exception("One of --url or --file should be specified") | |
text = None | |
if args.url: | |
r = requests.get(args.url) | |
text = r.text | |
elif args.file: | |
with open(args.file) as fh: | |
text = fh.read() | |
else: | |
raise Exception("Only one of --url or --file should be specified") | |
soup = BeautifulSoup(text, 'html.parser') | |
post = extract_tumblr_content(soup) | |
result = convert_item(post) | |
if args.part: | |
result = extract_ssc_part(result, args.part) | |
print(result) | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment