Skip to content

Instantly share code, notes, and snippets.

@berekuk
Created April 15, 2017 16:36
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save berekuk/0ae9780538194b7c3928280ad9ebf384 to your computer and use it in GitHub Desktop.
Save berekuk/0ae9780538194b7c3928280ad9ebf384 to your computer and use it in GitHub Desktop.
html2vk
#!/usr/bin/env python3
import argparse
from bs4 import BeautifulSoup
import requests
import urllib.parse
import re
def extract_tumblr_content(soup):
item = soup.find(class_='post-content')
if not item:
raise Exception("Can't find .post-content")
return item
def italic(text):
return "''" + text + "''"
def bold(text):
return "'''" + text + "'''"
def center(text):
return "<center>" + text + "</center>"
def convert_unknown(item):
return 'UNKNOWN[' + str(item) + ']'
def convert_link(item):
if item.name != 'a':
raise Exception("Expected <a> tag")
link = item['href']
if link.startswith('http://t.umblr.com/redirect'):
link = urllib.parse.parse_qs(urllib.parse.urlparse(link).query)['z'][0]
text = item.string
wrap = lambda x: x
if text.startswith('[') and text.endswith(']'):
text = text[1:-1]
wrap = lambda x: '[' + x + ']'
return wrap('[{link}|{text}]'.format(link=link, text=text))
def convert_header(item):
return bold(convert_plain(item)) + '\n'
def convert_plain(item):
return re.sub('\n', '', item.string)
def convert_list(children):
return ''.join(
[convert_item(child) for child in children]
)
def convert_i(item):
return italic(convert_list(item.children))
def convert_b(item):
return bold(convert_list(item.children))
def convert_ul(item):
result = ''
for child in item.children:
if not child:
continue
if not child.name and not str(child).strip():
continue
if child.name != 'li':
raise Exception("Expected <li> in <ul>, got {} instead".format(child))
result += '* ' + convert_list(child.children) + '\n'
result += '\n'
return result
def convert_br(item):
return '\n' + convert_list(item.children)
def convert_center(item):
return center(convert_list(item.children))
def convert_paragraph(item):
if item.attrs == {}:
return convert_list(item.children) + '\n\n'
elif item.attrs == {'style': 'text-align: center;'}:
return center(convert_list(item.children)) + '\n\n'
else:
print(item.attrs)
return convert_unknown(item)
def convert_div(item):
if item.attrs == {'class': ['post-content']}:
return convert_list(item) # for top-level element
else:
return convert_unknown(item)
def convert_twitter(item):
body = ''.join([s for s in item.find('p').strings])
link = item.find_all('a')[-1]['href']
date = item.find_all('a')[-1].string
login = re.search('\(@(.*?)\)', ''.join([repr(s) for s in item.stripped_strings])).group(1)
return '<left><blockquote>{body}\n<right>[https://twitter.com/{login}|@{login}], [{link}|{date}]</right></blockquote></left>'.format(body=body, login=login, link=link, date=date)
def convert_blockquote(item):
if item.attrs == {}:
inner = convert_list(item.children).strip('\n')
return '<blockquote>' + inner + '</blockquote>\n'
elif 'twitter-tweet' in item.attrs.get('class', []):
return convert_twitter(item)
else:
return convert_unknown(item)
def convert_item(item):
if not item:
return ''
elif not item.name:
return convert_plain(item)
elif item.name == 'h1':
return convert_header(item)
elif item.name == 'a':
return convert_link(item)
elif item.name == 'i':
return convert_i(item)
elif item.name == 'b':
return convert_b(item)
elif item.name == 'ul':
return convert_ul(item)
elif item.name == 'p':
return convert_paragraph(item)
elif item.name == 'br':
return convert_br(item)
elif item.name == 'center':
return convert_center(item)
elif item.name == 'div':
return convert_div(item)
elif item.name == 'blockquote':
return convert_blockquote(item)
elif item.name == 'script':
return ''
else:
return convert_unknown(item)
def get_next_part(part):
parts = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X']
if part not in parts:
raise Exception('Unknown part ' + part)
next_parts = {}
for i in range(len(parts) - 1):
next_parts[parts[i]] = parts[i+1]
return next_parts[part]
def extract_ssc_part(text, part):
next_part = get_next_part(part)
regex = "('''{}\.'''.*?)(?:$|'''{}\.''')".format(part, next_part)
match = re.search(regex, text, flags=re.DOTALL)
if not match:
raise Exception("Part {} not found".format(part))
return match.group(1)
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--url')
parser.add_argument('--file')
parser.add_argument('--part')
args = parser.parse_args()
if not args.url and not args.file:
raise Exception("One of --url or --file should be specified")
text = None
if args.url:
r = requests.get(args.url)
text = r.text
elif args.file:
with open(args.file) as fh:
text = fh.read()
else:
raise Exception("Only one of --url or --file should be specified")
soup = BeautifulSoup(text, 'html.parser')
post = extract_tumblr_content(soup)
result = convert_item(post)
if args.part:
result = extract_ssc_part(result, args.part)
print(result)
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment