Skip to content

Instantly share code, notes, and snippets.

@anna-hope
Last active August 29, 2015 14:16
Show Gist options
  • Save anna-hope/b181afd315719aca9f06 to your computer and use it in GitHub Desktop.
Save anna-hope/b181afd315719aca9f06 to your computer and use it in GitHub Desktop.
woe formatter
#!/usr/bin/env python3.4
from collections import Counter
from difflib import SequenceMatcher
import sys, re, pathlib
import subprocess
import requests
from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
import mistune
replacements = {
'--': '—',
'–': '—',
'Oblast': 'Province',
'oblast': 'Province',
'Raion': 'District',
'raion': 'District',
'Krai': 'Region',
'krai': 'region',
'Kray': 'Region',
'kray': 'region',
'Mensk': 'Minsk',
'Alyaksandr': 'Alexander',
'Lukashenka': 'Lukashenko',
'Luhansk': 'Lugansk',
'Daghestan': 'Dagestan',
'gastarbeiter': 'labor migrant'
}
def is_proper(word, words):
if word.istitle():
if word.casefold() in words:
return words.count(word) > words.count(word.casefold())
else:
return True
else:
return False
def partial_in(string: str, iterable: (str),
clever = True, threshold = 0.7) -> bool:
if clever:
matcher = SequenceMatcher(None, a=None, b=string)
for i in iterable:
matcher.set_seq1(i)
if matcher.quick_ratio() >= threshold:
return True
else:
return False
else:
for i in iterable:
if string in i:
return True
else:
return False
def fetch_page(link):
r = requests.get(link)
return r.text
def process_string(string):
if string.strip() == '':
# empty strings should be ignored
return None
else:
# it's actual text
return string.strip().replace('\n', ' ')
def extract_strings(element):
body_strings = []
links = []
for c in element.children:
if isinstance(c, NavigableString):
string = process_string(c)
if string:
body_strings.append(string)
elif c.name == 'i':
body_strings.append('*{}*'.format(c.string))
elif c.name == 'a':
body_strings.append('({})'.format(c['href']))
return (body_strings, links)
def has_strings(element):
return len(list(element.stripped_strings)) > 0
def process_content(content, process_spans=True):
body_strings = []
links = []
for child in content.children:
if child.name == 'div':
result = extract_strings(child)
body_strings += result[0]
links += result[1]
# extract text from spans
spans = child.find_all('span')
for span in spans:
result = extract_strings(span)
body_strings += result[0]
links += result[1]
elif child.name == 'ul':
ul = child
for li in ul.children:
try:
spantext = process_content(li)
body_strings.append('* ' + spantext)
except AttributeError:
pass
elif child.name == 'span':
body_strings += [process_string(s) for s in child.strings
if process_string(s)]
body_text = '\n\n'.join(body_strings)
return body_text
def extract_text(html, process_spans=True):
soup = BeautifulSoup(html)
entry_div = soup.find(class_='post hentry')
title = entry_div.find('h3').string
content = entry_div.find('div', class_='post-body entry-content')
body_text = process_content(content, process_spans)
return title, body_text
def fix_terms(text: str) -> str:
'''fix potential wrong terms'''
for k, v in replacements.items():
text = re.sub(r'(?<!\w){}'.format(k),
v, text)
return text
def fix_title(title, maxlen=55):
title = fix_terms(title)
nocaps = ('a', 'the')
titlewords = []
for w in title.split():
if w in nocaps:
titlewords.append(w.casefold())
else:
titlewords.append(w.capitalize())
newtitle = ' '.join(titlewords)
# if the title is over a given length, get rid of 'x says'
if len(newtitle) > maxlen and ',' in newtitle:
newtitle = newtitle.rsplit(',', 1)[0]
return newtitle
def tagify(text):
text.replace('\n', ' ')
sentences = [s.lstrip() for s in text.split('.')]
words = []
for s in sentences:
try:
words += s.split(' ', 1)[1].split(' ')
except IndexError:
pass
# match words or words separated with dashes
justword = re.compile(r'\w+[\-\w+]*|\w+')
words = [re.match(justword, w).group() for w in words
if re.match(justword, w)]
# get frequencies to weed out common words
#frequencies = Counter(w.casefold() for w in words)
#avg = statistics.mean(frequencies.values())
#uncased_words = frequencies.keys()
tags = []
tag = ''
for word in words:
if is_proper(word, words):
if not partial_in(word.strip(), tags):
tag += word + ' '
else:
if tag != '':
# we have to do this again
# because sometimes parts of a tag seep through
if not partial_in(tag.strip(), tags):
tags.append(tag.strip())
tag = ''
return tags
def fix_article(link, text):
dateline_regex = re.compile(r'Staunton, \w+ \d+')
dateline = re.search(dateline_regex, text).group()
linked_dateline = '[**{}**]({})'.format(dateline, link)
# turn the dateline into a link
text = re.sub(dateline_regex, linked_dateline, text)
text = fix_terms(text)
return text
def write_to_file(article, extension='.txt', path=None):
'''article is a tuple of (title, text, tags)'''
if not path:
path = pathlib.Path('WOEs')
if not path.exists():
path.mkdir(parents=True)
title, text, tags = article
filename = title + extension
newfile_path = pathlib.Path(path, filename)
with newfile_path.open('w') as newfile:
newfile.write('tags: ' + ','.join(tags) + '\n\n')
newfile.write(title + '\n\n')
newfile.write(text + '\n\n')
return newfile_path
def process_article(link, markdown=False):
try:
page_html = fetch_page(link)
except requests.exceptions.MissingSchema:
link = 'http://' + link
page_html = fetch_page(link)
title, body_text = extract_text(page_html)
title = fix_title(title)
body_text = fix_article(link, body_text)
if markdown:
text = mistune.markdown(body_text)
else:
text = body_text
tags = tagify(body_text)
article = (title, text, tags)
return article
arg = sys.argv[1]
if arg == 'clean':
clean_woes()
else:
link = arg
article = process_article(link)
path = pathlib.Path('/tmp', 'WOEs')
newpath = write_to_file(article, extension='.markdown', path=path)
subprocess.call(['open', str(newpath)])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment