anna-hope/woe.py

## woe.py
#!/usr/bin/env python3.4

from collections import Counter
from difflib import SequenceMatcher
import sys, re, pathlib
import subprocess

import requests
from bs4 import BeautifulSoup
from bs4.element import NavigableString, Tag
import mistune

replacements = {
    '--': '—',
    '–': '—',
    'Oblast': 'Province',
    'oblast': 'Province',
    'Raion': 'District',
    'raion': 'District',
    'Krai': 'Region',
    'krai': 'region',
    'Kray': 'Region',
    'kray': 'region',
    'Mensk': 'Minsk',
    'Alyaksandr': 'Alexander',
    'Lukashenka': 'Lukashenko',
    'Luhansk': 'Lugansk',
    'Daghestan': 'Dagestan',
    'gastarbeiter': 'labor migrant'
}

def is_proper(word, words):
    if word.istitle():
        if word.casefold() in words:
            return words.count(word) > words.count(word.casefold())
        else:
            return True
    else:
        return False

def partial_in(string: str, iterable: (str),
                clever = True, threshold = 0.7) -> bool:

    if clever:
        matcher = SequenceMatcher(None, a=None, b=string)
        for i in iterable:
            matcher.set_seq1(i)
            if matcher.quick_ratio() >= threshold:
                return True
        else:
            return False
    else:
        for i in iterable:
            if string in i:
                return True
        else:
            return False

def fetch_page(link):
    r = requests.get(link)
    return r.text

def process_string(string):
    if string.strip() == '':

        # empty strings should be ignored
        return None
    else:

        # it's actual text
        return string.strip().replace('\n', ' ')

def extract_strings(element):
    body_strings = []
    links = []

    for c in element.children:
        if isinstance(c, NavigableString):
            string = process_string(c)
            if string:
                body_strings.append(string)
        elif c.name == 'i':
            body_strings.append('*{}*'.format(c.string))
        elif c.name == 'a':
           body_strings.append('({})'.format(c['href']))

    return (body_strings, links)

def has_strings(element):
    return len(list(element.stripped_strings)) > 0


def process_content(content, process_spans=True):
    body_strings = []
    links = []

    for child in content.children:
        if child.name == 'div':
            result = extract_strings(child)
            body_strings += result[0]
            links += result[1]

            # extract text from spans
            spans = child.find_all('span')
            for span in spans:
                result = extract_strings(span)
                body_strings += result[0]
                links += result[1]

        elif child.name == 'ul':
            ul = child
            for li in ul.children:
                try:
                    spantext = process_content(li)
                    body_strings.append('* ' + spantext)
                except AttributeError:
                    pass
        elif child.name == 'span':
            body_strings += [process_string(s) for s in child.strings
                                if process_string(s)]

    body_text = '\n\n'.join(body_strings)
    return body_text


def extract_text(html, process_spans=True):
    soup = BeautifulSoup(html)
    entry_div = soup.find(class_='post hentry')
    title = entry_div.find('h3').string

    content = entry_div.find('div', class_='post-body entry-content')
    body_text = process_content(content, process_spans)

    return title, body_text

def fix_terms(text: str) -> str:
    '''fix potential wrong terms'''
    for k, v in replacements.items():
        text = re.sub(r'(?<!\w){}'.format(k),
                        v, text)
    return text

def fix_title(title, maxlen=55):
    title = fix_terms(title)

    nocaps = ('a', 'the')


    titlewords = []
    for w in title.split():
        if w in nocaps:
            titlewords.append(w.casefold())
        else:
            titlewords.append(w.capitalize())

    newtitle = ' '.join(titlewords)

    # if the title is over a given length, get rid of 'x says'
    if len(newtitle) > maxlen and ',' in newtitle:
        newtitle = newtitle.rsplit(',', 1)[0]

    return newtitle

def tagify(text):
    text.replace('\n', ' ')
    sentences = [s.lstrip() for s in text.split('.')]
    words = []
    for s in sentences:
        try:
            words += s.split(' ', 1)[1].split(' ')
        except IndexError:
            pass


    # match words or words separated with dashes
    justword = re.compile(r'\w+[\-\w+]*|\w+')
    words = [re.match(justword, w).group() for w in words
                if re.match(justword, w)]

    # get frequencies to weed out common words
    #frequencies = Counter(w.casefold() for w in words)
    #avg = statistics.mean(frequencies.values())
    #uncased_words = frequencies.keys()

    tags = []
    tag = ''
    for word in words:
        if is_proper(word, words):
            if not partial_in(word.strip(), tags):
                    tag += word + ' '
        else:
            if tag != '':

                # we have to do this again
                # because sometimes parts of a tag seep through
                if not partial_in(tag.strip(), tags):
                    tags.append(tag.strip())
                tag = ''

    return tags

def fix_article(link, text):
    dateline_regex = re.compile(r'Staunton, \w+ \d+')
    dateline = re.search(dateline_regex, text).group()
    linked_dateline = '[**{}**]({})'.format(dateline, link)

    # turn the dateline into a link
    text = re.sub(dateline_regex, linked_dateline, text)
    text = fix_terms(text)
    return text

def write_to_file(article, extension='.txt', path=None):
    '''article is a tuple of (title, text, tags)'''
    if not path:
        path = pathlib.Path('WOEs')
    if not path.exists():
        path.mkdir(parents=True)

    title, text, tags = article
    filename = title + extension
    newfile_path = pathlib.Path(path, filename)
    with newfile_path.open('w') as newfile:
        newfile.write('tags: ' + ','.join(tags) + '\n\n')
        newfile.write(title + '\n\n')
        newfile.write(text + '\n\n')

    return newfile_path


def process_article(link, markdown=False):
    try:
        page_html = fetch_page(link)
    except requests.exceptions.MissingSchema:
        link = 'http://' + link
        page_html = fetch_page(link)

    title, body_text = extract_text(page_html)
    title = fix_title(title)
    body_text = fix_article(link, body_text)

    if markdown:
        text = mistune.markdown(body_text)
    else:
        text = body_text

    tags = tagify(body_text)
    article = (title, text, tags)
    return article

arg = sys.argv[1]
if arg == 'clean':
    clean_woes()
else:
    link = arg
    article = process_article(link)
    path = pathlib.Path('/tmp', 'WOEs')
    newpath = write_to_file(article, extension='.markdown', path=path)
    subprocess.call(['open', str(newpath)])
	#!/usr/bin/env python3.4

	from collections import Counter
	from difflib import SequenceMatcher
	import sys, re, pathlib
	import subprocess

	import requests
	from bs4 import BeautifulSoup
	from bs4.element import NavigableString, Tag
	import mistune

	replacements = {
	'--': '—',
	'–': '—',
	'Oblast': 'Province',
	'oblast': 'Province',
	'Raion': 'District',
	'raion': 'District',
	'Krai': 'Region',
	'krai': 'region',
	'Kray': 'Region',
	'kray': 'region',
	'Mensk': 'Minsk',
	'Alyaksandr': 'Alexander',
	'Lukashenka': 'Lukashenko',
	'Luhansk': 'Lugansk',
	'Daghestan': 'Dagestan',
	'gastarbeiter': 'labor migrant'
	}

	def is_proper(word, words):
	if word.istitle():
	if word.casefold() in words:
	return words.count(word) > words.count(word.casefold())
	else:
	return True
	else:
	return False

	def partial_in(string: str, iterable: (str),
	clever = True, threshold = 0.7) -> bool:

	if clever:
	matcher = SequenceMatcher(None, a=None, b=string)
	for i in iterable:
	matcher.set_seq1(i)
	if matcher.quick_ratio() >= threshold:
	return True
	else:
	return False
	else:
	for i in iterable:
	if string in i:
	return True
	else:
	return False

	def fetch_page(link):
	r = requests.get(link)
	return r.text

	def process_string(string):
	if string.strip() == '':

	# empty strings should be ignored
	return None
	else:

	# it's actual text
	return string.strip().replace('\n', ' ')

	def extract_strings(element):
	body_strings = []
	links = []

	for c in element.children:
	if isinstance(c, NavigableString):
	string = process_string(c)
	if string:
	body_strings.append(string)
	elif c.name == 'i':
	body_strings.append('{}'.format(c.string))
	elif c.name == 'a':
	body_strings.append('({})'.format(c['href']))

	return (body_strings, links)

	def has_strings(element):
	return len(list(element.stripped_strings)) > 0


	def process_content(content, process_spans=True):
	body_strings = []
	links = []

	for child in content.children:
	if child.name == 'div':
	result = extract_strings(child)
	body_strings += result[0]
	links += result[1]

	# extract text from spans
	spans = child.find_all('span')
	for span in spans:
	result = extract_strings(span)
	body_strings += result[0]
	links += result[1]

	elif child.name == 'ul':
	ul = child
	for li in ul.children:
	try:
	spantext = process_content(li)
	body_strings.append('* ' + spantext)
	except AttributeError:
	pass
	elif child.name == 'span':
	body_strings += [process_string(s) for s in child.strings
	if process_string(s)]

	body_text = '\n\n'.join(body_strings)
	return body_text


	def extract_text(html, process_spans=True):
	soup = BeautifulSoup(html)
	entry_div = soup.find(class_='post hentry')
	title = entry_div.find('h3').string

	content = entry_div.find('div', class_='post-body entry-content')
	body_text = process_content(content, process_spans)

	return title, body_text

	def fix_terms(text: str) -> str:
	'''fix potential wrong terms'''
	for k, v in replacements.items():
	text = re.sub(r'(?<!\w){}'.format(k),
	v, text)
	return text

	def fix_title(title, maxlen=55):
	title = fix_terms(title)

	nocaps = ('a', 'the')


	titlewords = []
	for w in title.split():
	if w in nocaps:
	titlewords.append(w.casefold())
	else:
	titlewords.append(w.capitalize())

	newtitle = ' '.join(titlewords)

	# if the title is over a given length, get rid of 'x says'
	if len(newtitle) > maxlen and ',' in newtitle:
	newtitle = newtitle.rsplit(',', 1)[0]

	return newtitle

	def tagify(text):
	text.replace('\n', ' ')
	sentences = [s.lstrip() for s in text.split('.')]
	words = []
	for s in sentences:
	try:
	words += s.split(' ', 1)[1].split(' ')
	except IndexError:
	pass


	# match words or words separated with dashes
	justword = re.compile(r'\w+[\-\w+]*\|\w+')
	words = [re.match(justword, w).group() for w in words
	if re.match(justword, w)]

	# get frequencies to weed out common words
	#frequencies = Counter(w.casefold() for w in words)
	#avg = statistics.mean(frequencies.values())
	#uncased_words = frequencies.keys()

	tags = []
	tag = ''
	for word in words:
	if is_proper(word, words):
	if not partial_in(word.strip(), tags):
	tag += word + ' '
	else:
	if tag != '':

	# we have to do this again
	# because sometimes parts of a tag seep through
	if not partial_in(tag.strip(), tags):
	tags.append(tag.strip())
	tag = ''

	return tags

	def fix_article(link, text):
	dateline_regex = re.compile(r'Staunton, \w+ \d+')
	dateline = re.search(dateline_regex, text).group()
	linked_dateline = '[{}]({})'.format(dateline, link)

	# turn the dateline into a link
	text = re.sub(dateline_regex, linked_dateline, text)
	text = fix_terms(text)
	return text

	def write_to_file(article, extension='.txt', path=None):
	'''article is a tuple of (title, text, tags)'''
	if not path:
	path = pathlib.Path('WOEs')
	if not path.exists():
	path.mkdir(parents=True)

	title, text, tags = article
	filename = title + extension
	newfile_path = pathlib.Path(path, filename)
	with newfile_path.open('w') as newfile:
	newfile.write('tags: ' + ','.join(tags) + '\n\n')
	newfile.write(title + '\n\n')
	newfile.write(text + '\n\n')

	return newfile_path


	def process_article(link, markdown=False):
	try:
	page_html = fetch_page(link)
	except requests.exceptions.MissingSchema:
	link = 'http://' + link
	page_html = fetch_page(link)

	title, body_text = extract_text(page_html)
	title = fix_title(title)
	body_text = fix_article(link, body_text)

	if markdown:
	text = mistune.markdown(body_text)
	else:
	text = body_text

	tags = tagify(body_text)
	article = (title, text, tags)
	return article

	arg = sys.argv[1]
	if arg == 'clean':
	clean_woes()
	else:
	link = arg
	article = process_article(link)
	path = pathlib.Path('/tmp', 'WOEs')
	newpath = write_to_file(article, extension='.markdown', path=path)
	subprocess.call(['open', str(newpath)])