Last active
April 30, 2017 08:22
-
-
Save kgullikson88/832a15a2205b4fa73559 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from __future__ import print_function | |
from bs4 import BeautifulSoup | |
from pymarkovchain import MarkovChain | |
import logging | |
import re | |
import sys | |
import os | |
import time | |
import numpy as np | |
try: | |
from urllib.request import urlopen | |
except ImportError: | |
from urllib2 import urlopen | |
class FakeEntry(object): | |
""" | |
A simple class to print out the title and abstract in a prettier way | |
""" | |
def __init__(self, title, abstract): | |
self.title = title | |
self.abstract = abstract | |
def __repr__(self): | |
return self.__str__() | |
def __str__(self): | |
return '{:s}\n\n{:s}'.format(self.title.strip(), self.abstract.strip()) | |
#print(self.title.strip(), '\n') | |
#print(self.abstract.strip()) | |
class VixRa(object): | |
""" | |
A class to parse the html in a vixra.org page. | |
""" | |
def __init__(self, filename=None, url=None, markov_db='./MCDB'): | |
""" | |
Initialize the class. | |
Parameters: | |
============ | |
- filename: The filename of a local html file. VixRa will read in | |
file and parse it, finding all of the titles and abstracts. | |
- url: The url for a vixra page. This is generally more useful than filename. | |
- markov_db: The base name for the markov chain database. This argument gets | |
passed to pymarkovchain.MarkovChain. | |
Returns: | |
======== | |
None | |
""" | |
self.titles = [] | |
self.abstracts = [] | |
self.title_markov = MarkovChain('{}-titles'.format(markov_db)) | |
self.abstract_markov = MarkovChain('{}-abstracts'.format(markov_db)) | |
if filename is not None: | |
self.parse_html_file(filename) | |
if url is not None: | |
self.parse_url(url) | |
self._articles = ['the', 'a', 'an', 'of', 'and', 'in', 'on', 'with', 'for', 'is', 'as'] | |
return | |
def parse_url(self, url, update_markov=True): | |
""" | |
Parse the html from a url, and append the titles and abstracts to the instance variable lists | |
If update_markov is True, it will update the markov chain databases to reflect the new text. | |
""" | |
try: | |
text = urlopen('http://vixra.org/astro/').read() | |
except HTTPError: | |
logging.warn('URL does not exist: \n{}'.format(url)) | |
return | |
self._parse_text(text, update_markov=update_markov) | |
return | |
def parse_html_file(self, filename, update_markov=True): | |
""" | |
Parse an html file, and append the titles and abstracts to the instance variable lists | |
If update_markov is True, it will update the markov chain databases to reflect the new text. | |
""" | |
with open(filename) as f: | |
text = f.read() | |
self._parse_text(text, update_markov=update_markov) | |
return | |
def _parse_text(self, lines, update_markov=True): | |
# Use beautifulsoup to extract all the text | |
soup = BeautifulSoup(lines) | |
text = soup.get_text() | |
# Find the first instance of viXra: (that is the first text for a paper) | |
entry_indices = self._find_abstract_starts(text) | |
# Get the title and abstract for each entry | |
for start, end in zip(entry_indices[:-1], entry_indices[1:]): | |
title, abstract = self._parse_entry(text[start:end]) | |
self.titles.append(title) | |
self.abstracts.append(abstract) | |
if update_markov: | |
self._train_markov() | |
return | |
def _get_wordlist(self, string): | |
""" Get a word-list out of string, removing parentheses and quotations where necessary | |
""" | |
words = [] | |
for w in string.split(): | |
if '(' in w: | |
w = w.strip('(') | |
if ')' in w: | |
w = w.strip(')') | |
if '"' in w: | |
w = w.strip('"') | |
if "'" in w and "n't" not in w and "h's" not in w: | |
w = w.strip("'") | |
words.append(w.strip().lower()) | |
return words | |
def _get_matchiness(self, title, abstract): | |
""" Return a measure of how much the title and abstract match. | |
Right now, I do this by searching for the occurence rate in | |
the abstract of words that are in the title (except the, an, etc) | |
""" | |
# Convert the title into a word list. Remove parentheses when appropriate. | |
title_words = self._get_wordlist(title) | |
# Remove definite and indefinite articles from the title. | |
title_words = [w for w in title_words if w not in self._articles] | |
# Get the number of occurences of each of the title words in the abstract | |
abstract_words = self._get_wordlist(abstract) | |
abstract_str = ' '.join(abstract_words) # basically the same as the original abstract, but without parentheses or quotes | |
n_words = len(abstract_words) | |
if n_words < 50: | |
return 0.0 | |
occurence_rate = {w: abstract_str.count(w) / n_words for w in title_words} | |
# Get average occurence rate. Might want to change this... | |
#print(occurence_rate) | |
stat = np.mean(list(occurence_rate.values())) if len(occurence_rate) > 0 else 0.0 | |
return stat | |
def generate_entry(self, get_match=True, N_trials=10): | |
""" | |
Generates a title and abstract | |
""" | |
if not get_match: | |
N_trials = 1 | |
best_matchiness = 0.0 | |
for i in range(N_trials): | |
title = self.title_markov.generateString() | |
abstract = self.abstract_markov.generateString() | |
# Remove the '----' from abstract - I am just using that to separate the individual abstracts. | |
idx = abstract.find('----') | |
abstract = abstract[idx+4:] | |
matchiness = self._get_matchiness(title, abstract) | |
if matchiness > best_matchiness: | |
best_title = title | |
best_abstract = abstract | |
best_matchiness = matchiness | |
return FakeEntry(best_title, best_abstract) | |
def _train_markov(self): | |
if len(self.titles) < 1 or len(self.abstracts) < 1: | |
logging.warn('You must give html text before training the markov sampler!') | |
return | |
title_combined = '. '.join(self.titles) | |
abstract_combined = '----'.join(self.abstracts) | |
self.title_markov.generateDatabase(title_combined) | |
self.abstract_markov.generateDatabase(abstract_combined, sentenceSep='----') | |
return | |
def _find_abstract_starts(self, text, pattern=r'\[[0-9]+\] viXra:'): | |
"""Find all instances of 'viXra:' in text | |
""" | |
ab_starts = [m.start() for m in re.finditer(pattern, text)] | |
return ab_starts | |
def _parse_entry(self, text): | |
""" Parses the title and abstract text from an entry. | |
""" | |
# The title is always the 4th line | |
lines = text.split('\n') | |
title = lines[3] | |
# The abstract is everything in between the line that starts with 'Comments:' and the line that starts with 'Category:' | |
tmp = text.find('\nComments:') | |
first = text.find('\n', tmp+1) | |
last = text.rfind('\nCategory:') | |
abstract = text[first:last].replace('\n', ' ') | |
return title, abstract | |
class AutoVixra(VixRa): | |
""" | |
This class automatically reads in several years of viXra posts. | |
""" | |
int2month = {1: 'January', 2: 'February', 3: 'March', 4: 'April', | |
5: 'May', 6: 'June', 7: 'July', 8: 'August', | |
9: 'September', 10: 'October', 11: 'November', | |
12: 'December'} | |
def __init__(self, first_year=2010, last_year=2015, first_month=1, last_month=12): | |
super(AutoVixra, self).__init__() | |
for year in range(first_year-2000, last_year-2000): | |
for month in range(first_month, last_month): | |
logging.info('Reading viXra posts for {} {}'.format(self.int2month[month], year+2000)) | |
self.parse_url('http://vixra.org/astro/{:02d}{:02d}'.format(year, month)) | |
time.sleep(2) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment