Skip to content

Instantly share code, notes, and snippets.

@kgullikson88
Last active April 30, 2017 08:22
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save kgullikson88/832a15a2205b4fa73559 to your computer and use it in GitHub Desktop.
Save kgullikson88/832a15a2205b4fa73559 to your computer and use it in GitHub Desktop.
from __future__ import print_function
from bs4 import BeautifulSoup
from pymarkovchain import MarkovChain
import logging
import re
import sys
import os
import time
import numpy as np
try:
from urllib.request import urlopen
except ImportError:
from urllib2 import urlopen
class FakeEntry(object):
"""
A simple class to print out the title and abstract in a prettier way
"""
def __init__(self, title, abstract):
self.title = title
self.abstract = abstract
def __repr__(self):
return self.__str__()
def __str__(self):
return '{:s}\n\n{:s}'.format(self.title.strip(), self.abstract.strip())
#print(self.title.strip(), '\n')
#print(self.abstract.strip())
class VixRa(object):
"""
A class to parse the html in a vixra.org page.
"""
def __init__(self, filename=None, url=None, markov_db='./MCDB'):
"""
Initialize the class.
Parameters:
============
- filename: The filename of a local html file. VixRa will read in
file and parse it, finding all of the titles and abstracts.
- url: The url for a vixra page. This is generally more useful than filename.
- markov_db: The base name for the markov chain database. This argument gets
passed to pymarkovchain.MarkovChain.
Returns:
========
None
"""
self.titles = []
self.abstracts = []
self.title_markov = MarkovChain('{}-titles'.format(markov_db))
self.abstract_markov = MarkovChain('{}-abstracts'.format(markov_db))
if filename is not None:
self.parse_html_file(filename)
if url is not None:
self.parse_url(url)
self._articles = ['the', 'a', 'an', 'of', 'and', 'in', 'on', 'with', 'for', 'is', 'as']
return
def parse_url(self, url, update_markov=True):
"""
Parse the html from a url, and append the titles and abstracts to the instance variable lists
If update_markov is True, it will update the markov chain databases to reflect the new text.
"""
try:
text = urlopen('http://vixra.org/astro/').read()
except HTTPError:
logging.warn('URL does not exist: \n{}'.format(url))
return
self._parse_text(text, update_markov=update_markov)
return
def parse_html_file(self, filename, update_markov=True):
"""
Parse an html file, and append the titles and abstracts to the instance variable lists
If update_markov is True, it will update the markov chain databases to reflect the new text.
"""
with open(filename) as f:
text = f.read()
self._parse_text(text, update_markov=update_markov)
return
def _parse_text(self, lines, update_markov=True):
# Use beautifulsoup to extract all the text
soup = BeautifulSoup(lines)
text = soup.get_text()
# Find the first instance of viXra: (that is the first text for a paper)
entry_indices = self._find_abstract_starts(text)
# Get the title and abstract for each entry
for start, end in zip(entry_indices[:-1], entry_indices[1:]):
title, abstract = self._parse_entry(text[start:end])
self.titles.append(title)
self.abstracts.append(abstract)
if update_markov:
self._train_markov()
return
def _get_wordlist(self, string):
""" Get a word-list out of string, removing parentheses and quotations where necessary
"""
words = []
for w in string.split():
if '(' in w:
w = w.strip('(')
if ')' in w:
w = w.strip(')')
if '"' in w:
w = w.strip('"')
if "'" in w and "n't" not in w and "h's" not in w:
w = w.strip("'")
words.append(w.strip().lower())
return words
def _get_matchiness(self, title, abstract):
""" Return a measure of how much the title and abstract match.
Right now, I do this by searching for the occurence rate in
the abstract of words that are in the title (except the, an, etc)
"""
# Convert the title into a word list. Remove parentheses when appropriate.
title_words = self._get_wordlist(title)
# Remove definite and indefinite articles from the title.
title_words = [w for w in title_words if w not in self._articles]
# Get the number of occurences of each of the title words in the abstract
abstract_words = self._get_wordlist(abstract)
abstract_str = ' '.join(abstract_words) # basically the same as the original abstract, but without parentheses or quotes
n_words = len(abstract_words)
if n_words < 50:
return 0.0
occurence_rate = {w: abstract_str.count(w) / n_words for w in title_words}
# Get average occurence rate. Might want to change this...
#print(occurence_rate)
stat = np.mean(list(occurence_rate.values())) if len(occurence_rate) > 0 else 0.0
return stat
def generate_entry(self, get_match=True, N_trials=10):
"""
Generates a title and abstract
"""
if not get_match:
N_trials = 1
best_matchiness = 0.0
for i in range(N_trials):
title = self.title_markov.generateString()
abstract = self.abstract_markov.generateString()
# Remove the '----' from abstract - I am just using that to separate the individual abstracts.
idx = abstract.find('----')
abstract = abstract[idx+4:]
matchiness = self._get_matchiness(title, abstract)
if matchiness > best_matchiness:
best_title = title
best_abstract = abstract
best_matchiness = matchiness
return FakeEntry(best_title, best_abstract)
def _train_markov(self):
if len(self.titles) < 1 or len(self.abstracts) < 1:
logging.warn('You must give html text before training the markov sampler!')
return
title_combined = '. '.join(self.titles)
abstract_combined = '----'.join(self.abstracts)
self.title_markov.generateDatabase(title_combined)
self.abstract_markov.generateDatabase(abstract_combined, sentenceSep='----')
return
def _find_abstract_starts(self, text, pattern=r'\[[0-9]+\] viXra:'):
"""Find all instances of 'viXra:' in text
"""
ab_starts = [m.start() for m in re.finditer(pattern, text)]
return ab_starts
def _parse_entry(self, text):
""" Parses the title and abstract text from an entry.
"""
# The title is always the 4th line
lines = text.split('\n')
title = lines[3]
# The abstract is everything in between the line that starts with 'Comments:' and the line that starts with 'Category:'
tmp = text.find('\nComments:')
first = text.find('\n', tmp+1)
last = text.rfind('\nCategory:')
abstract = text[first:last].replace('\n', ' ')
return title, abstract
class AutoVixra(VixRa):
"""
This class automatically reads in several years of viXra posts.
"""
int2month = {1: 'January', 2: 'February', 3: 'March', 4: 'April',
5: 'May', 6: 'June', 7: 'July', 8: 'August',
9: 'September', 10: 'October', 11: 'November',
12: 'December'}
def __init__(self, first_year=2010, last_year=2015, first_month=1, last_month=12):
super(AutoVixra, self).__init__()
for year in range(first_year-2000, last_year-2000):
for month in range(first_month, last_month):
logging.info('Reading viXra posts for {} {}'.format(self.int2month[month], year+2000))
self.parse_url('http://vixra.org/astro/{:02d}{:02d}'.format(year, month))
time.sleep(2)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment