kgullikson88/vixra_parser.py

## vixra_parser.py
from __future__ import print_function

from bs4 import BeautifulSoup
from pymarkovchain import MarkovChain
import logging
import re
import sys
import os
import time
import numpy as np
try:
    from urllib.request import urlopen
except ImportError:
    from urllib2 import urlopen

class FakeEntry(object):
    """
    A simple class to print out the title and abstract in a prettier way
    """
    def __init__(self, title, abstract):
        self.title = title
        self.abstract = abstract

    def __repr__(self):
        return self.__str__()

    def __str__(self):
        return '{:s}\n\n{:s}'.format(self.title.strip(), self.abstract.strip())
        #print(self.title.strip(), '\n')
        #print(self.abstract.strip())

class VixRa(object):
    """
    A class to parse the html in a vixra.org page.
    """
    def __init__(self, filename=None, url=None, markov_db='./MCDB'):
        """
        Initialize the class.

        Parameters:
        ============

         - filename: The filename of a local html file. VixRa will read in
                     file and parse it, finding all of the titles and abstracts.
         - url: The url for a vixra page. This is generally more useful than filename.
         - markov_db: The base name for the markov chain database. This argument gets
                      passed to pymarkovchain.MarkovChain.

        Returns:
        ========
        None
        """
        self.titles = []
        self.abstracts = []
        self.title_markov = MarkovChain('{}-titles'.format(markov_db))
        self.abstract_markov = MarkovChain('{}-abstracts'.format(markov_db))
        if filename is not None:
            self.parse_html_file(filename)
        if url is not None:
            self.parse_url(url)

        self._articles = ['the', 'a', 'an', 'of', 'and', 'in', 'on', 'with', 'for', 'is', 'as']
        return


    def parse_url(self, url, update_markov=True):
        """
        Parse the html from a url, and append the titles and abstracts to the instance variable lists
        If update_markov is True, it will update the markov chain databases to reflect the new text.
        """
        try:
            text = urlopen('http://vixra.org/astro/').read()
        except HTTPError:
            logging.warn('URL does not exist: \n{}'.format(url))
            return
        self._parse_text(text, update_markov=update_markov)
        return

    def parse_html_file(self, filename, update_markov=True):
        """
        Parse an html file, and append the titles and abstracts to the instance variable lists
        If update_markov is True, it will update the markov chain databases to reflect the new text.
        """
        with open(filename) as f:
            text = f.read()
        self._parse_text(text, update_markov=update_markov)
        return


    def _parse_text(self, lines, update_markov=True):

        # Use beautifulsoup to extract all the text
        soup = BeautifulSoup(lines)
        text = soup.get_text()

        # Find the first instance of viXra: (that is the first text for a paper)
        entry_indices = self._find_abstract_starts(text)

        # Get the title and abstract for each entry
        for start, end in zip(entry_indices[:-1], entry_indices[1:]):
            title, abstract = self._parse_entry(text[start:end])
            self.titles.append(title)
            self.abstracts.append(abstract)

        if update_markov:
            self._train_markov()

        return

    def _get_wordlist(self, string):
        """ Get a word-list out of string, removing parentheses and quotations where necessary
        """
        words = []
        for w in string.split():
            if '(' in w:
                w = w.strip('(')
            if ')' in w:
                w = w.strip(')')
            if '"' in w:
                w = w.strip('"')
            if "'" in w and "n't" not in w and "h's" not in w:
                w = w.strip("'")
            words.append(w.strip().lower())
        return words

    def _get_matchiness(self, title, abstract):
        """ Return a measure of how much the title and abstract match.
        Right now, I do this by searching for the occurence rate in
        the abstract of words that are in the title (except the, an, etc)
        """
        # Convert the title into a word list. Remove parentheses when appropriate.
        title_words = self._get_wordlist(title)

        # Remove definite and indefinite articles from the title.
        title_words = [w for w in title_words if w not in self._articles]

        # Get the number of occurences of each of the title words in the abstract
        abstract_words = self._get_wordlist(abstract)
        abstract_str = ' '.join(abstract_words) # basically the same as the original abstract, but without parentheses or quotes
        n_words = len(abstract_words)
        if n_words < 50:
            return 0.0
        occurence_rate = {w: abstract_str.count(w) / n_words for w in title_words}

        # Get average occurence rate. Might want to change this...
        #print(occurence_rate)
        stat = np.mean(list(occurence_rate.values())) if len(occurence_rate) > 0 else 0.0

        return stat


    def generate_entry(self, get_match=True, N_trials=10):
        """
        Generates a title and abstract
        """
        if not get_match:
            N_trials = 1

        best_matchiness = 0.0
        for i in range(N_trials):
            title = self.title_markov.generateString()
            abstract = self.abstract_markov.generateString()

            # Remove the '----' from abstract - I am just using that to separate the individual abstracts.
            idx = abstract.find('----')
            abstract = abstract[idx+4:]

            matchiness = self._get_matchiness(title, abstract)

            if matchiness > best_matchiness:
                best_title = title
                best_abstract = abstract
                best_matchiness = matchiness

        return FakeEntry(best_title, best_abstract)


    def _train_markov(self):
        if len(self.titles) < 1 or len(self.abstracts) < 1:
            logging.warn('You must give html text before training the markov sampler!')
            return
        title_combined = '. '.join(self.titles)
        abstract_combined = '----'.join(self.abstracts)
        self.title_markov.generateDatabase(title_combined)
        self.abstract_markov.generateDatabase(abstract_combined, sentenceSep='----')
        return


    def _find_abstract_starts(self, text, pattern=r'\[[0-9]+\] viXra:'):
        """Find all instances of 'viXra:' in text
        """
        ab_starts = [m.start() for m in re.finditer(pattern, text)]
        return ab_starts


    def _parse_entry(self, text):
        """ Parses the title and abstract text from an entry.
        """
        # The title is always the 4th line
        lines = text.split('\n')
        title = lines[3]

        # The abstract is everything in between the line that starts with 'Comments:' and the line that starts with 'Category:'
        tmp = text.find('\nComments:')
        first = text.find('\n', tmp+1)
        last = text.rfind('\nCategory:')

        abstract = text[first:last].replace('\n', ' ')

        return title, abstract


class AutoVixra(VixRa):
    """
    This class automatically reads in several years of viXra posts.
    """
    int2month = {1: 'January', 2: 'February', 3: 'March', 4: 'April',
                 5: 'May', 6: 'June', 7: 'July', 8: 'August',
                 9: 'September', 10: 'October', 11: 'November',
                 12: 'December'}
    def __init__(self, first_year=2010, last_year=2015, first_month=1, last_month=12):
        super(AutoVixra, self).__init__()

        for year in range(first_year-2000, last_year-2000):
            for month in range(first_month, last_month):
                logging.info('Reading viXra posts for {} {}'.format(self.int2month[month], year+2000))
                self.parse_url('http://vixra.org/astro/{:02d}{:02d}'.format(year, month))
                time.sleep(2)
	from __future__ import print_function

	from bs4 import BeautifulSoup
	from pymarkovchain import MarkovChain
	import logging
	import re
	import sys
	import os
	import time
	import numpy as np
	try:
	from urllib.request import urlopen
	except ImportError:
	from urllib2 import urlopen

	class FakeEntry(object):
	"""
	A simple class to print out the title and abstract in a prettier way
	"""
	def __init__(self, title, abstract):
	self.title = title
	self.abstract = abstract

	def __repr__(self):
	return self.__str__()

	def __str__(self):
	return '{:s}\n\n{:s}'.format(self.title.strip(), self.abstract.strip())
	#print(self.title.strip(), '\n')
	#print(self.abstract.strip())

	class VixRa(object):
	"""
	A class to parse the html in a vixra.org page.
	"""
	def __init__(self, filename=None, url=None, markov_db='./MCDB'):
	"""
	Initialize the class.

	Parameters:
	============

	- filename: The filename of a local html file. VixRa will read in
	file and parse it, finding all of the titles and abstracts.
	- url: The url for a vixra page. This is generally more useful than filename.
	- markov_db: The base name for the markov chain database. This argument gets
	passed to pymarkovchain.MarkovChain.

	Returns:
	========
	None
	"""
	self.titles = []
	self.abstracts = []
	self.title_markov = MarkovChain('{}-titles'.format(markov_db))
	self.abstract_markov = MarkovChain('{}-abstracts'.format(markov_db))
	if filename is not None:
	self.parse_html_file(filename)
	if url is not None:
	self.parse_url(url)

	self._articles = ['the', 'a', 'an', 'of', 'and', 'in', 'on', 'with', 'for', 'is', 'as']
	return


	def parse_url(self, url, update_markov=True):
	"""
	Parse the html from a url, and append the titles and abstracts to the instance variable lists
	If update_markov is True, it will update the markov chain databases to reflect the new text.
	"""
	try:
	text = urlopen('http://vixra.org/astro/').read()
	except HTTPError:
	logging.warn('URL does not exist: \n{}'.format(url))
	return
	self._parse_text(text, update_markov=update_markov)
	return

	def parse_html_file(self, filename, update_markov=True):
	"""
	Parse an html file, and append the titles and abstracts to the instance variable lists
	If update_markov is True, it will update the markov chain databases to reflect the new text.
	"""
	with open(filename) as f:
	text = f.read()
	self._parse_text(text, update_markov=update_markov)
	return


	def _parse_text(self, lines, update_markov=True):

	# Use beautifulsoup to extract all the text
	soup = BeautifulSoup(lines)
	text = soup.get_text()

	# Find the first instance of viXra: (that is the first text for a paper)
	entry_indices = self._find_abstract_starts(text)

	# Get the title and abstract for each entry
	for start, end in zip(entry_indices[:-1], entry_indices[1:]):
	title, abstract = self._parse_entry(text[start:end])
	self.titles.append(title)
	self.abstracts.append(abstract)

	if update_markov:
	self._train_markov()

	return

	def _get_wordlist(self, string):
	""" Get a word-list out of string, removing parentheses and quotations where necessary
	"""
	words = []
	for w in string.split():
	if '(' in w:
	w = w.strip('(')
	if ')' in w:
	w = w.strip(')')
	if '"' in w:
	w = w.strip('"')
	if "'" in w and "n't" not in w and "h's" not in w:
	w = w.strip("'")
	words.append(w.strip().lower())
	return words

	def _get_matchiness(self, title, abstract):
	""" Return a measure of how much the title and abstract match.
	Right now, I do this by searching for the occurence rate in
	the abstract of words that are in the title (except the, an, etc)
	"""
	# Convert the title into a word list. Remove parentheses when appropriate.
	title_words = self._get_wordlist(title)

	# Remove definite and indefinite articles from the title.
	title_words = [w for w in title_words if w not in self._articles]

	# Get the number of occurences of each of the title words in the abstract
	abstract_words = self._get_wordlist(abstract)
	abstract_str = ' '.join(abstract_words) # basically the same as the original abstract, but without parentheses or quotes
	n_words = len(abstract_words)
	if n_words < 50:
	return 0.0
	occurence_rate = {w: abstract_str.count(w) / n_words for w in title_words}

	# Get average occurence rate. Might want to change this...
	#print(occurence_rate)
	stat = np.mean(list(occurence_rate.values())) if len(occurence_rate) > 0 else 0.0

	return stat


	def generate_entry(self, get_match=True, N_trials=10):
	"""
	Generates a title and abstract
	"""
	if not get_match:
	N_trials = 1

	best_matchiness = 0.0
	for i in range(N_trials):
	title = self.title_markov.generateString()
	abstract = self.abstract_markov.generateString()

	# Remove the '----' from abstract - I am just using that to separate the individual abstracts.
	idx = abstract.find('----')
	abstract = abstract[idx+4:]

	matchiness = self._get_matchiness(title, abstract)

	if matchiness > best_matchiness:
	best_title = title
	best_abstract = abstract
	best_matchiness = matchiness

	return FakeEntry(best_title, best_abstract)


	def _train_markov(self):
	if len(self.titles) < 1 or len(self.abstracts) < 1:
	logging.warn('You must give html text before training the markov sampler!')
	return
	title_combined = '. '.join(self.titles)
	abstract_combined = '----'.join(self.abstracts)
	self.title_markov.generateDatabase(title_combined)
	self.abstract_markov.generateDatabase(abstract_combined, sentenceSep='----')
	return



	def _find_abstract_starts(self, text, pattern=r'\[[0-9]+\] viXra:'):
	"""Find all instances of 'viXra:' in text
	"""
	ab_starts = [m.start() for m in re.finditer(pattern, text)]
	return ab_starts


	def _parse_entry(self, text):
	""" Parses the title and abstract text from an entry.
	"""
	# The title is always the 4th line
	lines = text.split('\n')
	title = lines[3]

	# The abstract is everything in between the line that starts with 'Comments:' and the line that starts with 'Category:'
	tmp = text.find('\nComments:')
	first = text.find('\n', tmp+1)
	last = text.rfind('\nCategory:')

	abstract = text[first:last].replace('\n', ' ')

	return title, abstract





	class AutoVixra(VixRa):
	"""
	This class automatically reads in several years of viXra posts.
	"""
	int2month = {1: 'January', 2: 'February', 3: 'March', 4: 'April',
	5: 'May', 6: 'June', 7: 'July', 8: 'August',
	9: 'September', 10: 'October', 11: 'November',
	12: 'December'}
	def __init__(self, first_year=2010, last_year=2015, first_month=1, last_month=12):
	super(AutoVixra, self).__init__()

	for year in range(first_year-2000, last_year-2000):
	for month in range(first_month, last_month):
	logging.info('Reading viXra posts for {} {}'.format(self.int2month[month], year+2000))
	self.parse_url('http://vixra.org/astro/{:02d}{:02d}'.format(year, month))
	time.sleep(2)