rcdilorenzo/bisynoptic_text_analysis.py

## bisynoptic_text_analysis.py
from word_frequency import WordFrequency

luke = WordFrequency('Luke', './Luke.txt')
matthew = WordFrequency('Matt', './Matthew.txt')

print('================================')
print('= Frequently Occurring Phrases =')
print('= Between Matthew & Luke (Max  =')
print('= 4 words/phrase, Min freq 15) =')
print('================================')
print('')

mtl_comparisons = matthew.compare_to(luke)
for comparison in mtl_comparisons:
    print(str(comparison))

ltm_comparisons = luke.compare_to(matthew)
for comparison in ltm_comparisons:
    print(str(comparison))

## output-no-color.txt
================================
= Frequently Occurring Phrases =
= Between Matthew & Luke (Max  =
= 4 words/phrase, Min freq 15) =
================================

Matt: 'man'|127                  Luke: 'man'|132, 'the son of man'|26, 'son of man is'|5
Matt: 'heaven'|59                Luke: 'heaven'|20
Matt: 'kingdom'|55               Luke: 'kingdom'|43, 'the kingdom of god'|33, 'kingdom of god and'|10, 'kingdom of god is'|7, 'in the kingdom of'|5
Matt: 'day'|39                   Luke: 'day'|64, 'on the sabbath day'|8, 'the sabbath day and'|5
Matt: 'house'|36                 Luke: 'house'|58
Matt: 'the kingdom of heaven'|33 Luke: Less than 5
Matt: 'the son of man'|32        Luke: 'the son of man'|26
Matt: 'hand'|31                  Luke: 'hand'|19
Matt: 'way'|28                   Luke: 'way'|24
Matt: 'mother'|27                Luke: 'mother'|22
Matt: 'multitude'|26             Luke: 'multitude'|18
Matt: 'earth'|25                 Luke: 'earth'|15
Matt: 'time'|25                  Luke: 'time'|23
Matt: 'prophet'|24               Luke: 'prophet'|9
Matt: 'son'|22                   Luke: 'son'|101, 'which was the son'|75, 'was the son of'|75, 'the son of man'|26, 'son of man is'|5
Matt: 'city'|22                  Luke: 'city'|36
Matt: 'forth'|21                 Luke: 'forth'|14
Matt: 'hour'|21                  Luke: 'hour'|13
Matt: 'father'|20                Luke: 'father'|25
Matt: 'lord'|20                  Luke: 'lord'|15
Matt: 'temple'|20                Luke: 'temple'|18
Matt: 'name'|20                  Luke: 'name'|23
Matt: 'brother'|19               Luke: 'brother'|12
Matt: 'word'|19                  Luke: 'word'|20, 'the word of god'|5
Matt: 'world'|18                 Luke: 'world'|10
Matt: 'sea'|18                   Luke: Less than 5
Matt: 'child'|18                 Luke: 'child'|17
Matt: 'wife'|18                  Luke: 'wife'|19
Matt: 'hast'|16                  Luke: 'hast'|14
Matt: 'eye'|16                   Luke: 'eye'|10
Matt: 'body'|15                  Luke: 'body'|13
Matt: 'field'|15                 Luke: 'field'|6
Matt: 'life'|15                  Luke: 'life'|15
Matt: 'heart'|15                 Luke: 'heart'|12
Matt: 'fruit'|15                 Luke: 'fruit'|12
Luke: 'man'|132                  Matt: 'man'|127, 'the son of man'|32, 'son of man be'|6, 'son of man shall'|6, 'son of man is'|5
Luke: 'son'|101                  Matt: 'the son of man'|32, 'son'|22, 'son of man be'|6, 'son of man shall'|6, 'son of man is'|5
Luke: 'which was the son'|75     Matt: Less than 5
Luke: 'was the son of'|75        Matt: Less than 5
Luke: 'day'|64                   Matt: 'day'|39
Luke: 'house'|58                 Matt: 'house'|36
Luke: 'kingdom'|43               Matt: 'kingdom'|55, 'the kingdom of heaven'|33, 'kingdom of heaven is'|14, 'in the kingdom of'|7, 'into the kingdom of'|6, 'for the kingdom of'|5, 'enter into the kingdom'|5, 'the kingdom of god'|5, 'kingdom of heaven and'|5
Luke: 'city'|36                  Matt: 'city'|22
Luke: 'the kingdom of god'|33    Matt: 'the kingdom of god'|5
Luke: 'the son of man'|26        Matt: 'the son of man'|32
Luke: 'father'|25                Matt: 'father'|20
Luke: 'way'|24                   Matt: 'way'|28
Luke: 'name'|23                  Matt: 'name'|20, 'in the name of'|6
Luke: 'time'|23                  Matt: 'time'|25
Luke: 'mother'|22                Matt: 'mother'|27
Luke: 'heaven'|20                Matt: 'heaven'|59, 'the kingdom of heaven'|33, 'which is in heaven'|14, 'kingdom of heaven is'|14, 'of heaven is like'|8, 'kingdom of heaven and'|5
Luke: 'word'|20                  Matt: 'word'|19
Luke: 'hand'|19                  Matt: 'hand'|31
Luke: 'wife'|19                  Matt: 'wife'|18
Luke: 'peace'|18                 Matt: 'peace'|6
Luke: 'sabbath'|18               Matt: 'sabbath'|9
Luke: 'temple'|18                Matt: 'temple'|20, 'in the temple and'|5
Luke: 'multitude'|18             Matt: 'multitude'|26
Luke: 'power'|18                 Matt: 'power'|8
Luke: 'woman'|17                 Matt: 'woman'|10
Luke: 'servant'|17               Matt: 'servant'|9
Luke: 'child'|17                 Matt: 'child'|18, 'the young child and'|5
Luke: 'meat'|16                  Matt: 'meat'|10
Luke: 'country'|16               Matt: 'country'|8
Luke: 'place'|16                 Matt: 'place'|13
Luke: 'spirit'|16                Matt: 'spirit'|5
Luke: 'earth'|15                 Matt: 'earth'|25
Luke: 'lord'|15                  Matt: 'lord'|20, 'angel of the lord'|5
Luke: 'life'|15                  Matt: 'life'|15

## phrase_comparison.py
from termcolor import colored

PHRASE_MATCH_THRESHOLD = 0.4
PADDING = 45

class PhraseComparison:
    def __init__(self, collection_name, phrase, count,
                 other_collection_name, other_results, threshold):
        self.collection_name = collection_name
        self.phrase = phrase
        self.count = count
        self.other_collection_name = other_collection_name
        self.threshold = threshold
        self.__compare(other_results)

    def __compare(self, other_results):
        phrase_matches = []
        words = self.phrase.split()
        for (other_phrase, other_count) in other_results:
            matches = 0
            for word in self.phrase.split():
                if word in other_phrase.split():
                    matches += 1
            if (matches / len(words)) >= PHRASE_MATCH_THRESHOLD:
                phrase_matches.append(dict(
                    phrase = other_phrase,
                    count = other_count
                ))
        self.phrase_matches = phrase_matches

    def __str__(self):
        matches = map(self.__str_match_phrase, self.phrase_matches)
        less_than_msg = colored('Less than ' + str(self.threshold), 'red')
        matches_str = ', '.join(matches) if len(matches) > 0 else less_than_msg
        count_str = colored(str(self.count), 'blue', attrs = ['bold'])
        left = self.collection_name + ": '" + self.phrase + "'|" + count_str
        right = self.other_collection_name + ": " + matches_str
        return left.ljust(PADDING) + " " + right

    def __str_match_phrase(self, result):
        color = 'green' if result['count'] >= self.count else 'red'
        return "'" + result['phrase'] + "'|" + colored(str(result['count']), color)

## word_frequency.py
import nltk
import re
import operator

from nltk.util import ngrams
from nltk.tokenize import WordPunctTokenizer

from phrase_comparison import PhraseComparison

THRESHOLD = 5

class WordFrequency:
    kjv_stopwords = ["thy", "thou", "art", "hath", "ye", "thee", "thine", "shalt", "unto"]

    def __init__(self, name, filename, gram_count = 4, tokenizer = WordPunctTokenizer()):
        self.name = name
        self.filename = filename
        self.gram_count = gram_count
        self.tokenizer = tokenizer
        self.run()

    def run(self):
        self.results = self.__top_words(self.__compact_content(self.filename))

    def compare_to(self, other, min_frequency = 15):
        return [self.__compare(phrase, count, other)
                for (phrase, count) in self.results
                if count >= min_frequency]

    def print_results(self):
        if len(self.results) == 0:
            self.run()
        print(self.name)
        print(self.results)

    def __compare(self, phrase, count, other):
        return PhraseComparison(
            self.name, phrase, count, other.name, other.results, THRESHOLD
        )

    def __should_append_word(self, word):
        lower = word.lower()
        return (len(lower) > 1 and re.match(r'^[a-zA-Z\s]+$', lower) and
               lower not in self.kjv_stopwords)

    def __should_append(self, word, pos):
        return self.__should_append_word(word) and (pos == "NN" or pos == "GRAM")

    def __append_word(self, word, acc):
        lower = word.lower()
        acc[lower] = acc.get(lower, 0) + 1

    def __contains_noun(self, grams):
        return any(["NN" == pos for (_word, pos) in nltk.pos_tag(grams)])

    def __combined_grams(self, tokens):
        list_of_subgrams = ngrams(tokens, self.gram_count)
        return [(' '.join(subgrams), "GRAM")
                for subgrams in list_of_subgrams
                if self.__contains_noun(subgrams)]

    def __word_freq(self, content):
        freq_count = dict()
        tokens = [token for token in self.tokenizer.tokenize(content)
                  if self.__should_append_word(token)]
        grams = self.__combined_grams(tokens)
        tagged = nltk.pos_tag(tokens) + grams

        [self.__append_word(word, freq_count)
         for (word, pos) in tagged
         if self.__should_append(word, pos)]

        return freq_count

    def __top_words(self, content):
        freq = {word: count
                for word, count in self.__word_freq(content).items()
                if count >= THRESHOLD}

        return sorted(freq.items(), key = operator.itemgetter(1), reverse = True)

    def __compact_content(self, name):
        f = open(name)

        content = ""
        for line in f:
            content += " " + line.strip()

        return content
	from word_frequency import WordFrequency

	luke = WordFrequency('Luke', './Luke.txt')
	matthew = WordFrequency('Matt', './Matthew.txt')

	print('================================')
	print('= Frequently Occurring Phrases =')
	print('= Between Matthew & Luke (Max =')
	print('= 4 words/phrase, Min freq 15) =')
	print('================================')
	print('')

	mtl_comparisons = matthew.compare_to(luke)
	for comparison in mtl_comparisons:
	print(str(comparison))

	ltm_comparisons = luke.compare_to(matthew)
	for comparison in ltm_comparisons:
	print(str(comparison))
	================================
	= Frequently Occurring Phrases =
	= Between Matthew & Luke (Max =
	= 4 words/phrase, Min freq 15) =
	================================

	Matt: 'man'\|127 Luke: 'man'\|132, 'the son of man'\|26, 'son of man is'\|5
	Matt: 'heaven'\|59 Luke: 'heaven'\|20
	Matt: 'kingdom'\|55 Luke: 'kingdom'\|43, 'the kingdom of god'\|33, 'kingdom of god and'\|10, 'kingdom of god is'\|7, 'in the kingdom of'\|5
	Matt: 'day'\|39 Luke: 'day'\|64, 'on the sabbath day'\|8, 'the sabbath day and'\|5
	Matt: 'house'\|36 Luke: 'house'\|58
	Matt: 'the kingdom of heaven'\|33 Luke: Less than 5
	Matt: 'the son of man'\|32 Luke: 'the son of man'\|26
	Matt: 'hand'\|31 Luke: 'hand'\|19
	Matt: 'way'\|28 Luke: 'way'\|24
	Matt: 'mother'\|27 Luke: 'mother'\|22
	Matt: 'multitude'\|26 Luke: 'multitude'\|18
	Matt: 'earth'\|25 Luke: 'earth'\|15
	Matt: 'time'\|25 Luke: 'time'\|23
	Matt: 'prophet'\|24 Luke: 'prophet'\|9
	Matt: 'son'\|22 Luke: 'son'\|101, 'which was the son'\|75, 'was the son of'\|75, 'the son of man'\|26, 'son of man is'\|5
	Matt: 'city'\|22 Luke: 'city'\|36
	Matt: 'forth'\|21 Luke: 'forth'\|14
	Matt: 'hour'\|21 Luke: 'hour'\|13
	Matt: 'father'\|20 Luke: 'father'\|25
	Matt: 'lord'\|20 Luke: 'lord'\|15
	Matt: 'temple'\|20 Luke: 'temple'\|18
	Matt: 'name'\|20 Luke: 'name'\|23
	Matt: 'brother'\|19 Luke: 'brother'\|12
	Matt: 'word'\|19 Luke: 'word'\|20, 'the word of god'\|5
	Matt: 'world'\|18 Luke: 'world'\|10
	Matt: 'sea'\|18 Luke: Less than 5
	Matt: 'child'\|18 Luke: 'child'\|17
	Matt: 'wife'\|18 Luke: 'wife'\|19
	Matt: 'hast'\|16 Luke: 'hast'\|14
	Matt: 'eye'\|16 Luke: 'eye'\|10
	Matt: 'body'\|15 Luke: 'body'\|13
	Matt: 'field'\|15 Luke: 'field'\|6
	Matt: 'life'\|15 Luke: 'life'\|15
	Matt: 'heart'\|15 Luke: 'heart'\|12
	Matt: 'fruit'\|15 Luke: 'fruit'\|12
	Luke: 'man'\|132 Matt: 'man'\|127, 'the son of man'\|32, 'son of man be'\|6, 'son of man shall'\|6, 'son of man is'\|5
	Luke: 'son'\|101 Matt: 'the son of man'\|32, 'son'\|22, 'son of man be'\|6, 'son of man shall'\|6, 'son of man is'\|5
	Luke: 'which was the son'\|75 Matt: Less than 5
	Luke: 'was the son of'\|75 Matt: Less than 5
	Luke: 'day'\|64 Matt: 'day'\|39
	Luke: 'house'\|58 Matt: 'house'\|36
	Luke: 'kingdom'\|43 Matt: 'kingdom'\|55, 'the kingdom of heaven'\|33, 'kingdom of heaven is'\|14, 'in the kingdom of'\|7, 'into the kingdom of'\|6, 'for the kingdom of'\|5, 'enter into the kingdom'\|5, 'the kingdom of god'\|5, 'kingdom of heaven and'\|5
	Luke: 'city'\|36 Matt: 'city'\|22
	Luke: 'the kingdom of god'\|33 Matt: 'the kingdom of god'\|5
	Luke: 'the son of man'\|26 Matt: 'the son of man'\|32
	Luke: 'father'\|25 Matt: 'father'\|20
	Luke: 'way'\|24 Matt: 'way'\|28
	Luke: 'name'\|23 Matt: 'name'\|20, 'in the name of'\|6
	Luke: 'time'\|23 Matt: 'time'\|25
	Luke: 'mother'\|22 Matt: 'mother'\|27
	Luke: 'heaven'\|20 Matt: 'heaven'\|59, 'the kingdom of heaven'\|33, 'which is in heaven'\|14, 'kingdom of heaven is'\|14, 'of heaven is like'\|8, 'kingdom of heaven and'\|5
	Luke: 'word'\|20 Matt: 'word'\|19
	Luke: 'hand'\|19 Matt: 'hand'\|31
	Luke: 'wife'\|19 Matt: 'wife'\|18
	Luke: 'peace'\|18 Matt: 'peace'\|6
	Luke: 'sabbath'\|18 Matt: 'sabbath'\|9
	Luke: 'temple'\|18 Matt: 'temple'\|20, 'in the temple and'\|5
	Luke: 'multitude'\|18 Matt: 'multitude'\|26
	Luke: 'power'\|18 Matt: 'power'\|8
	Luke: 'woman'\|17 Matt: 'woman'\|10
	Luke: 'servant'\|17 Matt: 'servant'\|9
	Luke: 'child'\|17 Matt: 'child'\|18, 'the young child and'\|5
	Luke: 'meat'\|16 Matt: 'meat'\|10
	Luke: 'country'\|16 Matt: 'country'\|8
	Luke: 'place'\|16 Matt: 'place'\|13
	Luke: 'spirit'\|16 Matt: 'spirit'\|5
	Luke: 'earth'\|15 Matt: 'earth'\|25
	Luke: 'lord'\|15 Matt: 'lord'\|20, 'angel of the lord'\|5
	Luke: 'life'\|15 Matt: 'life'\|15
	from termcolor import colored

	PHRASE_MATCH_THRESHOLD = 0.4
	PADDING = 45

	class PhraseComparison:
	def __init__(self, collection_name, phrase, count,
	other_collection_name, other_results, threshold):
	self.collection_name = collection_name
	self.phrase = phrase
	self.count = count
	self.other_collection_name = other_collection_name
	self.threshold = threshold
	self.__compare(other_results)

	def __compare(self, other_results):
	phrase_matches = []
	words = self.phrase.split()
	for (other_phrase, other_count) in other_results:
	matches = 0
	for word in self.phrase.split():
	if word in other_phrase.split():
	matches += 1
	if (matches / len(words)) >= PHRASE_MATCH_THRESHOLD:
	phrase_matches.append(dict(
	phrase = other_phrase,
	count = other_count
	))
	self.phrase_matches = phrase_matches

	def __str__(self):
	matches = map(self.__str_match_phrase, self.phrase_matches)
	less_than_msg = colored('Less than ' + str(self.threshold), 'red')
	matches_str = ', '.join(matches) if len(matches) > 0 else less_than_msg
	count_str = colored(str(self.count), 'blue', attrs = ['bold'])
	left = self.collection_name + ": '" + self.phrase + "'\|" + count_str
	right = self.other_collection_name + ": " + matches_str
	return left.ljust(PADDING) + " " + right

	def __str_match_phrase(self, result):
	color = 'green' if result['count'] >= self.count else 'red'
	return "'" + result['phrase'] + "'\|" + colored(str(result['count']), color)
	import nltk
	import re
	import operator

	from nltk.util import ngrams
	from nltk.tokenize import WordPunctTokenizer

	from phrase_comparison import PhraseComparison

	THRESHOLD = 5

	class WordFrequency:
	kjv_stopwords = ["thy", "thou", "art", "hath", "ye", "thee", "thine", "shalt", "unto"]

	def __init__(self, name, filename, gram_count = 4, tokenizer = WordPunctTokenizer()):
	self.name = name
	self.filename = filename
	self.gram_count = gram_count
	self.tokenizer = tokenizer
	self.run()

	def run(self):
	self.results = self.__top_words(self.__compact_content(self.filename))

	def compare_to(self, other, min_frequency = 15):
	return [self.__compare(phrase, count, other)
	for (phrase, count) in self.results
	if count >= min_frequency]

	def print_results(self):
	if len(self.results) == 0:
	self.run()
	print(self.name)
	print(self.results)

	def __compare(self, phrase, count, other):
	return PhraseComparison(
	self.name, phrase, count, other.name, other.results, THRESHOLD
	)

	def __should_append_word(self, word):
	lower = word.lower()
	return (len(lower) > 1 and re.match(r'^[a-zA-Z\s]+$', lower) and
	lower not in self.kjv_stopwords)

	def __should_append(self, word, pos):
	return self.__should_append_word(word) and (pos == "NN" or pos == "GRAM")

	def __append_word(self, word, acc):
	lower = word.lower()
	acc[lower] = acc.get(lower, 0) + 1

	def __contains_noun(self, grams):
	return any(["NN" == pos for (_word, pos) in nltk.pos_tag(grams)])

	def __combined_grams(self, tokens):
	list_of_subgrams = ngrams(tokens, self.gram_count)
	return [(' '.join(subgrams), "GRAM")
	for subgrams in list_of_subgrams
	if self.__contains_noun(subgrams)]

	def __word_freq(self, content):
	freq_count = dict()
	tokens = [token for token in self.tokenizer.tokenize(content)
	if self.__should_append_word(token)]
	grams = self.__combined_grams(tokens)
	tagged = nltk.pos_tag(tokens) + grams

	[self.__append_word(word, freq_count)
	for (word, pos) in tagged
	if self.__should_append(word, pos)]

	return freq_count

	def __top_words(self, content):
	freq = {word: count
	for word, count in self.__word_freq(content).items()
	if count >= THRESHOLD}

	return sorted(freq.items(), key = operator.itemgetter(1), reverse = True)

	def __compact_content(self, name):
	f = open(name)

	content = ""
	for line in f:
	content += " " + line.strip()

	return content