Skip to content

Instantly share code, notes, and snippets.

@ftfarias
Last active November 28, 2023 03:14
Show Gist options
  • Save ftfarias/4bae08b493bcbf287ac212e132ef5143 to your computer and use it in GitHub Desktop.
Save ftfarias/4bae08b493bcbf287ac212e132ef5143 to your computer and use it in GitHub Desktop.
Bigram Detector
import math
from collections import Counter
def calculate_mutual_information(words, bigrams):
# Count the frequency of individual words and bigrams
word_counts = Counter(words)
bigram_counts = Counter(bigrams)
# Calculate the total number of words and bigrams
total_words = sum(word_counts.values())
total_bigrams = sum(bigram_counts.values())
# Calculate the mutual information for each word and bigram
mutual_information = {}
# Calculate mutual information for individual words
for word, count in word_counts.items():
word_probability = count / total_words
mutual_information[word] = 0
# Calculate the mutual information for the word with each bigram
for bigram, bigram_count in bigram_counts.items():
if word in bigram:
bigram_probability = bigram_count / total_bigrams
# Calculate the joint probability of the word and bigram
joint_probability = bigram_count / total_words
# Calculate the mutual information using the formula
mutual_info = math.log2(joint_probability / (word_probability * bigram_probability))
mutual_information[word] += mutual_info
# Calculate mutual information for bigrams
for bigram, count in bigram_counts.items():
bigram_probability = count / total_bigrams
mutual_information[bigram] = 0
# Calculate the mutual information for the bigram with each word
for word, word_count in word_counts.items():
if word in bigram:
word_probability = word_count / total_words
# Calculate the joint probability of the word and bigram
joint_probability = count / total_words
# Calculate the mutual information using the formula
mutual_info = math.log2(joint_probability / (word_probability * bigram_probability))
mutual_information[bigram] += mutual_info
return mutual_information
"""
In this function, the words parameter represents a list of individual words,
and the bigrams parameter represents a list of word pairs (bigrams).
The function uses the Counter class from the collections module to count the
frequency of each word and bigram.
The mutual information is calculated for each word by iterating over the word
counts and bigram counts. For each word, the function calculates the word
probability and then iterates over the bigrams to check if the word is present in them.
If the word is present in a bigram, the mutual information is calculated using the
formula log2(joint_probability / (word_probability * bigram_probability)).
The function returns a dictionary where the keys are the individual words,
and the values are their corresponding mutual information scores.
You can use this function as follows:
"""
words = ["Tatooine", "Palpatine", "Darth", "Vader", "Han", "Solo"]
bigrams = ["Darth Vader", "Han Solo"]
mutual_info_scores = calculate_mutual_information(words, bigrams)
for word, score in mutual_info_scores.items():
print(f"Word: {word}, Mutual Information: {score}")
"""
Word: Tatooine, Mutual Information: 0.0
Word: Palpatine, Mutual Information: 0.0
Word: Darth, Mutual Information: 1.0
Word: Vader, Mutual Information: 1.0
Word: Han, Mutual Information: 1.0
Word: Solo, Mutual Information: 1.0
Item: Darth Vader, Mutual Information: 1.0
Item: Han Solo, Mutual Information: 1.0
In this example, the words "Darth," "Vader," "Han," and "Solo"
show mutual information scores of 1.0, indicating a strong
association with the bigrams. The words "Tatooine" and "Palpatine"
have a mutual information score of 0.0, indicating no
association with the given bigrams.
"""
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment