-
-
Save astariul/359f2bda89d77b49721ed77db1b54f00 to your computer and use it in GitHub Desktop.
Script to analyze the typos' position in the tweeter typo corpus
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Download the tweeter typo corpus with : | |
# wget https://luululu.com/tweet/typo-corpus-r1.txt | |
from collections import Counter, defaultdict | |
def main(): | |
indexes = defaultdict(list) | |
word_sizes = defaultdict(Counter) | |
with open("typo-corpus-r1.txt", "r") as f: | |
for line in f: | |
x = line.split("\t") | |
typo_type = x[2] | |
correction = x[3] | |
if typo_type == "IN": | |
sep = "<" | |
elif typo_type == "RM": | |
sep = "(" | |
else: | |
sep = "[" | |
for i in range(len(x[1])): | |
word_sizes[typo_type][i] += 1 | |
idx = correction.find(sep) | |
indexes[typo_type].append(idx) | |
for typo_type, idxs in indexes.items(): | |
idxs_c = Counter(idxs) | |
print(f"\n\n=========== {typo_type} ===========") | |
for i in range(12): | |
print(f"Number of words of size at least {i + 1} : {word_sizes[typo_type][i]}") | |
print("\nUnnormalized :") | |
for i in range(12): | |
print(f"Typo at {i} : {idxs_c[i]}") | |
print("\nNormalized :") | |
for i in range(12): | |
print(f"Typo at {i} : {idxs_c[i] / word_sizes[typo_type][i]:.3f}") | |
total_indexes = sum(indexes.values(), []) | |
total_word_sizes = sum(word_sizes.values(), Counter()) | |
idxs_c = Counter(total_indexes) | |
print("\n\n=========== TOTAL ===========") | |
for i in range(12): | |
print(f"Number of words of size at least {i + 1} : {total_word_sizes[i]}") | |
print("\nUnnormalized :") | |
for i in range(12): | |
print(f"Typo at {i} : {idxs_c[i]}") | |
print("\n\nNormalized :") | |
for i in range(12): | |
print(f"Typo at {i} : {idxs_c[i] / total_word_sizes[i]:.3f}") | |
if __name__ == "__main__": | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment