Skip to content

Instantly share code, notes, and snippets.

@astariul
Created July 11, 2023 02:31
Show Gist options
  • Save astariul/359f2bda89d77b49721ed77db1b54f00 to your computer and use it in GitHub Desktop.
Save astariul/359f2bda89d77b49721ed77db1b54f00 to your computer and use it in GitHub Desktop.
Script to analyze the typos' position in the tweeter typo corpus
# Download the tweeter typo corpus with :
# wget https://luululu.com/tweet/typo-corpus-r1.txt
from collections import Counter, defaultdict
def main():
indexes = defaultdict(list)
word_sizes = defaultdict(Counter)
with open("typo-corpus-r1.txt", "r") as f:
for line in f:
x = line.split("\t")
typo_type = x[2]
correction = x[3]
if typo_type == "IN":
sep = "<"
elif typo_type == "RM":
sep = "("
else:
sep = "["
for i in range(len(x[1])):
word_sizes[typo_type][i] += 1
idx = correction.find(sep)
indexes[typo_type].append(idx)
for typo_type, idxs in indexes.items():
idxs_c = Counter(idxs)
print(f"\n\n=========== {typo_type} ===========")
for i in range(12):
print(f"Number of words of size at least {i + 1} : {word_sizes[typo_type][i]}")
print("\nUnnormalized :")
for i in range(12):
print(f"Typo at {i} : {idxs_c[i]}")
print("\nNormalized :")
for i in range(12):
print(f"Typo at {i} : {idxs_c[i] / word_sizes[typo_type][i]:.3f}")
total_indexes = sum(indexes.values(), [])
total_word_sizes = sum(word_sizes.values(), Counter())
idxs_c = Counter(total_indexes)
print("\n\n=========== TOTAL ===========")
for i in range(12):
print(f"Number of words of size at least {i + 1} : {total_word_sizes[i]}")
print("\nUnnormalized :")
for i in range(12):
print(f"Typo at {i} : {idxs_c[i]}")
print("\n\nNormalized :")
for i in range(12):
print(f"Typo at {i} : {idxs_c[i] / total_word_sizes[i]:.3f}")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment