Skip to content

Instantly share code, notes, and snippets.

@michaelchadwick
Last active June 25, 2024 19:58
Show Gist options
  • Save michaelchadwick/855fe8e119a7def4a92c2c5c59f01b12 to your computer and use it in GitHub Desktop.
Save michaelchadwick/855fe8e119a7def4a92c2c5c59f01b12 to your computer and use it in GitHub Desktop.
Filter a corpus of words to remove certain ones
#!/usr/bin/env python3
"""
Filter corpus of words to remove certain ones
Current filters:
* Vulgarity (offensive words)
* Estoteria (uncommon words)
* Anagrams (re-orderings or existing words)
* By default, this is only used for pangrams (9-letter words)
Future filters:
* Plurals?
"""
import sys
import json
output_filename = None
input_json_str = None
output_json_str = None
words = {}
def word_count(words: dict):
return len(sorted({x for v in words.values() for x in v}))
def load_files():
global output_filename
global words
cli_filename = '';
if len(sys.argv) < 2:
print("Error: No filename provided.")
return False
cli_filename = sys.argv[1]
output_filename = f"{cli_filename[:-5]}.filtered.json"
try:
input_json_str = open(cli_filename, 'r').read()
words = json.loads(input_json_str)
print(f"checking _{word_count(words)}_ words from input: {cli_filename}")
return True
except FileNotFoundError:
print(f"Error: input file not found: {cli_filename}")
return False
# remove any words that are potentially offensive
def filter_vulgarity(words: dict):
vulgarity_file_path = 'vulgarity.txt'
filtered_words = {}
vulgarity_words = []
if not words:
print("Error: No words to check for vulgarity")
return filtered_words
def is_vulgar(word: str):
return word.lower() in vulgarity_words
try:
vulgarity_words = open(vulgarity_file_path, 'r').read()
except FileNotFoundError:
print(f"Error: could not open {vulgarity_file_path}")
if not vulgarity_words:
return words
print(f"- removing vulgarity")
for cat in words:
for i in range(0, len(words[cat])):
print(f"\r-- checking words[{cat}][{i}]", end='', flush=True)
word = words[cat][i]
if not is_vulgar(word):
if not cat in filtered_words:
filtered_words[cat] = []
filtered_words[cat].append(word)
print(f"\r", end='', flush=True)
print(f"--- filtered _{word_count(words)}_ -> _{word_count(filtered_words)}_ vulgar word(s)")
# print(filtered_words)
return filtered_words
# remove any words that are too esoteric
def filter_esoteria(words: dict):
filtered_words = {}
if not words:
print("Error: No words to check for esoteria")
return filtered_words
import nltk
# corpus types: https://www.nltk.org/nltk_data/
from nltk.corpus import brown
try:
# Get the list of common words from the Brown corpus
corpus_brown = set(w.lower() for w in brown.words())
except LookupError:
nltk.download('brown')
corpus_brown = set(w.lower() for w in brown.words())
def is_common(word: str, corpus = 'brown'):
match corpus:
case 'brown':
return word.lower() in corpus_brown
print(f"- removing esoteria")
for cat in words:
for i in range(0, len(words[cat])):
print(f"\r-- checking words[{cat}][{i}]", end='', flush=True)
word = words[cat][i]
if is_common(word):
if not cat in filtered_words:
filtered_words[cat] = []
filtered_words[cat].append(word)
print(f"\r", end='', flush=True)
print(f"--- filtered _{word_count(words)}_ -> _{word_count(filtered_words)}_ common word(s)")
# print(filtered_words)
return filtered_words
# remove any words that are anagrams of other words in input data
def filter_anagrams(words: dict, ignored_cats: list = None):
filtered_words = {}
if not words:
print("Error: No words to check for anagrams")
return filtered_words
print(f"- removing anagrams")
for cat in words:
if ignored_cats:
if int(cat) not in ignored_cats:
for i in range(0, len(words[cat])):
print(f"\r-- words[{cat}][{i}]", end='', flush=True)
anagramsFound = 0
potentialWord = words[cat][i]
# print(f" {potentialWord} vs ...")
for word in words[cat]:
# skip comparing word to itself
if word != potentialWord:
# print(f" {word}")
list1 = list(word.lower())
list1.sort()
word1 = ''.join(map(str, list1))
list2 = list(potentialWord.lower())
list2.sort()
word2 = ''.join(map(str, list2))
if word1 == word2:
# print(f" anagram found: {word}")
anagramsFound += 1
if anagramsFound == 0:
if not cat in filtered_words:
filtered_words[cat] = []
filtered_words[cat].append(potentialWord)
else:
filtered_words[cat] = words[cat]
print(f"\r", end='', flush=True)
print(f"--- filtered _{word_count(words)}_ -> _{word_count(filtered_words)}_ unique word(s)")
# print(filtered_words)
return filtered_words
def main():
global output_filename
if not load_files():
exit(1)
filtered_words = filter_anagrams(
filter_esoteria(filter_vulgarity(words)),
[3, 4, 5, 6, 7, 8]
)
print("----------------------------------------------------------")
print(f"FINAL WORD COUNT: {word_count(filtered_words)}")
print("----------------------------------------------------------")
print('')
output_file = open(output_filename, 'w+')
if output_file:
output_file.write(json.dumps(filtered_words))
print(f"-> output json written: {output_filename}")
output_file.close()
else:
print("Error: Could not write to output file.")
exit(1)
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment