michaelchadwick/filter-words.py

## filter-words.py
#!/usr/bin/env python3

"""
Filter corpus of words to remove certain ones

Current filters:
* Vulgarity (offensive words)
* Estoteria (uncommon words)
* Anagrams (re-orderings or existing words)
  * By default, this is only used for pangrams (9-letter words)

Future filters:
* Plurals?
"""

import sys
import json

output_filename = None

input_json_str = None
output_json_str = None

words = {}

def word_count(words: dict):
  return len(sorted({x for v in words.values() for x in v}))

def load_files():
  global output_filename
  global words

  cli_filename = '';

  if len(sys.argv) < 2:
    print("Error: No filename provided.")
    return False

  cli_filename = sys.argv[1]
  output_filename = f"{cli_filename[:-5]}.filtered.json"

  try:
    input_json_str = open(cli_filename, 'r').read()
    words = json.loads(input_json_str)

    print(f"checking _{word_count(words)}_ words from input: {cli_filename}")
    return True
  except FileNotFoundError:
    print(f"Error: input file not found: {cli_filename}")
    return False

# remove any words that are potentially offensive
def filter_vulgarity(words: dict):
  vulgarity_file_path = 'vulgarity.txt'
  filtered_words = {}
  vulgarity_words = []

  if not words:
    print("Error: No words to check for vulgarity")
    return filtered_words

  def is_vulgar(word: str):
    return word.lower() in vulgarity_words

  try:
    vulgarity_words = open(vulgarity_file_path, 'r').read()
  except FileNotFoundError:
    print(f"Error: could not open {vulgarity_file_path}")

  if not vulgarity_words:
    return words

  print(f"- removing vulgarity")

  for cat in words:
    for i in range(0, len(words[cat])):
      print(f"\r-- checking words[{cat}][{i}]", end='', flush=True)

      word = words[cat][i]

      if not is_vulgar(word):
        if not cat in filtered_words:
          filtered_words[cat] = []

        filtered_words[cat].append(word)

  print(f"\r", end='', flush=True)

  print(f"--- filtered _{word_count(words)}_ -> _{word_count(filtered_words)}_ vulgar word(s)")

  # print(filtered_words)

  return filtered_words

# remove any words that are too esoteric
def filter_esoteria(words: dict):
  filtered_words = {}

  if not words:
    print("Error: No words to check for esoteria")
    return filtered_words

  import nltk

  # corpus types: https://www.nltk.org/nltk_data/
  from nltk.corpus import brown

  try:
    # Get the list of common words from the Brown corpus
    corpus_brown = set(w.lower() for w in brown.words())
  except LookupError:
    nltk.download('brown')
    corpus_brown = set(w.lower() for w in brown.words())

  def is_common(word: str, corpus = 'brown'):
    match corpus:
      case 'brown':
        return word.lower() in corpus_brown

  print(f"- removing esoteria")

  for cat in words:
    for i in range(0, len(words[cat])):
      print(f"\r-- checking words[{cat}][{i}]", end='', flush=True)

      word = words[cat][i]

      if is_common(word):
        if not cat in filtered_words:
          filtered_words[cat] = []

        filtered_words[cat].append(word)

  print(f"\r", end='', flush=True)

  print(f"--- filtered _{word_count(words)}_ -> _{word_count(filtered_words)}_ common word(s)")

  # print(filtered_words)

  return filtered_words

# remove any words that are anagrams of other words in input data
def filter_anagrams(words: dict, ignored_cats: list = None):
  filtered_words = {}

  if not words:
    print("Error: No words to check for anagrams")
    return filtered_words

  print(f"- removing anagrams")

  for cat in words:
    if ignored_cats:
      if int(cat) not in ignored_cats:
        for i in range(0, len(words[cat])):
          print(f"\r-- words[{cat}][{i}]", end='', flush=True)

          anagramsFound = 0
          potentialWord = words[cat][i]

          # print(f"  {potentialWord} vs ...")

          for word in words[cat]:
            # skip comparing word to itself
            if word != potentialWord:
              # print(f"    {word}")

              list1 = list(word.lower())
              list1.sort()
              word1 = ''.join(map(str, list1))
              list2 = list(potentialWord.lower())
              list2.sort()
              word2 = ''.join(map(str, list2))

              if word1 == word2:
                # print(f"  anagram found: {word}")
                anagramsFound += 1

          if anagramsFound == 0:
            if not cat in filtered_words:
              filtered_words[cat] = []

            filtered_words[cat].append(potentialWord)
      else:
        filtered_words[cat] = words[cat]

  print(f"\r", end='', flush=True)

  print(f"--- filtered _{word_count(words)}_ -> _{word_count(filtered_words)}_ unique word(s)")

  # print(filtered_words)

  return filtered_words

def main():
  global output_filename

  if not load_files():
    exit(1)

  filtered_words = filter_anagrams(
    filter_esoteria(filter_vulgarity(words)),
    [3, 4, 5, 6, 7, 8]
  )

  print("----------------------------------------------------------")
  print(f"FINAL WORD COUNT: {word_count(filtered_words)}")
  print("----------------------------------------------------------")
  print('')

  output_file = open(output_filename, 'w+')

  if output_file:
    output_file.write(json.dumps(filtered_words))

    print(f"-> output json written: {output_filename}")

    output_file.close()
  else:
    print("Error: Could not write to output file.")
    exit(1)

if __name__ == '__main__':
  main()
	#!/usr/bin/env python3

	"""
	Filter corpus of words to remove certain ones

	Current filters:
	* Vulgarity (offensive words)
	* Estoteria (uncommon words)
	* Anagrams (re-orderings or existing words)
	* By default, this is only used for pangrams (9-letter words)

	Future filters:
	* Plurals?
	"""

	import sys
	import json

	output_filename = None

	input_json_str = None
	output_json_str = None

	words = {}

	def word_count(words: dict):
	return len(sorted({x for v in words.values() for x in v}))

	def load_files():
	global output_filename
	global words

	cli_filename = '';

	if len(sys.argv) < 2:
	print("Error: No filename provided.")
	return False

	cli_filename = sys.argv[1]
	output_filename = f"{cli_filename[:-5]}.filtered.json"

	try:
	input_json_str = open(cli_filename, 'r').read()
	words = json.loads(input_json_str)

	print(f"checking _{word_count(words)}_ words from input: {cli_filename}")
	return True
	except FileNotFoundError:
	print(f"Error: input file not found: {cli_filename}")
	return False

	# remove any words that are potentially offensive
	def filter_vulgarity(words: dict):
	vulgarity_file_path = 'vulgarity.txt'
	filtered_words = {}
	vulgarity_words = []

	if not words:
	print("Error: No words to check for vulgarity")
	return filtered_words

	def is_vulgar(word: str):
	return word.lower() in vulgarity_words

	try:
	vulgarity_words = open(vulgarity_file_path, 'r').read()
	except FileNotFoundError:
	print(f"Error: could not open {vulgarity_file_path}")

	if not vulgarity_words:
	return words

	print(f"- removing vulgarity")

	for cat in words:
	for i in range(0, len(words[cat])):
	print(f"\r-- checking words[{cat}][{i}]", end='', flush=True)

	word = words[cat][i]

	if not is_vulgar(word):
	if not cat in filtered_words:
	filtered_words[cat] = []

	filtered_words[cat].append(word)

	print(f"\r", end='', flush=True)

	print(f"--- filtered _{word_count(words)}_ -> _{word_count(filtered_words)}_ vulgar word(s)")

	# print(filtered_words)

	return filtered_words

	# remove any words that are too esoteric
	def filter_esoteria(words: dict):
	filtered_words = {}

	if not words:
	print("Error: No words to check for esoteria")
	return filtered_words

	import nltk

	# corpus types: https://www.nltk.org/nltk_data/
	from nltk.corpus import brown

	try:
	# Get the list of common words from the Brown corpus
	corpus_brown = set(w.lower() for w in brown.words())
	except LookupError:
	nltk.download('brown')
	corpus_brown = set(w.lower() for w in brown.words())

	def is_common(word: str, corpus = 'brown'):
	match corpus:
	case 'brown':
	return word.lower() in corpus_brown

	print(f"- removing esoteria")

	for cat in words:
	for i in range(0, len(words[cat])):
	print(f"\r-- checking words[{cat}][{i}]", end='', flush=True)

	word = words[cat][i]

	if is_common(word):
	if not cat in filtered_words:
	filtered_words[cat] = []

	filtered_words[cat].append(word)

	print(f"\r", end='', flush=True)

	print(f"--- filtered _{word_count(words)}_ -> _{word_count(filtered_words)}_ common word(s)")

	# print(filtered_words)

	return filtered_words

	# remove any words that are anagrams of other words in input data
	def filter_anagrams(words: dict, ignored_cats: list = None):
	filtered_words = {}

	if not words:
	print("Error: No words to check for anagrams")
	return filtered_words

	print(f"- removing anagrams")

	for cat in words:
	if ignored_cats:
	if int(cat) not in ignored_cats:
	for i in range(0, len(words[cat])):
	print(f"\r-- words[{cat}][{i}]", end='', flush=True)

	anagramsFound = 0
	potentialWord = words[cat][i]

	# print(f" {potentialWord} vs ...")

	for word in words[cat]:
	# skip comparing word to itself
	if word != potentialWord:
	# print(f" {word}")

	list1 = list(word.lower())
	list1.sort()
	word1 = ''.join(map(str, list1))
	list2 = list(potentialWord.lower())
	list2.sort()
	word2 = ''.join(map(str, list2))

	if word1 == word2:
	# print(f" anagram found: {word}")
	anagramsFound += 1

	if anagramsFound == 0:
	if not cat in filtered_words:
	filtered_words[cat] = []

	filtered_words[cat].append(potentialWord)
	else:
	filtered_words[cat] = words[cat]

	print(f"\r", end='', flush=True)

	print(f"--- filtered _{word_count(words)}_ -> _{word_count(filtered_words)}_ unique word(s)")

	# print(filtered_words)

	return filtered_words

	def main():
	global output_filename

	if not load_files():
	exit(1)

	filtered_words = filter_anagrams(
	filter_esoteria(filter_vulgarity(words)),
	[3, 4, 5, 6, 7, 8]
	)

	print("----------------------------------------------------------")
	print(f"FINAL WORD COUNT: {word_count(filtered_words)}")
	print("----------------------------------------------------------")
	print('')

	output_file = open(output_filename, 'w+')

	if output_file:
	output_file.write(json.dumps(filtered_words))

	print(f"-> output json written: {output_filename}")

	output_file.close()
	else:
	print("Error: Could not write to output file.")
	exit(1)

	if __name__ == '__main__':
	main()