Skip to content

Instantly share code, notes, and snippets.

@andrewbolster
Created April 21, 2024 15:17
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save andrewbolster/b9441c3ed551e48dbe4fac10ec325123 to your computer and use it in GitHub Desktop.
Save andrewbolster/b9441c3ed551e48dbe4fac10ec325123 to your computer and use it in GitHub Desktop.
import nltk
import random
from nltk.corpus import brown
# Ensure the necessary NLTK resources are downloaded
nltk.download('brown')
nltk.download('universal_tagset')
def get_common_words_by_pos(tag, num_words=100):
""" Return a list of the most common words for a given part of speech tag """
# Create a frequency distribution of words in the Brown corpus tagged as specified POS
word_freq = nltk.FreqDist(w.lower() for w, pos in brown.tagged_words(tagset='universal') if pos == tag)
# Get the most common words up to the specified number
return [word for word, freq in word_freq.most_common(num_words)]
def generate_common_pairs(num_pairs=1):
# Get the most common adjectives and nouns
adjectives = get_common_words_by_pos('ADJ', 1000) # Top 100 adjectives
nouns = get_common_words_by_pos('NOUN', 1000) # Top 100 nouns
pairs = []
for _ in range(num_pairs):
adjective = random.choice(adjectives)
noun = random.choice(nouns)
pairs.append(f"{adjective}-{noun}")
return pairs
# Example usage:
print(generate_common_pairs(5))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment