Skip to content

Instantly share code, notes, and snippets.

Embed
What would you like to do?
import sys
import json
import nltk
from nltk.tokenize import RegexpTokenizer
# For a list of all POS tags and their definitions, see:
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
SUPERLATIVE_TAGS = {"JJS", "RBS"}
# Since nltk's default word_tokenizer splits on apostrophes and other punctuation,
# we use whitespace tokenization in to simplify the calculation of each token's position.
tokenizer = RegexpTokenizer('\w+')
def tokenize(text):
"""
Tokenize a blob of texts and record the
starting and ending position of each token.
"""
tokens = tokenizer.tokenize(text)
offset = 0
offsets = []
for token in tokens:
offsets.append((offset, offset + len(token)))
offset += (len(token) + 1) # Add one for each space.
return tokens, offsets
def detect_superlatives(text):
"""
Detect superlatives in text via POS tagging
and return the position(s) of the superlative(s) in the original text.
"""
tokens, offsets = tokenize(text)
pos_tags = nltk.pos_tag(tokens)
for i, tag in enumerate(pos_tags):
if tag[1] in SUPERLATIVE_TAGS:
yield {"token": tokens[i], "position": offsets[i]}
if __name__ == "__main__":
if len(sys.argv) != 2:
print('usage: python detect_superlatives.py "The world\'s greatest program"')
sys.exit(1)
for token in list(detect_superlatives(sys.argv[1])):
print(json.dumps(token, indent=2))
@abelsonlive

This comment has been minimized.

Copy link
Owner Author

commented Aug 11, 2019

Superlative Spotter

This script detects superlatives (exaggerated or hyperbolical expressions of praise) in arbitrary blobs of text.

To use it, first install the dependencies by opening up a terminal and running:

pip install nltk
python -c "import nltk; nltk.download('punkt'); nltk.download('averaged_perceptron_tagger')"

Next, save this file as detect_superlatives.py and execute it like so:

python path/to/detect_superlatives.py 'The coolest cooler'

You should see the following output:

{
  "token": "coolest",
  "position": [
    4,
    11
  ]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.