Last active
August 17, 2019 16:58
-
-
Save abelsonlive/753ee9f692cab0181e32c17e7e30fe75 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import json | |
import nltk | |
from nltk.tokenize import RegexpTokenizer | |
# For a list of all POS tags and their definitions, see: | |
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html | |
SUPERLATIVE_TAGS = {"JJS", "RBS"} | |
# Since nltk's default word_tokenizer splits on apostrophes and other punctuation, | |
# we use whitespace tokenization in to simplify the calculation of each token's position. | |
tokenizer = RegexpTokenizer('\w+') | |
def tokenize(text): | |
""" | |
Tokenize a blob of texts and record the | |
starting and ending position of each token. | |
""" | |
tokens = tokenizer.tokenize(text) | |
offset = 0 | |
offsets = [] | |
for token in tokens: | |
offsets.append((offset, offset + len(token))) | |
offset += (len(token) + 1) # Add one for each space. | |
return tokens, offsets | |
def detect_superlatives(text): | |
""" | |
Detect superlatives in text via POS tagging | |
and return the position(s) of the superlative(s) in the original text. | |
""" | |
tokens, offsets = tokenize(text) | |
pos_tags = nltk.pos_tag(tokens) | |
for i, tag in enumerate(pos_tags): | |
if tag[1] in SUPERLATIVE_TAGS: | |
yield {"token": tokens[i], "position": offsets[i]} | |
if __name__ == "__main__": | |
if len(sys.argv) != 2: | |
print('usage: python detect_superlatives.py "The world\'s greatest program"') | |
sys.exit(1) | |
for token in list(detect_superlatives(sys.argv[1])): | |
print(json.dumps(token, indent=2)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Superlative Spotter
This script detects superlatives (exaggerated or hyperbolical expressions of praise) in arbitrary blobs of text.
To use it, first install the dependencies by opening up a terminal and running:
Next, save this file as
detect_superlatives.py
and execute it like so:You should see the following output: