abelsonlive/detect_superlatives.py

## detect_superlatives.py
import sys
import json

import nltk
from nltk.tokenize import RegexpTokenizer

# For a list of all POS tags and their definitions, see:
# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
SUPERLATIVE_TAGS = {"JJS", "RBS"}

# Since nltk's default word_tokenizer splits on apostrophes and other punctuation,
# we use whitespace tokenization in to simplify the calculation of each token's position.
tokenizer = RegexpTokenizer('\w+')


def tokenize(text):
    """
    Tokenize a blob of texts and record the
    starting and ending position of each token.
    """
    tokens = tokenizer.tokenize(text)
    offset = 0
    offsets = []
    for token in tokens:
        offsets.append((offset, offset + len(token)))
        offset += (len(token) + 1) # Add one for each space.
    return tokens, offsets


def detect_superlatives(text):
    """
    Detect superlatives in text via POS tagging
    and return the position(s) of the superlative(s) in the original text.
    """
    tokens, offsets = tokenize(text)
    pos_tags = nltk.pos_tag(tokens)
    for i, tag in enumerate(pos_tags):
        if tag[1] in SUPERLATIVE_TAGS:
            yield {"token": tokens[i], "position": offsets[i]}


if __name__ == "__main__":
    if len(sys.argv) != 2:
        print('usage: python detect_superlatives.py "The world\'s greatest program"')
        sys.exit(1)
    for token in list(detect_superlatives(sys.argv[1])):
        print(json.dumps(token, indent=2))
	import sys
	import json

	import nltk
	from nltk.tokenize import RegexpTokenizer

	# For a list of all POS tags and their definitions, see:
	# https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html
	SUPERLATIVE_TAGS = {"JJS", "RBS"}

	# Since nltk's default word_tokenizer splits on apostrophes and other punctuation,
	# we use whitespace tokenization in to simplify the calculation of each token's position.
	tokenizer = RegexpTokenizer('\w+')


	def tokenize(text):
	"""
	Tokenize a blob of texts and record the
	starting and ending position of each token.
	"""
	tokens = tokenizer.tokenize(text)
	offset = 0
	offsets = []
	for token in tokens:
	offsets.append((offset, offset + len(token)))
	offset += (len(token) + 1) # Add one for each space.
	return tokens, offsets


	def detect_superlatives(text):
	"""
	Detect superlatives in text via POS tagging
	and return the position(s) of the superlative(s) in the original text.
	"""
	tokens, offsets = tokenize(text)
	pos_tags = nltk.pos_tag(tokens)
	for i, tag in enumerate(pos_tags):
	if tag[1] in SUPERLATIVE_TAGS:
	yield {"token": tokens[i], "position": offsets[i]}


	if __name__ == "__main__":
	if len(sys.argv) != 2:
	print('usage: python detect_superlatives.py "The world\'s greatest program"')
	sys.exit(1)
	for token in list(detect_superlatives(sys.argv[1])):
	print(json.dumps(token, indent=2))