Last active
March 4, 2017 21:31
-
-
Save ubergarm/38711b7c9c15fce68c61a8ee6ad2f7b4 to your computer and use it in GitHub Desktop.
CLI Text Phrase Extraction using RAKE
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# Example Usage: | |
# $ curl -s https://sherlock-holm.es/stories/plain-text/croo.txt | ./rake.py --limit=10 --minscore=5 | |
# Download SmartStopList.txt from: | |
# https://github.com/fabianvf/python-rake | |
import sys | |
import RAKE | |
import argparse | |
# Accept input from stdin or filename | |
parser = argparse.ArgumentParser() | |
parser.add_argument('infile', | |
nargs='?', | |
type=argparse.FileType('r'), | |
default=sys.stdin, | |
help="text from stdin or file from which to extract keywords.") | |
# Optionally pass in a custom stoplist | |
parser.add_argument('--stoplist', | |
default="./SmartStoplist.txt", | |
help="text file with one stop word per line") | |
# Optionally pass in max number of phrases to return | |
parser.add_argument('--limit', | |
type=int, | |
default=5, | |
help="Return at most LIMIT phrases") | |
# Optionally pass in minimum required score | |
parser.add_argument('--minscore', | |
type=float, | |
default=2.0, | |
help="Return only phrases with score greater than MINSCORE") | |
args = parser.parse_args() | |
# scrub characters from input data | |
scrublist = '!"#$%&\'()*+,-./:;=>?@[\\]^_`{|}~' | |
text = args.infile.read().translate({ord(c): None for c in scrublist}) | |
# single white spaces throughout | |
text = " ".join(text.split()) | |
# calculate results | |
Rake = RAKE.Rake(args.stoplist) | |
# output no more than the limit scores with minimum score of minscore | |
results = Rake.run(text)[:args.limit] | |
[print('{},{}'.format(phrase.strip(), score)) for (phrase, score) in results if score >= args.minscore] |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Example output: