-
-
Save tecknicaltom/d50e9beb637d516be42b to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Like strings(1) but only return stuff that is probably English (or whatever locale you set) | |
TODO: | |
* Add a parameter to ignore punctuation as token counts towards the threshold. | |
e.g. "Foo." vs "Foo........" would both return true. | |
* Add conditional for not counting digit only results if alphanumeric=True in is_natlang() | |
Copyleft 2015 Ian Gallagher <crash@neg9.org> | |
Depedencies: | |
pip install nltk | |
pip install pyenchant | |
""" | |
import sys, os, enchant, nltk | |
dictionary = enchant.Dict("en_US") | |
def is_natlang(data, threshold, alphanumeric): | |
#print "In is_natlang(%r, %r)" % (data, threshold) | |
tokens = nltk.wordpunct_tokenize(data) | |
#print "Tokens: %r" % tokens | |
num_tokens = len(tokens) | |
if(0 >= num_tokens): | |
return(False) | |
num_natlang = 0 | |
percent_natlang = 0.0 | |
for token in tokens: | |
try: | |
# TODO: Add conditional for not counting digit only results if alphanumeric=True | |
if(1 < len(token) and token.isalnum() and dictionary.check(token)): | |
num_natlang += 1 | |
except enchant.errors.Error as ex: | |
print repr(token) | |
raise | |
continue | |
percent_natlang = float(num_natlang) / float(num_tokens) | |
#print "Percent natlang >= threshold: %r >= %r" % (percent_natlang, threshold) | |
return(percent_natlang >= threshold) | |
def do_stdin(threshold, alphanumeric): | |
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0) | |
try: | |
for line in sys.stdin: | |
if is_natlang(line, threshold, alphanumeric): | |
sys.stdout.write(line) | |
sys.stdout.flush() | |
except Exception as ex: | |
print >> sys.stderr, "Error reading/operating on stdin: %s" % str(ex) | |
def main(): | |
import optparse | |
parser = optparse.OptionParser(usage="Usage: %prog [options] - read input from stdin") | |
parser.add_option('-d', '--debug', dest='debug', type='int', default=1, | |
help='Debug level (0, 1, 2; default 1)') | |
parser.add_option('-t', '--threshold', dest='threshold', type='float', default=0.50, | |
help='Percentage of content to be natural language tokens (default 0.5 or 50%)') | |
parser.add_option('-a', '--alpha', dest='alpha', action='store_true', | |
help='Return only alphanumeric results and not results which are numeric only.') | |
(options, args) = parser.parse_args() | |
if len(args) < 0: | |
parser.print_usage() | |
return(1) | |
do_stdin(options.threshold, options.alpha) | |
return(0) | |
if '__main__' == __name__: | |
sys.exit(main()) | |
# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment