Skip to content

Instantly share code, notes, and snippets.

@tecknicaltom
Forked from craSH/natstrings.py
Last active August 29, 2015 14:19
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save tecknicaltom/d50e9beb637d516be42b to your computer and use it in GitHub Desktop.
Save tecknicaltom/d50e9beb637d516be42b to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
"""
Like strings(1) but only return stuff that is probably English (or whatever locale you set)
TODO:
* Add a parameter to ignore punctuation as token counts towards the threshold.
e.g. "Foo." vs "Foo........" would both return true.
* Add conditional for not counting digit only results if alphanumeric=True in is_natlang()
Copyleft 2015 Ian Gallagher <crash@neg9.org>
Depedencies:
pip install nltk
pip install pyenchant
"""
import sys, os, enchant, nltk
dictionary = enchant.Dict("en_US")
def is_natlang(data, threshold, alphanumeric):
#print "In is_natlang(%r, %r)" % (data, threshold)
tokens = nltk.wordpunct_tokenize(data)
#print "Tokens: %r" % tokens
num_tokens = len(tokens)
if(0 >= num_tokens):
return(False)
num_natlang = 0
percent_natlang = 0.0
for token in tokens:
try:
# TODO: Add conditional for not counting digit only results if alphanumeric=True
if(1 < len(token) and token.isalnum() and dictionary.check(token)):
num_natlang += 1
except enchant.errors.Error as ex:
print repr(token)
raise
continue
percent_natlang = float(num_natlang) / float(num_tokens)
#print "Percent natlang >= threshold: %r >= %r" % (percent_natlang, threshold)
return(percent_natlang >= threshold)
def do_stdin(threshold, alphanumeric):
sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
try:
for line in sys.stdin:
if is_natlang(line, threshold, alphanumeric):
sys.stdout.write(line)
sys.stdout.flush()
except Exception as ex:
print >> sys.stderr, "Error reading/operating on stdin: %s" % str(ex)
def main():
import optparse
parser = optparse.OptionParser(usage="Usage: %prog [options] - read input from stdin")
parser.add_option('-d', '--debug', dest='debug', type='int', default=1,
help='Debug level (0, 1, 2; default 1)')
parser.add_option('-t', '--threshold', dest='threshold', type='float', default=0.50,
help='Percentage of content to be natural language tokens (default 0.5 or 50%)')
parser.add_option('-a', '--alpha', dest='alpha', action='store_true',
help='Return only alphanumeric results and not results which are numeric only.')
(options, args) = parser.parse_args()
if len(args) < 0:
parser.print_usage()
return(1)
do_stdin(options.threshold, options.alpha)
return(0)
if '__main__' == __name__:
sys.exit(main())
# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment