Skip to content

Instantly share code, notes, and snippets.

@craSH
Last active August 29, 2015 14:19
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 1 You must be signed in to fork a gist
  • Save craSH/3a49384d82dd94c5b172 to your computer and use it in GitHub Desktop.
Save craSH/3a49384d82dd94c5b172 to your computer and use it in GitHub Desktop.
Like strings(1) but only return stuff that is probably english (or whatever locale you set)
#!/usr/bin/env python
"""
Like strings(1) but only return stuff that is probably English (or whatever locale you set)
TODO:
* Add a parameter to ignore punctuation as token counts towards the threshold.
e.g. "Foo." vs "Foo........" would both return true.
* Add conditional for not counting digit only results if alphanumeric=True in is_natlang()
* Unicode support - currently it doesn't work with unicode, quite bad for non-english input
Copyleft 2015 Ian Gallagher <crash@neg9.org>
Depedencies:
pip install nltk
pip install pyenchant
"""
import sys, os, enchant, nltk, codecs
def is_natlang(data, options):
#print "In is_natlang(%r, %r)" % (data, options.threshold)
tokens = nltk.wordpunct_tokenize(data)
#print "Tokens: %r" % tokens
num_tokens = len(tokens)
if(0 >= num_tokens):
return(False)
num_natlang = 0
percent_natlang = 0.0
for token in tokens:
try:
# TODO: Add conditional for not counting digit only results if alphanumeric=True
if(1 < len(token) and token.isalnum() and options.dictionary.check(token)):
num_natlang += 1
except enchant.errors.Error as ex:
print repr(token)
raise
#continue
percent_natlang = float(num_natlang) / float(num_tokens)
#print "Percent natlang >= options.threshold: %r >= %r" % (percent_natlang, options.threshold)
return(percent_natlang >= options.threshold)
def do_stdin(options):
try:
reader = codecs.getreader('latin-1')
sys.stdin = reader(sys.stdin)
for line in sys.stdin:
if is_natlang(line, options):
sys.stdout.write(line)
sys.stdout.flush()
except Exception as ex:
print >> sys.stderr, "Error reading/operating on stdin: %s" % str(ex)
def main():
import optparse
parser = optparse.OptionParser(usage="Usage: %prog [options] - read input from stdin")
parser.add_option('-d', '--debug', dest='debug', type='int', default=1,
help='Debug level (0, 1, 2; default 1)')
parser.add_option('-l', '--language', dest='language', type='str', default='en_US',
help='Language string to use for matching (e.g. en_US, the default')
parser.add_option('-t', '--threshold', dest='threshold', type='float', default=0.50,
help='Percentage of content to be natural language tokens (default 0.5 or 50%)')
parser.add_option('-a', '--alpha', dest='alphanumeric', action='store_true',
help='Return only alphanumeric results and not results which are numeric only.')
(options, args) = parser.parse_args()
if len(args) < 0:
parser.print_usage()
return(1)
options.dictionary = enchant.Dict(options.language)
do_stdin(options)
return(0)
if '__main__' == __name__:
sys.exit(main())
# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment