Last active
August 29, 2015 14:19
-
-
Save craSH/3a49384d82dd94c5b172 to your computer and use it in GitHub Desktop.
Like strings(1) but only return stuff that is probably english (or whatever locale you set)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
Like strings(1) but only return stuff that is probably English (or whatever locale you set) | |
TODO: | |
* Add a parameter to ignore punctuation as token counts towards the threshold. | |
e.g. "Foo." vs "Foo........" would both return true. | |
* Add conditional for not counting digit only results if alphanumeric=True in is_natlang() | |
* Unicode support - currently it doesn't work with unicode, quite bad for non-english input | |
Copyleft 2015 Ian Gallagher <crash@neg9.org> | |
Depedencies: | |
pip install nltk | |
pip install pyenchant | |
""" | |
import sys, os, enchant, nltk, codecs | |
def is_natlang(data, options): | |
#print "In is_natlang(%r, %r)" % (data, options.threshold) | |
tokens = nltk.wordpunct_tokenize(data) | |
#print "Tokens: %r" % tokens | |
num_tokens = len(tokens) | |
if(0 >= num_tokens): | |
return(False) | |
num_natlang = 0 | |
percent_natlang = 0.0 | |
for token in tokens: | |
try: | |
# TODO: Add conditional for not counting digit only results if alphanumeric=True | |
if(1 < len(token) and token.isalnum() and options.dictionary.check(token)): | |
num_natlang += 1 | |
except enchant.errors.Error as ex: | |
print repr(token) | |
raise | |
#continue | |
percent_natlang = float(num_natlang) / float(num_tokens) | |
#print "Percent natlang >= options.threshold: %r >= %r" % (percent_natlang, options.threshold) | |
return(percent_natlang >= options.threshold) | |
def do_stdin(options): | |
try: | |
reader = codecs.getreader('latin-1') | |
sys.stdin = reader(sys.stdin) | |
for line in sys.stdin: | |
if is_natlang(line, options): | |
sys.stdout.write(line) | |
sys.stdout.flush() | |
except Exception as ex: | |
print >> sys.stderr, "Error reading/operating on stdin: %s" % str(ex) | |
def main(): | |
import optparse | |
parser = optparse.OptionParser(usage="Usage: %prog [options] - read input from stdin") | |
parser.add_option('-d', '--debug', dest='debug', type='int', default=1, | |
help='Debug level (0, 1, 2; default 1)') | |
parser.add_option('-l', '--language', dest='language', type='str', default='en_US', | |
help='Language string to use for matching (e.g. en_US, the default') | |
parser.add_option('-t', '--threshold', dest='threshold', type='float', default=0.50, | |
help='Percentage of content to be natural language tokens (default 0.5 or 50%)') | |
parser.add_option('-a', '--alpha', dest='alphanumeric', action='store_true', | |
help='Return only alphanumeric results and not results which are numeric only.') | |
(options, args) = parser.parse_args() | |
if len(args) < 0: | |
parser.print_usage() | |
return(1) | |
options.dictionary = enchant.Dict(options.language) | |
do_stdin(options) | |
return(0) | |
if '__main__' == __name__: | |
sys.exit(main()) | |
# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment