craSH/natstrings.py

## natstrings.py
#!/usr/bin/env python
"""
Like strings(1) but only return stuff that is probably English (or whatever locale you set)

TODO:
    * Add a parameter to ignore punctuation as token counts towards the threshold.
       e.g. "Foo." vs "Foo........" would both return true.
    * Add conditional for not counting digit only results if alphanumeric=True in is_natlang()
    * Unicode support - currently it doesn't work with unicode, quite bad for non-english input

Copyleft 2015 Ian Gallagher <crash@neg9.org>

Depedencies:
    pip install nltk
    pip install pyenchant
"""
import sys, os, enchant, nltk, codecs

def is_natlang(data, options):
    #print "In is_natlang(%r, %r)" % (data, options.threshold)
    tokens = nltk.wordpunct_tokenize(data)
    #print "Tokens: %r" % tokens
    num_tokens = len(tokens)

    if(0 >= num_tokens):
        return(False)

    num_natlang = 0
    percent_natlang = 0.0

    for token in tokens:
        try:
            # TODO: Add conditional for not counting digit only results if alphanumeric=True
            if(1 < len(token) and token.isalnum() and options.dictionary.check(token)):
                num_natlang += 1
        except enchant.errors.Error as ex:
            print repr(token)
            raise
            #continue

    percent_natlang = float(num_natlang) / float(num_tokens)
    #print "Percent natlang >= options.threshold: %r >= %r" % (percent_natlang, options.threshold)

    return(percent_natlang >= options.threshold)

def do_stdin(options):
    try:
        reader = codecs.getreader('latin-1')
        sys.stdin = reader(sys.stdin)
        for line in sys.stdin:
            if is_natlang(line, options):
                sys.stdout.write(line)
                sys.stdout.flush()
    except Exception as ex:
        print >> sys.stderr, "Error reading/operating on stdin: %s" % str(ex)

def main():
    import optparse
    parser = optparse.OptionParser(usage="Usage: %prog [options] - read input from stdin")

    parser.add_option('-d', '--debug', dest='debug', type='int', default=1,
            help='Debug level (0, 1, 2; default 1)')
    parser.add_option('-l', '--language', dest='language', type='str', default='en_US',
            help='Language string to use for matching (e.g. en_US, the default')
    parser.add_option('-t', '--threshold', dest='threshold', type='float', default=0.50,
            help='Percentage of content to be natural language tokens (default 0.5 or 50%)')
    parser.add_option('-a', '--alpha', dest='alphanumeric', action='store_true',
            help='Return only alphanumeric results and not results which are numeric only.')

    (options, args) = parser.parse_args()

    if len(args) < 0:
        parser.print_usage()
        return(1)

    options.dictionary = enchant.Dict(options.language)
    do_stdin(options)


    return(0)

if '__main__' == __name__:
    sys.exit(main())

# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
	#!/usr/bin/env python
	"""
	Like strings(1) but only return stuff that is probably English (or whatever locale you set)

	TODO:
	* Add a parameter to ignore punctuation as token counts towards the threshold.
	e.g. "Foo." vs "Foo........" would both return true.
	* Add conditional for not counting digit only results if alphanumeric=True in is_natlang()
	* Unicode support - currently it doesn't work with unicode, quite bad for non-english input

	Copyleft 2015 Ian Gallagher <crash@neg9.org>

	Depedencies:
	pip install nltk
	pip install pyenchant
	"""
	import sys, os, enchant, nltk, codecs

	def is_natlang(data, options):
	#print "In is_natlang(%r, %r)" % (data, options.threshold)
	tokens = nltk.wordpunct_tokenize(data)
	#print "Tokens: %r" % tokens
	num_tokens = len(tokens)

	if(0 >= num_tokens):
	return(False)

	num_natlang = 0
	percent_natlang = 0.0

	for token in tokens:
	try:
	# TODO: Add conditional for not counting digit only results if alphanumeric=True
	if(1 < len(token) and token.isalnum() and options.dictionary.check(token)):
	num_natlang += 1
	except enchant.errors.Error as ex:
	print repr(token)
	raise
	#continue

	percent_natlang = float(num_natlang) / float(num_tokens)
	#print "Percent natlang >= options.threshold: %r >= %r" % (percent_natlang, options.threshold)

	return(percent_natlang >= options.threshold)

	def do_stdin(options):
	try:
	reader = codecs.getreader('latin-1')
	sys.stdin = reader(sys.stdin)
	for line in sys.stdin:
	if is_natlang(line, options):
	sys.stdout.write(line)
	sys.stdout.flush()
	except Exception as ex:
	print >> sys.stderr, "Error reading/operating on stdin: %s" % str(ex)

	def main():
	import optparse
	parser = optparse.OptionParser(usage="Usage: %prog [options] - read input from stdin")

	parser.add_option('-d', '--debug', dest='debug', type='int', default=1,
	help='Debug level (0, 1, 2; default 1)')
	parser.add_option('-l', '--language', dest='language', type='str', default='en_US',
	help='Language string to use for matching (e.g. en_US, the default')
	parser.add_option('-t', '--threshold', dest='threshold', type='float', default=0.50,
	help='Percentage of content to be natural language tokens (default 0.5 or 50%)')
	parser.add_option('-a', '--alpha', dest='alphanumeric', action='store_true',
	help='Return only alphanumeric results and not results which are numeric only.')

	(options, args) = parser.parse_args()

	if len(args) < 0:
	parser.print_usage()
	return(1)

	options.dictionary = enchant.Dict(options.language)
	do_stdin(options)


	return(0)

	if '__main__' == __name__:
	sys.exit(main())

	# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4