tecknicaltom/natstrings.py

## natstrings.py
#!/usr/bin/env python
"""
Like strings(1) but only return stuff that is probably English (or whatever locale you set)

TODO:
    * Add a parameter to ignore punctuation as token counts towards the threshold.
       e.g. "Foo." vs "Foo........" would both return true.
    * Add conditional for not counting digit only results if alphanumeric=True in is_natlang()

Copyleft 2015 Ian Gallagher <crash@neg9.org>

Depedencies:
    pip install nltk
    pip install pyenchant
"""
import sys, os, enchant, nltk

dictionary = enchant.Dict("en_US")

def is_natlang(data, threshold, alphanumeric):
    #print "In is_natlang(%r, %r)" % (data, threshold)
    tokens = nltk.wordpunct_tokenize(data)
    #print "Tokens: %r" % tokens
    num_tokens = len(tokens)

    if(0 >= num_tokens):
        return(False)

    num_natlang = 0
    percent_natlang = 0.0

    for token in tokens:
        try:
            # TODO: Add conditional for not counting digit only results if alphanumeric=True
            if(1 < len(token) and token.isalnum() and dictionary.check(token)):
                num_natlang += 1
        except enchant.errors.Error as ex:
            print repr(token)
            raise
            continue

    percent_natlang = float(num_natlang) / float(num_tokens)
    #print "Percent natlang >= threshold: %r >= %r" % (percent_natlang, threshold)

    return(percent_natlang >= threshold)

def do_stdin(threshold, alphanumeric):
    sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
    try:
        for line in sys.stdin:
            if is_natlang(line, threshold, alphanumeric):
                sys.stdout.write(line)
                sys.stdout.flush()
    except Exception as ex:
        print >> sys.stderr, "Error reading/operating on stdin: %s" % str(ex)

def main():
    import optparse
    parser = optparse.OptionParser(usage="Usage: %prog [options] - read input from stdin")

    parser.add_option('-d', '--debug', dest='debug', type='int', default=1,
            help='Debug level (0, 1, 2; default 1)')
    parser.add_option('-t', '--threshold', dest='threshold', type='float', default=0.50,
            help='Percentage of content to be natural language tokens (default 0.5 or 50%)')
    parser.add_option('-a', '--alpha', dest='alpha', action='store_true',
            help='Return only alphanumeric results and not results which are numeric only.')

    (options, args) = parser.parse_args()

    if len(args) < 0:
        parser.print_usage()
        return(1)

    do_stdin(options.threshold, options.alpha)


    return(0)

if '__main__' == __name__:
    sys.exit(main())

# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4
	#!/usr/bin/env python
	"""
	Like strings(1) but only return stuff that is probably English (or whatever locale you set)

	TODO:
	* Add a parameter to ignore punctuation as token counts towards the threshold.
	e.g. "Foo." vs "Foo........" would both return true.
	* Add conditional for not counting digit only results if alphanumeric=True in is_natlang()

	Copyleft 2015 Ian Gallagher <crash@neg9.org>

	Depedencies:
	pip install nltk
	pip install pyenchant
	"""
	import sys, os, enchant, nltk

	dictionary = enchant.Dict("en_US")

	def is_natlang(data, threshold, alphanumeric):
	#print "In is_natlang(%r, %r)" % (data, threshold)
	tokens = nltk.wordpunct_tokenize(data)
	#print "Tokens: %r" % tokens
	num_tokens = len(tokens)

	if(0 >= num_tokens):
	return(False)

	num_natlang = 0
	percent_natlang = 0.0

	for token in tokens:
	try:
	# TODO: Add conditional for not counting digit only results if alphanumeric=True
	if(1 < len(token) and token.isalnum() and dictionary.check(token)):
	num_natlang += 1
	except enchant.errors.Error as ex:
	print repr(token)
	raise
	continue

	percent_natlang = float(num_natlang) / float(num_tokens)
	#print "Percent natlang >= threshold: %r >= %r" % (percent_natlang, threshold)

	return(percent_natlang >= threshold)

	def do_stdin(threshold, alphanumeric):
	sys.stdout = os.fdopen(sys.stdout.fileno(), 'w', 0)
	try:
	for line in sys.stdin:
	if is_natlang(line, threshold, alphanumeric):
	sys.stdout.write(line)
	sys.stdout.flush()
	except Exception as ex:
	print >> sys.stderr, "Error reading/operating on stdin: %s" % str(ex)

	def main():
	import optparse
	parser = optparse.OptionParser(usage="Usage: %prog [options] - read input from stdin")

	parser.add_option('-d', '--debug', dest='debug', type='int', default=1,
	help='Debug level (0, 1, 2; default 1)')
	parser.add_option('-t', '--threshold', dest='threshold', type='float', default=0.50,
	help='Percentage of content to be natural language tokens (default 0.5 or 50%)')
	parser.add_option('-a', '--alpha', dest='alpha', action='store_true',
	help='Return only alphanumeric results and not results which are numeric only.')

	(options, args) = parser.parse_args()

	if len(args) < 0:
	parser.print_usage()
	return(1)

	do_stdin(options.threshold, options.alpha)


	return(0)

	if '__main__' == __name__:
	sys.exit(main())

	# vim: tabstop=8 expandtab shiftwidth=4 softtabstop=4