tinybike/wordcount.py

## wordcount.py
#!/usr/bin/env python
"""
parse a text file and count the occurrences of words in the file
@author Jack Peterson (jack@tinybike.net)
"""
import sys
import math
import sqlite3

def wordcount(data):
    count = {}
    for item in data:
        count[item] = count[item] + 1 if item in count else 1
    return count

def convert_text_to_list(text, allowed=(' ', '\'', '-', '/')):
    text = text.strip().replace('\n', ' ')
    while '  ' in text:
        text = text.replace('  ', ' ')
    text = ''.join(e for e in text if e.isalnum() or e in allowed)
    return (word.strip('\'') for word in text.split())

def get_chunk(textfile, chunk_size=2048, line_breaks=(' ', '\r', '\n')):
    chunk = textfile.read(chunk_size)
    next_chunk = False
    while True:
        if not chunk: break
        if len(chunk) > 1 and chunk[-1] in line_breaks:
            next_chunk = True
            yield chunk
        if next_chunk:
            next_chunk = False
            chunk = textfile.read(chunk_size)
        else:
            next_char = textfile.read(1)
            if not next_char:
                yield chunk
                break
            chunk += next_char

def process_chunk(cur, chunk, case_sensitive=True):
    if not case_sensitive:
        chunk = chunk.lower()
    chunk = convert_text_to_list(chunk)
    chunk_count = wordcount(chunk)
    update_query = 'UPDATE counts SET cnt = cnt + ? WHERE word = ?'
    insert_query = 'INSERT INTO counts VALUES (?, ?)'
    for word in chunk_count:
        cur.execute(update_query, (chunk_count[word], word))
        if not cur.rowcount:
            cur.execute(insert_query, (word, chunk_count[word]))

def textparse(infile, outfile, fetch_rows=1000):
    conn = sqlite3.connect('count.db')
    with conn, open(infile, 'r') as textfile, open(outfile, 'w') as countfile:
        cur = conn.cursor()
        cur.execute('CREATE TABLE counts (word TEXT, cnt INT)')
        for chunk in get_chunk(textfile):
            if chunk:
                process_chunk(cur, chunk)
        cur.execute('SELECT COUNT(*) FROM counts')
        num_words = cur.fetchone()[0]
        num_queries = int(math.ceil(float(num_words) / float(fetch_rows)))
        for i in xrange(num_queries):
            cur.execute('SELECT * FROM counts LIMIT ? OFFSET ?', (fetch_rows, i*fetch_rows))
            for row in cur:
                print >>countfile, str(row[0]) + ',' + str(row[1])
        cur.execute('DROP TABLE IF EXISTS counts')

if __name__ == '__main__':
    infile = sys.argv[1]
    outfile = ''.join(infile.split('.')[:-1]) + '.csv'
    textparse(infile, outfile)
	#!/usr/bin/env python
	"""
	parse a text file and count the occurrences of words in the file
	@author Jack Peterson (jack@tinybike.net)
	"""
	import sys
	import math
	import sqlite3

	def wordcount(data):
	count = {}
	for item in data:
	count[item] = count[item] + 1 if item in count else 1
	return count

	def convert_text_to_list(text, allowed=(' ', '\'', '-', '/')):
	text = text.strip().replace('\n', ' ')
	while ' ' in text:
	text = text.replace(' ', ' ')
	text = ''.join(e for e in text if e.isalnum() or e in allowed)
	return (word.strip('\'') for word in text.split())

	def get_chunk(textfile, chunk_size=2048, line_breaks=(' ', '\r', '\n')):
	chunk = textfile.read(chunk_size)
	next_chunk = False
	while True:
	if not chunk: break
	if len(chunk) > 1 and chunk[-1] in line_breaks:
	next_chunk = True
	yield chunk
	if next_chunk:
	next_chunk = False
	chunk = textfile.read(chunk_size)
	else:
	next_char = textfile.read(1)
	if not next_char:
	yield chunk
	break
	chunk += next_char

	def process_chunk(cur, chunk, case_sensitive=True):
	if not case_sensitive:
	chunk = chunk.lower()
	chunk = convert_text_to_list(chunk)
	chunk_count = wordcount(chunk)
	update_query = 'UPDATE counts SET cnt = cnt + ? WHERE word = ?'
	insert_query = 'INSERT INTO counts VALUES (?, ?)'
	for word in chunk_count:
	cur.execute(update_query, (chunk_count[word], word))
	if not cur.rowcount:
	cur.execute(insert_query, (word, chunk_count[word]))

	def textparse(infile, outfile, fetch_rows=1000):
	conn = sqlite3.connect('count.db')
	with conn, open(infile, 'r') as textfile, open(outfile, 'w') as countfile:
	cur = conn.cursor()
	cur.execute('CREATE TABLE counts (word TEXT, cnt INT)')
	for chunk in get_chunk(textfile):
	if chunk:
	process_chunk(cur, chunk)
	cur.execute('SELECT COUNT(*) FROM counts')
	num_words = cur.fetchone()[0]
	num_queries = int(math.ceil(float(num_words) / float(fetch_rows)))
	for i in xrange(num_queries):
	cur.execute('SELECT * FROM counts LIMIT ? OFFSET ?', (fetch_rows, i*fetch_rows))
	for row in cur:
	print >>countfile, str(row[0]) + ',' + str(row[1])
	cur.execute('DROP TABLE IF EXISTS counts')

	if __name__ == '__main__':
	infile = sys.argv[1]
	outfile = ''.join(infile.split('.')[:-1]) + '.csv'
	textparse(infile, outfile)