Skip to content

Instantly share code, notes, and snippets.

@tinybike
Last active August 29, 2015 14:05
Show Gist options
  • Save tinybike/78c4506ef16a58e9eb86 to your computer and use it in GitHub Desktop.
Save tinybike/78c4506ef16a58e9eb86 to your computer and use it in GitHub Desktop.
parse a text file and count the occurrences of words in the file
#!/usr/bin/env python
"""
parse a text file and count the occurrences of words in the file
@author Jack Peterson (jack@tinybike.net)
"""
import sys
import math
import sqlite3
def wordcount(data):
count = {}
for item in data:
count[item] = count[item] + 1 if item in count else 1
return count
def convert_text_to_list(text, allowed=(' ', '\'', '-', '/')):
text = text.strip().replace('\n', ' ')
while ' ' in text:
text = text.replace(' ', ' ')
text = ''.join(e for e in text if e.isalnum() or e in allowed)
return (word.strip('\'') for word in text.split())
def get_chunk(textfile, chunk_size=2048, line_breaks=(' ', '\r', '\n')):
chunk = textfile.read(chunk_size)
next_chunk = False
while True:
if not chunk: break
if len(chunk) > 1 and chunk[-1] in line_breaks:
next_chunk = True
yield chunk
if next_chunk:
next_chunk = False
chunk = textfile.read(chunk_size)
else:
next_char = textfile.read(1)
if not next_char:
yield chunk
break
chunk += next_char
def process_chunk(cur, chunk, case_sensitive=True):
if not case_sensitive:
chunk = chunk.lower()
chunk = convert_text_to_list(chunk)
chunk_count = wordcount(chunk)
update_query = 'UPDATE counts SET cnt = cnt + ? WHERE word = ?'
insert_query = 'INSERT INTO counts VALUES (?, ?)'
for word in chunk_count:
cur.execute(update_query, (chunk_count[word], word))
if not cur.rowcount:
cur.execute(insert_query, (word, chunk_count[word]))
def textparse(infile, outfile, fetch_rows=1000):
conn = sqlite3.connect('count.db')
with conn, open(infile, 'r') as textfile, open(outfile, 'w') as countfile:
cur = conn.cursor()
cur.execute('CREATE TABLE counts (word TEXT, cnt INT)')
for chunk in get_chunk(textfile):
if chunk:
process_chunk(cur, chunk)
cur.execute('SELECT COUNT(*) FROM counts')
num_words = cur.fetchone()[0]
num_queries = int(math.ceil(float(num_words) / float(fetch_rows)))
for i in xrange(num_queries):
cur.execute('SELECT * FROM counts LIMIT ? OFFSET ?', (fetch_rows, i*fetch_rows))
for row in cur:
print >>countfile, str(row[0]) + ',' + str(row[1])
cur.execute('DROP TABLE IF EXISTS counts')
if __name__ == '__main__':
infile = sys.argv[1]
outfile = ''.join(infile.split('.')[:-1]) + '.csv'
textparse(infile, outfile)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment