Last active
August 29, 2015 14:05
-
-
Save tinybike/78c4506ef16a58e9eb86 to your computer and use it in GitHub Desktop.
parse a text file and count the occurrences of words in the file
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
parse a text file and count the occurrences of words in the file | |
@author Jack Peterson (jack@tinybike.net) | |
""" | |
import sys | |
import math | |
import sqlite3 | |
def wordcount(data): | |
count = {} | |
for item in data: | |
count[item] = count[item] + 1 if item in count else 1 | |
return count | |
def convert_text_to_list(text, allowed=(' ', '\'', '-', '/')): | |
text = text.strip().replace('\n', ' ') | |
while ' ' in text: | |
text = text.replace(' ', ' ') | |
text = ''.join(e for e in text if e.isalnum() or e in allowed) | |
return (word.strip('\'') for word in text.split()) | |
def get_chunk(textfile, chunk_size=2048, line_breaks=(' ', '\r', '\n')): | |
chunk = textfile.read(chunk_size) | |
next_chunk = False | |
while True: | |
if not chunk: break | |
if len(chunk) > 1 and chunk[-1] in line_breaks: | |
next_chunk = True | |
yield chunk | |
if next_chunk: | |
next_chunk = False | |
chunk = textfile.read(chunk_size) | |
else: | |
next_char = textfile.read(1) | |
if not next_char: | |
yield chunk | |
break | |
chunk += next_char | |
def process_chunk(cur, chunk, case_sensitive=True): | |
if not case_sensitive: | |
chunk = chunk.lower() | |
chunk = convert_text_to_list(chunk) | |
chunk_count = wordcount(chunk) | |
update_query = 'UPDATE counts SET cnt = cnt + ? WHERE word = ?' | |
insert_query = 'INSERT INTO counts VALUES (?, ?)' | |
for word in chunk_count: | |
cur.execute(update_query, (chunk_count[word], word)) | |
if not cur.rowcount: | |
cur.execute(insert_query, (word, chunk_count[word])) | |
def textparse(infile, outfile, fetch_rows=1000): | |
conn = sqlite3.connect('count.db') | |
with conn, open(infile, 'r') as textfile, open(outfile, 'w') as countfile: | |
cur = conn.cursor() | |
cur.execute('CREATE TABLE counts (word TEXT, cnt INT)') | |
for chunk in get_chunk(textfile): | |
if chunk: | |
process_chunk(cur, chunk) | |
cur.execute('SELECT COUNT(*) FROM counts') | |
num_words = cur.fetchone()[0] | |
num_queries = int(math.ceil(float(num_words) / float(fetch_rows))) | |
for i in xrange(num_queries): | |
cur.execute('SELECT * FROM counts LIMIT ? OFFSET ?', (fetch_rows, i*fetch_rows)) | |
for row in cur: | |
print >>countfile, str(row[0]) + ',' + str(row[1]) | |
cur.execute('DROP TABLE IF EXISTS counts') | |
if __name__ == '__main__': | |
infile = sys.argv[1] | |
outfile = ''.join(infile.split('.')[:-1]) + '.csv' | |
textparse(infile, outfile) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment