Skip to content

Instantly share code, notes, and snippets.

@rpq
Created April 8, 2015 17:19
Show Gist options
  • Save rpq/7379de407fc66f254a62 to your computer and use it in GitHub Desktop.
Save rpq/7379de407fc66f254a62 to your computer and use it in GitHub Desktop.
Gets all byte indexes for each individual mbox message in a mbox file format (this implementation works on gmail mbox)
import re
class BufferFileReader(object):
def __init__(self, file_path, size):
self.last_offset = 0
self.file_path = file_path
self.size = size
self.fp = open(self.file_path, 'r')
def start_at(self, offset):
self.last_offset = offset
print 'seeking to %s from %s' % (offset, 0)
self.fp.seek(offset, 0)
print 'fp is at %s' % (self.fp.tell(),)
def get_index(self, fn):
return self.match_position(self.buffer)
def match_position(self, line):
line_index = line.find('From ')
if line_index > -1:
#regex = r'From [^ ]+ (Sun|Mon|Tue|Wed|Thu|Fri|Sat)'
regex = r'From [-] (Sun|Mon|Tue|Wed|Thu|Fri|Sat)'
if re.search(regex, line):
return line_index
return None
def read(self):
self.buffer = self.fp.read(self.size)
'''
print 'buffer (%s) for file name (%s)' % (self.buffer[0:2],
self.fp.name)
'''
if self.buffer:
return self.buffer
return None
def absolute_position(self, position):
return self.last_offset + position
class ChunkFileReadStringMatcher(object):
def __init__(self, buffer1, buffer2):
self.buffer1 = buffer1
self.buffer2 = buffer2
def get_indexes(self):
indexes = []
data_buffer_one = self.buffer1.read()
while(data_buffer_one):
index_buffer_one = self.buffer1.match_position(
data_buffer_one)
if index_buffer_one is not None:
print 'found index one %s (%s)' % (index_buffer_one, self.buffer1.absolute_position(index_buffer_one))
next_byte_one = self.buffer1.absolute_position(index_buffer_one) + 1
self.buffer2.start_at(next_byte_one)
print 'buffer2 changed to %s' % self.buffer2.fp.tell()
data_buffer_two = self.buffer2.read()
while(data_buffer_two):
index_buffer_two = self.buffer2.match_position(
data_buffer_two)
if index_buffer_two is not None:
print 'found index two %s (%s)' % (index_buffer_two, self.buffer2.absolute_position(index_buffer_two))
next_byte_two = self.buffer2.absolute_position(index_buffer_two) + 1
indexes.append(self.buffer1.absolute_position(index_buffer_one))
indexes.append(self.buffer2.absolute_position(index_buffer_two))
print 'found indexes %s, %s' % (self.buffer1.absolute_position(index_buffer_one),
self.buffer1.absolute_position(index_buffer_two),)
self.buffer1.start_at(next_byte_two)
print 'buffer1 changed to %s' % self.buffer1.fp.tell()
data_buffer_one = ''
data_buffer_two = ''
break
data_buffer_two += self.buffer2.read()
read_one = self.buffer1.read()
if read_one:
data_buffer_one += read_one
else:
break
print 'complete!'
print 'here are all the %s' % (indexes,)
return indexes
file_path = 'Inbox'
buffer1 = BufferFileReader(file_path, 1024)
buffer2 = BufferFileReader(file_path, 1024)
chunked_file_reader = ChunkFileReadStringMatcher(buffer1, buffer2)
print 'chunked file reader create'
indexes = chunked_file_reader.get_indexes()
print 'found %s indexes...' % len(indexes)
print 'inserting into rethinkdb...'
import rethinkdb as r
r.connect("localhost", 28015).repl()
for index in indexes:
index_d = {'index': index}
#insert_response = r.db('mailboy').table('mbox_indexes').insert(index_d).run()
print 'inserted %s into rethinkdb...' % index
print insert_response
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment