Skip to content

Instantly share code, notes, and snippets.

@matthewrobertbell
Created December 11, 2011 20:12
Show Gist options
  • Save matthewrobertbell/1462489 to your computer and use it in GitHub Desktop.
Save matthewrobertbell/1462489 to your computer and use it in GitHub Desktop.
Python Random Lines
import os.path
import random
class RandomLines(object):
def __init__(self, input_file, cache_index=True):
if isinstance(input_file, basestring):
self.source_file = open(input_file,'rb')
filename = input_file
else:
self.source_file = input_file
filename = input_file.name
self.index = []
if not os.path.isfile(filename+'.lineindex'):
bytes_counter = 0
for line in self.source_file:
bytes_counter += len(line)
if len(line.strip()):
self.index.add(bytes_counter-len(line))
if cache_index:
open(filename+'.lineindex','w').write('\n'.join(str(i) for i in self.index))
else:
for line in open(filename+'.lineindex'):
self.index.append(int(line.strip()))
def __iter__(self):
return self
def next(self):
while len(self.index):
offset = self.index.pop(random.randrange(0,len(self.index)))
self.source_file.seek(offset,0)
return self.source_file.readline().strip()
raise StopIteration
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment