Skip to content

Instantly share code, notes, and snippets.

@msukmanowsky
Created November 11, 2013 21:40
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save msukmanowsky/7420914 to your computer and use it in GitHub Desktop.
Save msukmanowsky/7420914 to your computer and use it in GitHub Desktop.
from collections import defaultdict
try:
import cStringIO as StringIO
except ImportError:
import StringIO
class EscapedLineReader(object):
"""Custom reader for files where we could have escaped new lines.
This is not an efficient implementation and should move to Cython. Also, the line
delimiter is assumed to be single char and doesn't work right now for Windows
style \r\n endings.
>>> with open('somefile.txt', 'r') as fp:
... reader = EscapedLineReader(fp)
... for line in reader:
... do_stuff()
"""
MAX_BYTES = 1000
def __init__(self, fp, quote_chars=['\'', '"'], line_delimiter='\n'):
self.fp = fp
self.quote_chars = quote_chars
self.line_delimiter = line_delimiter
def __iter__(self):
return self
def _get_line(self):
line = StringIO.StringIO() # buffer to hold our eventual line
finished = False
pos = self.fp.tell() # Store current pos within file, we'll need for rewind
# acts as a counter to ensure that quote characters are balanced in the
# string
quote_counters = defaultdict(int)
while finished == False:
chars = self.fp.read(self.MAX_BYTES)
if chars == '':
raise StopIteration() # EOF
for char in chars:
if char in self.quote_chars:
quote_counters[char] += 1
line.write(char)
if char == self.line_delimiter and \
all(map(lambda x: x % 2 == 0, quote_counters.itervalues())):
# We have a properly terminated line, rewind the file
# pointer and mark as done
self.fp.seek(pos + len(line.getvalue()))
finished = True
break
return line.getvalue()
def next(self):
return self._get_line()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment