Skip to content

Instantly share code, notes, and snippets.

@salmoni
Created April 12, 2014 20:51
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
Star You must be signed in to star a gist
Save salmoni/10556194 to your computer and use it in GitHub Desktop.
Parses a single line of CSV with a set (multiple!) delimiters and a set (multiple!) of quotation characters. Embedded quotes are kept honest (see examples at bottom of file).
def ParseLine(line, delims, quotes):
"""
Parses a line of text into components. This attempts to
be a proper parser that can cope with multiple delimiters.
"""
inQuote = False # flag for being 'within' quotes
token = '' # current token
tokens = [] # list of tokens
for char in line:
if inQuote: # so if we're in the middle of a quote...
if char == inQuoteChar: # ...and have a matching quote character...
tokens.append(token) # add the token to list (ignore quote character)
token = '' # and begin new token
inQuote = False # flag that we're not in a quote any more
else: # But if char is a non-matching quote...
token += char # ...just add to token
elif char in delims: # or if char is a delimiter...
if len(token) > 0: # ...and token is worth recording...
tokens.append(token) # add token to list
token = '' # and begin new token
else: # if token has 0 length and no content...
pass # ...adjacent delimiters so do nothing
elif char in quotes: # But if char is a quote...
inQuoteChar = char # record it to check for matching quote later
inQuote = True # and flag that we're in a quotation
else: # And if char is anything else...
token += char # add to token
if len(token) > 0: # Check if last item is worth recording (len > 0)
tokens.append(token) # add to list of tokens
return tokens # return list of tokens
# Some slightly gnarly test data
delims = ",; " # comma, semi-colon and space as delimiters
quotes = '"'+"'" # double and single quotes together
line = """Col1, 'col2"' col3; col4 "col5,;", col6"""
print ParseLine(line, delims, quotes)
line = """1,2 3.3;4 5,6"""
print ParseLine(line, delims, quotes)
line = """6,5 4;3 2.2,1"""
print ParseLine(line, delims, quotes)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment