Skip to content

Instantly share code, notes, and snippets.

@jcreixell
Created May 15, 2012 09:39
Show Gist options
  • Save jcreixell/2700416 to your computer and use it in GitHub Desktop.
Save jcreixell/2700416 to your computer and use it in GitHub Desktop.
Google Doc pandas integration
import gspread
from pandas import *
import logging
class GDocError(Exception):
def __init__(self, value):
self.parameter = value
def __str__(self):
return repr(self.parameter)
class GDoc():
def __init__(self, user, passwd, key):
session = gspread.login(user, passwd)
self.spreadsheet = session.open_by_key(key)
def upsert(self, dataframe, index_col, blank_columns=0, sheet_name=None):
"""take care as the dataframe is expected to have an index in order to be able to update rows by key"""
if sheet_name is None:
worksheet = self.spreadsheet.sheet1
else:
worksheet = self.spreadsheet.worksheet(sheet_name)
total_sheet_rows = worksheet.row_count
cols = dataframe.shape[1]
index_col_nr = list(dataframe.columns).index(index_col) + 1
existing_ids = worksheet.col_values(index_col_nr + blank_columns)
existing_ids = [str(ex_id).lower() for ex_id in existing_ids]
nrows = len(existing_ids)
col_letter = self.__get_column_letter(blank_columns + cols)
first_col_letter = self.__get_column_letter(blank_columns + 1)
rangestr = '%s1:%s1' % (first_col_letter, col_letter)
cell_list = worksheet.range(rangestr)
for i, cell in enumerate(cell_list):
cell.value = dataframe.columns[i]
worksheet.update_cells(cell_list)
ids_to_insert = Series([str(id_to_insert).lower() for id_to_insert in dataframe[index_col]])
data_old = dataframe[ids_to_insert.isin(existing_ids)] # to lower case in order to be insensitive to typoes case/non case with key columns. is equal to the string matching in mysql
data_new = dataframe[-ids_to_insert.isin(existing_ids)]
# update existing indexes
if data_old.shape[0] > 0:
for data_id, row in data_old.T.iteritems():
if str(row[index_col]).lower() in existing_ids:
row_nr = existing_ids.index(str(row[index_col]).lower()) + 1
logging.debug("Updating row: %s" % row_nr)
rangestr = '%s%s:%s%s' % (first_col_letter, row_nr, col_letter, row_nr)
cell_list = worksheet.range(rangestr)
for i, cell in enumerate(cell_list):
if notnull(row[i]):
cell.value = str(row[i])
else:
cell.value = ""
worksheet.update_cells(cell_list)
# add new indexes
counter = 0 # needed to iterate through cell list which is not a matrix
new_rows = data_new.shape[0]
if new_rows > 0:
free_cells = total_sheet_rows - len(existing_ids) # +1 because of headers
if free_cells < new_rows:
worksheet.add_rows(new_rows - free_cells)
rangestr = '%s%s:%s%s' % (first_col_letter, nrows + 1, col_letter, nrows + new_rows)
cell_list = worksheet.range(rangestr)
for data_id, row in data_new.T.iteritems():
for i, cell in enumerate(cell_list[counter * cols:(counter * cols + cols)]):
if notnull(row[i]):
cell.value = str(row[i])
else:
cell.value = ""
counter += 1
# Then update the value parameter of this cells
worksheet.update_cells(cell_list)
def insert(self, dataframe, index_col, blank_columns=0, sheet_name=None):
"""take care as the dataframe is expected to have an index in order to be able to update rows by key"""
if sheet_name is None:
worksheet = self.spreadsheet.sheet1
else:
worksheet = self.spreadsheet.worksheet(sheet_name)
total_sheet_rows = worksheet.row_count
cols = dataframe.shape[1]
index_col_nr = list(dataframe.columns).index(index_col) + 1
existing_ids = worksheet.col_values(index_col_nr + blank_columns)
existing_ids = [str(ex_id).lower() for ex_id in existing_ids]
nrows = len(existing_ids)
col_letter = self.__get_column_letter(blank_columns + cols)
first_col_letter = self.__get_column_letter(blank_columns + 1)
rangestr = '%s1:%s1' % (first_col_letter, col_letter)
cell_list = worksheet.range(rangestr)
for i, cell in enumerate(cell_list):
cell.value = dataframe.columns[i]
worksheet.update_cells(cell_list)
ids_to_insert = Series([str(id_to_insert).lower() for id_to_insert in dataframe[index_col]])
data_new = dataframe[-ids_to_insert.isin(existing_ids)]
# add new indexes
counter = 0 # needed to iterate through cell list which is not a matrix
new_rows = data_new.shape[0]
if new_rows > 0:
free_cells = total_sheet_rows - len(existing_ids) # +1 because of headers
if free_cells < new_rows:
worksheet.add_rows(new_rows - free_cells)
rangestr = '%s%s:%s%s' % (first_col_letter, nrows + 1, col_letter, nrows + new_rows)
cell_list = worksheet.range(rangestr)
for data_id, row in data_new.T.iteritems():
for i, cell in enumerate(cell_list[counter * cols:(counter * cols + cols)]):
if notnull(row[i]):
cell.value = str(row[i])
else:
cell.value = ""
counter += 1
# Then update the value parameter of this cells
worksheet.update_cells(cell_list)
def retrieve_columns(self, index_col, sheet_name=None):
if sheet_name is None:
worksheet = self.spreadsheet.sheet1
else:
worksheet = self.spreadsheet.worksheet(sheet_name)
header = worksheet.row_values(1)
index_col_nr = header.index(index_col) + 1
data = worksheet.col_values(index_col_nr)
return data
def __get_column_letter(self, col_idx):
"""Convert a column number into a column letter (3 -> 'C')
Right shift the column col_idx by 26 to find column letters in reverse
order. These numbers are 1-based, and can be converted to ASCII
ordinals by adding 64.
"""
# these indicies corrospond to A -> ZZZ and include all allowed
# columns
if not 1 <= col_idx <= 18278:
raise GDocError('Column index out of bounds: %s' % col_idx)
ordinals = []
temp = col_idx
while temp:
quotient, remainder = divmod(temp, 26)
# check for exact division and borrow if needed
if remainder == 0:
quotient -= 1
remainder = 26
ordinals.append(remainder + 64)
temp = quotient
ordinals.reverse()
return ''.join([chr(ordinal) for ordinal in ordinals])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment