Skip to content

Instantly share code, notes, and snippets.

@guyromm
Last active September 23, 2018 18:26
Show Gist options
  • Save guyromm/9e93dfe5b0dce0e8ae7fffade01d36d9 to your computer and use it in GitHub Desktop.
Save guyromm/9e93dfe5b0dce0e8ae7fffade01d36d9 to your computer and use it in GitHub Desktop.
synchronize a local .tsv file onto a google spreadsheet where row identity is calculated by composite keys
#!/usr/bin/env python
"""
synchronize data from a local file to a google spreadsheet, where rows are located according to keys.
achtung! lots of case-specific bits (marked with FIXME). use with caution.
tested on python 3.6.6
"""
import gspread,json,sys
import threading,time # for rate limiting
from functools import wraps
from random import shuffle
shid = sys.argv[2] # google sheet to write to
cfn='auth.json'
from oauth2client.service_account import ServiceAccountCredentials
scope = ['https://spreadsheets.google.com/feeds',
'https://www.googleapis.com/auth/drive']
do_not_update = ['notes'] # FIXME: case-specific. the list of columns not to update on the destination sheet.
credentials = ServiceAccountCredentials.from_json_keyfile_name(cfn, scope)
gc = gspread.authorize(credentials)
sh = gc.open_by_key(shid)
worksheet = sh.sheet1
print('spreadsheet',sh.id)
makeheader=True
from fabfile import sslheader as sheetheader # FIXME: case-specific. this is a list of all columns
lastcol = chr(ord('A')+len(sheetheader)-1)
def rate_limited(max_per_second: int):
"""Rate-limits the decorated function locally, for one process."""
lock = threading.Lock()
min_interval = 1.0 / max_per_second
def decorate(func):
last_time_called = time.perf_counter()
@wraps(func)
def rate_limited_function(*args, **kwargs):
lock.acquire()
nonlocal last_time_called
try:
elapsed = time.perf_counter() - last_time_called
left_to_wait = min_interval - elapsed
if left_to_wait > 0:
time.sleep(left_to_wait)
return func(*args, **kwargs)
finally:
last_time_called = time.perf_counter()
lock.release()
return rate_limited_function
return decorate
if makeheader:
# establish headers if absent
header = worksheet.row_values(1)
if header!=sheetheader:
print(header,'!=',sheetheader)
rng = 'A1:%s1'%lastcol
print('rng',rng)
cl = worksheet.range(rng)
i=0
for c in cl:
print(i)
c.value = sheetheader[i]
print(c,'=',sheetheader[i])
i+=1
worksheet.update_cells(cl)
print('headers established')
else:
print('headers ok')
else:
header = sheetheader
# walk over each output row and look for its keys in existing worksheet
cnt=0
keys = ['repo','hostname'] # FIXME: case-specific . these are the keys by which row uniqueness is identified
colvals={}
# obtain key column values
for k in keys:
kpos = header.index(k)
vals = worksheet.col_values(kpos+1)[1:]
colvals[k]=vals
rows = open(sys.argv[1],'r').read().split("\n")[1:]
shuffle(rows)
for row in rows:
cnt+=1
r = row.split("\t")
if r==['']: continue
#assert len(header)==len(r),Exception(header,r)
try:
rv = dict([(header[i],
(len(r)-1>i and r[i] or None)) \
for i in range(len(header))])
except IndexError as e:
print('could not expand',r,'by',header)
raise
# determine if we have row in spread for this
def calcrowsfound():
rowsfound={}
for k in keys:
rowsfound[k] = [i for i in range(len(colvals[k])) if colvals[k][i]==rv[k]]
return rowsfound
rowsfound = calcrowsfound()
# find the intersection of all these sets
isect = set.intersection(*[set(i[1]) for i in rowsfound.items()])
if not len(isect):
@rate_limited(1)
def apnd():
worksheet.resize(cols=len(header)) # otherwise append misbehaves..
worksheet.append_row(r)
print('appending',r)
apnd()
else:
while len(isect)>1:
todel = isect.pop()
print('deleting row',todel,[rv[k] for k in keys])
for k in keys:
print('colval',k,':',colvals[k][todel])
del colvals[k][todel]
tdr = todel+1
worksheet.delete_row(tdr)
rowsfound = calcrowsfound()
isect = set.intersection(*[set(i[1]) for i in rowsfound.items()])
print('deleted row',tdr,worksheet.row_values(tdr))
#isect,[rv[k] for k in keys])
rownum = list(isect)[0]+2
rng = 'A%s:%s%s'%(rownum,lastcol,rownum)
#print('going to update',rng,'=>',r)
@rate_limited(1)
def rngf():
cl = worksheet.range(rng)
return cl
cl = rngf()
i=0
changed=False
for c in cl:
if header[i] in ['valid_fr','valid_to']: # FIXME: case-specific
cmpf = lambda x: x.replace(' 0',' ')
else:
cmpf = lambda x: x
if (len(r)-1>i) and cmpf(c.value)!=cmpf(r[i]) and header[i] not in do_not_update:
jkey=(isect,'::'.join([rv[k] for k in keys]))
print(jkey,'.',c,'=',r[i],'(',i,header[i],')')
assert c.value!=r[i], Exception('but why?',c.value==r[i],header[i],type(c.value),type(r[i]))
c.value= r[i]
changed=True
i+=1
if changed:
@rate_limited(1)
def updc():
worksheet.update_cells(cl)
updc()
print(cnt,'/',len(rows))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment