Last active
September 23, 2018 18:26
-
-
Save guyromm/9e93dfe5b0dce0e8ae7fffade01d36d9 to your computer and use it in GitHub Desktop.
synchronize a local .tsv file onto a google spreadsheet where row identity is calculated by composite keys
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
""" | |
synchronize data from a local file to a google spreadsheet, where rows are located according to keys. | |
achtung! lots of case-specific bits (marked with FIXME). use with caution. | |
tested on python 3.6.6 | |
""" | |
import gspread,json,sys | |
import threading,time # for rate limiting | |
from functools import wraps | |
from random import shuffle | |
shid = sys.argv[2] # google sheet to write to | |
cfn='auth.json' | |
from oauth2client.service_account import ServiceAccountCredentials | |
scope = ['https://spreadsheets.google.com/feeds', | |
'https://www.googleapis.com/auth/drive'] | |
do_not_update = ['notes'] # FIXME: case-specific. the list of columns not to update on the destination sheet. | |
credentials = ServiceAccountCredentials.from_json_keyfile_name(cfn, scope) | |
gc = gspread.authorize(credentials) | |
sh = gc.open_by_key(shid) | |
worksheet = sh.sheet1 | |
print('spreadsheet',sh.id) | |
makeheader=True | |
from fabfile import sslheader as sheetheader # FIXME: case-specific. this is a list of all columns | |
lastcol = chr(ord('A')+len(sheetheader)-1) | |
def rate_limited(max_per_second: int): | |
"""Rate-limits the decorated function locally, for one process.""" | |
lock = threading.Lock() | |
min_interval = 1.0 / max_per_second | |
def decorate(func): | |
last_time_called = time.perf_counter() | |
@wraps(func) | |
def rate_limited_function(*args, **kwargs): | |
lock.acquire() | |
nonlocal last_time_called | |
try: | |
elapsed = time.perf_counter() - last_time_called | |
left_to_wait = min_interval - elapsed | |
if left_to_wait > 0: | |
time.sleep(left_to_wait) | |
return func(*args, **kwargs) | |
finally: | |
last_time_called = time.perf_counter() | |
lock.release() | |
return rate_limited_function | |
return decorate | |
if makeheader: | |
# establish headers if absent | |
header = worksheet.row_values(1) | |
if header!=sheetheader: | |
print(header,'!=',sheetheader) | |
rng = 'A1:%s1'%lastcol | |
print('rng',rng) | |
cl = worksheet.range(rng) | |
i=0 | |
for c in cl: | |
print(i) | |
c.value = sheetheader[i] | |
print(c,'=',sheetheader[i]) | |
i+=1 | |
worksheet.update_cells(cl) | |
print('headers established') | |
else: | |
print('headers ok') | |
else: | |
header = sheetheader | |
# walk over each output row and look for its keys in existing worksheet | |
cnt=0 | |
keys = ['repo','hostname'] # FIXME: case-specific . these are the keys by which row uniqueness is identified | |
colvals={} | |
# obtain key column values | |
for k in keys: | |
kpos = header.index(k) | |
vals = worksheet.col_values(kpos+1)[1:] | |
colvals[k]=vals | |
rows = open(sys.argv[1],'r').read().split("\n")[1:] | |
shuffle(rows) | |
for row in rows: | |
cnt+=1 | |
r = row.split("\t") | |
if r==['']: continue | |
#assert len(header)==len(r),Exception(header,r) | |
try: | |
rv = dict([(header[i], | |
(len(r)-1>i and r[i] or None)) \ | |
for i in range(len(header))]) | |
except IndexError as e: | |
print('could not expand',r,'by',header) | |
raise | |
# determine if we have row in spread for this | |
def calcrowsfound(): | |
rowsfound={} | |
for k in keys: | |
rowsfound[k] = [i for i in range(len(colvals[k])) if colvals[k][i]==rv[k]] | |
return rowsfound | |
rowsfound = calcrowsfound() | |
# find the intersection of all these sets | |
isect = set.intersection(*[set(i[1]) for i in rowsfound.items()]) | |
if not len(isect): | |
@rate_limited(1) | |
def apnd(): | |
worksheet.resize(cols=len(header)) # otherwise append misbehaves.. | |
worksheet.append_row(r) | |
print('appending',r) | |
apnd() | |
else: | |
while len(isect)>1: | |
todel = isect.pop() | |
print('deleting row',todel,[rv[k] for k in keys]) | |
for k in keys: | |
print('colval',k,':',colvals[k][todel]) | |
del colvals[k][todel] | |
tdr = todel+1 | |
worksheet.delete_row(tdr) | |
rowsfound = calcrowsfound() | |
isect = set.intersection(*[set(i[1]) for i in rowsfound.items()]) | |
print('deleted row',tdr,worksheet.row_values(tdr)) | |
#isect,[rv[k] for k in keys]) | |
rownum = list(isect)[0]+2 | |
rng = 'A%s:%s%s'%(rownum,lastcol,rownum) | |
#print('going to update',rng,'=>',r) | |
@rate_limited(1) | |
def rngf(): | |
cl = worksheet.range(rng) | |
return cl | |
cl = rngf() | |
i=0 | |
changed=False | |
for c in cl: | |
if header[i] in ['valid_fr','valid_to']: # FIXME: case-specific | |
cmpf = lambda x: x.replace(' 0',' ') | |
else: | |
cmpf = lambda x: x | |
if (len(r)-1>i) and cmpf(c.value)!=cmpf(r[i]) and header[i] not in do_not_update: | |
jkey=(isect,'::'.join([rv[k] for k in keys])) | |
print(jkey,'.',c,'=',r[i],'(',i,header[i],')') | |
assert c.value!=r[i], Exception('but why?',c.value==r[i],header[i],type(c.value),type(r[i])) | |
c.value= r[i] | |
changed=True | |
i+=1 | |
if changed: | |
@rate_limited(1) | |
def updc(): | |
worksheet.update_cells(cl) | |
updc() | |
print(cnt,'/',len(rows)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment