Skip to content

Instantly share code, notes, and snippets.

@computercolin
Created February 27, 2019 06:52
Show Gist options
  • Save computercolin/52de5dc5349822ca088c40d744ef9718 to your computer and use it in GitHub Desktop.
Save computercolin/52de5dc5349822ca088c40d744ef9718 to your computer and use it in GitHub Desktop.
#
# Note, in all examples, could be class, or function, or for loop.
# Many cases, can replace with list compreension + fn. But if fn will never
# be used with scalar input, I usually write it as generator.
#
# Nothing that special about generators, just they're easy way to make
# iteration producers or filters that are fairly readable.
#
# Can also chain together (producer, transformer, transformer, fitler, etc).
#
## MK_USER_USER_FRIEND_FOLLOW_ABVR_MAP_FILE_TASK_GENER
#
# Object that takes paths and outputs file_tasks is simple abstraction so
# reasonable place to put code boundary.
# Makes calling code much easier to read.
#
def mk_user_user_inter_abvr_map_file_task_gener(fetchtype: str = None, data_in_dir=None,
data_out_dir=None, stageno=None):
in_fp_l, in_fmeta_l = get_fetch_infiles_fpaths_fmeta(data_in_dir, fetchtype)
out_fn_l = [gen_out_fname_from_fetch_meta(fmeta, "stg-%d-abvrmap" % stageno, '.tsv')
for fmeta in in_fmeta_l]
for in_fpath, in_fmeta, out_fname in zip(in_fp_l, in_fmeta_l, out_fn_l):
if in_fmeta['td'] != fetchtype:
raise InputDataError("Err File contains wrong data type! Expected %s"
" Got %s -- %s" % (fetchtype, in_fmeta['td'], in_fmeta['fname']))
out_fpath = pathj(data_out_dir, out_fname)
yield {
'name': "stg%d-%s-%s" % (stageno, fetchtype, in_fmeta['sn']),
'file_dep': [in_fpath],
'actions': [(
extract_write_user_user_abvr_map_f, (in_fpath , out_fpath, stageno, fetchtype) )],
'targets': [out_fpath],
}
## UNAME_RAND_GENER
#
# Generate and don't stop.
# Note: Designed for few k outputs.
# If this were expected to exhaust int range, would need checks, mem efficiency.
#
def uname_rand_gener(min_id=100000, max_id=999999):
sentinel = object()
visited = {}
base = 'urlyte3.so.'
while True:
n = random.randint(min_id, max_id +1)
if n in visited:
continue
visited[n] = sentinel
s = "u%d.%s" % (n, base)
yield s
### CSV_ROWREADER
#
# Utility rowreader written in relatively few lines.
# It is easy to read "transform" code that uses multifile_csv_reader!
# It is easy to read these generators.
# Everyone wins!
#
def mk_multifile_csv_rowreader(tgt_fpaths, delimeter=',',
quotechar='"', escapechar='\\', lineterminator='\n'):
"""Yield lines, parsed as csv, from each file in tgt_fpaths. File contents are "chained" together into
continuous iteration. Requires "unix" style csv (each row spans only 1 line)."""
linereader = adv_open_r_multifile_linereader(tgt_fpaths, text_mode=True, newline=lineterminator)
csvr = csv.reader(linereader, delimeter=delimeter, quotechar=quotechar,
escapechar=escapechar, lineterminator=lineterminator)
yield from csvr
def adv_open_r_multifile_linereader(fpaths, text_mode=True, newline='\n', raise_on_missing=True,
gzip_override_assume_gzip=DEF_GZIP_OVERRIDE_ASSUME_GZIPPED,
gzip_heur_enable_file_ext_detect=DEF_GZIP_HEUR_ENABLE_FILE_EXT_DETECT,
gzip_heur_enable_file_startbytes_detect=DEF_GZIP_HEUR_ENABLE_FILE_STARTBYTES_DETECT):
"""Open fpaths in order and yield contents one line at a time. Contents are "chained" together into
one continuous iteration."""
for fpath in fpaths:
try:
fh = adv_open_r_fh(fpath, text_mode=text_mode, newline=newline,
gzip_override_assume_gzip=gzip_override_assume_gzip,
gzip_heur_enable_file_ext_detect=gzip_heur_enable_file_ext_detect,
gzip_heur_enable_file_startbytes_detect=gzip_heur_enable_file_startbytes_detect)
except FileNotFoundError:
if raise_on_missing: raise
continue
with fh:
yield from fh
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment