rpgoldman/Dask column assignment snippet.py

## Dask column assignment snippet.py
# Here's what I do to get my background data (sorry, not public)
df = dd.read_csv('r1c5va879uaex_r1c639xp952g4.csv', assume_missing=True)

# Now, in order to add a column, I need to be able to add metadata --
# if I don't, I get mysterious errors about failing to infer types
newmeta = df._meta.copy() # get the original metadata
# add new column to metadata
newmeta.insert(len(newmeta.columns), 'well', 'foo')
# specify the dtype of the new column
newmeta = newmeta.astype({'well': str})

# the function we use to compute the new column values --
# note that efm suggests we use a field splitter, instead of
# regular expression matching, which is less efficient
def find_well(x):
    assert isinstance(x, str), f"Trying to find well in non-string value {x}"
    match = re.match(id_re, x)
    if match is None:
        raise ValueError(f"Couldn't find well ID in {x}")
    return match.group(1)
find_wellv = np.vectorize(find_well)

# now actually compute and add the new column
df4 = df.map_partitions(lambda df: df.assign(well=find_wellv(df['id'])), meta=newmeta)

## Dask column assignment.ipynb

      
Display the source blob

    
Display the rendered blob

    
    Raw
  

              Dask column assignment.ipynb
            
          
      Sorry, something went wrong. Reload?
      Sorry, we cannot display this file.
      Sorry, this file is invalid so it cannot be displayed.
      
          Viewer requires iframe.
	# Here's what I do to get my background data (sorry, not public)
	df = dd.read_csv('r1c5va879uaex_r1c639xp952g4.csv', assume_missing=True)

	# Now, in order to add a column, I need to be able to add metadata --
	# if I don't, I get mysterious errors about failing to infer types
	newmeta = df._meta.copy() # get the original metadata
	# add new column to metadata
	newmeta.insert(len(newmeta.columns), 'well', 'foo')
	# specify the dtype of the new column
	newmeta = newmeta.astype({'well': str})

	# the function we use to compute the new column values --
	# note that efm suggests we use a field splitter, instead of
	# regular expression matching, which is less efficient
	def find_well(x):
	assert isinstance(x, str), f"Trying to find well in non-string value {x}"
	match = re.match(id_re, x)
	if match is None:
	raise ValueError(f"Couldn't find well ID in {x}")
	return match.group(1)
	find_wellv = np.vectorize(find_well)

	# now actually compute and add the new column
	df4 = df.map_partitions(lambda df: df.assign(well=find_wellv(df['id'])), meta=newmeta)