bwv988/pandas_data_munging.py

## pandas_data_munging.py
import pandas as pd
import numpy as np

# I want to clean col2. This records equipemnts.
data = {'col1': [1, 2, 4], 'col2': ["e70", "e70 e80 ua", "b20"]}

# Create a Pandas data frame.
df = pd.DataFrame(data)

# Quick helper function to flatten a nested list.
# FIXME: Should consider using itertools for Python >= 2.6
# https://docs.python.org/3/library/itertools.html#itertools.product
flatten = lambda l: [item for sublist in l for item in sublist]

equip_raw = df["col2"]

# Split strings on spaces.
# FIXME: Might need to consider more than one space; remove empty strings.
equip_raw = [e.split(" ") for e in equip_raw]

# Get unique equipment column names.
equip_cols = np.unique(flatten(equip_raw))

# Now add a column for each eq. type.
tmp = df.reindex(columns = equip_cols)
tmp.fillna(value = False, inplace = True)
# Convert to boolean.
tmp = tmp.astype(np.bool)
# Merge
df.join(tmp)

# Now set to true if in equip in col2
# FIXME: Perhaps not the most elegant solution.
for col in equip_cols:
    df[col] = df["col2"].str.contains(col)
	import pandas as pd
	import numpy as np

	# I want to clean col2. This records equipemnts.
	data = {'col1': [1, 2, 4], 'col2': ["e70", "e70 e80 ua", "b20"]}

	# Create a Pandas data frame.
	df = pd.DataFrame(data)

	# Quick helper function to flatten a nested list.
	# FIXME: Should consider using itertools for Python >= 2.6
	# https://docs.python.org/3/library/itertools.html#itertools.product
	flatten = lambda l: [item for sublist in l for item in sublist]

	equip_raw = df["col2"]

	# Split strings on spaces.
	# FIXME: Might need to consider more than one space; remove empty strings.
	equip_raw = [e.split(" ") for e in equip_raw]

	# Get unique equipment column names.
	equip_cols = np.unique(flatten(equip_raw))

	# Now add a column for each eq. type.
	tmp = df.reindex(columns = equip_cols)
	tmp.fillna(value = False, inplace = True)
	# Convert to boolean.
	tmp = tmp.astype(np.bool)
	# Merge
	df.join(tmp)

	# Now set to true if in equip in col2
	# FIXME: Perhaps not the most elegant solution.
	for col in equip_cols:
	df[col] = df["col2"].str.contains(col)