jmquintana79/filtering_pandas.py

## filtering_pandas.py
""" Filter by condition """
# conditional operators for pandas:
# EQUAL: ==
# DIFFERENT TO: !=
# BIGGER THAN: >
# LOWER THAN: <
# BIGGER OR EQUAL TO: >=
# LOWER OR EQUEAL TO: <=
# AND: &
# OR: |

# condition is set through single values
DF_filtered = DF[DF.column == value] # or other condition
DF_filtered = DF[(DF.column1 == value1) & (DF.column2 == value2)] # or other condition


""" Filtering by "isin()" method """
# condition is set through multiple values

#When values is a list:
df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
df.isin([1, 3, 12, 'a'])
"""
       A      B
0   True   True
1  False  False
2   True  False
"""

# When values is a dict:
df = DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]})
df.isin({'A': [1, 3], 'B': [4, 7, 12]})
"""
       A      B
0   True  False  # Note that B didn't match the 1 here.
1  False   True
2   True   True
"""

# When values is a Series or DataFrame:
df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']})
df.isin(other)
"""
       A      B
0   True  False
1  False  False  # Column A in `other` has a 3, but not at index 1.
2   True   True
"""

# filter dataframe through a condition with values of list
df_filtered = df[df.column.isin(list_values_to_filter)]

# filter dataframe through a condition with values out of list
df_filtered = df[~df.column.isin(list_values_to_filter)]

# filter according to a column values if appear in a list
df[df.columna.isin(list_targets)]

# filter according to a string column value contain a substring
df[df.columna.map(lambda x: 'substr' in x)]
df[df.COLUMN.str.contains('substring')]

## filtering nan values
lindex = list(df[np.isnan(df["column"])].index) # only possible with float values
lindex = np.where(pd.isnull(df["column"])) # only possible with float values
lindex = df[pd.isnull(df["column"])].index.tolist() # always possible
df_filtered = df[~df.index.isin(lindex)]

## other way
df_filtered = df.dropna(inplace=False)
# dropping rows with NaN values in only a list of column
df.dropna(subset=['column_name'])

## other way with notna()
df[df.notna()]
df[df['column'].notna()]

""" duplicated """
# return True / False column
DF['duplicated'] = DF.duplicated(list_columns_considered, keep = 'first') # keep = {‘first’, ‘last’, False}, default ‘first’
# filter duplicated values
DF = DF[~DF['duplicated']]

# DROP DUPLICATES
df.drop_duplicates(['col1','col2'],keep='first',inplace=True)
	""" Filter by condition """
	# conditional operators for pandas:
	# EQUAL: ==
	# DIFFERENT TO: !=
	# BIGGER THAN: >
	# LOWER THAN: <
	# BIGGER OR EQUAL TO: >=
	# LOWER OR EQUEAL TO: <=
	# AND: &
	# OR: \|

	# condition is set through single values
	DF_filtered = DF[DF.column == value] # or other condition
	DF_filtered = DF[(DF.column1 == value1) & (DF.column2 == value2)] # or other condition



	""" Filtering by "isin()" method """
	# condition is set through multiple values

	#When values is a list:
	df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
	df.isin([1, 3, 12, 'a'])
	"""
	A B
	0 True True
	1 False False
	2 True False
	"""

	# When values is a dict:
	df = DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]})
	df.isin({'A': [1, 3], 'B': [4, 7, 12]})
	"""
	A B
	0 True False # Note that B didn't match the 1 here.
	1 False True
	2 True True
	"""

	# When values is a Series or DataFrame:
	df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
	other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']})
	df.isin(other)
	"""
	A B
	0 True False
	1 False False # Column A in `other` has a 3, but not at index 1.
	2 True True
	"""

	# filter dataframe through a condition with values of list
	df_filtered = df[df.column.isin(list_values_to_filter)]

	# filter dataframe through a condition with values out of list
	df_filtered = df[~df.column.isin(list_values_to_filter)]

	# filter according to a column values if appear in a list
	df[df.columna.isin(list_targets)]

	# filter according to a string column value contain a substring
	df[df.columna.map(lambda x: 'substr' in x)]
	df[df.COLUMN.str.contains('substring')]

	## filtering nan values
	lindex = list(df[np.isnan(df["column"])].index) # only possible with float values
	lindex = np.where(pd.isnull(df["column"])) # only possible with float values
	lindex = df[pd.isnull(df["column"])].index.tolist() # always possible
	df_filtered = df[~df.index.isin(lindex)]

	## other way
	df_filtered = df.dropna(inplace=False)
	# dropping rows with NaN values in only a list of column
	df.dropna(subset=['column_name'])

	## other way with notna()
	df[df.notna()]
	df[df['column'].notna()]

	""" duplicated """
	# return True / False column
	DF['duplicated'] = DF.duplicated(list_columns_considered, keep = 'first') # keep = {‘first’, ‘last’, False}, default ‘first’
	# filter duplicated values
	DF = DF[~DF['duplicated']]

	# DROP DUPLICATES
	df.drop_duplicates(['col1','col2'],keep='first',inplace=True)