Skip to content

Instantly share code, notes, and snippets.

@jmquintana79
Last active January 22, 2022 00:04
Show Gist options
  • Save jmquintana79/0ae540676993bf6cbbc61a1513326859 to your computer and use it in GitHub Desktop.
Save jmquintana79/0ae540676993bf6cbbc61a1513326859 to your computer and use it in GitHub Desktop.
Filter data stored into pandas dataframe
""" Filter by condition """
# conditional operators for pandas:
# EQUAL: ==
# DIFFERENT TO: !=
# BIGGER THAN: >
# LOWER THAN: <
# BIGGER OR EQUAL TO: >=
# LOWER OR EQUEAL TO: <=
# AND: &
# OR: |
# condition is set through single values
DF_filtered = DF[DF.column == value] # or other condition
DF_filtered = DF[(DF.column1 == value1) & (DF.column2 == value2)] # or other condition
""" Filtering by "isin()" method """
# condition is set through multiple values
#When values is a list:
df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
df.isin([1, 3, 12, 'a'])
"""
A B
0 True True
1 False False
2 True False
"""
# When values is a dict:
df = DataFrame({'A': [1, 2, 3], 'B': [1, 4, 7]})
df.isin({'A': [1, 3], 'B': [4, 7, 12]})
"""
A B
0 True False # Note that B didn't match the 1 here.
1 False True
2 True True
"""
# When values is a Series or DataFrame:
df = DataFrame({'A': [1, 2, 3], 'B': ['a', 'b', 'f']})
other = DataFrame({'A': [1, 3, 3, 2], 'B': ['e', 'f', 'f', 'e']})
df.isin(other)
"""
A B
0 True False
1 False False # Column A in `other` has a 3, but not at index 1.
2 True True
"""
# filter dataframe through a condition with values of list
df_filtered = df[df.column.isin(list_values_to_filter)]
# filter dataframe through a condition with values out of list
df_filtered = df[~df.column.isin(list_values_to_filter)]
# filter according to a column values if appear in a list
df[df.columna.isin(list_targets)]
# filter according to a string column value contain a substring
df[df.columna.map(lambda x: 'substr' in x)]
df[df.COLUMN.str.contains('substring')]
## filtering nan values
lindex = list(df[np.isnan(df["column"])].index) # only possible with float values
lindex = np.where(pd.isnull(df["column"])) # only possible with float values
lindex = df[pd.isnull(df["column"])].index.tolist() # always possible
df_filtered = df[~df.index.isin(lindex)]
## other way
df_filtered = df.dropna(inplace=False)
# dropping rows with NaN values in only a list of column
df.dropna(subset=['column_name'])
## other way with notna()
df[df.notna()]
df[df['column'].notna()]
""" duplicated """
# return True / False column
DF['duplicated'] = DF.duplicated(list_columns_considered, keep = 'first') # keep = {‘first’, ‘last’, False}, default ‘first’
# filter duplicated values
DF = DF[~DF['duplicated']]
# DROP DUPLICATES
df.drop_duplicates(['col1','col2'],keep='first',inplace=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment