Skip to content

Instantly share code, notes, and snippets.

@cmaureir
Created March 7, 2019 12:14
Show Gist options
  • Save cmaureir/c63f66c36cbe92aa1d4435201714164a to your computer and use it in GitHub Desktop.
Save cmaureir/c63f66c36cbe92aa1d4435201714164a to your computer and use it in GitHub Desktop.
Compare two data frames by column values
import numpy as np
import pandas as pd
# content from '1.csv'
#
# a,b,c,d
# 1,0.1,"a-10","hello"
# 2,0.2,"a-11","hola"
# 3,0.3,"a-12","hallo"
# 4,0.5,"a-13","hello"
# content from '2.csv'
# other,other_c
# 3.1,"a-99"
# 4.2,"a-10"
# 6.3,"a-22"
# 1.5,"a-11"
df_a = pd.read_csv("1.csv")
df_b = pd.read_csv("2.csv")
a = df_a[['a', 'b', 'c']]
b = df_b[['other', 'other_c']]
print(a)
print(b)
print("-"*10)
print("values from `1.csv` that are in `2.csv` based on column `c`")
contains = a[b['other_c'].apply(lambda x: x in a['c'].values)]
print(contains)
print("-"*10)
print("values from `1.csv` that are NOT in `2.csv` based on column `c`")
diff = a[b['other_c'].apply(lambda x: x not in a['c'].values)]
print(diff)
# Output
#
# a b c
# 0 1 0.1 a-10
# 1 2 0.2 a-11
# 2 3 0.3 a-12
# 3 4 0.5 a-13
# other other_c
# 0 3.1 a-99
# 1 4.2 a-10
# 2 6.3 a-22
# 3 1.5 a-11
# ----------
# values from `1.csv` that are in `2.csv` based on column `c`
# a b c
# 1 2 0.2 a-11
# 3 4 0.5 a-13
# ----------
# values from `1.csv` that are NOT in `2.csv` based on column `c`
# a b c
# 0 1 0.1 a-10
# 2 3 0.3 a-12
#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment