cmaureir/compare.py

## compare.py
import numpy as np
import pandas as pd

# content from '1.csv'
#
# a,b,c,d
# 1,0.1,"a-10","hello"
# 2,0.2,"a-11","hola"
# 3,0.3,"a-12","hallo"
# 4,0.5,"a-13","hello"

# content from '2.csv'
# other,other_c
# 3.1,"a-99"
# 4.2,"a-10"
# 6.3,"a-22"
# 1.5,"a-11"

df_a = pd.read_csv("1.csv")
df_b = pd.read_csv("2.csv")

a = df_a[['a', 'b', 'c']]
b = df_b[['other', 'other_c']]

print(a)
print(b)
print("-"*10)


print("values from `1.csv` that are in `2.csv` based on column `c`")
contains = a[b['other_c'].apply(lambda x: x in a['c'].values)]
print(contains)
print("-"*10)

print("values from `1.csv` that are NOT in `2.csv` based on column `c`")
diff = a[b['other_c'].apply(lambda x: x not in a['c'].values)]
print(diff)

# Output
#
#    a    b     c
# 0  1  0.1  a-10
# 1  2  0.2  a-11
# 2  3  0.3  a-12
# 3  4  0.5  a-13
#    other other_c
# 0    3.1    a-99
# 1    4.2    a-10
# 2    6.3    a-22
# 3    1.5    a-11
# ----------
# values from `1.csv` that are in `2.csv` based on column `c`
#    a    b     c
# 1  2  0.2  a-11
# 3  4  0.5  a-13
# ----------
# values from `1.csv` that are NOT in `2.csv` based on column `c`
#    a    b     c
# 0  1  0.1  a-10
# 2  3  0.3  a-12
#
	import numpy as np
	import pandas as pd

	# content from '1.csv'
	#
	# a,b,c,d
	# 1,0.1,"a-10","hello"
	# 2,0.2,"a-11","hola"
	# 3,0.3,"a-12","hallo"
	# 4,0.5,"a-13","hello"

	# content from '2.csv'
	# other,other_c
	# 3.1,"a-99"
	# 4.2,"a-10"
	# 6.3,"a-22"
	# 1.5,"a-11"

	df_a = pd.read_csv("1.csv")
	df_b = pd.read_csv("2.csv")

	a = df_a[['a', 'b', 'c']]
	b = df_b[['other', 'other_c']]

	print(a)
	print(b)
	print("-"*10)


	print("values from `1.csv` that are in `2.csv` based on column `c`")
	contains = a[b['other_c'].apply(lambda x: x in a['c'].values)]
	print(contains)
	print("-"*10)

	print("values from `1.csv` that are NOT in `2.csv` based on column `c`")
	diff = a[b['other_c'].apply(lambda x: x not in a['c'].values)]
	print(diff)

	# Output
	#
	# a b c
	# 0 1 0.1 a-10
	# 1 2 0.2 a-11
	# 2 3 0.3 a-12
	# 3 4 0.5 a-13
	# other other_c
	# 0 3.1 a-99
	# 1 4.2 a-10
	# 2 6.3 a-22
	# 3 1.5 a-11
	# ----------
	# values from `1.csv` that are in `2.csv` based on column `c`
	# a b c
	# 1 2 0.2 a-11
	# 3 4 0.5 a-13
	# ----------
	# values from `1.csv` that are NOT in `2.csv` based on column `c`
	# a b c
	# 0 1 0.1 a-10
	# 2 3 0.3 a-12
	#