Navigation Menu

Skip to content

Instantly share code, notes, and snippets.

@gabefair
Created August 9, 2018 19:54
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gabefair/da976005c64e432030d479b4141956c2 to your computer and use it in GitHub Desktop.
Save gabefair/da976005c64e432030d479b4141956c2 to your computer and use it in GitHub Desktop.
merge rows of two csv files based on a column-key and place one of them ahead infront of the other file's rows after the merge
import sys, os
try:
import pandas as pd
except ImportError:
print("Please run `pip install pandas` from a command line")
exit()
def get_file_head(file_name):
with open(file_name, "r") as f:
counter = 0
for line in f:
print(line)
counter += 1
if counter == 5: break
def merge_files(new_name_list, old_map_df, source_file_column_name, target_file_column_name):
# Extract only the matches
print("Now spliting matches from not matches")
new_and_old_matches = old_map_df[old_map_df.full_name_h.isin(new_name_list[target_file_column_name])]#.dropna(how='any', axis=0)
remaining_old = old_map_df[~old_map_df.full_name_h.isin(new_name_list[target_file_column_name])]#.dropna(how='any', axis=0)
file_name = source_file_column_name + "_and_" + target_file_column_name + "_merged.csv"
print("Now saving to disk")
# First dump the new ones to the top of the csv file
new_and_old_matches.to_csv(file_name, index=False)
# Now append the rows from the old file
remaining_old.to_csv(file_name, header=False, index=False, mode='a')
# pandas python2 bug adding index. Unable to drop or ignore it. Reading it back in to save it again without it
remaining_old = pd.read_csv(file_name, header=0, index_col='Unnamed: 0')
remaining_old.to_csv(file_name, index=False)
print("Merge Complete. Here is the merged file's head()")
get_file_head(file_name)
def read_in_files(target_file, source_file, source_file_column_name, target_file_column_name, target_file_format, source_file_format):
print("Reading in files now. This will take ~.15mins")
# Read in new file
if (target_file_format == '.csv'):
new_name_list = pd.read_csv(target_file, header=0)
new_name_list.columns = new_name_list.columns.str.strip() #removes any spaces that might be in the column names
else:
new_name_list = pd.read_csv(target_file, header=None)
# Read in old file
if (source_file_format == '.csv'):
old_map_df = pd.read_csv(source_file, header=0)
else:
old_map_df = pd.read_csv(source_file, header=None)
merge_files(new_name_list, old_map_df, source_file_column_name, target_file_column_name)
def get_file_format(file_name):
filename, file_extension = os.path.splitext(file_name)
return file_extension
def reconcile_inputs(arg_length, target_file, source_file, source_file_column_name):
target_file_format = get_file_format(target_file)
source_file_format = get_file_format(source_file)
if (arg_length < 4):
target_file_column_name = source_file_column_name
else:
target_file_column_name = sys.argv[4]
read_in_files(target_file, source_file, source_file_column_name, target_file_column_name, target_file_format, source_file_format)
def main():
arg_length = len(sys.argv)
if (arg_length < 3):
print("Please specify at least 3 parameters: <name_of_file_to_merge_on> <name_of_sorce_file_to_merge_from> <source_file_column_name> <OPTIONAL: target_file_column_name>")
exit()
reconcile_inputs(arg_length, sys.argv[1], sys.argv[2], sys.argv[3])
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment