gabefair/setup.py

## setup.py
import sys, os

try:
    import pandas as pd
except ImportError:
    print("Please run `pip install pandas` from a command line")
    exit()


def get_file_head(file_name):
    with open(file_name, "r") as f:
        counter = 0
        for line in f:
            print(line)
            counter += 1
            if counter == 5: break


def merge_files(new_name_list, old_map_df, source_file_column_name, target_file_column_name):
    # Extract only the matches
    print("Now spliting matches from not matches")

	new_and_old_matches = old_map_df[old_map_df.full_name_h.isin(new_name_list[target_file_column_name])]#.dropna(how='any', axis=0)
	remaining_old = old_map_df[~old_map_df.full_name_h.isin(new_name_list[target_file_column_name])]#.dropna(how='any', axis=0)


    file_name = source_file_column_name + "_and_" + target_file_column_name + "_merged.csv"

    print("Now saving to disk")
    # First dump the new ones to the top of the csv file
    new_and_old_matches.to_csv(file_name, index=False)

    # Now append the rows from the old file
    remaining_old.to_csv(file_name, header=False, index=False, mode='a')

    # pandas python2 bug adding index. Unable to drop or ignore it. Reading it back in to save it again without it
    remaining_old = pd.read_csv(file_name, header=0, index_col='Unnamed: 0')
    remaining_old.to_csv(file_name, index=False)

    print("Merge Complete. Here is the merged file's head()")
    get_file_head(file_name)


def read_in_files(target_file, source_file, source_file_column_name, target_file_column_name, target_file_format, source_file_format):
    print("Reading in files now. This will take ~.15mins")

    # Read in new file
    if (target_file_format == '.csv'):
        new_name_list = pd.read_csv(target_file, header=0)
        new_name_list.columns = new_name_list.columns.str.strip() #removes any spaces that might be in the column names
    else:
        new_name_list = pd.read_csv(target_file, header=None)

    # Read in old file
    if (source_file_format == '.csv'):
        old_map_df = pd.read_csv(source_file, header=0)
    else:
        old_map_df = pd.read_csv(source_file, header=None)

    merge_files(new_name_list, old_map_df, source_file_column_name, target_file_column_name)


def get_file_format(file_name):
    filename, file_extension = os.path.splitext(file_name)
    return file_extension


def reconcile_inputs(arg_length, target_file, source_file, source_file_column_name):
    target_file_format = get_file_format(target_file)
    source_file_format = get_file_format(source_file)

	if (arg_length < 4):
		target_file_column_name = source_file_column_name
	else:
		target_file_column_name = sys.argv[4]


    read_in_files(target_file, source_file, source_file_column_name, target_file_column_name, target_file_format, source_file_format)


def main():
    arg_length = len(sys.argv)
    if (arg_length < 3):
        print("Please specify at least 3 parameters: <name_of_file_to_merge_on> <name_of_sorce_file_to_merge_from> <source_file_column_name> <OPTIONAL: target_file_column_name>")
        exit()
    reconcile_inputs(arg_length, sys.argv[1], sys.argv[2], sys.argv[3])


if __name__ == '__main__':
    main()
	import sys, os

	try:
	import pandas as pd
	except ImportError:
	print("Please run `pip install pandas` from a command line")
	exit()


	def get_file_head(file_name):
	with open(file_name, "r") as f:
	counter = 0
	for line in f:
	print(line)
	counter += 1
	if counter == 5: break


	def merge_files(new_name_list, old_map_df, source_file_column_name, target_file_column_name):
	# Extract only the matches
	print("Now spliting matches from not matches")

	new_and_old_matches = old_map_df[old_map_df.full_name_h.isin(new_name_list[target_file_column_name])]#.dropna(how='any', axis=0)
	remaining_old = old_map_df[~old_map_df.full_name_h.isin(new_name_list[target_file_column_name])]#.dropna(how='any', axis=0)


	file_name = source_file_column_name + "_and_" + target_file_column_name + "_merged.csv"

	print("Now saving to disk")
	# First dump the new ones to the top of the csv file
	new_and_old_matches.to_csv(file_name, index=False)

	# Now append the rows from the old file
	remaining_old.to_csv(file_name, header=False, index=False, mode='a')

	# pandas python2 bug adding index. Unable to drop or ignore it. Reading it back in to save it again without it
	remaining_old = pd.read_csv(file_name, header=0, index_col='Unnamed: 0')
	remaining_old.to_csv(file_name, index=False)

	print("Merge Complete. Here is the merged file's head()")
	get_file_head(file_name)


	def read_in_files(target_file, source_file, source_file_column_name, target_file_column_name, target_file_format, source_file_format):
	print("Reading in files now. This will take ~.15mins")

	# Read in new file
	if (target_file_format == '.csv'):
	new_name_list = pd.read_csv(target_file, header=0)
	new_name_list.columns = new_name_list.columns.str.strip() #removes any spaces that might be in the column names
	else:
	new_name_list = pd.read_csv(target_file, header=None)

	# Read in old file
	if (source_file_format == '.csv'):
	old_map_df = pd.read_csv(source_file, header=0)
	else:
	old_map_df = pd.read_csv(source_file, header=None)

	merge_files(new_name_list, old_map_df, source_file_column_name, target_file_column_name)


	def get_file_format(file_name):
	filename, file_extension = os.path.splitext(file_name)
	return file_extension


	def reconcile_inputs(arg_length, target_file, source_file, source_file_column_name):
	target_file_format = get_file_format(target_file)
	source_file_format = get_file_format(source_file)

	if (arg_length < 4):
	target_file_column_name = source_file_column_name
	else:
	target_file_column_name = sys.argv[4]


	read_in_files(target_file, source_file, source_file_column_name, target_file_column_name, target_file_format, source_file_format)


	def main():
	arg_length = len(sys.argv)
	if (arg_length < 3):
	print("Please specify at least 3 parameters: <name_of_file_to_merge_on> <name_of_sorce_file_to_merge_from> <source_file_column_name> <OPTIONAL: target_file_column_name>")
	exit()
	reconcile_inputs(arg_length, sys.argv[1], sys.argv[2], sys.argv[3])


	if __name__ == '__main__':
	main()