shawngraham/1_compare.py

## 1_compare.py
import argparse
import csv

# Initialize the argument parser
parser = argparse.ArgumentParser(description='Match entities from two CSV files and save the results to a new file.')
parser.add_argument('first_csv', help='The CSV file with "source" and "target" columns.')
parser.add_argument('second_csv', help='The CSV file to compare against.')
parser.add_argument('--output_csv', default='matched_rows.csv', help='The output CSV file with matched rows. Defaults to "matched_rows.csv".')

# Parse the command-line arguments
args = parser.parse_args()

# Load source and target entities from the first CSV
entities_set = set()
with open(args.first_csv, mode='r', newline='', encoding='utf-8-sig') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        entities_set.add(row['source'].strip())
        entities_set.add(row['target'].strip())

# Read the second CSV and append matches to the output CSV
match_count = 0 # Counter for matches
with open(args.second_csv, mode='r', newline='', encoding='utf-8-sig') as read_file, \
     open(args.output_csv, mode='a', newline='', encoding='utf-8') as write_file:

    csv_reader = csv.DictReader(read_file)
    # Check if the output csv file is empty, to write header or not
    write_file.seek(0, 2)  # Go to the end of the file.
    if write_file.tell() == 0:  # Check if it's empty.
        write_header = True
    else:
        write_header = False
    write_file.seek(0)  # Go back to the beginning of the file.

    csv_writer = csv.DictWriter(write_file, fieldnames=csv_reader.fieldnames)
    # If it's the first time writing to the file, write the header
    if write_header:
        csv_writer.writeheader()

    # Check each row in the second CSV for matches and append to the output CSV if matched
    for row in csv_reader:
        if any(row['name'].strip() in entities_set for field in row):
            csv_writer.writerow(row)
            match_count += 1

if match_count == 0:
    print("No matches were found.")
else:
    print(f"{match_count} matching {'row' if match_count == 1 else 'rows'} have been appended to the output CSV file.")

## 1_fuzzy_compare.py
import argparse
import csv
from rapidfuzz import process, fuzz

# Initialize the argument parser
parser = argparse.ArgumentParser(description='Match entities from two CSV files and save the results to a new file.')
parser.add_argument('first_csv', help='The CSV file with "source" and "target" columns.')
parser.add_argument('second_csv', help='The CSV file to compare against.')
parser.add_argument('--output_csv', default='matched_rows.csv', help='The output CSV file with matched rows.')
parser.add_argument('--score_threshold', type=int, default=90, help='The match score threshold for fuzzy matching.')

# Parse the command-line arguments
args = parser.parse_args()

# Load source and target entities from the first CSV
entities_set = set()
with open(args.first_csv, mode='r', newline='', encoding='utf-8-sig') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        entities_set.add(row['source'].strip())
        entities_set.add(row['target'].strip())

# Read the second CSV and append matches to the output CSV
match_count = 0 # Counter for matches
# Check each row in the second CSV for fuzzy matches and append to the output CSV if matched
with open(args.second_csv, mode='r', newline='', encoding='utf-8-sig') as read_file, \
     open(args.output_csv, mode='a', newline='', encoding='utf-8') as write_file:

    csv_reader = csv.DictReader(read_file)
    # Extend the fieldnames to include the matched_name field
    fieldnames = csv_reader.fieldnames + ['matched_name']
    csv_writer = csv.DictWriter(write_file, fieldnames=fieldnames)

    # If it's the first time writing to the file, write the header
    write_file.seek(0, 2)  # Go to the end of file
    if write_file.tell() == 0:
        csv_writer.writeheader()

    # Check each row in the second CSV for fuzzy matches and append to the output CSV if matched
    for row in csv_reader:
        name_to_match = row['name'].strip()
        result = process.extractOne(name_to_match, entities_set, scorer=fuzz.WRatio, score_cutoff=args.score_threshold)

        # Check if a match was found and unpack the result (which includes the match, score, and index)
        if result:
            best_match, score, _ = result

            # Write the original row to the new file, appending the matched_name
            row['matched_name'] = best_match
            csv_writer.writerow(row)
            match_count += 1

if match_count == 0:
    print("No matches were found.")
else:
    print(f"{match_count} matching {'row' if match_count == 1 else 'rows'} have been appended to the output CSV file.")

## 2_relationshipper.py
import csv

# File paths for your CSV files
matched_rows_path = 'matched_rows.csv'
relationships_path = 'orig_data/relationships.csv'
output_relationships_path = 'output_relationships.csv'

# Load the node_id values from matched_rows.csv
node_ids_set = set()
with open(matched_rows_path, mode='r', newline='', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        node_ids_set.add(row['node_id'].strip())

# Read the relationships.csv and write matches to a new CSV
match_count = 0 # Counter for matches
with open(relationships_path, mode='r', newline='', encoding='utf-8') as read_file, \
     open(output_relationships_path, mode='w', newline='', encoding='utf-8') as write_file:

    csv_reader = csv.DictReader(read_file)
    csv_writer = csv.DictWriter(write_file, fieldnames=csv_reader.fieldnames)
    # Write the header to the output file
    csv_writer.writeheader()

    # Check each row for matching node_id_start or node_id_end and write to the new file if any match is found
    for row in csv_reader:
        if row['node_id_start'].strip() in node_ids_set or row['node_id_end'].strip() in node_ids_set:
            csv_writer.writerow(row)
            match_count += 1

if match_count == 0:
    print("No matching relationships were found.")
else:
    print(f"{match_count} matching {'relationship' if match_count == 1 else 'relationships'} have been written to the output CSV file.")

## 3_compare_node_ids.py
import argparse
import csv

# Initialize the argument parser
parser = argparse.ArgumentParser(description='Match entities from two CSV files and save the results to a new file.')
parser.add_argument('first_csv', help='The CSV file with "source" and "target" columns.')
parser.add_argument('second_csv', help='The CSV file to compare against.')
parser.add_argument('--output_csv', default='matched_rows.csv', help='The output CSV file with matched rows. Defaults to "matched_rows.csv".')

# Parse the command-line arguments
args = parser.parse_args()

# Load source and target entities from the first CSV
entities_set = set()
with open(args.first_csv, mode='r', newline='', encoding='utf-8-sig') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        entities_set.add(row['source'].strip())
        entities_set.add(row['target'].strip())
        print(entities_set)

# Read the second CSV and append matches to the output CSV
match_count = 0 # Counter for matches
with open(args.second_csv, mode='r', newline='', encoding='utf-8-sig') as read_file, \
     open(args.output_csv, mode='a', newline='', encoding='utf-8') as write_file:

    csv_reader = csv.DictReader(read_file)
    # Check if the output csv file is empty, to write header or not
    write_file.seek(0, 2)  # Go to the end of the file.
    if write_file.tell() == 0:  # Check if it's empty.
        write_header = True
    else:
        write_header = False
    write_file.seek(0)  # Go back to the beginning of the file.

    csv_writer = csv.DictWriter(write_file, fieldnames=csv_reader.fieldnames)
    # If it's the first time writing to the file, write the header
    if write_header:
        csv_writer.writeheader()

    # Check each row in the second CSV for matches and append to the output CSV if matched
    for row in csv_reader:
        if any(row['node_id'].strip() in entities_set for field in row):
            csv_writer.writerow(row)
            match_count += 1

if match_count == 0:
    print("No matches were found.")
else:
    print(f"{match_count} matching {'row' if match_count == 1 else 'rows'} have been appended to the output CSV file.")

## 4_last_step.py
import argparse
import csv

# Set up the argument parser
parser = argparse.ArgumentParser(description='Replace node IDs with their corresponding names.')
parser.add_argument('csv1', help='The CSV file with "source" and "target" columns, containing numeric node identifiers.')
parser.add_argument('csv2', help='The CSV file with "node_id" and "name" columns, for mapping identifiers to names.')
parser.add_argument('--output_csv', default='named_relationships.csv', help='The output CSV file with "source" and "target" replaced by names.')

# Parse the command-line arguments
args = parser.parse_args()

# Build the mapping from node_id to name from csv2
node_id_to_name = {}
with open(args.csv2, mode='r', newline='', encoding='utf-8') as file:
    csv_reader = csv.DictReader(file)
    for row in csv_reader:
        node_id_to_name[row['node_id'].strip()] = row['name'].strip()

# Read csv1 and replace node IDs with names using the mapping, then write to the new CSV
with open(args.csv1, mode='r', newline='', encoding='utf-8') as read_file, \
     open(args.output_csv, mode='w', newline='', encoding='utf-8') as write_file:

    csv_reader = csv.DictReader(read_file)
    # Define the new headers, assuming you want 'source' and 'target' to keep the original names
    fieldnames = ['source', 'target']
    csv_writer = csv.DictWriter(write_file, fieldnames=fieldnames)
    csv_writer.writeheader()

    for row in csv_reader:
        # Use the mapping to replace the node ID with the corresponding name
        source_name = node_id_to_name.get(row['source'].strip(), row['source'])
        target_name = node_id_to_name.get(row['target'].strip(), row['target'])
        csv_writer.writerow({'source': source_name, 'target': target_name})

print(f"Names have been replaced in {args.output_csv}.")

## workflow.py
import os
import subprocess
import pandas as pd

# Define the paths to the original data
knowledge_graph_path = 'orig_data/knowledge-graph.csv'
entities_path = 'orig_data/nodes-entities.csv'
intermediaries_path = 'orig_data/nodes-intermediaries.csv'
officers_path = 'orig_data/nodes-officers.csv'

# Step 1: Run fuzzy_compare on various CSVs
print("And awaaaay we go! \n\n...fuzzy compare of kg to entities\n")
subprocess.run(['python', '1_fuzzy_compare.py', knowledge_graph_path, entities_path, '--output_csv', 'matched_rows.csv', '--score_threshold', '95'])
print("... fuzzy compare of kg to intermediaries\n")
subprocess.run(['python', '1_fuzzy_compare.py', knowledge_graph_path, intermediaries_path, '--output_csv', 'matched_rows.csv', '--score_threshold', '95'])
print("... fuzzy compare of kg to officers\n")
subprocess.run(['python', '1_fuzzy_compare.py', knowledge_graph_path, officers_path, '--output_csv', 'matched_rows.csv', '--score_threshold', '95'])

# Step 2: Process the matched rows to generate relationships
print("...digging out relationships...")
subprocess.run(['python', '2_relationshipper.py'])

# Manually selecting the first three columns of data is replaced by pandas
print("...sorting out relationships...")
df = pd.read_csv('output_relationships.csv', usecols=[0, 1, 2])
df.columns = ['source', 'target', 'rel_type']
df.to_csv('thirdstep.csv', index=False)

# Step 3: Compare node IDs and append to laststep.csv
print("...matching names to ids...")
subprocess.run(['python', '3_compare_node_ids.py', 'thirdstep.csv', entities_path, '--output_csv', 'laststep.csv'])
subprocess.run(['python', '3_compare_node_ids.py', 'thirdstep.csv', intermediaries_path, '--output_csv', 'laststep.csv'])
subprocess.run(['python', '3_compare_node_ids.py', 'thirdstep.csv', officers_path, '--output_csv', 'laststep.csv'])

# Final step: Combine the third step data with last step
print("...just about done...")
subprocess.run(['python', '4_last_step.py', 'thirdstep.csv', 'laststep.csv'])

# Function to rearrange names from 'Lastname - Firstname Middlename' to 'Firstname Middlename Lastname'
def rearrange_names(name):
    if '-' in name:
        parts = name.split(' - ')
        if len(parts) == 2:
            return f"{parts[1]} {parts[0]}"
    return name  # Return the name unchanged if it doesn't fit the pattern

# Last element of workflow.py, rearrange names in the named_relationships.csv
def rearrange_names_in_csv(csv_file_path):
    df = pd.read_csv(csv_file_path)
    df['source'] = df['source'].apply(rearrange_names)
    df.to_csv(csv_file_path, index=False)  # Overwrite the original CSV with the rearranged names

# Call the function to rearrange names
rearrange_names_in_csv('named_relationships.csv')

print("Names have been rearranged in named_relationships.csv.")
	import argparse
	import csv

	# Initialize the argument parser
	parser = argparse.ArgumentParser(description='Match entities from two CSV files and save the results to a new file.')
	parser.add_argument('first_csv', help='The CSV file with "source" and "target" columns.')
	parser.add_argument('second_csv', help='The CSV file to compare against.')
	parser.add_argument('--output_csv', default='matched_rows.csv', help='The output CSV file with matched rows. Defaults to "matched_rows.csv".')

	# Parse the command-line arguments
	args = parser.parse_args()

	# Load source and target entities from the first CSV
	entities_set = set()
	with open(args.first_csv, mode='r', newline='', encoding='utf-8-sig') as file:
	csv_reader = csv.DictReader(file)
	for row in csv_reader:
	entities_set.add(row['source'].strip())
	entities_set.add(row['target'].strip())

	# Read the second CSV and append matches to the output CSV
	match_count = 0 # Counter for matches
	with open(args.second_csv, mode='r', newline='', encoding='utf-8-sig') as read_file, \
	open(args.output_csv, mode='a', newline='', encoding='utf-8') as write_file:

	csv_reader = csv.DictReader(read_file)
	# Check if the output csv file is empty, to write header or not
	write_file.seek(0, 2) # Go to the end of the file.
	if write_file.tell() == 0: # Check if it's empty.
	write_header = True
	else:
	write_header = False
	write_file.seek(0) # Go back to the beginning of the file.

	csv_writer = csv.DictWriter(write_file, fieldnames=csv_reader.fieldnames)
	# If it's the first time writing to the file, write the header
	if write_header:
	csv_writer.writeheader()

	# Check each row in the second CSV for matches and append to the output CSV if matched
	for row in csv_reader:
	if any(row['name'].strip() in entities_set for field in row):
	csv_writer.writerow(row)
	match_count += 1

	if match_count == 0:
	print("No matches were found.")
	else:
	print(f"{match_count} matching {'row' if match_count == 1 else 'rows'} have been appended to the output CSV file.")
	import argparse
	import csv
	from rapidfuzz import process, fuzz

	# Initialize the argument parser
	parser = argparse.ArgumentParser(description='Match entities from two CSV files and save the results to a new file.')
	parser.add_argument('first_csv', help='The CSV file with "source" and "target" columns.')
	parser.add_argument('second_csv', help='The CSV file to compare against.')
	parser.add_argument('--output_csv', default='matched_rows.csv', help='The output CSV file with matched rows.')
	parser.add_argument('--score_threshold', type=int, default=90, help='The match score threshold for fuzzy matching.')

	# Parse the command-line arguments
	args = parser.parse_args()

	# Load source and target entities from the first CSV
	entities_set = set()
	with open(args.first_csv, mode='r', newline='', encoding='utf-8-sig') as file:
	csv_reader = csv.DictReader(file)
	for row in csv_reader:
	entities_set.add(row['source'].strip())
	entities_set.add(row['target'].strip())

	# Read the second CSV and append matches to the output CSV
	match_count = 0 # Counter for matches
	# Check each row in the second CSV for fuzzy matches and append to the output CSV if matched
	with open(args.second_csv, mode='r', newline='', encoding='utf-8-sig') as read_file, \
	open(args.output_csv, mode='a', newline='', encoding='utf-8') as write_file:

	csv_reader = csv.DictReader(read_file)
	# Extend the fieldnames to include the matched_name field
	fieldnames = csv_reader.fieldnames + ['matched_name']
	csv_writer = csv.DictWriter(write_file, fieldnames=fieldnames)

	# If it's the first time writing to the file, write the header
	write_file.seek(0, 2) # Go to the end of file
	if write_file.tell() == 0:
	csv_writer.writeheader()

	# Check each row in the second CSV for fuzzy matches and append to the output CSV if matched
	for row in csv_reader:
	name_to_match = row['name'].strip()
	result = process.extractOne(name_to_match, entities_set, scorer=fuzz.WRatio, score_cutoff=args.score_threshold)

	# Check if a match was found and unpack the result (which includes the match, score, and index)
	if result:
	best_match, score, _ = result

	# Write the original row to the new file, appending the matched_name
	row['matched_name'] = best_match
	csv_writer.writerow(row)
	match_count += 1

	if match_count == 0:
	print("No matches were found.")
	else:
	print(f"{match_count} matching {'row' if match_count == 1 else 'rows'} have been appended to the output CSV file.")
	import csv

	# File paths for your CSV files
	matched_rows_path = 'matched_rows.csv'
	relationships_path = 'orig_data/relationships.csv'
	output_relationships_path = 'output_relationships.csv'

	# Load the node_id values from matched_rows.csv
	node_ids_set = set()
	with open(matched_rows_path, mode='r', newline='', encoding='utf-8') as file:
	csv_reader = csv.DictReader(file)
	for row in csv_reader:
	node_ids_set.add(row['node_id'].strip())

	# Read the relationships.csv and write matches to a new CSV
	match_count = 0 # Counter for matches
	with open(relationships_path, mode='r', newline='', encoding='utf-8') as read_file, \
	open(output_relationships_path, mode='w', newline='', encoding='utf-8') as write_file:

	csv_reader = csv.DictReader(read_file)
	csv_writer = csv.DictWriter(write_file, fieldnames=csv_reader.fieldnames)
	# Write the header to the output file
	csv_writer.writeheader()

	# Check each row for matching node_id_start or node_id_end and write to the new file if any match is found
	for row in csv_reader:
	if row['node_id_start'].strip() in node_ids_set or row['node_id_end'].strip() in node_ids_set:
	csv_writer.writerow(row)
	match_count += 1

	if match_count == 0:
	print("No matching relationships were found.")
	else:
	print(f"{match_count} matching {'relationship' if match_count == 1 else 'relationships'} have been written to the output CSV file.")
	import argparse
	import csv

	# Set up the argument parser
	parser = argparse.ArgumentParser(description='Replace node IDs with their corresponding names.')
	parser.add_argument('csv1', help='The CSV file with "source" and "target" columns, containing numeric node identifiers.')
	parser.add_argument('csv2', help='The CSV file with "node_id" and "name" columns, for mapping identifiers to names.')
	parser.add_argument('--output_csv', default='named_relationships.csv', help='The output CSV file with "source" and "target" replaced by names.')

	# Parse the command-line arguments
	args = parser.parse_args()

	# Build the mapping from node_id to name from csv2
	node_id_to_name = {}
	with open(args.csv2, mode='r', newline='', encoding='utf-8') as file:
	csv_reader = csv.DictReader(file)
	for row in csv_reader:
	node_id_to_name[row['node_id'].strip()] = row['name'].strip()

	# Read csv1 and replace node IDs with names using the mapping, then write to the new CSV
	with open(args.csv1, mode='r', newline='', encoding='utf-8') as read_file, \
	open(args.output_csv, mode='w', newline='', encoding='utf-8') as write_file:

	csv_reader = csv.DictReader(read_file)
	# Define the new headers, assuming you want 'source' and 'target' to keep the original names
	fieldnames = ['source', 'target']
	csv_writer = csv.DictWriter(write_file, fieldnames=fieldnames)
	csv_writer.writeheader()

	for row in csv_reader:
	# Use the mapping to replace the node ID with the corresponding name
	source_name = node_id_to_name.get(row['source'].strip(), row['source'])
	target_name = node_id_to_name.get(row['target'].strip(), row['target'])
	csv_writer.writerow({'source': source_name, 'target': target_name})

	print(f"Names have been replaced in {args.output_csv}.")
	import os
	import subprocess
	import pandas as pd

	# Define the paths to the original data
	knowledge_graph_path = 'orig_data/knowledge-graph.csv'
	entities_path = 'orig_data/nodes-entities.csv'
	intermediaries_path = 'orig_data/nodes-intermediaries.csv'
	officers_path = 'orig_data/nodes-officers.csv'

	# Step 1: Run fuzzy_compare on various CSVs
	print("And awaaaay we go! \n\n...fuzzy compare of kg to entities\n")
	subprocess.run(['python', '1_fuzzy_compare.py', knowledge_graph_path, entities_path, '--output_csv', 'matched_rows.csv', '--score_threshold', '95'])
	print("... fuzzy compare of kg to intermediaries\n")
	subprocess.run(['python', '1_fuzzy_compare.py', knowledge_graph_path, intermediaries_path, '--output_csv', 'matched_rows.csv', '--score_threshold', '95'])
	print("... fuzzy compare of kg to officers\n")
	subprocess.run(['python', '1_fuzzy_compare.py', knowledge_graph_path, officers_path, '--output_csv', 'matched_rows.csv', '--score_threshold', '95'])

	# Step 2: Process the matched rows to generate relationships
	print("...digging out relationships...")
	subprocess.run(['python', '2_relationshipper.py'])

	# Manually selecting the first three columns of data is replaced by pandas
	print("...sorting out relationships...")
	df = pd.read_csv('output_relationships.csv', usecols=[0, 1, 2])
	df.columns = ['source', 'target', 'rel_type']
	df.to_csv('thirdstep.csv', index=False)

	# Step 3: Compare node IDs and append to laststep.csv
	print("...matching names to ids...")
	subprocess.run(['python', '3_compare_node_ids.py', 'thirdstep.csv', entities_path, '--output_csv', 'laststep.csv'])
	subprocess.run(['python', '3_compare_node_ids.py', 'thirdstep.csv', intermediaries_path, '--output_csv', 'laststep.csv'])
	subprocess.run(['python', '3_compare_node_ids.py', 'thirdstep.csv', officers_path, '--output_csv', 'laststep.csv'])

	# Final step: Combine the third step data with last step
	print("...just about done...")
	subprocess.run(['python', '4_last_step.py', 'thirdstep.csv', 'laststep.csv'])

	# Function to rearrange names from 'Lastname - Firstname Middlename' to 'Firstname Middlename Lastname'
	def rearrange_names(name):
	if '-' in name:
	parts = name.split(' - ')
	if len(parts) == 2:
	return f"{parts[1]} {parts[0]}"
	return name # Return the name unchanged if it doesn't fit the pattern

	# Last element of workflow.py, rearrange names in the named_relationships.csv
	def rearrange_names_in_csv(csv_file_path):
	df = pd.read_csv(csv_file_path)
	df['source'] = df['source'].apply(rearrange_names)
	df.to_csv(csv_file_path, index=False) # Overwrite the original CSV with the rearranged names

	# Call the function to rearrange names
	rearrange_names_in_csv('named_relationships.csv')

	print("Names have been rearranged in named_relationships.csv.")