Skip to content

Instantly share code, notes, and snippets.

@shawngraham
Last active December 10, 2023 01:11
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save shawngraham/8b099ae4992a57718cf66e2c3166a16d to your computer and use it in GitHub Desktop.
Save shawngraham/8b099ae4992a57718cf66e2c3166a16d to your computer and use it in GitHub Desktop.
comparing the offshoreleaks database (https://offshoreleaks.icij.org/pages/database) to augment my knowledge-graph.csv. Blog post: https://carleton.ca/xlab/2023/trawling-data-the-panama-papers-and-our-knowledge-graph/
import argparse
import csv
# Initialize the argument parser
parser = argparse.ArgumentParser(description='Match entities from two CSV files and save the results to a new file.')
parser.add_argument('first_csv', help='The CSV file with "source" and "target" columns.')
parser.add_argument('second_csv', help='The CSV file to compare against.')
parser.add_argument('--output_csv', default='matched_rows.csv', help='The output CSV file with matched rows. Defaults to "matched_rows.csv".')
# Parse the command-line arguments
args = parser.parse_args()
# Load source and target entities from the first CSV
entities_set = set()
with open(args.first_csv, mode='r', newline='', encoding='utf-8-sig') as file:
csv_reader = csv.DictReader(file)
for row in csv_reader:
entities_set.add(row['source'].strip())
entities_set.add(row['target'].strip())
# Read the second CSV and append matches to the output CSV
match_count = 0 # Counter for matches
with open(args.second_csv, mode='r', newline='', encoding='utf-8-sig') as read_file, \
open(args.output_csv, mode='a', newline='', encoding='utf-8') as write_file:
csv_reader = csv.DictReader(read_file)
# Check if the output csv file is empty, to write header or not
write_file.seek(0, 2) # Go to the end of the file.
if write_file.tell() == 0: # Check if it's empty.
write_header = True
else:
write_header = False
write_file.seek(0) # Go back to the beginning of the file.
csv_writer = csv.DictWriter(write_file, fieldnames=csv_reader.fieldnames)
# If it's the first time writing to the file, write the header
if write_header:
csv_writer.writeheader()
# Check each row in the second CSV for matches and append to the output CSV if matched
for row in csv_reader:
if any(row['name'].strip() in entities_set for field in row):
csv_writer.writerow(row)
match_count += 1
if match_count == 0:
print("No matches were found.")
else:
print(f"{match_count} matching {'row' if match_count == 1 else 'rows'} have been appended to the output CSV file.")
import argparse
import csv
from rapidfuzz import process, fuzz
# Initialize the argument parser
parser = argparse.ArgumentParser(description='Match entities from two CSV files and save the results to a new file.')
parser.add_argument('first_csv', help='The CSV file with "source" and "target" columns.')
parser.add_argument('second_csv', help='The CSV file to compare against.')
parser.add_argument('--output_csv', default='matched_rows.csv', help='The output CSV file with matched rows.')
parser.add_argument('--score_threshold', type=int, default=90, help='The match score threshold for fuzzy matching.')
# Parse the command-line arguments
args = parser.parse_args()
# Load source and target entities from the first CSV
entities_set = set()
with open(args.first_csv, mode='r', newline='', encoding='utf-8-sig') as file:
csv_reader = csv.DictReader(file)
for row in csv_reader:
entities_set.add(row['source'].strip())
entities_set.add(row['target'].strip())
# Read the second CSV and append matches to the output CSV
match_count = 0 # Counter for matches
# Check each row in the second CSV for fuzzy matches and append to the output CSV if matched
with open(args.second_csv, mode='r', newline='', encoding='utf-8-sig') as read_file, \
open(args.output_csv, mode='a', newline='', encoding='utf-8') as write_file:
csv_reader = csv.DictReader(read_file)
# Extend the fieldnames to include the matched_name field
fieldnames = csv_reader.fieldnames + ['matched_name']
csv_writer = csv.DictWriter(write_file, fieldnames=fieldnames)
# If it's the first time writing to the file, write the header
write_file.seek(0, 2) # Go to the end of file
if write_file.tell() == 0:
csv_writer.writeheader()
# Check each row in the second CSV for fuzzy matches and append to the output CSV if matched
for row in csv_reader:
name_to_match = row['name'].strip()
result = process.extractOne(name_to_match, entities_set, scorer=fuzz.WRatio, score_cutoff=args.score_threshold)
# Check if a match was found and unpack the result (which includes the match, score, and index)
if result:
best_match, score, _ = result
# Write the original row to the new file, appending the matched_name
row['matched_name'] = best_match
csv_writer.writerow(row)
match_count += 1
if match_count == 0:
print("No matches were found.")
else:
print(f"{match_count} matching {'row' if match_count == 1 else 'rows'} have been appended to the output CSV file.")
import csv
# File paths for your CSV files
matched_rows_path = 'matched_rows.csv'
relationships_path = 'orig_data/relationships.csv'
output_relationships_path = 'output_relationships.csv'
# Load the node_id values from matched_rows.csv
node_ids_set = set()
with open(matched_rows_path, mode='r', newline='', encoding='utf-8') as file:
csv_reader = csv.DictReader(file)
for row in csv_reader:
node_ids_set.add(row['node_id'].strip())
# Read the relationships.csv and write matches to a new CSV
match_count = 0 # Counter for matches
with open(relationships_path, mode='r', newline='', encoding='utf-8') as read_file, \
open(output_relationships_path, mode='w', newline='', encoding='utf-8') as write_file:
csv_reader = csv.DictReader(read_file)
csv_writer = csv.DictWriter(write_file, fieldnames=csv_reader.fieldnames)
# Write the header to the output file
csv_writer.writeheader()
# Check each row for matching node_id_start or node_id_end and write to the new file if any match is found
for row in csv_reader:
if row['node_id_start'].strip() in node_ids_set or row['node_id_end'].strip() in node_ids_set:
csv_writer.writerow(row)
match_count += 1
if match_count == 0:
print("No matching relationships were found.")
else:
print(f"{match_count} matching {'relationship' if match_count == 1 else 'relationships'} have been written to the output CSV file.")
import argparse
import csv
# Initialize the argument parser
parser = argparse.ArgumentParser(description='Match entities from two CSV files and save the results to a new file.')
parser.add_argument('first_csv', help='The CSV file with "source" and "target" columns.')
parser.add_argument('second_csv', help='The CSV file to compare against.')
parser.add_argument('--output_csv', default='matched_rows.csv', help='The output CSV file with matched rows. Defaults to "matched_rows.csv".')
# Parse the command-line arguments
args = parser.parse_args()
# Load source and target entities from the first CSV
entities_set = set()
with open(args.first_csv, mode='r', newline='', encoding='utf-8-sig') as file:
csv_reader = csv.DictReader(file)
for row in csv_reader:
entities_set.add(row['source'].strip())
entities_set.add(row['target'].strip())
print(entities_set)
# Read the second CSV and append matches to the output CSV
match_count = 0 # Counter for matches
with open(args.second_csv, mode='r', newline='', encoding='utf-8-sig') as read_file, \
open(args.output_csv, mode='a', newline='', encoding='utf-8') as write_file:
csv_reader = csv.DictReader(read_file)
# Check if the output csv file is empty, to write header or not
write_file.seek(0, 2) # Go to the end of the file.
if write_file.tell() == 0: # Check if it's empty.
write_header = True
else:
write_header = False
write_file.seek(0) # Go back to the beginning of the file.
csv_writer = csv.DictWriter(write_file, fieldnames=csv_reader.fieldnames)
# If it's the first time writing to the file, write the header
if write_header:
csv_writer.writeheader()
# Check each row in the second CSV for matches and append to the output CSV if matched
for row in csv_reader:
if any(row['node_id'].strip() in entities_set for field in row):
csv_writer.writerow(row)
match_count += 1
if match_count == 0:
print("No matches were found.")
else:
print(f"{match_count} matching {'row' if match_count == 1 else 'rows'} have been appended to the output CSV file.")
import argparse
import csv
# Set up the argument parser
parser = argparse.ArgumentParser(description='Replace node IDs with their corresponding names.')
parser.add_argument('csv1', help='The CSV file with "source" and "target" columns, containing numeric node identifiers.')
parser.add_argument('csv2', help='The CSV file with "node_id" and "name" columns, for mapping identifiers to names.')
parser.add_argument('--output_csv', default='named_relationships.csv', help='The output CSV file with "source" and "target" replaced by names.')
# Parse the command-line arguments
args = parser.parse_args()
# Build the mapping from node_id to name from csv2
node_id_to_name = {}
with open(args.csv2, mode='r', newline='', encoding='utf-8') as file:
csv_reader = csv.DictReader(file)
for row in csv_reader:
node_id_to_name[row['node_id'].strip()] = row['name'].strip()
# Read csv1 and replace node IDs with names using the mapping, then write to the new CSV
with open(args.csv1, mode='r', newline='', encoding='utf-8') as read_file, \
open(args.output_csv, mode='w', newline='', encoding='utf-8') as write_file:
csv_reader = csv.DictReader(read_file)
# Define the new headers, assuming you want 'source' and 'target' to keep the original names
fieldnames = ['source', 'target']
csv_writer = csv.DictWriter(write_file, fieldnames=fieldnames)
csv_writer.writeheader()
for row in csv_reader:
# Use the mapping to replace the node ID with the corresponding name
source_name = node_id_to_name.get(row['source'].strip(), row['source'])
target_name = node_id_to_name.get(row['target'].strip(), row['target'])
csv_writer.writerow({'source': source_name, 'target': target_name})
print(f"Names have been replaced in {args.output_csv}.")
import os
import subprocess
import pandas as pd
# Define the paths to the original data
knowledge_graph_path = 'orig_data/knowledge-graph.csv'
entities_path = 'orig_data/nodes-entities.csv'
intermediaries_path = 'orig_data/nodes-intermediaries.csv'
officers_path = 'orig_data/nodes-officers.csv'
# Step 1: Run fuzzy_compare on various CSVs
print("And awaaaay we go! \n\n...fuzzy compare of kg to entities\n")
subprocess.run(['python', '1_fuzzy_compare.py', knowledge_graph_path, entities_path, '--output_csv', 'matched_rows.csv', '--score_threshold', '95'])
print("... fuzzy compare of kg to intermediaries\n")
subprocess.run(['python', '1_fuzzy_compare.py', knowledge_graph_path, intermediaries_path, '--output_csv', 'matched_rows.csv', '--score_threshold', '95'])
print("... fuzzy compare of kg to officers\n")
subprocess.run(['python', '1_fuzzy_compare.py', knowledge_graph_path, officers_path, '--output_csv', 'matched_rows.csv', '--score_threshold', '95'])
# Step 2: Process the matched rows to generate relationships
print("...digging out relationships...")
subprocess.run(['python', '2_relationshipper.py'])
# Manually selecting the first three columns of data is replaced by pandas
print("...sorting out relationships...")
df = pd.read_csv('output_relationships.csv', usecols=[0, 1, 2])
df.columns = ['source', 'target', 'rel_type']
df.to_csv('thirdstep.csv', index=False)
# Step 3: Compare node IDs and append to laststep.csv
print("...matching names to ids...")
subprocess.run(['python', '3_compare_node_ids.py', 'thirdstep.csv', entities_path, '--output_csv', 'laststep.csv'])
subprocess.run(['python', '3_compare_node_ids.py', 'thirdstep.csv', intermediaries_path, '--output_csv', 'laststep.csv'])
subprocess.run(['python', '3_compare_node_ids.py', 'thirdstep.csv', officers_path, '--output_csv', 'laststep.csv'])
# Final step: Combine the third step data with last step
print("...just about done...")
subprocess.run(['python', '4_last_step.py', 'thirdstep.csv', 'laststep.csv'])
# Function to rearrange names from 'Lastname - Firstname Middlename' to 'Firstname Middlename Lastname'
def rearrange_names(name):
if '-' in name:
parts = name.split(' - ')
if len(parts) == 2:
return f"{parts[1]} {parts[0]}"
return name # Return the name unchanged if it doesn't fit the pattern
# Last element of workflow.py, rearrange names in the named_relationships.csv
def rearrange_names_in_csv(csv_file_path):
df = pd.read_csv(csv_file_path)
df['source'] = df['source'].apply(rearrange_names)
df.to_csv(csv_file_path, index=False) # Overwrite the original CSV with the rearranged names
# Call the function to rearrange names
rearrange_names_in_csv('named_relationships.csv')
print("Names have been rearranged in named_relationships.csv.")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment