Skip to content

Instantly share code, notes, and snippets.

@lokal-profil
Last active November 2, 2023 15:38
Show Gist options
  • Save lokal-profil/0bb32eb05d7a401757e7734fde55c4f7 to your computer and use it in GitHub Desktop.
Save lokal-profil/0bb32eb05d7a401757e7734fde55c4f7 to your computer and use it in GitHub Desktop.
Merges the two files in wcvp.zip from Kew Gardens on plant_name_id, split the result by family
import csv
from collections import defaultdict
from tqdm import tqdm
distribution_file = "wcvp_distribution.csv"
names_file = "wcvp_names.csv"
merge_file = "output_family/merge_{}.csv"
fieldnames_distribution = None
plant_id = defaultdict(list)
demo = False # only output matches for plant_name_id = 1 or 2
def make_writer(family, fieldnames_out):
out_file = open(merge_file.format(family), 'w', newline='')
writer = csv.DictWriter(out_file, fieldnames=fieldnames_out, delimiter='|')
_ = writer.writeheader()
return writer, out_file
with open(distribution_file, "r") as infile_distribution:
distribution_reader = csv.DictReader(infile_distribution, delimiter='|')
fieldnames_distribution = distribution_reader.fieldnames
for row in tqdm(distribution_reader, desc ="Reading distribution"):
if (not demo or row.get('plant_name_id') in ['1', '2']):
plant_id[row.get('plant_name_id')].append(row)
families = defaultdict(list)
fieldnames_out = None
with open(names_file, "r") as infile_names:
names_reader = csv.DictReader(infile_names, delimiter='|')
fieldnames_out = list(names_reader.fieldnames)
fieldnames_out.extend(x for x in fieldnames_distribution if x not in fieldnames_out)
for row in tqdm(names_reader, desc ="Reading names"):
families[row.get('family')].append(row)
for family, rows in tqdm(families.items(), desc ="Processing families"):
writer = None # only iniitate for families with hits
out_file = None
for row in rows:
if row.get('plant_name_id') in plant_id:
if not writer:
writer, out_file = make_writer(family, fieldnames_out)
for dist in plant_id.get(row.get('plant_name_id')):
_ = writer.writerow(row|dist)
if out_file:
out_file.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment