Skip to content

Instantly share code, notes, and snippets.

@adigitoleo
Created July 27, 2022 02:26
Show Gist options
  • Save adigitoleo/856ca22fe9132ede3470b70a73ac9d74 to your computer and use it in GitHub Desktop.
Save adigitoleo/856ca22fe9132ede3470b70a73ac9d74 to your computer and use it in GitHub Desktop.
Script to parse IMA mineral list into CSV file
import re
import csv
import pdfplumber
# Create a list of minerals as classified by the IMA.
# Get PDF from http://cnmnc.main.jp/
pdf = pdfplumber.open("IMA_MineralList_202207.pdf")
# Set up regexp to fix chemical formulae.
ion_sup = re.compile(r"(\d+[+-])")
count_sub = re.compile(r"([^{])(\d+)")
# First two pages are header stuff.
pages = pdf.pages[2:]
# Need larger tolerance to pick up sub-/superscripts.
table_settings={"text_y_tolerance": 6}
# Get table content on first page, which has the column names.
first_table = pages[0].extract_table(table_settings=table_settings)
# Check PDF, currently there are 5828 mineral species.
index = iter(range(5828))
minerals = {
key.replace("\n", "").strip(): [""]*5828 for key in first_table[0]
}
for page in pages:
if page.page_number == pages[0].page_number:
table = first_table[1:]
else:
table = page.extract_table(table_settings=table_settings)
for row in table:
idx = next(index)
for cell, (key, val) in enumerate(minerals.items()):
cell_val = row[cell].replace("\n", "").strip()
if "formula" in key:
minerals[key][idx] = count_sub.sub(
r"\1_{\2}", ion_sup.sub(r"^{\1}", cell_val)
).replace("·", "")
else:
minerals[key][idx] = cell_val
# Write to CSV file.
with open("IMA_MineralList_202207.csv", mode="w", encoding="utf8") as file:
keys = minerals.keys()
writer = csv.DictWriter(file, fieldnames=keys)
writer.writeheader()
for row in zip(*minerals.values()):
writer.writerow(dict(zip(keys, row)))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment