Skip to content

Instantly share code, notes, and snippets.

@Recursing
Last active May 24, 2022 20:44
Show Gist options
  • Save Recursing/ad4297e7729251e491198557ae105580 to your computer and use it in GitHub Desktop.
Save Recursing/ad4297e7729251e491198557ae105580 to your computer and use it in GitHub Desktop.
Extract biomarkers from pdf
import re
import os
import pdfminer.high_level as pdf_reader
import csv
PATTERNS = [
re.compile(p, re.M)
for p in [
r"^Tumor Mutational Burden\s+([^\s]+|Not Evaluable)\s+mutations-per-megabase$(?:\n|\s)+Microsatellite Instability(?:\n|\s)+([^\n]+)$",
r"^Biomarker\n+Status/Score\n+TMB\nMSI\n+(\d+.?\d+|Not Evaluable)\n+(.+)$",
r"^Biomarker\n+TMB\nMSI\n+Status/Score\n+(\d+.?\d+|Not Evaluable)\n+(.+)$",
]
]
biomarkers: dict[str, list[str]] = {}
for fname in sorted(os.listdir(".")):
if not fname.endswith(".pdf"):
continue
text = pdf_reader.extract_text(fname)
print("Loading", fname)
matches = None
for pattern in PATTERNS:
matches = pattern.findall(text)
if matches:
break
if not matches:
print("NO DATA FOUND", fname)
continue
biomarkers[fname] = list(matches[0])
columns = ["fname", "Tumor Mutational Burden", "Microsatellite Instability"]
with open("biomarkers.csv", "w", newline="") as out_file:
writer = csv.writer(out_file)
writer.writerow(columns)
for fname, patient_biomarkers in biomarkers.items():
row = [fname] + patient_biomarkers
writer.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment