Skip to content

Instantly share code, notes, and snippets.

@Recursing
Last active May 24, 2022 20:27
Show Gist options
  • Save Recursing/9d9bbf830fda8496fce0a89a287232ee to your computer and use it in GitHub Desktop.
Save Recursing/9d9bbf830fda8496fce0a89a287232ee to your computer and use it in GitHub Desktop.
Pdf to excel
import re
import os
import pdfminer.high_level as pdf_reader
import csv
ignore_list = ["GENE", "MSI-High", ""]
def get_pattern(text: str) -> re.Pattern[str]:
if "FMI Sample ID" in text:
assert "Status/Score" in text
return re.compile(
r"(?P<GENES>(?:^[^\s]+$\n){2,})\n(?:ALTERATION\n\n)?(?P<ALTERATIONS>(?:^(?:[^\s]|splice site | rearrangement| fusion|promoter )+$\n?)+)",
re.M,
)
else:
assert re.compile(r"Template Version: \d.\d").findall(text)
return re.compile(
r"(?P<GENES>(?:^[^\s]+$\n)+)\n(?P<ALTERATIONS>(?:^(?:[^\s]|splice site | rearrangement| fusion|promoter )+$\n?)+)",
re.M,
)
def flatten_matches(matches: list[tuple[str, str]]) -> list[str]:
gene_alterations: list[str] = []
for (genes, alterations) in matches:
for gene, alteration in zip(genes.split("\n"), alterations.split("\n")):
if gene in ignore_list:
continue
assert f"{gene}_{alteration}" not in gene_alterations
gene_alterations.append(f"{gene}_{alteration}")
if gene not in gene_alterations:
gene_alterations.append(gene)
return gene_alterations
alterations: dict[str, list[str]] = {}
for fname in sorted(os.listdir(".")):
if not fname.endswith(".pdf"):
continue
text = pdf_reader.extract_text(fname)
print("Loading", fname)
pattern = get_pattern(text)
if "Reason for Sample Failure" in text:
continue
if "Enrollment Eligible Alterations" in text:
text = text.partition("Enrollment Eligible Alterations")[-1]
if "Status/Score" in text:
text = text.partition("Status/Score")[0]
if "Variants of Unknown Significance Identified" in text:
text = text.partition("Variants of Unknown Significance Identified")[0]
if "GENOMIC SIGNATURES" in text:
text = text.partition("GENOMIC SIGNATURES")[0]
matches = pattern.findall(text)
if not matches:
continue
alterations[fname] = flatten_matches(matches)
columns: list[str] = []
for patient_alterations in alterations.values():
columns.extend(patient_alterations)
all_alterations = sorted(set(columns), key=str.lower)
columns = ["filename"] + all_alterations
with open("mutations.csv", "w", newline="") as out_file:
writer = csv.writer(out_file)
writer.writerow(columns)
for fname, patient_alterations in alterations.items():
row = [fname]
row.extend(
str(alteration in patient_alterations) for alteration in all_alterations
)
writer.writerow(row)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment