Last active
May 24, 2022 20:27
-
-
Save Recursing/9d9bbf830fda8496fce0a89a287232ee to your computer and use it in GitHub Desktop.
Pdf to excel
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
import os | |
import pdfminer.high_level as pdf_reader | |
import csv | |
ignore_list = ["GENE", "MSI-High", ""] | |
def get_pattern(text: str) -> re.Pattern[str]: | |
if "FMI Sample ID" in text: | |
assert "Status/Score" in text | |
return re.compile( | |
r"(?P<GENES>(?:^[^\s]+$\n){2,})\n(?:ALTERATION\n\n)?(?P<ALTERATIONS>(?:^(?:[^\s]|splice site | rearrangement| fusion|promoter )+$\n?)+)", | |
re.M, | |
) | |
else: | |
assert re.compile(r"Template Version: \d.\d").findall(text) | |
return re.compile( | |
r"(?P<GENES>(?:^[^\s]+$\n)+)\n(?P<ALTERATIONS>(?:^(?:[^\s]|splice site | rearrangement| fusion|promoter )+$\n?)+)", | |
re.M, | |
) | |
def flatten_matches(matches: list[tuple[str, str]]) -> list[str]: | |
gene_alterations: list[str] = [] | |
for (genes, alterations) in matches: | |
for gene, alteration in zip(genes.split("\n"), alterations.split("\n")): | |
if gene in ignore_list: | |
continue | |
assert f"{gene}_{alteration}" not in gene_alterations | |
gene_alterations.append(f"{gene}_{alteration}") | |
if gene not in gene_alterations: | |
gene_alterations.append(gene) | |
return gene_alterations | |
alterations: dict[str, list[str]] = {} | |
for fname in sorted(os.listdir(".")): | |
if not fname.endswith(".pdf"): | |
continue | |
text = pdf_reader.extract_text(fname) | |
print("Loading", fname) | |
pattern = get_pattern(text) | |
if "Reason for Sample Failure" in text: | |
continue | |
if "Enrollment Eligible Alterations" in text: | |
text = text.partition("Enrollment Eligible Alterations")[-1] | |
if "Status/Score" in text: | |
text = text.partition("Status/Score")[0] | |
if "Variants of Unknown Significance Identified" in text: | |
text = text.partition("Variants of Unknown Significance Identified")[0] | |
if "GENOMIC SIGNATURES" in text: | |
text = text.partition("GENOMIC SIGNATURES")[0] | |
matches = pattern.findall(text) | |
if not matches: | |
continue | |
alterations[fname] = flatten_matches(matches) | |
columns: list[str] = [] | |
for patient_alterations in alterations.values(): | |
columns.extend(patient_alterations) | |
all_alterations = sorted(set(columns), key=str.lower) | |
columns = ["filename"] + all_alterations | |
with open("mutations.csv", "w", newline="") as out_file: | |
writer = csv.writer(out_file) | |
writer.writerow(columns) | |
for fname, patient_alterations in alterations.items(): | |
row = [fname] | |
row.extend( | |
str(alteration in patient_alterations) for alteration in all_alterations | |
) | |
writer.writerow(row) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment