Recursing/mosco pdf extractor.py

## mosco pdf extractor.py
import re
import os
import pdfminer.high_level as pdf_reader
import csv

ignore_list = ["GENE", "MSI-High", ""]


def get_pattern(text: str) -> re.Pattern[str]:
    if "FMI Sample ID" in text:
        assert "Status/Score" in text
        return re.compile(
            r"(?P<GENES>(?:^[^\s]+$\n){2,})\n(?:ALTERATION\n\n)?(?P<ALTERATIONS>(?:^(?:[^\s]|splice site | rearrangement| fusion|promoter )+$\n?)+)",
            re.M,
        )
    else:
        assert re.compile(r"Template Version: \d.\d").findall(text)
        return re.compile(
            r"(?P<GENES>(?:^[^\s]+$\n)+)\n(?P<ALTERATIONS>(?:^(?:[^\s]|splice site | rearrangement| fusion|promoter )+$\n?)+)",
            re.M,
        )


def flatten_matches(matches: list[tuple[str, str]]) -> list[str]:
    gene_alterations: list[str] = []
    for (genes, alterations) in matches:
        for gene, alteration in zip(genes.split("\n"), alterations.split("\n")):
            if gene in ignore_list:
                continue
            assert f"{gene}_{alteration}" not in gene_alterations
            gene_alterations.append(f"{gene}_{alteration}")
            if gene not in gene_alterations:
                gene_alterations.append(gene)
    return gene_alterations


alterations: dict[str, list[str]] = {}
for fname in sorted(os.listdir(".")):
    if not fname.endswith(".pdf"):
        continue
    text = pdf_reader.extract_text(fname)
    print("Loading", fname)
    pattern = get_pattern(text)
    if "Reason for Sample Failure" in text:
        continue
    if "Enrollment Eligible Alterations" in text:
        text = text.partition("Enrollment Eligible Alterations")[-1]
    if "Status/Score" in text:
        text = text.partition("Status/Score")[0]
    if "Variants of Unknown Significance Identified" in text:
        text = text.partition("Variants of Unknown Significance Identified")[0]
    if "GENOMIC SIGNATURES" in text:
        text = text.partition("GENOMIC SIGNATURES")[0]
    matches = pattern.findall(text)
    if not matches:
        continue
    alterations[fname] = flatten_matches(matches)

columns: list[str] = []
for patient_alterations in alterations.values():
    columns.extend(patient_alterations)

all_alterations = sorted(set(columns), key=str.lower)
columns = ["filename"] + all_alterations

with open("mutations.csv", "w", newline="") as out_file:
    writer = csv.writer(out_file)
    writer.writerow(columns)
    for fname, patient_alterations in alterations.items():
        row = [fname]
        row.extend(
            str(alteration in patient_alterations) for alteration in all_alterations
        )
        writer.writerow(row)
	import re
	import os
	import pdfminer.high_level as pdf_reader
	import csv

	ignore_list = ["GENE", "MSI-High", ""]


	def get_pattern(text: str) -> re.Pattern[str]:
	if "FMI Sample ID" in text:
	assert "Status/Score" in text
	return re.compile(
	r"(?P<GENES>(?:^[^\s]+$\n){2,})\n(?:ALTERATION\n\n)?(?P<ALTERATIONS>(?:^(?:[^\s]\|splice site \| rearrangement\| fusion\|promoter )+$\n?)+)",
	re.M,
	)
	else:
	assert re.compile(r"Template Version: \d.\d").findall(text)
	return re.compile(
	r"(?P<GENES>(?:^[^\s]+$\n)+)\n(?P<ALTERATIONS>(?:^(?:[^\s]\|splice site \| rearrangement\| fusion\|promoter )+$\n?)+)",
	re.M,
	)


	def flatten_matches(matches: list[tuple[str, str]]) -> list[str]:
	gene_alterations: list[str] = []
	for (genes, alterations) in matches:
	for gene, alteration in zip(genes.split("\n"), alterations.split("\n")):
	if gene in ignore_list:
	continue
	assert f"{gene}_{alteration}" not in gene_alterations
	gene_alterations.append(f"{gene}_{alteration}")
	if gene not in gene_alterations:
	gene_alterations.append(gene)
	return gene_alterations


	alterations: dict[str, list[str]] = {}
	for fname in sorted(os.listdir(".")):
	if not fname.endswith(".pdf"):
	continue
	text = pdf_reader.extract_text(fname)
	print("Loading", fname)
	pattern = get_pattern(text)
	if "Reason for Sample Failure" in text:
	continue
	if "Enrollment Eligible Alterations" in text:
	text = text.partition("Enrollment Eligible Alterations")[-1]
	if "Status/Score" in text:
	text = text.partition("Status/Score")[0]
	if "Variants of Unknown Significance Identified" in text:
	text = text.partition("Variants of Unknown Significance Identified")[0]
	if "GENOMIC SIGNATURES" in text:
	text = text.partition("GENOMIC SIGNATURES")[0]
	matches = pattern.findall(text)
	if not matches:
	continue
	alterations[fname] = flatten_matches(matches)

	columns: list[str] = []
	for patient_alterations in alterations.values():
	columns.extend(patient_alterations)

	all_alterations = sorted(set(columns), key=str.lower)
	columns = ["filename"] + all_alterations

	with open("mutations.csv", "w", newline="") as out_file:
	writer = csv.writer(out_file)
	writer.writerow(columns)
	for fname, patient_alterations in alterations.items():
	row = [fname]
	row.extend(
	str(alteration in patient_alterations) for alteration in all_alterations
	)
	writer.writerow(row)