Created
June 21, 2023 22:31
-
-
Save mrbesher/6f44c551130ed603862887349eed88a3 to your computer and use it in GitHub Desktop.
A code to count words in old Turkish literature books without the footnotes and the side notes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
import argparse | |
import string | |
import re | |
from typing import List, Tuple, Optional | |
from itertools import islice | |
import numpy as np | |
from pdfminer.high_level import extract_pages | |
from pdfminer.layout import LTTextBox, LTPage | |
from tqdm import tqdm | |
def filter_text_boxes(page: LTPage, max_y_ratio: float = .95, min_y_ratio: float = .08, bottom_y_ratio: float = .2, | |
max_width_ratio: float = .6, hor_dim_threshold_ratio: float = .1, | |
max_special_chars_ratio: float = .3, min_characters: int = 10, max_digit_ratio: float = .05) -> Tuple[List[str], List[LTTextBox]]: | |
# Get all text boxes in the page | |
elements = [element for element in page if isinstance(element, LTTextBox)] | |
texts = [] | |
boxes = [] | |
# Page dimensions | |
max_y = page.height * max_y_ratio | |
min_y = page.height * min_y_ratio | |
bottom_y = page.height * bottom_y_ratio | |
max_width = page.width * max_width_ratio | |
# Horizontal dimension threshold for short side notes | |
hor_dim_threshold = page.width * hor_dim_threshold_ratio | |
# For each text box | |
for box in elements: | |
text = box.get_text() | |
# Skip if it's a page number or footnote | |
if box.y0 > max_y or box.y0 < min_y: | |
continue | |
# Check horizontal dimension | |
if box.width < hor_dim_threshold: | |
continue | |
# Filter box if it's wider than 60% of the page and in the bottom 20% of the page | |
if box.width > max_width and box.y0 < bottom_y: | |
continue | |
# Split text into lines | |
lines = text.split('\n') | |
# Filtered lines will be stored in this list | |
filtered_lines = [] | |
for line in lines: | |
# Remove leading number from text (skipping whitespaces) | |
line = re.sub(r"^\s*[\da-zA-Z]\s*[-•\.]\s*", "", line) | |
# Remove trailing number from text | |
line = re.sub(r"\s*[-\.\*]?\s*\d+\s*$", "", line) | |
# Remove single characters that are not 'u' or 'ü' | |
line = re.sub(r"(?<![\wü])[^uümOo\s](?![\wü])", "", line) | |
# Filter box if it contains less than n characters | |
if len(line) < min_characters: | |
continue | |
# Calculate digit ratio in text | |
digits_ratio = len([char for char in line if char.isdigit()]) / len(line) | |
# Filter box if digit ratio exceeds the maximum | |
if digits_ratio > max_digit_ratio: | |
continue | |
# Filter box if it contains more than 30% special characters | |
if len([char for char in line if not char.isalnum() and not char.isspace()]) > len(line) * max_special_chars_ratio: | |
continue | |
# Add line to the list of filtered lines | |
filtered_lines.append(line) | |
# Rejoin the lines into a single string, adding this box to the results | |
if filtered_lines: | |
texts.append('\n'.join(filtered_lines)) | |
boxes.append(box) | |
return texts, boxes | |
def analyze_pdf(file_name: str, start_page: Optional[int] = None, end_page: Optional[int] = None, **kwargs): | |
page_iterator = extract_pages(file_name) | |
if start_page is not None or end_page is not None: | |
iteration_start = start_page | |
start_page = start_page - 1 if start_page else None | |
page_iterator = islice(page_iterator, start_page, end_page) | |
else: | |
iteration_start = 1 | |
max_word_count = -np.inf | |
min_word_count = np.inf | |
total_word_count = 0 | |
pages_word_counts = [] | |
for page_number, page in enumerate(tqdm(page_iterator, desc="Analyzing PDF"), start=iteration_start): | |
texts, _ = filter_text_boxes(page, **kwargs) | |
word_count = sum(len(text.split()) for text in texts) | |
pages_word_counts.append(word_count) | |
if word_count > max_word_count: | |
max_word_count = word_count | |
max_word_page = page_number | |
if word_count < min_word_count: | |
min_word_count = word_count | |
min_word_page = page_number | |
total_word_count += word_count | |
# Sayfa başına ortalama kelime sayısını hesapla | |
mean_word_count = np.mean(pages_word_counts) | |
# Analiz sonuçlarını yazdır | |
print(f"*** PDF Analysis Results ***\n" | |
f"Total word count: {total_word_count}\n" | |
f"Average word count per page: {mean_word_count:.2f}\n" | |
f"Max word count on a page: {max_word_count} (Page: {max_word_page})\n" | |
f"Min word count on a page: {min_word_count} (Page: {min_word_page})\n") | |
if __name__ == "__main__": | |
parser = argparse.ArgumentParser(description="Analyze a PDF file.") | |
parser.add_argument("file", type=str, help="PDF file to be analyzed.") | |
parser.add_argument("--start", type=int, default=None, help="Start page for analysis.") | |
parser.add_argument("--end", type=int, default=None, help="End page for analysis.") | |
parser.add_argument("--max_y_ratio", type=float, default=0.95, help="Maximum y ratio for text box placement.") | |
parser.add_argument("--min_y_ratio", type=float, default=0.08, help="Minimum y ratio for text box placement.") | |
parser.add_argument("--bottom_y_ratio", type=float, default=0.2, help="Bottom y ratio for text box placement.") | |
parser.add_argument("--max_width_ratio", type=float, default=0.6, help="Maximum width ratio for text boxes.") | |
parser.add_argument("--hor_dim_threshold_ratio", type=float, default=0.1, help="Horizontal dimension threshold ratio for short side notes.") | |
parser.add_argument("--max_special_chars_ratio", type=float, default=0.3, help="Maximum ratio of special characters in a text box.") | |
parser.add_argument("--min_characters", type=int, default=10, help="Minimum number of characters in a text box.") | |
parser.add_argument("--max_digit_ratio", type=float, default=0.05, help="Maximum ratio of digits in a text box.") | |
args = parser.parse_args() | |
analyze_pdf(args.file, | |
start_page=args.start, | |
end_page=args.end, | |
max_y_ratio=args.max_y_ratio, | |
min_y_ratio=args.min_y_ratio, | |
bottom_y_ratio=args.bottom_y_ratio, | |
max_width_ratio=args.max_width_ratio, | |
hor_dim_threshold_ratio=args.hor_dim_threshold_ratio, | |
max_special_chars_ratio=args.max_special_chars_ratio, | |
min_characters=args.min_characters, | |
max_digit_ratio=args.max_digit_ratio) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment