Skip to content

Instantly share code, notes, and snippets.

@mrbesher
Created June 21, 2023 22:31
Show Gist options
  • Save mrbesher/6f44c551130ed603862887349eed88a3 to your computer and use it in GitHub Desktop.
Save mrbesher/6f44c551130ed603862887349eed88a3 to your computer and use it in GitHub Desktop.
A code to count words in old Turkish literature books without the footnotes and the side notes.
#!/usr/bin/env python
import argparse
import string
import re
from typing import List, Tuple, Optional
from itertools import islice
import numpy as np
from pdfminer.high_level import extract_pages
from pdfminer.layout import LTTextBox, LTPage
from tqdm import tqdm
def filter_text_boxes(page: LTPage, max_y_ratio: float = .95, min_y_ratio: float = .08, bottom_y_ratio: float = .2,
max_width_ratio: float = .6, hor_dim_threshold_ratio: float = .1,
max_special_chars_ratio: float = .3, min_characters: int = 10, max_digit_ratio: float = .05) -> Tuple[List[str], List[LTTextBox]]:
# Get all text boxes in the page
elements = [element for element in page if isinstance(element, LTTextBox)]
texts = []
boxes = []
# Page dimensions
max_y = page.height * max_y_ratio
min_y = page.height * min_y_ratio
bottom_y = page.height * bottom_y_ratio
max_width = page.width * max_width_ratio
# Horizontal dimension threshold for short side notes
hor_dim_threshold = page.width * hor_dim_threshold_ratio
# For each text box
for box in elements:
text = box.get_text()
# Skip if it's a page number or footnote
if box.y0 > max_y or box.y0 < min_y:
continue
# Check horizontal dimension
if box.width < hor_dim_threshold:
continue
# Filter box if it's wider than 60% of the page and in the bottom 20% of the page
if box.width > max_width and box.y0 < bottom_y:
continue
# Split text into lines
lines = text.split('\n')
# Filtered lines will be stored in this list
filtered_lines = []
for line in lines:
# Remove leading number from text (skipping whitespaces)
line = re.sub(r"^\s*[\da-zA-Z]\s*[-•\.]\s*", "", line)
# Remove trailing number from text
line = re.sub(r"\s*[-\.\*]?\s*\d+\s*$", "", line)
# Remove single characters that are not 'u' or 'ü'
line = re.sub(r"(?<![\wü])[^uümOo\s](?![\wü])", "", line)
# Filter box if it contains less than n characters
if len(line) < min_characters:
continue
# Calculate digit ratio in text
digits_ratio = len([char for char in line if char.isdigit()]) / len(line)
# Filter box if digit ratio exceeds the maximum
if digits_ratio > max_digit_ratio:
continue
# Filter box if it contains more than 30% special characters
if len([char for char in line if not char.isalnum() and not char.isspace()]) > len(line) * max_special_chars_ratio:
continue
# Add line to the list of filtered lines
filtered_lines.append(line)
# Rejoin the lines into a single string, adding this box to the results
if filtered_lines:
texts.append('\n'.join(filtered_lines))
boxes.append(box)
return texts, boxes
def analyze_pdf(file_name: str, start_page: Optional[int] = None, end_page: Optional[int] = None, **kwargs):
page_iterator = extract_pages(file_name)
if start_page is not None or end_page is not None:
iteration_start = start_page
start_page = start_page - 1 if start_page else None
page_iterator = islice(page_iterator, start_page, end_page)
else:
iteration_start = 1
max_word_count = -np.inf
min_word_count = np.inf
total_word_count = 0
pages_word_counts = []
for page_number, page in enumerate(tqdm(page_iterator, desc="Analyzing PDF"), start=iteration_start):
texts, _ = filter_text_boxes(page, **kwargs)
word_count = sum(len(text.split()) for text in texts)
pages_word_counts.append(word_count)
if word_count > max_word_count:
max_word_count = word_count
max_word_page = page_number
if word_count < min_word_count:
min_word_count = word_count
min_word_page = page_number
total_word_count += word_count
# Sayfa başına ortalama kelime sayısını hesapla
mean_word_count = np.mean(pages_word_counts)
# Analiz sonuçlarını yazdır
print(f"*** PDF Analysis Results ***\n"
f"Total word count: {total_word_count}\n"
f"Average word count per page: {mean_word_count:.2f}\n"
f"Max word count on a page: {max_word_count} (Page: {max_word_page})\n"
f"Min word count on a page: {min_word_count} (Page: {min_word_page})\n")
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Analyze a PDF file.")
parser.add_argument("file", type=str, help="PDF file to be analyzed.")
parser.add_argument("--start", type=int, default=None, help="Start page for analysis.")
parser.add_argument("--end", type=int, default=None, help="End page for analysis.")
parser.add_argument("--max_y_ratio", type=float, default=0.95, help="Maximum y ratio for text box placement.")
parser.add_argument("--min_y_ratio", type=float, default=0.08, help="Minimum y ratio for text box placement.")
parser.add_argument("--bottom_y_ratio", type=float, default=0.2, help="Bottom y ratio for text box placement.")
parser.add_argument("--max_width_ratio", type=float, default=0.6, help="Maximum width ratio for text boxes.")
parser.add_argument("--hor_dim_threshold_ratio", type=float, default=0.1, help="Horizontal dimension threshold ratio for short side notes.")
parser.add_argument("--max_special_chars_ratio", type=float, default=0.3, help="Maximum ratio of special characters in a text box.")
parser.add_argument("--min_characters", type=int, default=10, help="Minimum number of characters in a text box.")
parser.add_argument("--max_digit_ratio", type=float, default=0.05, help="Maximum ratio of digits in a text box.")
args = parser.parse_args()
analyze_pdf(args.file,
start_page=args.start,
end_page=args.end,
max_y_ratio=args.max_y_ratio,
min_y_ratio=args.min_y_ratio,
bottom_y_ratio=args.bottom_y_ratio,
max_width_ratio=args.max_width_ratio,
hor_dim_threshold_ratio=args.hor_dim_threshold_ratio,
max_special_chars_ratio=args.max_special_chars_ratio,
min_characters=args.min_characters,
max_digit_ratio=args.max_digit_ratio)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment