Skip to content

Instantly share code, notes, and snippets.

@yig
Last active May 2, 2024 05:00
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save yig/d55eba6221997d12d94fe6976a357edd to your computer and use it in GitHub Desktop.
Save yig/d55eba6221997d12d94fe6976a357edd to your computer and use it in GitHub Desktop.
Converts a PDF file assumed to be a two-column ACM or CGF article to text. Ignores reviewer red numbering.
'''
# Author: Yotam Gingold <yotam@yotamgingold.com>
# License: CC0
# URL: <https://gist.github.com/yig/d55eba6221997d12d94fe6976a357edd>
## About
Converts a PDF file assumed to be a two-column ACM article to text. Ignores reviewer red numbering.
## Install
pip install pypdf==4.0.2
To remove hyphenation:
pip install spacy==3.7.4
python -m spacy download en_core_web_sm
## Usage
By default, saves to `.txt` next to the PDF. Pass `-` as the output file to print to stdout.
python3 sig2text.py file.pdf [out.txt]
'''
import argparse
parser = argparse.ArgumentParser( description = 'Convert two-column ACM articles to text.' )
parser.add_argument( 'inpath', type = str, help = 'Path to input PDF file.' )
parser.add_argument( 'outpath', type = str, nargs = '?', default = None, help = 'Path to save output text (default is .txt next to the PDF). Pass `-` to print to stdout.' )
parser.add_argument( '-metrics', type = str, choices = ['ACM', 'CGF'], default = 'ACM', help = 'Which bounding boxes to use. Choices: ACM, CGF. Default is ACM.' )
args = parser.parse_args()
from pypdf import PdfReader
print( f"Loading:", args.inpath )
reader = PdfReader( args.inpath )
print( f"Using {args.metrics} metrics." )
metrics = [
## Column 1
# left, right, top, bottom
{
'ACM': [ 43, 313, 75, 705 ],
'CGF': [ 41, 308, 78, 726 ]
},
## Column 2
# left, right, top, bottom
{
'ACM': [ 313, 575, 75, 705 ],
'CGF': [ 308, 575, 78, 726 ]
}
]
parts = []
for page_index, page in enumerate( reader.pages ):
def visit_crop( text, user_matrix, tm_matrix, font_dict, font_size ):
x, y = tm_matrix[4:6]
# y is from the bottom, so flip it
y = page.mediabox[3] - y
## Keep only what's inside the crop box
if x < crop[0] or x > crop[1] or y < crop[2] or y > crop[3]: return
## Check for the noise on the first page
if page_index == 0 and text.startswith( "Permission to make digital or hard copies" ):
crop[3] = y
return
parts.append(text)
## Column 1
# left, right, top, bottom
crop = metrics[0][args.metrics]
page.extract_text( visitor_text = visit_crop )
if len( parts ) > 0: parts[-1] += '\n'
## Column 2
# left, right, top, bottom
crop = metrics[1][args.metrics]
page.extract_text( visitor_text = visit_crop )
if len( parts ) > 0: parts[-1] += '\n'
text_body = "".join( parts )
## Remove hyphenation
REMOVE_HYPHENATION = False
try:
import spacy
# Load the English NLP model
nlp = spacy.load("en_core_web_sm")
# Let's remove hyphenation!
REMOVE_HYPHENATION = True
except: pass
if REMOVE_HYPHENATION:
def is_english(word):
doc = nlp(word)
# Check if the language of the word is English
return doc.lang_ == "en"
import re
pattern = re.compile(r'([a-zA-Z]+)-[\n]+([a-zA-Z]+)')
def replace_if_english( match ):
dehyphen = '\n' + match.group(1) + match.group(2)
return dehyphen if is_english( dehyphen ) else match.group(0)
original_text_body = text_body
text_body = pattern.sub( replace_if_english, original_text_body )
if args.outpath is None:
from pathlib import Path
args.outpath = Path(args.inpath).with_suffix( '.txt' )
if args.outpath == '-':
print( text_body )
else:
with open( args.outpath, 'w' ) as f: f.write( text_body )
print( f"Saved:", args.outpath )
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment