Skip to content

Instantly share code, notes, and snippets.

@sergiolucero
Last active April 12, 2023 13:49
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save sergiolucero/e3253ac0d5f16e309963194cc4ecb967 to your computer and use it in GitHub Desktop.
Save sergiolucero/e3253ac0d5f16e309963194cc4ecb967 to your computer and use it in GitHub Desktop.
pdf legal conversion
import glob, fitz, pandas as pd
files = glob.glob('folder/*.pdf')
texts = [' '.join([page.get_text() for page in fitz.open(fn)])
for fn in files]
df = pd.DataFrame(dict(file=files, text=texts))
df['cuerpo'] = df.text.apply(lambda t: remove_headandsentence)
df['fallo'] = df.text.apply(lambda t: extract_fallo)
df.to_csv('sentencias.csv', index=False)
print(sum(len(txt) for txt in texts))
@sergiolucero
Copy link
Author

legalbert

@sergiolucero
Copy link
Author

import boto3
import fitz
from datetime import datetime

translate = boto3.client('translate')
source_language_code = 'en'; target_language_code = 'es'
SIZE_LIMIT = 9500

get_text = lambda fn: ' '.join([page.get_text() for page in fitz.open(fn)])

def xl8(filename):
document_text = get_text(filename)
nChars = len(document_text)
nChunks = nChars/SIZE_LIMIT
translated_text = ''

for chunk in range(int(nChunks)):
    start = chunk*SIZE_LIMIT
    stop = start+SIZE_LIMIT
    chunk_text = document_text[start:stop]
    translated_chunk = translate.translate_text(
        Text=chunk_text, SourceLanguageCode=source_language_code,
        TargetLanguageCode=target_language_code )['TranslatedText']
    if chunk%20==10:  # every 20 pages
        timestamp = datetime.now().strftime('%H:%M:%S')
        print('[Bloque %d](%s) %s' %(chunk, timestamp, translated_chunk[:200]))
translated_text += translated_chunk
return translated_text

def pdf2spa(filename):
txt = xl8(filename)
open(filename.replace('.pdf','.txt'), 'w').write(txt)

@sergiolucero
Copy link
Author

pip install PyMuPDF

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment