Skip to content

Instantly share code, notes, and snippets.

@mthh
Last active May 25, 2023 10:15
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save mthh/ed5426321a7066357ec4ca81165c40d9 to your computer and use it in GitHub Desktop.
Save mthh/ed5426321a7066357ec4ca81165c40d9 to your computer and use it in GitHub Desktop.
import PyPDF2
import os
if __name__ == '__main__':
# Le chemin du dossier qui contient les pdf
path_input = '/home/mthh/Téléchargements/'
# Le chemin du dossier de sorties qui va acceuillir les fichiers docx
path_output = '/home/mthh/Téléchargements/output/'
# On créé le dossier de sortie s'il n'existe pas
if not os.path.exists(path_output):
os.makedirs(path_output)
# On liste les fichiers pdf du dossier d'entrées
files = [fp for fp in os.listdir(path_input) if 'pdf' in fp.lower()]
# Pour chaque fichier, on fait la conversion
for file_name in files:
with open(path_input + file_name,'rb') as f:
pdfreader = PyPDF2.PdfReader(f)
text = ''
for i in range(len(pdfreader.pages)):
text += pdfreader.pages[i].extract_text()
with open(path_output + file_name[:-4] + '.txt', 'w') as f2:
f2.write(text)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment