Skip to content

Instantly share code, notes, and snippets.

@drvenabili
Created March 25, 2016 13:11
Show Gist options
  • Save drvenabili/34cdfe24805c13474d43 to your computer and use it in GitHub Desktop.
Save drvenabili/34cdfe24805c13474d43 to your computer and use it in GitHub Desktop.
#!/usr/bin/python
# -*- coding: utf-8 -*-
import codecs
import re
import io
import os,glob
path = '*' # change this according to where the script is and where the alto files are
files=glob.glob(path)
for chemin in files :
if os.path.isfile(chemin) == True : # we only want the files, not the directories
f = io.open(chemin,'r',encoding="utf-8") # opens every file
lines = f.readlines() # reads the file line by line
x = io.open(chemin[:-4]+".txt","w",encoding='utf-8') # removes the .xml from the filename of the file we're reading, creates a .txt file with the same name
for line in lines: # for each line,
y = re.search('CONTENT=\"(.*?)\"',line) # we search for the matching " "CONTENT="xyz" " regex
if y: # if we find it,
y = y.group(0) # we write every match of it inside the file, while removing " "CONTENT=" and the trailing " " " and adding a space between each word
z = y[9:]
zz = z[:-1]
zz = zz+" "
x.write(zz)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment