Skip to content

Instantly share code, notes, and snippets.

@kadster
Created March 5, 2018 14:05
Show Gist options
  • Save kadster/2c251fb33ddd4497f21600c82ce16acb to your computer and use it in GitHub Desktop.
Save kadster/2c251fb33ddd4497f21600c82ce16acb to your computer and use it in GitHub Desktop.
import os
from bs4 import BeautifulSoup
folder = '/Users/nt/Desktop/Darwin'
files = os.listdir(folder)
tagnames = ['persName', 'date', 'keywords', 'abstract', 'text']
files = [f for f in files[:] if 'xml' in f]
output_strs = []
for fname in files:
with open(os.path.join(folder, fname), "r") as infile:
file_data = []
content = infile.read()
soup = BeautifulSoup(content,'xml')
for tagname in tagnames:
tag_data = soup.find(tagname)
if tag_data is None:
tag_text="MISSING"
else:
transcription = soup.find(type="transcription")
temp = tag_data.get_text()
if transcription is not None:
tag_text = transcription.get_text()
else:
tag_text = tag_data.get_text()
if tag_text is "":
tag_text="TRANSCRIPTION MISSING"
file_data.append(",\"" + tag_text + "\"")
output_strs.append("{}\t{}\n".format(fname, "\t".join(file_data).replace('\n', ' ')))
print(len(output_strs))
with open('/Users/nt/Desktop/text_trying.csv', 'w') as outfile:
outfile.writelines(output_strs)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment