Skip to content

Instantly share code, notes, and snippets.

@MartinThoma
Last active August 29, 2015 14:20
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save MartinThoma/cc2f3e8a9a194ded4f45 to your computer and use it in GitHub Desktop.
Save MartinThoma/cc2f3e8a9a194ded4f45 to your computer and use it in GitHub Desktop.
Transform the format of a file to move www.saai.macbay.de/saai to kit.edu
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
Transform contents of
http://www.saai.macbay.de/saai/bestaende_aichele_manfred.html
into a new format so that it get on kit.edu
"""
from __future__ import unicode_literals
from six.moves.urllib.request import urlopen
from bs4 import BeautifulSoup
import sys
reload(sys)
sys.setdefaultencoding('utf-8')
import os
import logging
import sys
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s',
level=logging.DEBUG,
stream=sys.stdout)
def get_content(filename):
"""Get text from a website / text file.
Parameters
----------
filename : string
Returns
-------
string :
contents of file / webpage
"""
if filename.startswith("http"): # Internet
response = urlopen(filename)
content = response.read()
else: # local
with open(filename) as f:
content = f.read()
return content
def transform(text):
"""Transform the text from one HTML structure to another HTML structure.
Parameters
----------
text : string
Returns
-------
string
"""
soup = BeautifulSoup(text)
biography = soup.find(id="a")
columns = []
for tag in biography.find_all(class_="bestaende"):
col = ''.join([str(t) for t in tag.contents]).split("<br/>")
col = [el.strip() for el in col]
columns.append(col)
# Get correct date / text relationship
table = []
for d, text in zip(columns[0], columns[1]):
if d != '':
table.append({'date': d, 'text': []})
table[-1]['text'].append(text)
# create the new text format from the parsed data in `table`.
transformed = """<table cellpadding="1" cellspacing="1" style="width:562px">
\t<caption>
\t<p>Kurzbiografie</p>
\t</caption>
\t<tbody>\n"""
for el in table:
transformed += '\t\t<tr>\n'
transformed += '\t\t\t<td width="100">%s</td>\n' % el['date']
transformed += '\t\t\t<td>%s</td>\n' % '<br />\n\t\t\t'.join(el['text'])
transformed += "\t\t</tr>\n"
transformed += """\t</tbody>
</table>\n\n"""
# TODO: Ist das immer so?
transformed += """<table cellpadding="1" cellspacing="1" style="width:562px">
</table>
<table cellpadding="1" cellspacing="1" style="width:562px">
\t<caption>
\t<p>Werkauswahl</p>
\t</caption>
\t<tbody>
\t\t<tr>
\t\t\t<td>siehe Fiedler Aichele</td>
\t\t</tr>
\t</tbody>
</table>
<p></p>
<table cellpadding="1" cellspacing="1" style="width:562px">
\t<caption>Literaturauswahl</caption>
\t<tbody>
\t\t<tr>
\t\t\t<td>in Bearbeitung</td>
\t\t</tr>
\t</tbody>
</table>
<p></p>
"""
return transformed
def save(filename, contents):
"""
Parameters
----------
filename : string
contents : string
"""
with open(filename, 'w') as f:
f.write(contents)
def transform_one(url, output_folder):
"""Transforms a single webpage to the new format
Parameters
----------
url : string
For example http://www.saai.macbay.de/saai/bestaende_aichele_manfred.html
"""
name = url.split('/')[-1].split('.')[0]
text = get_content(url)
transformed = transform(text)
target_filename = os.path.join(output_folder, '%s.txt' % name)
save(target_filename, transformed)
logging.info("Saved contents of '%s' to '%s'.", url, target_filename)
def is_valid_folder(parser, arg):
"""Check if arg is a valid file that already exists on the file system."""
arg = os.path.abspath(arg)
if not os.path.isdir(arg):
parser.error("The folder %s does not exist!" % arg)
else:
return arg
def get_parser():
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter
parser = ArgumentParser(description=__doc__,
formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument("-u", "--url",
dest="url",
help=("e.g. http://www.saai.macbay.de/saai/"
"bestaende_aichele_manfred.html"),
required=True,
metavar="URL")
parser.add_argument("-o", "--output",
dest="output_folder",
help=("folder in which the transformed files will be "
"put"),
type=lambda x: is_valid_folder(parser, x),
default=os.path.dirname(os.path.abspath(__file__)),
metavar="FOLDER")
return parser
if __name__ == "__main__":
parser = get_parser()
if len(sys.argv) == 1:
parser.print_help()
sys.exit(1)
args = parser.parse_args()
transform_one(args.url, args.output_folder)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment