Last active
August 29, 2015 14:20
-
-
Save MartinThoma/cc2f3e8a9a194ded4f45 to your computer and use it in GitHub Desktop.
Transform the format of a file to move www.saai.macbay.de/saai to kit.edu
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
# -*- coding: utf-8 -*- | |
""" | |
Transform contents of | |
http://www.saai.macbay.de/saai/bestaende_aichele_manfred.html | |
into a new format so that it get on kit.edu | |
""" | |
from __future__ import unicode_literals | |
from six.moves.urllib.request import urlopen | |
from bs4 import BeautifulSoup | |
import sys | |
reload(sys) | |
sys.setdefaultencoding('utf-8') | |
import os | |
import logging | |
import sys | |
logging.basicConfig(format='%(asctime)s %(levelname)s %(message)s', | |
level=logging.DEBUG, | |
stream=sys.stdout) | |
def get_content(filename): | |
"""Get text from a website / text file. | |
Parameters | |
---------- | |
filename : string | |
Returns | |
------- | |
string : | |
contents of file / webpage | |
""" | |
if filename.startswith("http"): # Internet | |
response = urlopen(filename) | |
content = response.read() | |
else: # local | |
with open(filename) as f: | |
content = f.read() | |
return content | |
def transform(text): | |
"""Transform the text from one HTML structure to another HTML structure. | |
Parameters | |
---------- | |
text : string | |
Returns | |
------- | |
string | |
""" | |
soup = BeautifulSoup(text) | |
biography = soup.find(id="a") | |
columns = [] | |
for tag in biography.find_all(class_="bestaende"): | |
col = ''.join([str(t) for t in tag.contents]).split("<br/>") | |
col = [el.strip() for el in col] | |
columns.append(col) | |
# Get correct date / text relationship | |
table = [] | |
for d, text in zip(columns[0], columns[1]): | |
if d != '': | |
table.append({'date': d, 'text': []}) | |
table[-1]['text'].append(text) | |
# create the new text format from the parsed data in `table`. | |
transformed = """<table cellpadding="1" cellspacing="1" style="width:562px"> | |
\t<caption> | |
\t<p>Kurzbiografie</p> | |
\t</caption> | |
\t<tbody>\n""" | |
for el in table: | |
transformed += '\t\t<tr>\n' | |
transformed += '\t\t\t<td width="100">%s</td>\n' % el['date'] | |
transformed += '\t\t\t<td>%s</td>\n' % '<br />\n\t\t\t'.join(el['text']) | |
transformed += "\t\t</tr>\n" | |
transformed += """\t</tbody> | |
</table>\n\n""" | |
# TODO: Ist das immer so? | |
transformed += """<table cellpadding="1" cellspacing="1" style="width:562px"> | |
</table> | |
<table cellpadding="1" cellspacing="1" style="width:562px"> | |
\t<caption> | |
\t<p>Werkauswahl</p> | |
\t</caption> | |
\t<tbody> | |
\t\t<tr> | |
\t\t\t<td>siehe Fiedler Aichele</td> | |
\t\t</tr> | |
\t</tbody> | |
</table> | |
<p></p> | |
<table cellpadding="1" cellspacing="1" style="width:562px"> | |
\t<caption>Literaturauswahl</caption> | |
\t<tbody> | |
\t\t<tr> | |
\t\t\t<td>in Bearbeitung</td> | |
\t\t</tr> | |
\t</tbody> | |
</table> | |
<p></p> | |
""" | |
return transformed | |
def save(filename, contents): | |
""" | |
Parameters | |
---------- | |
filename : string | |
contents : string | |
""" | |
with open(filename, 'w') as f: | |
f.write(contents) | |
def transform_one(url, output_folder): | |
"""Transforms a single webpage to the new format | |
Parameters | |
---------- | |
url : string | |
For example http://www.saai.macbay.de/saai/bestaende_aichele_manfred.html | |
""" | |
name = url.split('/')[-1].split('.')[0] | |
text = get_content(url) | |
transformed = transform(text) | |
target_filename = os.path.join(output_folder, '%s.txt' % name) | |
save(target_filename, transformed) | |
logging.info("Saved contents of '%s' to '%s'.", url, target_filename) | |
def is_valid_folder(parser, arg): | |
"""Check if arg is a valid file that already exists on the file system.""" | |
arg = os.path.abspath(arg) | |
if not os.path.isdir(arg): | |
parser.error("The folder %s does not exist!" % arg) | |
else: | |
return arg | |
def get_parser(): | |
from argparse import ArgumentParser, ArgumentDefaultsHelpFormatter | |
parser = ArgumentParser(description=__doc__, | |
formatter_class=ArgumentDefaultsHelpFormatter) | |
parser.add_argument("-u", "--url", | |
dest="url", | |
help=("e.g. http://www.saai.macbay.de/saai/" | |
"bestaende_aichele_manfred.html"), | |
required=True, | |
metavar="URL") | |
parser.add_argument("-o", "--output", | |
dest="output_folder", | |
help=("folder in which the transformed files will be " | |
"put"), | |
type=lambda x: is_valid_folder(parser, x), | |
default=os.path.dirname(os.path.abspath(__file__)), | |
metavar="FOLDER") | |
return parser | |
if __name__ == "__main__": | |
parser = get_parser() | |
if len(sys.argv) == 1: | |
parser.print_help() | |
sys.exit(1) | |
args = parser.parse_args() | |
transform_one(args.url, args.output_folder) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment