Last active
August 29, 2015 14:26
-
-
Save pepoluan/04f61a3e30dc78071548 to your computer and use it in GitHub Desktop.
This script pulls the timeline from the "Timeline of Canon Media" article on Wookieepedia, and extracts the information into a CSV file that can be opened directly (without using Text Import Wizard) by Excel.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python | |
from __future__ import print_function, division, unicode_literals | |
__author__ = 'Pepoluan' | |
import sys | |
import codecs | |
import re | |
import datetime | |
import requests | |
from bs4 import BeautifulSoup | |
import bs4.element | |
# noinspection PyUnresolvedReferences | |
from html5lib import parse # Not actually used, just to indicate to user that html5lib needs to be installed | |
def _get_input(what, default=None): | |
print('\n{0} (default: "{1}")'.format(what, default)) | |
inp = raw_input('[Enter accepts default]: ').strip() | |
return inp or default | |
class G(object): | |
URL = _get_input('URL for "Timeline of Canon Media"', | |
default='http://starwars.wikia.com/wiki/Timeline_of_canon_media') | |
OutFile = _get_input('Destination CSV file', default='D:\\TimelineCanon.csv') | |
CSV_Sep = _get_input('CSV file separator', default=',') | |
class CanonMediaEntry(object): | |
RE_Note = re.compile(r'\[[0-9]+\]') | |
Columns = [] | |
def __init__(self, timeline_row, order_count): | |
assert isinstance(timeline_row, bs4.element.Tag) | |
self._timeline_row = timeline_row # For debugging | |
self.cells = timeline_row.find_all('td') | |
assert isinstance(order_count, int) | |
self.ord = str(order_count) | |
self.year = self._getcleantext(0) | |
self.mtype = self._getcleantext(1) | |
self.title = self._getcleantext(2) | |
self.writers = self._getcleantext(3) | |
self.release = self._getcleantext(4) | |
self.release_date = self._parse_release() | |
self.asset = self._deduce_asset() | |
@property | |
def is_released(self): | |
return 'Yes' if datetime.datetime.now().date() >= self.release_date else 'No' | |
def _getcleantext(self, index): | |
t = self.cells[index].text.strip() | |
t = CanonMediaEntry.RE_Note.sub('', t) | |
t = t.replace('\u2013', '-').replace('\u2014', '-') | |
return t | |
def _deduce_asset(self): | |
title = self.title | |
asset = '' | |
if title.startswith('The Clone Wars'): | |
asset = 'TCW' | |
elif title.startswith('Star Wars Rebels'): | |
asset = 'SWR' | |
elif title.startswith('Star Wars: Kanan'): | |
asset = 'Kanan' | |
elif title.startswith('Star Wars: Princess Leia'): | |
asset = 'Leia' | |
elif title.startswith('Star Wars: Darth Vader'): | |
asset = 'Vader' | |
elif self.mtype == 'F' and 'Episode' in title: | |
asset = 'Movies' | |
return asset | |
def _parse_release(self): | |
dt = None | |
dts = self.release.split('-') | |
def_date = [9999, 12, 28] # 28 instead of 31 because Feb might be only 28 days long | |
act_date = [] | |
for f in dts: | |
d = def_date.pop(0) | |
if f and f.strip().isdigit(): | |
act_date.append(int(f.strip())) | |
else: | |
act_date.append(d) | |
if len(act_date) < 3: | |
act_date.extend(def_date) | |
try: | |
dt = datetime.date(*act_date) | |
except TypeError: | |
print(self._timeline_row) | |
return dt | |
# noinspection PyPep8Naming | |
@classmethod | |
def SetColumns(cls, lst): | |
# IMPORTANT: Changing the headers here should be reflected in the to_list() method! | |
cls.Columns = ['ChronOrd', 'Asset'] | |
cls.Columns.extend(lst) | |
cls.Columns.append('Rls?') | |
def to_list(self): | |
# IMPORTANT: These should match the columns in SetColumns() classmethod! | |
# That is why instead of directly building a list literal, we reflect how the statements in SetColumns() | |
# are structured | |
lst1 = [self.ord, self.asset] | |
lst1.extend([self.year, self.mtype, self.title, self.writers, self.release]) | |
lst1.append(self.is_released) | |
return map(lambda x: x.replace('"', '""'), lst1) | |
class _StringCleaner(object): | |
DefaultCleaners = [] | |
def __init__(self, s, cleaners=None): | |
""" | |
Initializes the StringCleaner Class | |
:param s: String to clean, may be str() or unicode() | |
:param cleaners: (optional) list of tuples(regex, regex_flags, replacement) | |
:type cleaners: list | |
""" | |
self.__contents = s | |
__cleaners = list(_StringCleaner.DefaultCleaners) | |
if cleaners: | |
assert isinstance(cleaners, list) | |
__cleaners.extend(cleaners) | |
for c in __cleaners: | |
rex = re.compile(*c[0:2]) | |
self.__contents = rex.sub(c[2], self.__contents) | |
@property | |
def text(self): | |
return self.__contents | |
def resub_if(self, cond, regex_param_tuple, replacement): | |
if cond: | |
rex = re.compile(*regex_param_tuple) | |
self.__contents = rex.sub(replacement, self.__contents) | |
def contains(self, s): | |
return s in self.__contents | |
def __str__(self): | |
return self.__contents.encode('utf-8') if isinstance(self.__contents, unicode) else self.__contents | |
def get_timeline_article(url, replace_br=True, replace_dash=True): | |
print('\nDownloading "Timeline of Canon Media" article from {0}...'.format(url), end='') | |
resp = requests.get(url) | |
the_page = _StringCleaner(resp.text) | |
the_page.resub_if(replace_br, (r'<br\s*/?>', re.IGNORECASE), '\n') | |
# Excel does not understand unicode dashes, so replace with 'standard' dash | |
the_page.resub_if(replace_dash, (u'\u2013|\u2014', re.UNICODE), '-') | |
if not the_page.contains('Episode IX'): | |
print('ERROR!\nThe page seems to be truncated!') | |
sys.exit(1) | |
print(' done.') | |
return the_page.text | |
def get_timeline_table(page): | |
print('Looking for the Timeline table...', end='') | |
soup = BeautifulSoup(page, 'html5lib') | |
timeline_table = None | |
for t in soup.find_all('table'): | |
ln = str(t.tr) | |
if 'Year' in ln and 'Title' in ln and 'Writer' in ln: | |
timeline_table = t | |
break | |
if timeline_table is None: | |
print('ERROR!') | |
print('I can\'t find the Timeline Table!') | |
sys.exit(1) | |
print(' done.') | |
return timeline_table | |
def get_timeline_entries(table): | |
print('Parsing the Timeline table...', end='') | |
entries = [] | |
linenum = 0 | |
for r in table.find_all('tr'): | |
print('.', end='') | |
if linenum == 0: | |
cols = [] | |
col = 1 | |
for h in r.find_all('th'): | |
head = h.string.strip() | |
head = head if head else 'Col{0}'.format(col) | |
col += 1 | |
cols.append('{0}'.format(head)) | |
CanonMediaEntry.SetColumns(cols) | |
else: | |
entries.append( | |
CanonMediaEntry(timeline_row=r, order_count=linenum) | |
) | |
linenum += 1 | |
print('\nTotal {0} records processed.'.format(linenum)) | |
return entries | |
def join_enquote(lst): | |
return G.CSV_Sep.join(['"{0}"'.format(i) for i in lst]) | |
def main(): | |
timeline_page = get_timeline_article(G.URL) | |
timeline_table = get_timeline_table(timeline_page) | |
timeline_entries = get_timeline_entries(timeline_table) | |
try: | |
with codecs.open(G.OutFile, 'w', encoding='utf-8-sig') as fout: | |
print('Saving the parsed Timeline table into file "{0}"...'.format(G.OutFile), end='') | |
c = 1 | |
try: | |
print(join_enquote(CanonMediaEntry.Columns), file=fout) | |
for ent in timeline_entries: | |
assert isinstance(ent, CanonMediaEntry) | |
print(join_enquote(ent.to_list()), file=fout) | |
c += 1 | |
print(' {0} lines.'.format(c)) | |
except: | |
print('ERROR!') | |
raise | |
except IOError: | |
print('ERROR trying to open file "{0}"'.format(G.OutFile)) | |
print('Is it open somewhere?') | |
sys.exit(1) | |
if '__main__' == __name__: | |
main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment