Skip to content

Instantly share code, notes, and snippets.

@pepoluan
Last active August 29, 2015 14:26
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save pepoluan/04f61a3e30dc78071548 to your computer and use it in GitHub Desktop.
Save pepoluan/04f61a3e30dc78071548 to your computer and use it in GitHub Desktop.
This script pulls the timeline from the "Timeline of Canon Media" article on Wookieepedia, and extracts the information into a CSV file that can be opened directly (without using Text Import Wizard) by Excel.
#!/usr/bin/env python
from __future__ import print_function, division, unicode_literals
__author__ = 'Pepoluan'
import sys
import codecs
import re
import datetime
import requests
from bs4 import BeautifulSoup
import bs4.element
# noinspection PyUnresolvedReferences
from html5lib import parse # Not actually used, just to indicate to user that html5lib needs to be installed
def _get_input(what, default=None):
print('\n{0} (default: "{1}")'.format(what, default))
inp = raw_input('[Enter accepts default]: ').strip()
return inp or default
class G(object):
URL = _get_input('URL for "Timeline of Canon Media"',
default='http://starwars.wikia.com/wiki/Timeline_of_canon_media')
OutFile = _get_input('Destination CSV file', default='D:\\TimelineCanon.csv')
CSV_Sep = _get_input('CSV file separator', default=',')
class CanonMediaEntry(object):
RE_Note = re.compile(r'\[[0-9]+\]')
Columns = []
def __init__(self, timeline_row, order_count):
assert isinstance(timeline_row, bs4.element.Tag)
self._timeline_row = timeline_row # For debugging
self.cells = timeline_row.find_all('td')
assert isinstance(order_count, int)
self.ord = str(order_count)
self.year = self._getcleantext(0)
self.mtype = self._getcleantext(1)
self.title = self._getcleantext(2)
self.writers = self._getcleantext(3)
self.release = self._getcleantext(4)
self.release_date = self._parse_release()
self.asset = self._deduce_asset()
@property
def is_released(self):
return 'Yes' if datetime.datetime.now().date() >= self.release_date else 'No'
def _getcleantext(self, index):
t = self.cells[index].text.strip()
t = CanonMediaEntry.RE_Note.sub('', t)
t = t.replace('\u2013', '-').replace('\u2014', '-')
return t
def _deduce_asset(self):
title = self.title
asset = ''
if title.startswith('The Clone Wars'):
asset = 'TCW'
elif title.startswith('Star Wars Rebels'):
asset = 'SWR'
elif title.startswith('Star Wars: Kanan'):
asset = 'Kanan'
elif title.startswith('Star Wars: Princess Leia'):
asset = 'Leia'
elif title.startswith('Star Wars: Darth Vader'):
asset = 'Vader'
elif self.mtype == 'F' and 'Episode' in title:
asset = 'Movies'
return asset
def _parse_release(self):
dt = None
dts = self.release.split('-')
def_date = [9999, 12, 28] # 28 instead of 31 because Feb might be only 28 days long
act_date = []
for f in dts:
d = def_date.pop(0)
if f and f.strip().isdigit():
act_date.append(int(f.strip()))
else:
act_date.append(d)
if len(act_date) < 3:
act_date.extend(def_date)
try:
dt = datetime.date(*act_date)
except TypeError:
print(self._timeline_row)
return dt
# noinspection PyPep8Naming
@classmethod
def SetColumns(cls, lst):
# IMPORTANT: Changing the headers here should be reflected in the to_list() method!
cls.Columns = ['ChronOrd', 'Asset']
cls.Columns.extend(lst)
cls.Columns.append('Rls?')
def to_list(self):
# IMPORTANT: These should match the columns in SetColumns() classmethod!
# That is why instead of directly building a list literal, we reflect how the statements in SetColumns()
# are structured
lst1 = [self.ord, self.asset]
lst1.extend([self.year, self.mtype, self.title, self.writers, self.release])
lst1.append(self.is_released)
return map(lambda x: x.replace('"', '""'), lst1)
class _StringCleaner(object):
DefaultCleaners = []
def __init__(self, s, cleaners=None):
"""
Initializes the StringCleaner Class
:param s: String to clean, may be str() or unicode()
:param cleaners: (optional) list of tuples(regex, regex_flags, replacement)
:type cleaners: list
"""
self.__contents = s
__cleaners = list(_StringCleaner.DefaultCleaners)
if cleaners:
assert isinstance(cleaners, list)
__cleaners.extend(cleaners)
for c in __cleaners:
rex = re.compile(*c[0:2])
self.__contents = rex.sub(c[2], self.__contents)
@property
def text(self):
return self.__contents
def resub_if(self, cond, regex_param_tuple, replacement):
if cond:
rex = re.compile(*regex_param_tuple)
self.__contents = rex.sub(replacement, self.__contents)
def contains(self, s):
return s in self.__contents
def __str__(self):
return self.__contents.encode('utf-8') if isinstance(self.__contents, unicode) else self.__contents
def get_timeline_article(url, replace_br=True, replace_dash=True):
print('\nDownloading "Timeline of Canon Media" article from {0}...'.format(url), end='')
resp = requests.get(url)
the_page = _StringCleaner(resp.text)
the_page.resub_if(replace_br, (r'<br\s*/?>', re.IGNORECASE), '\n')
# Excel does not understand unicode dashes, so replace with 'standard' dash
the_page.resub_if(replace_dash, (u'\u2013|\u2014', re.UNICODE), '-')
if not the_page.contains('Episode IX'):
print('ERROR!\nThe page seems to be truncated!')
sys.exit(1)
print(' done.')
return the_page.text
def get_timeline_table(page):
print('Looking for the Timeline table...', end='')
soup = BeautifulSoup(page, 'html5lib')
timeline_table = None
for t in soup.find_all('table'):
ln = str(t.tr)
if 'Year' in ln and 'Title' in ln and 'Writer' in ln:
timeline_table = t
break
if timeline_table is None:
print('ERROR!')
print('I can\'t find the Timeline Table!')
sys.exit(1)
print(' done.')
return timeline_table
def get_timeline_entries(table):
print('Parsing the Timeline table...', end='')
entries = []
linenum = 0
for r in table.find_all('tr'):
print('.', end='')
if linenum == 0:
cols = []
col = 1
for h in r.find_all('th'):
head = h.string.strip()
head = head if head else 'Col{0}'.format(col)
col += 1
cols.append('{0}'.format(head))
CanonMediaEntry.SetColumns(cols)
else:
entries.append(
CanonMediaEntry(timeline_row=r, order_count=linenum)
)
linenum += 1
print('\nTotal {0} records processed.'.format(linenum))
return entries
def join_enquote(lst):
return G.CSV_Sep.join(['"{0}"'.format(i) for i in lst])
def main():
timeline_page = get_timeline_article(G.URL)
timeline_table = get_timeline_table(timeline_page)
timeline_entries = get_timeline_entries(timeline_table)
try:
with codecs.open(G.OutFile, 'w', encoding='utf-8-sig') as fout:
print('Saving the parsed Timeline table into file "{0}"...'.format(G.OutFile), end='')
c = 1
try:
print(join_enquote(CanonMediaEntry.Columns), file=fout)
for ent in timeline_entries:
assert isinstance(ent, CanonMediaEntry)
print(join_enquote(ent.to_list()), file=fout)
c += 1
print(' {0} lines.'.format(c))
except:
print('ERROR!')
raise
except IOError:
print('ERROR trying to open file "{0}"'.format(G.OutFile))
print('Is it open somewhere?')
sys.exit(1)
if '__main__' == __name__:
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment