ict4eo/wiki_google2rst.py

## wiki_google2rst.py
"""
    Convert a wikipage in Google wiki format to reStructuredText
    Version 1.0.2
    19 March 2014
    Derek Hohls, Meraka, CSIR

    Handles
    =======
        * internal comments (#name - no space)
        * bold / italics text
        * inline code
        * code blocks
        * internal cross-reference [[example]]
        * bullets
        * numbered list
        * headers (up to 4 levels deep)
        * tables
        * standalone hyperlinks (these stay as is)

    TODO
    ====
        indented bullets
        external hyperlinks  with embedded link: `Python web site <http://www.python.org>`__
        named internal cross-reference: [[FutureModuleRoadmap | Future Roadmap]]

"""
import pprint


def process_header(txt):
    if txt[:1] == '=':
        level = 1
        under = '============================================================='
    if txt[:2] == '==':
        level = 2
        under = '-------------------------------------------------------------'
    if txt[:3] == '===':
        level = 3
        under = '^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'
    if txt[:3] == '====':
        level = 4
        under = '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
    else:
        pass
    replace = '======'[:level]
    _txt = txt.replace(replace, '')
    _txt = _txt.strip(' ')
    length = len(_txt) - 1
    result = []
    result.append(_txt)
    result.append(under[1:length])
    result.append(' ')
    return result


def process_table(rows):
    items = []
    results = []
    # clean data
    for r in rows:
        r = r.strip('\n').strip(' ').strip('||')
        new = r.split('||')
        items.append(new)
    #pprint.pprint(items)
    # size of cells
    count = {}
    for key, i in enumerate(items[0]):
        count[key] = len(i)
    for item in items:
        #print len(item), "::", item
        for key, i in enumerate(item):
            count[key] = max(len(i), count[key])
    #pprint.pprint(count)
    # splitter row
    split = '+'
    for key, i in enumerate(items[0]):
        split = "%s%s%s" % (split, "-" * count[key], '+')
    #print split
    # results
    results.append(split)
    for item in items:
        cell = '|'
        for key, i in enumerate(item):
            cell += i.ljust(count[key])
            cell += '|'
        results.append(cell)
        results.append(split)
    #pprint.pprint(results)
    return results


def process_bullet(text, text_prior):
    """TODO"""
    return text


def process(file_in, file_out=None):
    # open input/output files
    inputfile = open(file_in)
    if file_out:
        outputfile = open(file_out, 'w')
    text_in = inputfile.readlines()

    # defaults
    text_out = []
    table_rows = []
    is_code = False
    txt_prior = ''

    # process rows
    for txt in text_in:
        _txt = txt.strip(' ')
        if is_code and txt[:3] != '}}}':  # indent code
            text_out.append('    %s' % _txt)
        elif txt[:3] == '{{{':  # code start
            is_code = True
            text_out.append('::\n')
        elif txt[:3] == '}}}':  # code end
            text_out.append('\n')
            is_code = False
        else:
            if _txt[:2] == '# ':  # numbered list
                _txt = _txt.replace('# ', '#. ')
            if txt[0] == '*':  # bullets
                _txt = process_bullet(txt, txt_prior)
            _txt = _txt.replace('__', '**')  # bold / strong
            _txt = _txt.replace('_', '*')  # italics
            _txt = _txt.replace('`', '``') # inline
            _txt = _txt.replace('{{', '``').replace('}}', '``')  # inline
            _txt = _txt.replace('[', '[[').replace(']', ']]')  # cross-ref
            if txt[0] == '#':  # comment
                text_out.append('.. .. %s' % _txt)
            elif txt[0] == '=':  # header
                header_out = process_header(_txt)
                for head in header_out:
                    text_out.append(head)
                text_out.append('\n')
            elif txt[:2] == '||':  # table
                table_rows.append(_txt)
            else:
                if table_rows:
                    #pprint.pprint(table_rows)
                    new_rows = process_table(table_rows)
                    for row in new_rows:
                        text_out.append("%s\n" % row)
                    table_rows = []
                text_out.append(_txt)
        txt_prior = txt

    # write output
    if file_out:
        outputfile.writelines(text_out)
    else:
        for text in text_out:
            print text.strip('\n')


process('text.wiki', 'text.rst')
#process('text.wiki')  # to screen
	"""
	Convert a wikipage in Google wiki format to reStructuredText
	Version 1.0.2
	19 March 2014
	Derek Hohls, Meraka, CSIR

	Handles
	=======
	* internal comments (#name - no space)
	* bold / italics text
	* inline code
	* code blocks
	* internal cross-reference [[example]]
	* bullets
	* numbered list
	* headers (up to 4 levels deep)
	* tables
	* standalone hyperlinks (these stay as is)

	TODO
	====
	indented bullets
	external hyperlinks with embedded link: `Python web site <http://www.python.org>`__
	named internal cross-reference: [[FutureModuleRoadmap \| Future Roadmap]]

	"""
	import pprint


	def process_header(txt):
	if txt[:1] == '=':
	level = 1
	under = '============================================================='
	if txt[:2] == '==':
	level = 2
	under = '-------------------------------------------------------------'
	if txt[:3] == '===':
	level = 3
	under = '^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^'
	if txt[:3] == '====':
	level = 4
	under = '~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~'
	else:
	pass
	replace = '======'[:level]
	_txt = txt.replace(replace, '')
	_txt = _txt.strip(' ')
	length = len(_txt) - 1
	result = []
	result.append(_txt)
	result.append(under[1:length])
	result.append(' ')
	return result


	def process_table(rows):
	items = []
	results = []
	# clean data
	for r in rows:
	r = r.strip('\n').strip(' ').strip('\|\|')
	new = r.split('\|\|')
	items.append(new)
	#pprint.pprint(items)
	# size of cells
	count = {}
	for key, i in enumerate(items[0]):
	count[key] = len(i)
	for item in items:
	#print len(item), "::", item
	for key, i in enumerate(item):
	count[key] = max(len(i), count[key])
	#pprint.pprint(count)
	# splitter row
	split = '+'
	for key, i in enumerate(items[0]):
	split = "%s%s%s" % (split, "-" * count[key], '+')
	#print split
	# results
	results.append(split)
	for item in items:
	cell = '\|'
	for key, i in enumerate(item):
	cell += i.ljust(count[key])
	cell += '\|'
	results.append(cell)
	results.append(split)
	#pprint.pprint(results)
	return results


	def process_bullet(text, text_prior):
	"""TODO"""
	return text


	def process(file_in, file_out=None):
	# open input/output files
	inputfile = open(file_in)
	if file_out:
	outputfile = open(file_out, 'w')
	text_in = inputfile.readlines()

	# defaults
	text_out = []
	table_rows = []
	is_code = False
	txt_prior = ''

	# process rows
	for txt in text_in:
	_txt = txt.strip(' ')
	if is_code and txt[:3] != '}}}': # indent code
	text_out.append(' %s' % _txt)
	elif txt[:3] == '{{{': # code start
	is_code = True
	text_out.append('::\n')
	elif txt[:3] == '}}}': # code end
	text_out.append('\n')
	is_code = False
	else:
	if _txt[:2] == '# ': # numbered list
	_txt = _txt.replace('# ', '#. ')
	if txt[0] == '*': # bullets
	_txt = process_bullet(txt, txt_prior)
	_txt = _txt.replace('__', '**') # bold / strong
	_txt = _txt.replace('_', '*') # italics
	_txt = _txt.replace('`', '``') # inline
	_txt = _txt.replace('{{', '``').replace('}}', '``') # inline
	_txt = _txt.replace('[', '[[').replace(']', ']]') # cross-ref
	if txt[0] == '#': # comment
	text_out.append('.. .. %s' % _txt)
	elif txt[0] == '=': # header
	header_out = process_header(_txt)
	for head in header_out:
	text_out.append(head)
	text_out.append('\n')
	elif txt[:2] == '\|\|': # table
	table_rows.append(_txt)
	else:
	if table_rows:
	#pprint.pprint(table_rows)
	new_rows = process_table(table_rows)
	for row in new_rows:
	text_out.append("%s\n" % row)
	table_rows = []
	text_out.append(_txt)
	txt_prior = txt

	# write output
	if file_out:
	outputfile.writelines(text_out)
	else:
	for text in text_out:
	print text.strip('\n')


	process('text.wiki', 'text.rst')
	#process('text.wiki') # to screen