Norod/stage.py

## stage.py
#Grab archived stories from stage.co.il using
#wayback_machine_downloader -c 8 -a http://stage.co.il/Stories
#
#This script, reads a "stage.co.il" index.html file and outputs the following HTML tag dirty text:
#Author name
#Title
#Story
#<|endoftext|>


import sys
from os import path
import time
import lxml.html
from html.parser import HTMLParser
import re

if len(sys.argv) != 2:
    print("Usage: "+ str(sys.argv[0]) + " input.html")
    exit(-1)

input_html_file = sys.argv[1]
#input_html_file = "./Stories/61407/index.html"

if path.exists(input_html_file) == False:
    print("Error: "+ str(sys.argv[1]) + " not found")
    exit(-2)

tag_re = re.compile(r'(<!--.*?-->|<[^>]*>)')

def stringify_children(node, text_encoding):
    from lxml.etree import tostring
    from itertools import chain
    h = HTMLParser()
    parts = ([node.text] +
            list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) +
            [node.tail])
    # filter removes possible Nones in texts and tails
    text = ''
    for part in parts:
        if part != None:
            if type(part) == bytes:
                decoded_part = part.decode('utf-8')
                text = text + decoded_part

    no_tags = tag_re.sub('', text)
    unescaped_text = h.unescape(no_tags)
    if text_encoding == "ISO-8859-8":
        unescaped_text = unescaped_text[::-1]

    while unescaped_text.find('\r\n') > -1:
        unescaped_text = unescaped_text.replace('\r\n', '\n')

    while unescaped_text.find('\n\n') > -1:
        unescaped_text = unescaped_text.replace('\n\n', '\n')

    return unescaped_text

def debug_print_all_elements(tree):
    for element in tree.iter():
        if type(element) is lxml.html.HtmlComment:
            print(str(type(element)) + " : " + str(element.text_content))
        else:
            print(str(type(element)) + " : " + tree.getelementpath(element))

tree = lxml.html.parse(input_html_file)
if tree == None:
    exit(-3)
#debug_print_all_elements(tree)

#print("<input_html_file=\"" + str(input_html_file) + "\">\n")

result = ""

for node in tree.iter():
    node_tag = node.tag
    node_text = node.text
    if node_text == None:
        exit(-4)
    if len(node_text) == 0:
        exit(-4)
    if node_tag == "title":
        author_title = (node_text.split("|")[-1]).split("/")
        result = result + "שם היצירה: " + str(author_title[1]) + "\nמאת: " + str(author_title[0]) + "\n\n"
        break

text_encoding = 'utf8'
meta_tags = tree.xpath('/html/head/meta')
for meta_tag in meta_tags:
    meta_tag_attrib = meta_tag.attrib
    content_attrib = meta_tag_attrib['content']
    if content_attrib != None and len(content_attrib) > 0:
        updated_text_encoding = content_attrib.split('=')[1]
        if len(updated_text_encoding) > 0:
            text_encoding = updated_text_encoding
            break

#print("<encoding=\"" + text_encoding + "\">\n")

teh_story = tree.xpath('/html/body/center/table[2]/tr/td[2]/div[1]')
if teh_story != None and len(teh_story) == 0:
    teh_story = tree.xpath('/html/body/center/table/tr/blockquote/blockquote/div[1]')
if teh_story != None and len(teh_story) == 0:
    teh_story = tree.xpath('/html/body/center/table[2]/tr/td[2]/center[2]/table/tr/td/div')


if teh_story != None and len(teh_story) > 0:
    for node in teh_story:
        node_tag = node.tag
        node_text = node.text
        node_iter_text = node.itertext()
        strings = stringify_children(node, text_encoding)
        if strings == None:
            print("<error=\"" + "No textual content" + "\">\n")
            exit(-5)
        if len(strings) <= 127:
            print("<error=\"" + "No enough textual content" + "\">\n")
            exit(-6)
        result = result + strings
else:
    print("<error=\"" + "Unable to extract textual content" + "\">\n")
    exit(-7)

result = result + "\n<|endoftext|>\n"
print(result)
	#Grab archived stories from stage.co.il using
	#wayback_machine_downloader -c 8 -a http://stage.co.il/Stories
	#
	#This script, reads a "stage.co.il" index.html file and outputs the following HTML tag dirty text:
	#Author name
	#Title
	#Story
	#<\|endoftext\|>


	import sys
	from os import path
	import time
	import lxml.html
	from html.parser import HTMLParser
	import re

	if len(sys.argv) != 2:
	print("Usage: "+ str(sys.argv[0]) + " input.html")
	exit(-1)

	input_html_file = sys.argv[1]
	#input_html_file = "./Stories/61407/index.html"

	if path.exists(input_html_file) == False:
	print("Error: "+ str(sys.argv[1]) + " not found")
	exit(-2)

	tag_re = re.compile(r'(<!--.?-->\|<[^>]>)')

	def stringify_children(node, text_encoding):
	from lxml.etree import tostring
	from itertools import chain
	h = HTMLParser()
	parts = ([node.text] +
	list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) +
	[node.tail])
	# filter removes possible Nones in texts and tails
	text = ''
	for part in parts:
	if part != None:
	if type(part) == bytes:
	decoded_part = part.decode('utf-8')
	text = text + decoded_part

	no_tags = tag_re.sub('', text)
	unescaped_text = h.unescape(no_tags)
	if text_encoding == "ISO-8859-8":
	unescaped_text = unescaped_text[::-1]

	while unescaped_text.find('\r\n') > -1:
	unescaped_text = unescaped_text.replace('\r\n', '\n')

	while unescaped_text.find('\n\n') > -1:
	unescaped_text = unescaped_text.replace('\n\n', '\n')

	return unescaped_text

	def debug_print_all_elements(tree):
	for element in tree.iter():
	if type(element) is lxml.html.HtmlComment:
	print(str(type(element)) + " : " + str(element.text_content))
	else:
	print(str(type(element)) + " : " + tree.getelementpath(element))

	tree = lxml.html.parse(input_html_file)
	if tree == None:
	exit(-3)
	#debug_print_all_elements(tree)

	#print("<input_html_file=\"" + str(input_html_file) + "\">\n")

	result = ""

	for node in tree.iter():
	node_tag = node.tag
	node_text = node.text
	if node_text == None:
	exit(-4)
	if len(node_text) == 0:
	exit(-4)
	if node_tag == "title":
	author_title = (node_text.split("\|")[-1]).split("/")
	result = result + "שם היצירה: " + str(author_title[1]) + "\nמאת: " + str(author_title[0]) + "\n\n"
	break

	text_encoding = 'utf8'
	meta_tags = tree.xpath('/html/head/meta')
	for meta_tag in meta_tags:
	meta_tag_attrib = meta_tag.attrib
	content_attrib = meta_tag_attrib['content']
	if content_attrib != None and len(content_attrib) > 0:
	updated_text_encoding = content_attrib.split('=')[1]
	if len(updated_text_encoding) > 0:
	text_encoding = updated_text_encoding
	break

	#print("<encoding=\"" + text_encoding + "\">\n")

	teh_story = tree.xpath('/html/body/center/table[2]/tr/td[2]/div[1]')
	if teh_story != None and len(teh_story) == 0:
	teh_story = tree.xpath('/html/body/center/table/tr/blockquote/blockquote/div[1]')
	if teh_story != None and len(teh_story) == 0:
	teh_story = tree.xpath('/html/body/center/table[2]/tr/td[2]/center[2]/table/tr/td/div')


	if teh_story != None and len(teh_story) > 0:
	for node in teh_story:
	node_tag = node.tag
	node_text = node.text
	node_iter_text = node.itertext()
	strings = stringify_children(node, text_encoding)
	if strings == None:
	print("<error=\"" + "No textual content" + "\">\n")
	exit(-5)
	if len(strings) <= 127:
	print("<error=\"" + "No enough textual content" + "\">\n")
	exit(-6)
	result = result + strings
	else:
	print("<error=\"" + "Unable to extract textual content" + "\">\n")
	exit(-7)

	result = result + "\n<\|endoftext\|>\n"
	print(result)