Skip to content

Instantly share code, notes, and snippets.

@Norod
Last active August 2, 2020 16:01
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save Norod/09c12249b8e76bde8095e544ca0803aa to your computer and use it in GitHub Desktop.
Save Norod/09c12249b8e76bde8095e544ca0803aa to your computer and use it in GitHub Desktop.
Parse index.html files from the website stage.co.il
#Grab archived stories from stage.co.il using
#wayback_machine_downloader -c 8 -a http://stage.co.il/Stories
#
#This script, reads a "stage.co.il" index.html file and outputs the following HTML tag dirty text:
#Author name
#Title
#Story
#<|endoftext|>
import sys
from os import path
import time
import lxml.html
from html.parser import HTMLParser
import re
if len(sys.argv) != 2:
print("Usage: "+ str(sys.argv[0]) + " input.html")
exit(-1)
input_html_file = sys.argv[1]
#input_html_file = "./Stories/61407/index.html"
if path.exists(input_html_file) == False:
print("Error: "+ str(sys.argv[1]) + " not found")
exit(-2)
tag_re = re.compile(r'(<!--.*?-->|<[^>]*>)')
def stringify_children(node, text_encoding):
from lxml.etree import tostring
from itertools import chain
h = HTMLParser()
parts = ([node.text] +
list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) +
[node.tail])
# filter removes possible Nones in texts and tails
text = ''
for part in parts:
if part != None:
if type(part) == bytes:
decoded_part = part.decode('utf-8')
text = text + decoded_part
no_tags = tag_re.sub('', text)
unescaped_text = h.unescape(no_tags)
if text_encoding == "ISO-8859-8":
unescaped_text = unescaped_text[::-1]
while unescaped_text.find('\r\n') > -1:
unescaped_text = unescaped_text.replace('\r\n', '\n')
while unescaped_text.find('\n\n') > -1:
unescaped_text = unescaped_text.replace('\n\n', '\n')
return unescaped_text
def debug_print_all_elements(tree):
for element in tree.iter():
if type(element) is lxml.html.HtmlComment:
print(str(type(element)) + " : " + str(element.text_content))
else:
print(str(type(element)) + " : " + tree.getelementpath(element))
tree = lxml.html.parse(input_html_file)
if tree == None:
exit(-3)
#debug_print_all_elements(tree)
#print("<input_html_file=\"" + str(input_html_file) + "\">\n")
result = ""
for node in tree.iter():
node_tag = node.tag
node_text = node.text
if node_text == None:
exit(-4)
if len(node_text) == 0:
exit(-4)
if node_tag == "title":
author_title = (node_text.split("|")[-1]).split("/")
result = result + "שם היצירה: " + str(author_title[1]) + "\nמאת: " + str(author_title[0]) + "\n\n"
break
text_encoding = 'utf8'
meta_tags = tree.xpath('/html/head/meta')
for meta_tag in meta_tags:
meta_tag_attrib = meta_tag.attrib
content_attrib = meta_tag_attrib['content']
if content_attrib != None and len(content_attrib) > 0:
updated_text_encoding = content_attrib.split('=')[1]
if len(updated_text_encoding) > 0:
text_encoding = updated_text_encoding
break
#print("<encoding=\"" + text_encoding + "\">\n")
teh_story = tree.xpath('/html/body/center/table[2]/tr/td[2]/div[1]')
if teh_story != None and len(teh_story) == 0:
teh_story = tree.xpath('/html/body/center/table/tr/blockquote/blockquote/div[1]')
if teh_story != None and len(teh_story) == 0:
teh_story = tree.xpath('/html/body/center/table[2]/tr/td[2]/center[2]/table/tr/td/div')
if teh_story != None and len(teh_story) > 0:
for node in teh_story:
node_tag = node.tag
node_text = node.text
node_iter_text = node.itertext()
strings = stringify_children(node, text_encoding)
if strings == None:
print("<error=\"" + "No textual content" + "\">\n")
exit(-5)
if len(strings) <= 127:
print("<error=\"" + "No enough textual content" + "\">\n")
exit(-6)
result = result + strings
else:
print("<error=\"" + "Unable to extract textual content" + "\">\n")
exit(-7)
result = result + "\n<|endoftext|>\n"
print(result)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment