Last active
August 2, 2020 16:01
-
-
Save Norod/09c12249b8e76bde8095e544ca0803aa to your computer and use it in GitHub Desktop.
Parse index.html files from the website stage.co.il
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#Grab archived stories from stage.co.il using | |
#wayback_machine_downloader -c 8 -a http://stage.co.il/Stories | |
# | |
#This script, reads a "stage.co.il" index.html file and outputs the following HTML tag dirty text: | |
#Author name | |
#Title | |
#Story | |
#<|endoftext|> | |
import sys | |
from os import path | |
import time | |
import lxml.html | |
from html.parser import HTMLParser | |
import re | |
if len(sys.argv) != 2: | |
print("Usage: "+ str(sys.argv[0]) + " input.html") | |
exit(-1) | |
input_html_file = sys.argv[1] | |
#input_html_file = "./Stories/61407/index.html" | |
if path.exists(input_html_file) == False: | |
print("Error: "+ str(sys.argv[1]) + " not found") | |
exit(-2) | |
tag_re = re.compile(r'(<!--.*?-->|<[^>]*>)') | |
def stringify_children(node, text_encoding): | |
from lxml.etree import tostring | |
from itertools import chain | |
h = HTMLParser() | |
parts = ([node.text] + | |
list(chain(*([c.text, tostring(c), c.tail] for c in node.getchildren()))) + | |
[node.tail]) | |
# filter removes possible Nones in texts and tails | |
text = '' | |
for part in parts: | |
if part != None: | |
if type(part) == bytes: | |
decoded_part = part.decode('utf-8') | |
text = text + decoded_part | |
no_tags = tag_re.sub('', text) | |
unescaped_text = h.unescape(no_tags) | |
if text_encoding == "ISO-8859-8": | |
unescaped_text = unescaped_text[::-1] | |
while unescaped_text.find('\r\n') > -1: | |
unescaped_text = unescaped_text.replace('\r\n', '\n') | |
while unescaped_text.find('\n\n') > -1: | |
unescaped_text = unescaped_text.replace('\n\n', '\n') | |
return unescaped_text | |
def debug_print_all_elements(tree): | |
for element in tree.iter(): | |
if type(element) is lxml.html.HtmlComment: | |
print(str(type(element)) + " : " + str(element.text_content)) | |
else: | |
print(str(type(element)) + " : " + tree.getelementpath(element)) | |
tree = lxml.html.parse(input_html_file) | |
if tree == None: | |
exit(-3) | |
#debug_print_all_elements(tree) | |
#print("<input_html_file=\"" + str(input_html_file) + "\">\n") | |
result = "" | |
for node in tree.iter(): | |
node_tag = node.tag | |
node_text = node.text | |
if node_text == None: | |
exit(-4) | |
if len(node_text) == 0: | |
exit(-4) | |
if node_tag == "title": | |
author_title = (node_text.split("|")[-1]).split("/") | |
result = result + "שם היצירה: " + str(author_title[1]) + "\nמאת: " + str(author_title[0]) + "\n\n" | |
break | |
text_encoding = 'utf8' | |
meta_tags = tree.xpath('/html/head/meta') | |
for meta_tag in meta_tags: | |
meta_tag_attrib = meta_tag.attrib | |
content_attrib = meta_tag_attrib['content'] | |
if content_attrib != None and len(content_attrib) > 0: | |
updated_text_encoding = content_attrib.split('=')[1] | |
if len(updated_text_encoding) > 0: | |
text_encoding = updated_text_encoding | |
break | |
#print("<encoding=\"" + text_encoding + "\">\n") | |
teh_story = tree.xpath('/html/body/center/table[2]/tr/td[2]/div[1]') | |
if teh_story != None and len(teh_story) == 0: | |
teh_story = tree.xpath('/html/body/center/table/tr/blockquote/blockquote/div[1]') | |
if teh_story != None and len(teh_story) == 0: | |
teh_story = tree.xpath('/html/body/center/table[2]/tr/td[2]/center[2]/table/tr/td/div') | |
if teh_story != None and len(teh_story) > 0: | |
for node in teh_story: | |
node_tag = node.tag | |
node_text = node.text | |
node_iter_text = node.itertext() | |
strings = stringify_children(node, text_encoding) | |
if strings == None: | |
print("<error=\"" + "No textual content" + "\">\n") | |
exit(-5) | |
if len(strings) <= 127: | |
print("<error=\"" + "No enough textual content" + "\">\n") | |
exit(-6) | |
result = result + strings | |
else: | |
print("<error=\"" + "Unable to extract textual content" + "\">\n") | |
exit(-7) | |
result = result + "\n<|endoftext|>\n" | |
print(result) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment