adammichaelwood/rend.py

## rend.py
import sys
import re
from bs4 import BeautifulSoup
import urllib.request

home_link = "whoishostingthis"

file_name = sys.argv[1]

f = open(file_name)
soup = BeautifulSoup(f, 'html.parser')
f.close()

anchors = soup.find_all('a')

for link in anchors:
    if not home_link in link['href']:
        link['target'] = "_blank"

    if link.string == 'x':
        print( link['href'])
        page = urllib.request.urlopen(link['href'])
        content = page.read()
        souped_content = BeautifulSoup(content, 'html.parser')

        if 'amzn' in link['href']:
        # to do...
        # set to id productTitle
        # elif souped_content.h1 :
        #    link.string = souped_content.h1.string
        #    print( link.string)
        else:
            link.string = souped_content.title.string
            print( link.string)
            # remove anything after a | or space-hyphen-space
            # set anchor text to fixed title

html = soup.prettify("utf-8", formatter="html")

with open(file_name, "wb") as file:
    file.write(html)

blocktags = '''\
<address    <article    <aside
<blockquote
<canvas
<dd    <div    <dl
<fieldset    <figcaption    <figure    <footer    <form
<h1    <h2    <h3    <h4    <h5    <h6    <header    <hgroup    <hr
<li
<main
<nav    <noscript
<ol    <output
<p    <pre
<section
<table    <tfoot
<ul
<video'''.split()

pat = re.compile('(' + '|'.join(blocktags) + ')')


f = open(file_name, 'r')

html = f.read().replace('\n', '')

f.close()

html = pat.sub(r'\n\1', html)

html = re.sub(' +',' ', html)
html = re.sub('p\> ', 'p>', html)

with open(file_name, "w") as file:
    file.write(html)
	import sys
	import re
	from bs4 import BeautifulSoup
	import urllib.request

	home_link = "whoishostingthis"

	file_name = sys.argv[1]

	f = open(file_name)
	soup = BeautifulSoup(f, 'html.parser')
	f.close()

	anchors = soup.find_all('a')

	for link in anchors:
	if not home_link in link['href']:
	link['target'] = "_blank"

	if link.string == 'x':
	print( link['href'])
	page = urllib.request.urlopen(link['href'])
	content = page.read()
	souped_content = BeautifulSoup(content, 'html.parser')

	if 'amzn' in link['href']:
	# to do...
	# set to id productTitle
	# elif souped_content.h1 :
	# link.string = souped_content.h1.string
	# print( link.string)
	else:
	link.string = souped_content.title.string
	print( link.string)
	# remove anything after a \| or space-hyphen-space
	# set anchor text to fixed title

	html = soup.prettify("utf-8", formatter="html")

	with open(file_name, "wb") as file:
	file.write(html)

	blocktags = '''\
	<address <article <aside
	<blockquote
	<canvas
	<dd <div <dl
	<fieldset <figcaption <figure <footer <form
	<h1 <h2 <h3 <h4 <h5 <h6 <header <hgroup <hr
	<li
	<main
	<nav <noscript
	<ol <output
	<p <pre
	<section
	<table <tfoot
	<ul
	<video'''.split()

	pat = re.compile('(' + '\|'.join(blocktags) + ')')


	f = open(file_name, 'r')

	html = f.read().replace('\n', '')

	f.close()

	html = pat.sub(r'\n\1', html)

	html = re.sub(' +',' ', html)
	html = re.sub('p\> ', 'p>', html)

	with open(file_name, "w") as file:
	file.write(html)