Skip to content

Instantly share code, notes, and snippets.

@adammichaelwood
Created March 31, 2016 03:39
Show Gist options
  • Save adammichaelwood/ad85f7e2b84faa663dce77042a5f6116 to your computer and use it in GitHub Desktop.
Save adammichaelwood/ad85f7e2b84faa663dce77042a5f6116 to your computer and use it in GitHub Desktop.
import sys
import re
from bs4 import BeautifulSoup
import urllib.request
home_link = "whoishostingthis"
file_name = sys.argv[1]
f = open(file_name)
soup = BeautifulSoup(f, 'html.parser')
f.close()
anchors = soup.find_all('a')
for link in anchors:
if not home_link in link['href']:
link['target'] = "_blank"
if link.string == 'x':
print( link['href'])
page = urllib.request.urlopen(link['href'])
content = page.read()
souped_content = BeautifulSoup(content, 'html.parser')
if 'amzn' in link['href']:
# to do...
# set to id productTitle
# elif souped_content.h1 :
# link.string = souped_content.h1.string
# print( link.string)
else:
link.string = souped_content.title.string
print( link.string)
# remove anything after a | or space-hyphen-space
# set anchor text to fixed title
html = soup.prettify("utf-8", formatter="html")
with open(file_name, "wb") as file:
file.write(html)
blocktags = '''\
<address <article <aside
<blockquote
<canvas
<dd <div <dl
<fieldset <figcaption <figure <footer <form
<h1 <h2 <h3 <h4 <h5 <h6 <header <hgroup <hr
<li
<main
<nav <noscript
<ol <output
<p <pre
<section
<table <tfoot
<ul
<video'''.split()
pat = re.compile('(' + '|'.join(blocktags) + ')')
f = open(file_name, 'r')
html = f.read().replace('\n', '')
f.close()
html = pat.sub(r'\n\1', html)
html = re.sub(' +',' ', html)
html = re.sub('p\> ', 'p>', html)
with open(file_name, "w") as file:
file.write(html)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment