Skip to content

Instantly share code, notes, and snippets.

@EdwardIII
Created October 12, 2012 13:12
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save EdwardIII/3879128 to your computer and use it in GitHub Desktop.
Save EdwardIII/3879128 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python
from bs4 import BeautifulSoup
import fnmatch
import os
import re
def Page(object):
def __init__(self, filename):
self.filename = filename
self.soup = None
self.changed = False
def _soup(self):
if self.soup:
return soup
else:
f = open(self.filename, 'r')
self.soup = BeautifulSoup(f.read())
return self.soup
def remove_script_tag_containing(self, regex_str):
for script in page.find_all('script'):
if bool( re.search(r'_trackPageview', script.get_text()) ):
print "Found match in %s" % script
script.decompose()
self.changed = True
def save(self, force_save = False):
if self.changed or force_save:
f = open(self.filename, 'w')
f.write(self.soup().get_text())
f.close()
for root, dirnames, filenames in os.walk('.'):
for filename in fnmatch.filter(filenames, '*.html'):
page = Page(os.path.join(root, filename))
#page.remove_script_tag_containing(r'_trackPageView')
page.save(True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment