Skip to content

Instantly share code, notes, and snippets.

@roseg43
Last active March 12, 2019 15:29
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save roseg43/7c9ed297421596ef02053ee118a4cfc9 to your computer and use it in GitHub Desktop.
Save roseg43/7c9ed297421596ef02053ee118a4cfc9 to your computer and use it in GitHub Desktop.
A small command-line script to remove version strings from filenames and filename references in directories. Useful for parsing wget scrapes.
import os, re, sys
print ("Asset Renamer v1 ::::::::::::::")
def replaceVersionsInFile(file):
#This regex matches all href and src attributes and finds version strings that either use question marks of the unicode string %3
pattern = "(?:href=[\"\']?([^\"\'>]+)[\"\']?|src=[\"\']?([^\"\'>]+)[\"\']?)([\']?(?=.*%3?)[^\"\']*)|(.*(?=.*\?)[^\"\']*)"
fo = open(file, 'r+');
text = fo.readlines()
fo.seek(0)
fo.truncate()
for line in text:
search = re.search(pattern, line)
if search:
versionString = search.group(3)
if versionString:
print("Old line: " + line)
line = line.replace(str(versionString), '')
print("New Line: " + line)
fo.write(line)
fo.close()
if len(sys.argv) > 1:
scrptPath = os.path.join(os.path.dirname(os.path.realpath(__file__ + '/../')), sys.argv[1])
print (scrptPath)
else:
scrptPath = os.path.dirname(os.path.realpath(__file__))
# Loops through all directories and files in the script path, modifying html files and renaming any files that match the regex.
for root, dirs, files in os.walk(scrptPath):
for filename in files:
if filename.endswith('.html'):
replaceVersionsInFile(os.path.join(root,filename))
if "?" in filename:
m = re.search("(.*(?=.*%3))|(.*(?=.*\?))", filename)
os.rename(os.path.join(root,filename), os.path.join(root,m.group(0)))
print ("Asset Renamer v1 DONE ::::::::::::")
@roseg43
Copy link
Author

roseg43 commented Jan 5, 2017

Usage

Download and place in the root directory of your scrape. Open a terminal at that location and run python versionStringRemover.py. Any replaced lines found in HTML files will be listed as "Old Line:", "New Line:".

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment