Last active
March 12, 2019 15:29
-
-
Save roseg43/7c9ed297421596ef02053ee118a4cfc9 to your computer and use it in GitHub Desktop.
A small command-line script to remove version strings from filenames and filename references in directories. Useful for parsing wget scrapes.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os, re, sys | |
print ("Asset Renamer v1 ::::::::::::::") | |
def replaceVersionsInFile(file): | |
#This regex matches all href and src attributes and finds version strings that either use question marks of the unicode string %3 | |
pattern = "(?:href=[\"\']?([^\"\'>]+)[\"\']?|src=[\"\']?([^\"\'>]+)[\"\']?)([\']?(?=.*%3?)[^\"\']*)|(.*(?=.*\?)[^\"\']*)" | |
fo = open(file, 'r+'); | |
text = fo.readlines() | |
fo.seek(0) | |
fo.truncate() | |
for line in text: | |
search = re.search(pattern, line) | |
if search: | |
versionString = search.group(3) | |
if versionString: | |
print("Old line: " + line) | |
line = line.replace(str(versionString), '') | |
print("New Line: " + line) | |
fo.write(line) | |
fo.close() | |
if len(sys.argv) > 1: | |
scrptPath = os.path.join(os.path.dirname(os.path.realpath(__file__ + '/../')), sys.argv[1]) | |
print (scrptPath) | |
else: | |
scrptPath = os.path.dirname(os.path.realpath(__file__)) | |
# Loops through all directories and files in the script path, modifying html files and renaming any files that match the regex. | |
for root, dirs, files in os.walk(scrptPath): | |
for filename in files: | |
if filename.endswith('.html'): | |
replaceVersionsInFile(os.path.join(root,filename)) | |
if "?" in filename: | |
m = re.search("(.*(?=.*%3))|(.*(?=.*\?))", filename) | |
os.rename(os.path.join(root,filename), os.path.join(root,m.group(0))) | |
print ("Asset Renamer v1 DONE ::::::::::::") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Usage
Download and place in the root directory of your scrape. Open a terminal at that location and run
python versionStringRemover.py
. Any replaced lines found in HTML files will be listed as "Old Line:", "New Line:".