pigeonflight/README.md

## README.md

      
    Raw
  

              README.md
            
          
    Background

This is a horribly ugly (and lazy) implementation. The goal is to remove images that are not referenced by either the css or html files
of a web project.
Additionally it converts large png files (Large png is bigger than 250000 bytes) into jpg files since these are most often photographs. It then goes into each file and replaces the reference to the png with the reference to the jpg file. Finally, it removes the old png files to reduce the overall site folder size. It also skips externally stored images.
The script includes a 'sed-like' function based on something found at stackoverflow (https://stackoverflow.com/questions/12714415/python-equivalent-to-sed/40843600#40843600)
Assumption

All images are located in images/ and all css is located in css/.
It ignores files that begin with '_'.
Usage

To use this script edit the css pattern at the beginning of the file.
By default, the file looks for css files that end with ".webflow.css"
python cleanimages.py site_folder


## cleanimages.py
import re
import os
import sys
import shutil
from tempfile import mkstemp
from PIL import Image

large_png = 250000
css_file_pattern = '.webflow.css'

#---- no need to change anything below here ----#
directory = sys.argv[1]
css_directory = '{}/css'.format(directory)
image_directory = '{}/images'.format(directory)
output = []
css_output = []

# this function is "lifted" from stackoverflow
# see https://stackoverflow.com/questions/12714415/python-equivalent-to-sed/40843600#40843600
def sed(pattern, replace, source, dest=None, count=0):
    """Reads a source file and writes the destination file.

    In each line, replaces pattern with replace.

    Args:
        pattern (str): pattern to match (can be re.pattern)
        replace (str): replacement str
        source  (str): input filename
        count (int): number of occurrences to replace
        dest (str):   destination filename, if not given, source will be over written.
    """

    fin = open(source, 'r')
    num_replaced = count

    if dest:
        fout = open(dest, 'w')
    else:
        fd, name = mkstemp()
        fout = open(name, 'w')

    for line in fin:
        out = re.sub(pattern, replace, line)
        fout.write(out)

        if out != line:
            num_replaced += 1
        if count and num_replaced > count:
            break
    try:
        fout.writelines(fin.readlines())
    except Exception as E:
        raise E

    fin.close()
    fout.close()

    if not dest:
        shutil.move(name, source)

def replace_in_files(old,new):
    for filename in os.listdir(directory):
        if filename.endswith(".html"):
            file_path = '{}/{}'.format(directory,filename)
            sed(old,new, file_path)

    for filename in os.listdir(css_directory):
        if filename.endswith(css_file_pattern):
            file_path = '{}/{}'.format(css_directory,filename)
            sed(old,new, file_path)


# get html matches
for filename in os.listdir(directory):
    if filename.endswith(".html"):
        with open('{}/{}'.format(directory,filename)) as html:
            content = html.read()
            pattern = r'[^\"\'=\s]+\.(?:jpe?g|png|PNG|gif|ico)'
            matches = re.findall(pattern, content)
            output.extend(matches)


# get css matches
for filename in os.listdir(css_directory):
    if filename.endswith(css_file_pattern):
        with open('{}/{}'.format(css_directory,filename)) as html:
            content = html.read()
            matches_ = re.findall(r'url\(\'\.\.\/([^)]+)\'\)', content)
            matches = [match for match in matches_ if match.endswith('.jpg') or match.endswith('.png')]
            css_output.extend(matches)

# get images
html = set(output)
css = set(css_output)
file_images = html.union(css)
directory_images = set(["images/{}".format(item) for item in os.listdir(image_directory)])

unused_images = directory_images.difference(file_images)
for filename in unused_images:
    print("removing ",filename)
    os.remove('{}/{}'.format(directory,filename))

for filename in file_images:
    if filename.endswith('png'):
        if filename.startswith('http') or os.path.basename(filename).startswith('_'):
            print('skipping {}'.format(filename))
            continue
        print('processing {}'.format(filename))
        filename_new = '{}.jpg'.format(os.path.splitext(filename)[0])
        file_path = "{}/{}".format(directory,filename)
        file_path_new = "{}/{}".format(directory,filename_new)
        if os.stat(file_path).st_size > large_png:
            print('converting {} to jpg'.format(file_path))
            im = Image.open(file_path)
            rgb_im = im.convert('RGB')
            rgb_im.save(file_path_new)
            os.remove(file_path)

            replace_in_files(filename, filename_new)
	import re
	import os
	import sys
	import shutil
	from tempfile import mkstemp
	from PIL import Image

	large_png = 250000
	css_file_pattern = '.webflow.css'

	#---- no need to change anything below here ----#
	directory = sys.argv[1]
	css_directory = '{}/css'.format(directory)
	image_directory = '{}/images'.format(directory)
	output = []
	css_output = []

	# this function is "lifted" from stackoverflow
	# see https://stackoverflow.com/questions/12714415/python-equivalent-to-sed/40843600#40843600
	def sed(pattern, replace, source, dest=None, count=0):
	"""Reads a source file and writes the destination file.

	In each line, replaces pattern with replace.

	Args:
	pattern (str): pattern to match (can be re.pattern)
	replace (str): replacement str
	source (str): input filename
	count (int): number of occurrences to replace
	dest (str): destination filename, if not given, source will be over written.
	"""

	fin = open(source, 'r')
	num_replaced = count

	if dest:
	fout = open(dest, 'w')
	else:
	fd, name = mkstemp()
	fout = open(name, 'w')

	for line in fin:
	out = re.sub(pattern, replace, line)
	fout.write(out)

	if out != line:
	num_replaced += 1
	if count and num_replaced > count:
	break
	try:
	fout.writelines(fin.readlines())
	except Exception as E:
	raise E

	fin.close()
	fout.close()

	if not dest:
	shutil.move(name, source)

	def replace_in_files(old,new):
	for filename in os.listdir(directory):
	if filename.endswith(".html"):
	file_path = '{}/{}'.format(directory,filename)
	sed(old,new, file_path)

	for filename in os.listdir(css_directory):
	if filename.endswith(css_file_pattern):
	file_path = '{}/{}'.format(css_directory,filename)
	sed(old,new, file_path)


	# get html matches
	for filename in os.listdir(directory):
	if filename.endswith(".html"):
	with open('{}/{}'.format(directory,filename)) as html:
	content = html.read()
	pattern = r'[^\"\'=\s]+\.(?:jpe?g\|png\|PNG\|gif\|ico)'
	matches = re.findall(pattern, content)
	output.extend(matches)


	# get css matches
	for filename in os.listdir(css_directory):
	if filename.endswith(css_file_pattern):
	with open('{}/{}'.format(css_directory,filename)) as html:
	content = html.read()
	matches_ = re.findall(r'url\(\'\.\.\/([^)]+)\'\)', content)
	matches = [match for match in matches_ if match.endswith('.jpg') or match.endswith('.png')]
	css_output.extend(matches)

	# get images
	html = set(output)
	css = set(css_output)
	file_images = html.union(css)
	directory_images = set(["images/{}".format(item) for item in os.listdir(image_directory)])

	unused_images = directory_images.difference(file_images)
	for filename in unused_images:
	print("removing ",filename)
	os.remove('{}/{}'.format(directory,filename))

	for filename in file_images:
	if filename.endswith('png'):
	if filename.startswith('http') or os.path.basename(filename).startswith('_'):
	print('skipping {}'.format(filename))
	continue
	print('processing {}'.format(filename))
	filename_new = '{}.jpg'.format(os.path.splitext(filename)[0])
	file_path = "{}/{}".format(directory,filename)
	file_path_new = "{}/{}".format(directory,filename_new)
	if os.stat(file_path).st_size > large_png:
	print('converting {} to jpg'.format(file_path))
	im = Image.open(file_path)
	rgb_im = im.convert('RGB')
	rgb_im.save(file_path_new)
	os.remove(file_path)

	replace_in_files(filename, filename_new)