FurloSK/extract_har.py

## extract_har.py
#!/usr/bin/env python3

# Python 3 script to extract images from HTTP Archive (HAR) files (Tested & working on Python 3.11)
#
# Original code: kafran
#   https://gist.github.com/kafran/0257c13b3d0a79620695b73062334930
# Updated code: Lewiscowles1986
#   https://gist.github.com/Lewiscowles1986/645e79295efa84698f4e45cd06d610ea
# This code from: MrCheatEugene
#   https://gist.github.com/MrCheatEugene/46ad8173e83efb70cf6543cb36629403
# Updated & tweaked by: FurloSK [originally developed 2023-11-03]
#   https://gist.github.com/FurloSK/0477e01024f701db42341fc3223a5d8c
#
# Changes / Release notes:
#   2023-11-20
#     Prevents overwriting files by appending number to them + displays warning in such case.
#   2023-11-03
#     First working version.

import json
import base64
import os
import sys

# BEGIN config part

# Allowed mimetypes/extensions to be parsed:
# Determines which mimetypes will be saved when parsing the *.har file
mimetypes = {
    "image/webp": ".webp",
    "image/jpeg": ".jpeg", # *.jpg files have two possible extensions
    "image/jpeg": ".jpg",  #   (but .jpeg is official and thus preferred)
    "image/png": ".png",
    "image/svg+xml": ".svg"
}

# Output file path creation rules:
# Determines whether to create subfolders for domain/path when exporting files.
# When <outputPath> is True, <outputPathDepth> determines how many url path
#   parts (delimited by /) will be used, and from which end of url to start.
outputDomain = True
outputPath = True
outputPathDepth = 0 # 0=all, 2=first two path parts , -2=last two path parts

# END config part

#=============================================================================
# Start program
#=============================================================================

# check cmd arguments
if len(sys.argv) < 2 or len(sys.argv) > 3:
    print('extract_har: A simple script to extract all picture files from *.har file.')
    print('Usage: extract_har.py <input.har> [<output_dir>]')
    sys.exit(0)

# get *.har file to work with
if not os.path.exists(sys.argv[1]):
    print('Specified *.har file (' + sys.argv[1] + ') does not exist, exiting...')
    sys.exit(1)
with open(sys.argv[1], "r" ,encoding="utf8") as f:
    print('Loading *.har file: ' + sys.argv[1])
    har = json.loads(f.read())

# get base directory for extraction
if len(sys.argv) == 3:
    folder = sys.argv[2]
else:
    folder = os.path.join(os.getcwd(), sys.argv[1] + '_extract')
baseFolder = os.path.basename(os.path.normpath(folder))
print('Destination folder: ' + baseFolder)
if not os.path.isdir(folder):
    print('  Creating folder structure for extraction...')
    os.makedirs(folder)

# save extensions list from allowed mimetypes
extensions = tuple(mimetypes.values())

# print some info
print('\nExtraction output settings:')
if not outputDomain and not outputPath:
    print('  do not create any directory structure - extract images directly to base folder')
else:
    print('  create subfolders for domain: ' + str(outputDomain))
    if outputPath and outputPathDepth != 0:
        print('  create subfolders for URL path: ' + str(outputPath)
            + ' (only for ' + ('first ' if outputPathDepth > 0 else 'last ')
            + str(abs(outputPathDepth)) + ' parts)')
    else:
        print('  create subfolders for URL path: ' + str(outputPath))

# start parsing the individual entries
print('\nStarting extraction...')
entries = har["log"]["entries"]
count_total = 0
count_extracted = 0
for entry in entries:
    count_total += 1

    # only parse the entry if it is one of the wanted extensions
    mimetype = entry["response"]["content"]["mimeType"]
    if mimetype in mimetypes:
        count_extracted += 1

        # parse entry url
        url = entry["request"]["url"]
        urlProtocol, tmp = url.split("//", 1) # protocol
        tmp = tmp.split("/")
        urlDomain = tmp[0]                    # domain
        urlFullPath = tmp[1:]

        # extract file name and path
        urlPath = urlFullPath[:-1]            # path
        urlFilename = urlFullPath[-1]         # filename

        # if filename does not contain expected extension, append it
        if not urlFilename.endswith(extensions):
            urlFilename = urlFilename + mimetypes.get(mimetype)

        #print(str(urlProtocol), str(urlDomain), str(urlPath), str(urlFilename))

        # determine subfolders structure according to config settings
        pathStr = ''
        if outputDomain and outputPath:
            pathStr = os.path.join(urlDomain,
                *((urlPath[:outputPathDepth]) if outputPathDepth > 0 else (urlPath[outputPathDepth:]))
            )
        elif outputDomain:
            pathStr = urlDomain
        elif outputPath:
            pathStr = os.path.join(
                *((urlPath[:outputPathDepth]) if outputPathDepth > 0 else (urlPath[outputPathDepth:]))
            )

        # construct final file path to save extracted file
        if outputDomain or outputPath:
            subFolder = os.path.join(folder, pathStr)
        else:
            subFolder = folder
        outFile = os.path.join(subFolder, urlFilename)

        # prepare subfolder
        if not os.path.isdir(subFolder):
            os.makedirs(subFolder)

        # parse raw data to image data
        image64 = entry["response"]["content"]["text"]
        image = base64.b64decode(image64)

        # check if file exists
        print('  ' + urlFilename + ' [' + str(len(image)) + ' bytes]:')
        print('    extracting to: ' + (pathStr if len(pathStr) else baseFolder))
        if os.path.exists(outFile):
            print('    ⚠️ WARNING: file ' + urlFilename + ' already exists!')
            fixFilename, fixExtension = os.path.splitext(urlFilename)

            counter = 1
            while os.path.exists(outFile):
                fixFile = fixFilename + " (" + str(counter) + ")" + fixExtension
                outFile = os.path.join(subFolder, fixFile)
                counter += 1
            print('    ℹ️ Creating file ' + fixFile + '...')

        # save data to extracted file
        with open(outFile, "wb") as f:
            f.write(image)

print('\nFinished extracting ' + str(count_extracted) + ' (out of total ' + str(count_total) + ') files.')
	#!/usr/bin/env python3

	# Python 3 script to extract images from HTTP Archive (HAR) files (Tested & working on Python 3.11)
	#
	# Original code: kafran
	# https://gist.github.com/kafran/0257c13b3d0a79620695b73062334930
	# Updated code: Lewiscowles1986
	# https://gist.github.com/Lewiscowles1986/645e79295efa84698f4e45cd06d610ea
	# This code from: MrCheatEugene
	# https://gist.github.com/MrCheatEugene/46ad8173e83efb70cf6543cb36629403
	# Updated & tweaked by: FurloSK [originally developed 2023-11-03]
	# https://gist.github.com/FurloSK/0477e01024f701db42341fc3223a5d8c
	#
	# Changes / Release notes:
	# 2023-11-20
	# Prevents overwriting files by appending number to them + displays warning in such case.
	# 2023-11-03
	# First working version.

	import json
	import base64
	import os
	import sys

	# BEGIN config part

	# Allowed mimetypes/extensions to be parsed:
	# Determines which mimetypes will be saved when parsing the *.har file
	mimetypes = {
	"image/webp": ".webp",
	"image/jpeg": ".jpeg", # *.jpg files have two possible extensions
	"image/jpeg": ".jpg", # (but .jpeg is official and thus preferred)
	"image/png": ".png",
	"image/svg+xml": ".svg"
	}

	# Output file path creation rules:
	# Determines whether to create subfolders for domain/path when exporting files.
	# When <outputPath> is True, <outputPathDepth> determines how many url path
	# parts (delimited by /) will be used, and from which end of url to start.
	outputDomain = True
	outputPath = True
	outputPathDepth = 0 # 0=all, 2=first two path parts , -2=last two path parts

	# END config part

	#=============================================================================
	# Start program
	#=============================================================================

	# check cmd arguments
	if len(sys.argv) < 2 or len(sys.argv) > 3:
	print('extract_har: A simple script to extract all picture files from *.har file.')
	print('Usage: extract_har.py <input.har> [<output_dir>]')
	sys.exit(0)

	# get *.har file to work with
	if not os.path.exists(sys.argv[1]):
	print('Specified *.har file (' + sys.argv[1] + ') does not exist, exiting...')
	sys.exit(1)
	with open(sys.argv[1], "r" ,encoding="utf8") as f:
	print('Loading *.har file: ' + sys.argv[1])
	har = json.loads(f.read())

	# get base directory for extraction
	if len(sys.argv) == 3:
	folder = sys.argv[2]
	else:
	folder = os.path.join(os.getcwd(), sys.argv[1] + '_extract')
	baseFolder = os.path.basename(os.path.normpath(folder))
	print('Destination folder: ' + baseFolder)
	if not os.path.isdir(folder):
	print(' Creating folder structure for extraction...')
	os.makedirs(folder)

	# save extensions list from allowed mimetypes
	extensions = tuple(mimetypes.values())

	# print some info
	print('\nExtraction output settings:')
	if not outputDomain and not outputPath:
	print(' do not create any directory structure - extract images directly to base folder')
	else:
	print(' create subfolders for domain: ' + str(outputDomain))
	if outputPath and outputPathDepth != 0:
	print(' create subfolders for URL path: ' + str(outputPath)
	+ ' (only for ' + ('first ' if outputPathDepth > 0 else 'last ')
	+ str(abs(outputPathDepth)) + ' parts)')
	else:
	print(' create subfolders for URL path: ' + str(outputPath))

	# start parsing the individual entries
	print('\nStarting extraction...')
	entries = har["log"]["entries"]
	count_total = 0
	count_extracted = 0
	for entry in entries:
	count_total += 1

	# only parse the entry if it is one of the wanted extensions
	mimetype = entry["response"]["content"]["mimeType"]
	if mimetype in mimetypes:
	count_extracted += 1

	# parse entry url
	url = entry["request"]["url"]
	urlProtocol, tmp = url.split("//", 1) # protocol
	tmp = tmp.split("/")
	urlDomain = tmp[0] # domain
	urlFullPath = tmp[1:]

	# extract file name and path
	urlPath = urlFullPath[:-1] # path
	urlFilename = urlFullPath[-1] # filename

	# if filename does not contain expected extension, append it
	if not urlFilename.endswith(extensions):
	urlFilename = urlFilename + mimetypes.get(mimetype)

	#print(str(urlProtocol), str(urlDomain), str(urlPath), str(urlFilename))

	# determine subfolders structure according to config settings
	pathStr = ''
	if outputDomain and outputPath:
	pathStr = os.path.join(urlDomain,
	*((urlPath[:outputPathDepth]) if outputPathDepth > 0 else (urlPath[outputPathDepth:]))
	)
	elif outputDomain:
	pathStr = urlDomain
	elif outputPath:
	pathStr = os.path.join(
	*((urlPath[:outputPathDepth]) if outputPathDepth > 0 else (urlPath[outputPathDepth:]))
	)

	# construct final file path to save extracted file
	if outputDomain or outputPath:
	subFolder = os.path.join(folder, pathStr)
	else:
	subFolder = folder
	outFile = os.path.join(subFolder, urlFilename)

	# prepare subfolder
	if not os.path.isdir(subFolder):
	os.makedirs(subFolder)

	# parse raw data to image data
	image64 = entry["response"]["content"]["text"]
	image = base64.b64decode(image64)

	# check if file exists
	print(' ' + urlFilename + ' [' + str(len(image)) + ' bytes]:')
	print(' extracting to: ' + (pathStr if len(pathStr) else baseFolder))
	if os.path.exists(outFile):
	print(' ⚠️ WARNING: file ' + urlFilename + ' already exists!')
	fixFilename, fixExtension = os.path.splitext(urlFilename)

	counter = 1
	while os.path.exists(outFile):
	fixFile = fixFilename + " (" + str(counter) + ")" + fixExtension
	outFile = os.path.join(subFolder, fixFile)
	counter += 1
	print(' ℹ️ Creating file ' + fixFile + '...')

	# save data to extracted file
	with open(outFile, "wb") as f:
	f.write(image)

	print('\nFinished extracting ' + str(count_extracted) + ' (out of total ' + str(count_total) + ') files.')