Skip to content

Instantly share code, notes, and snippets.

@BTBTravis
Last active March 16, 2018 21:54
Show Gist options
  • Save BTBTravis/9a3dc3a2da191e6cd5434eb1ce3ea810 to your computer and use it in GitHub Desktop.
Save BTBTravis/9a3dc3a2da191e6cd5434eb1ce3ea810 to your computer and use it in GitHub Desktop.
Python Script to scrape images files out of data dump including with img src base64 encoded images
#! /usr/bin/python
import re
# from base64 import decodestring
import base64
# import base64
f = open("dump.txt","r")
str = f.read()
# str = 'lalala <img src=""> lalala <img src="">'
# str = '<img src=""> '
# <img src=\"data:image/png;base64
# p = re.compile(r'src=\"data:image/([^;]+);base64,([^>]+)')
p = re.compile(r'src=\\"data:image\/([^;]+);base64,([^"]+)')
result = p.findall(str)
# m = re.search('scr="([^"]+)', str)
print(len(result))
# m.group(0)
# print(result)
def pad(data):
missing_padding = len(data) % 4
if missing_padding != 0:
data += b'='* (4 - missing_padding)
return data
# return base64.decodestring(data)
for k,v in enumerate(result):
# print(k)
# print(v)
t = v[0]
s = v[1][:-1]
s = s.replace('\r\n', '')
p = pad(s)
# print({
# 't': t,
# 's': s,
# 'pad': p
# })
fh = open("dump_" + `k` + "." + t, "wb")
fh.write(p.decode('base64'))
# SELECT ptc.`pm_task_comment` FROM `pm_tasks` pt
# LEFT JOIN `pm_tasks_comments` ptc ON ptc.`issue_id` = pt.`pm_task_id`
# WHERE
# pt.`pm_tasks_client_id` = 179
# AND ptc.`pm_task_comment` LIKE '%data%';
# LIMIT 1;
input()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment