Skip to content

Instantly share code, notes, and snippets.

@zcorpan zcorpan/fetch-img.py
Last active Aug 29, 2015

Embed
What would you like to do?
Exif research
import re
from HTMLParser import HTMLParser
from urlparse import urljoin
import os
src = ''
width = False
height = False
class MyHTMLParser(HTMLParser):
def handle_starttag(self, tag, attrs):
global src
global width
global height
src = ''
width = False
height = False
for name, value in attrs:
if name == 'src' and src == '':
src = value
elif name == 'width':
width = True
elif name == 'height':
height = True
parser = MyHTMLParser()
f = open('img.txt', 'rb')
has_dup_orientation_output = open('has-dup-orientation.txt', 'a', 1)
has_orientation_output = open('has-orientation.txt', 'a', 1)
has_width_height_output = open('has-width-height.txt', 'a', 1)
count = 0
sites = 0
last_site = ''
for line in f:
try:
count += 1
base = re.search('/../([^_]+)_', line)
if not base:
continue
base = 'http://' + base.group(1)
if base != last_site:
sites += 1
last_site = base
print "at site " + str(sites) + ", image " + str(count)
parser.feed(line.decode('windows-1252', 'ignore'))
if src == '':
continue
src = src.strip()
if re.search('^(ftp|file|javascript|data):', src):
continue
src = urljoin(base, src)
p = os.popen('curl -s "' + src + '" | exiftool -a -IFD0:Orientation -',"r")
output = p.read().strip()
if output == "":
continue
num = output.count("\n")
num_horizontal = output.count("Horizontal")
if num > 0 and num + 1 != num_horizontal:
has_dup_orientation_output.write(str(count) + ": " + base + " " + src + "\n")
if num_horizontal > 0:
continue
has_orientation_output.write(str(count) + ": " + base + " " + src + "\n")
if width and height:
has_width_height_output.write(str(count) + ": " + base + " " + src + "\n")
except:
print str(count) + ": error " + base + " " + src
f.close()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
You can’t perform that action at this time.