zcorpan/fetch-img.py

## fetch-img.py
import re
from HTMLParser import HTMLParser
from urlparse import urljoin
import os

src = ''
width = False
height = False

class MyHTMLParser(HTMLParser):
    def handle_starttag(self, tag, attrs):
        global src
        global width
        global height
        src = ''
        width = False
        height = False
        for name, value in attrs:
            if name == 'src' and src == '':
                src = value
            elif name == 'width':
                width = True
            elif name == 'height':
                height = True

parser = MyHTMLParser()

f = open('img.txt', 'rb')
has_dup_orientation_output = open('has-dup-orientation.txt', 'a', 1)
has_orientation_output = open('has-orientation.txt', 'a', 1)
has_width_height_output = open('has-width-height.txt', 'a', 1)

count = 0
sites = 0
last_site = ''
for line in f:
    try:
        count += 1
        base = re.search('/../([^_]+)_', line)
        if not base:
            continue
        base = 'http://' + base.group(1)
        if base != last_site:
            sites += 1
            last_site = base
            print "at site " + str(sites) + ", image " + str(count)
        parser.feed(line.decode('windows-1252', 'ignore'))
        if src == '':
            continue
        src = src.strip()
        if re.search('^(ftp|file|javascript|data):', src):
            continue
        src = urljoin(base, src)

        p = os.popen('curl -s "' + src + '" | exiftool -a -IFD0:Orientation -',"r")
        output = p.read().strip()
        if output == "":
            continue
        num = output.count("\n")
        num_horizontal = output.count("Horizontal")
        if num > 0 and num + 1 != num_horizontal:
            has_dup_orientation_output.write(str(count) + ": " + base + " " + src + "\n")
        if num_horizontal > 0:
            continue
        has_orientation_output.write(str(count) + ": " + base + " " + src + "\n")
        if width and height:
            has_width_height_output.write(str(count) + ": " + base + " " + src + "\n")
    except:
        print str(count) + ": error " + base + " " + src

f.close()
	import re
	from HTMLParser import HTMLParser
	from urlparse import urljoin
	import os

	src = ''
	width = False
	height = False

	class MyHTMLParser(HTMLParser):
	def handle_starttag(self, tag, attrs):
	global src
	global width
	global height
	src = ''
	width = False
	height = False
	for name, value in attrs:
	if name == 'src' and src == '':
	src = value
	elif name == 'width':
	width = True
	elif name == 'height':
	height = True

	parser = MyHTMLParser()

	f = open('img.txt', 'rb')
	has_dup_orientation_output = open('has-dup-orientation.txt', 'a', 1)
	has_orientation_output = open('has-orientation.txt', 'a', 1)
	has_width_height_output = open('has-width-height.txt', 'a', 1)

	count = 0
	sites = 0
	last_site = ''
	for line in f:
	try:
	count += 1
	base = re.search('/../([^_]+)_', line)
	if not base:
	continue
	base = 'http://' + base.group(1)
	if base != last_site:
	sites += 1
	last_site = base
	print "at site " + str(sites) + ", image " + str(count)
	parser.feed(line.decode('windows-1252', 'ignore'))
	if src == '':
	continue
	src = src.strip()
	if re.search('^(ftp\|file\|javascript\|data):', src):
	continue
	src = urljoin(base, src)

	p = os.popen('curl -s "' + src + '" \| exiftool -a -IFD0:Orientation -',"r")
	output = p.read().strip()
	if output == "":
	continue
	num = output.count("\n")
	num_horizontal = output.count("Horizontal")
	if num > 0 and num + 1 != num_horizontal:
	has_dup_orientation_output.write(str(count) + ": " + base + " " + src + "\n")
	if num_horizontal > 0:
	continue
	has_orientation_output.write(str(count) + ": " + base + " " + src + "\n")
	if width and height:
	has_width_height_output.write(str(count) + ": " + base + " " + src + "\n")
	except:
	print str(count) + ": error " + base + " " + src

	f.close()