whoeverest/gist:4065917

## gistfile1.txt
import urllib2
import bs4 as BeautifulSoup
from PIL import Image
from cStringIO import StringIO

import tempfile, subprocess

URL = "http://www.skopjefinki.ekoinformatika.mk"
WHITE = (255, 255, 255)
BLACK = (0, 0, 0, 255)
MAX_BOX = (5, 45, 50, 65)
TRESHOLD = 0.7


def get_scalar_value(line, name="n", max=100, min=0):
    pixels = line.load()
    min_color, max_value = 200, 0
    for y in range(line.size[1]):
        color = round(sum(pixels[0,y])) / float(len(pixels[0,y]))
        if color <= min_color:
            min_color, max_value = color, line.size[1] - y
    value = "%.2f" % (max_value / float(line.size[1]) * max)
    line.save("%s_(%s).png" % (name, value))
    return value

def get_max_values(tags):
    max_values = Image.new("RGBA",
                           (len(tags) * (MAX_BOX[2] - MAX_BOX[0]),
                           (MAX_BOX[3] - MAX_BOX[1])), None)
    values = {}
    for tag_no, tag in enumerate(tags):
        url = tag.get("src")
        name = tag.get("alt")
        image_file = urllib2.urlopen(URL + url)
        image_data = StringIO(image_file.read())
        image = Image.open(image_data)
        values[name] = get_scalar_value(get_line(image), name)
        max_value = image.crop(MAX_BOX)
        max_values.paste(remove_black(max_value),
                         (tag_no * max_value.size[0], 0))
        max_values.save("max_values.png")

        max_values.paste(Image.open('comma.png'), (tag_no * max_value.size[0], 0))
        max_values.save("max_values.png")

    print values

def remove_black(image):
    """asdlkfjaldskfjalkdsfjhaslkdfhaslkdjfhasdf
    asdfasdfasdfasdfadsfadsfasdfadsf
    @param image is the input graph: """

    width, height = image.size
    for x in range(width):
        for y in range(height):
            if sum(image.getpixel((x, y))) < sum(WHITE) * TRESHOLD:
                image.putpixel((x, y), WHITE)
    return image

def get_line(image):
    start, end, max_x = 0, 0, 0
    for x in range(image.size[0]):
        if (sum(image.getpixel((x, image.size[1]/2))) == 255) and x > max_x:
            max_x = x
    for y in range(image.size[1]):
        if start == 0 and image.getpixel((max_x, y)) == BLACK:
            start = y
        elif start > 0 and image.getpixel((max_x, y)) == BLACK:
            end = y
    return image.crop((max_x-2, start+1, max_x-1, end))

def image_to_text(image):
    from google.appengine.api import conversion

    output = StringIO.StringIO()
    image.save(output, format="GIF")
    contents = output.getvalue()
    output.close()

    asset = conversion.Asset("image/png", contents, "max_values.txt")
    conversion_obj = conversion.Conversion(asset, "text/plain")
    result = conversion.convert(conversion_obj)
    return result

def ocr(image):
    tempFile = tempfile.NamedTemporaryFile(delete = False)

    process = subprocess.Popen(['tesseract', image, tempFile.name, '-psm', '7', 'digits'], stdout = subprocess.PIPE, stdin = subprocess.PIPE, stderr = subprocess.STDOUT)
    process.communicate()

    handle = open(tempFile.name + '.txt', 'r').read()

    return handle

def main():
    page = urllib2.urlopen(URL)
    html = page.read()
    page.close()
    soup = BeautifulSoup.BeautifulSoup(html)
    tags = soup.body.findAll('img')[:-1]
    get_max_values(tags)
    print ocr('max_values.png')

if __name__ == '__main__':
    main()
	import urllib2
	import bs4 as BeautifulSoup
	from PIL import Image
	from cStringIO import StringIO

	import tempfile, subprocess

	URL = "http://www.skopjefinki.ekoinformatika.mk"
	WHITE = (255, 255, 255)
	BLACK = (0, 0, 0, 255)
	MAX_BOX = (5, 45, 50, 65)
	TRESHOLD = 0.7


	def get_scalar_value(line, name="n", max=100, min=0):
	pixels = line.load()
	min_color, max_value = 200, 0
	for y in range(line.size[1]):
	color = round(sum(pixels[0,y])) / float(len(pixels[0,y]))
	if color <= min_color:
	min_color, max_value = color, line.size[1] - y
	value = "%.2f" % (max_value / float(line.size[1]) * max)
	line.save("%s_(%s).png" % (name, value))
	return value

	def get_max_values(tags):
	max_values = Image.new("RGBA",
	(len(tags) * (MAX_BOX[2] - MAX_BOX[0]),
	(MAX_BOX[3] - MAX_BOX[1])), None)
	values = {}
	for tag_no, tag in enumerate(tags):
	url = tag.get("src")
	name = tag.get("alt")
	image_file = urllib2.urlopen(URL + url)
	image_data = StringIO(image_file.read())
	image = Image.open(image_data)
	values[name] = get_scalar_value(get_line(image), name)
	max_value = image.crop(MAX_BOX)
	max_values.paste(remove_black(max_value),
	(tag_no * max_value.size[0], 0))
	max_values.save("max_values.png")

	max_values.paste(Image.open('comma.png'), (tag_no * max_value.size[0], 0))
	max_values.save("max_values.png")

	print values

	def remove_black(image):
	"""asdlkfjaldskfjalkdsfjhaslkdfhaslkdjfhasdf
	asdfasdfasdfasdfadsfadsfasdfadsf
	@param image is the input graph: """

	width, height = image.size
	for x in range(width):
	for y in range(height):
	if sum(image.getpixel((x, y))) < sum(WHITE) * TRESHOLD:
	image.putpixel((x, y), WHITE)
	return image

	def get_line(image):
	start, end, max_x = 0, 0, 0
	for x in range(image.size[0]):
	if (sum(image.getpixel((x, image.size[1]/2))) == 255) and x > max_x:
	max_x = x
	for y in range(image.size[1]):
	if start == 0 and image.getpixel((max_x, y)) == BLACK:
	start = y
	elif start > 0 and image.getpixel((max_x, y)) == BLACK:
	end = y
	return image.crop((max_x-2, start+1, max_x-1, end))

	def image_to_text(image):
	from google.appengine.api import conversion

	output = StringIO.StringIO()
	image.save(output, format="GIF")
	contents = output.getvalue()
	output.close()

	asset = conversion.Asset("image/png", contents, "max_values.txt")
	conversion_obj = conversion.Conversion(asset, "text/plain")
	result = conversion.convert(conversion_obj)
	return result

	def ocr(image):
	tempFile = tempfile.NamedTemporaryFile(delete = False)

	process = subprocess.Popen(['tesseract', image, tempFile.name, '-psm', '7', 'digits'], stdout = subprocess.PIPE, stdin = subprocess.PIPE, stderr = subprocess.STDOUT)
	process.communicate()

	handle = open(tempFile.name + '.txt', 'r').read()

	return handle

	def main():
	page = urllib2.urlopen(URL)
	html = page.read()
	page.close()
	soup = BeautifulSoup.BeautifulSoup(html)
	tags = soup.body.findAll('img')[:-1]
	get_max_values(tags)
	print ocr('max_values.png')

	if __name__ == '__main__':
	main()