Skip to content

Instantly share code, notes, and snippets.

@whoeverest
Created November 13, 2012 14:04
Show Gist options
  • Save whoeverest/4065917 to your computer and use it in GitHub Desktop.
Save whoeverest/4065917 to your computer and use it in GitHub Desktop.
import urllib2
import bs4 as BeautifulSoup
from PIL import Image
from cStringIO import StringIO
import tempfile, subprocess
URL = "http://www.skopjefinki.ekoinformatika.mk"
WHITE = (255, 255, 255)
BLACK = (0, 0, 0, 255)
MAX_BOX = (5, 45, 50, 65)
TRESHOLD = 0.7
def get_scalar_value(line, name="n", max=100, min=0):
pixels = line.load()
min_color, max_value = 200, 0
for y in range(line.size[1]):
color = round(sum(pixels[0,y])) / float(len(pixels[0,y]))
if color <= min_color:
min_color, max_value = color, line.size[1] - y
value = "%.2f" % (max_value / float(line.size[1]) * max)
line.save("%s_(%s).png" % (name, value))
return value
def get_max_values(tags):
max_values = Image.new("RGBA",
(len(tags) * (MAX_BOX[2] - MAX_BOX[0]),
(MAX_BOX[3] - MAX_BOX[1])), None)
values = {}
for tag_no, tag in enumerate(tags):
url = tag.get("src")
name = tag.get("alt")
image_file = urllib2.urlopen(URL + url)
image_data = StringIO(image_file.read())
image = Image.open(image_data)
values[name] = get_scalar_value(get_line(image), name)
max_value = image.crop(MAX_BOX)
max_values.paste(remove_black(max_value),
(tag_no * max_value.size[0], 0))
max_values.save("max_values.png")
max_values.paste(Image.open('comma.png'), (tag_no * max_value.size[0], 0))
max_values.save("max_values.png")
print values
def remove_black(image):
"""asdlkfjaldskfjalkdsfjhaslkdfhaslkdjfhasdf
asdfasdfasdfasdfadsfadsfasdfadsf
@param image is the input graph: """
width, height = image.size
for x in range(width):
for y in range(height):
if sum(image.getpixel((x, y))) < sum(WHITE) * TRESHOLD:
image.putpixel((x, y), WHITE)
return image
def get_line(image):
start, end, max_x = 0, 0, 0
for x in range(image.size[0]):
if (sum(image.getpixel((x, image.size[1]/2))) == 255) and x > max_x:
max_x = x
for y in range(image.size[1]):
if start == 0 and image.getpixel((max_x, y)) == BLACK:
start = y
elif start > 0 and image.getpixel((max_x, y)) == BLACK:
end = y
return image.crop((max_x-2, start+1, max_x-1, end))
def image_to_text(image):
from google.appengine.api import conversion
output = StringIO.StringIO()
image.save(output, format="GIF")
contents = output.getvalue()
output.close()
asset = conversion.Asset("image/png", contents, "max_values.txt")
conversion_obj = conversion.Conversion(asset, "text/plain")
result = conversion.convert(conversion_obj)
return result
def ocr(image):
tempFile = tempfile.NamedTemporaryFile(delete = False)
process = subprocess.Popen(['tesseract', image, tempFile.name, '-psm', '7', 'digits'], stdout = subprocess.PIPE, stdin = subprocess.PIPE, stderr = subprocess.STDOUT)
process.communicate()
handle = open(tempFile.name + '.txt', 'r').read()
return handle
def main():
page = urllib2.urlopen(URL)
html = page.read()
page.close()
soup = BeautifulSoup.BeautifulSoup(html)
tags = soup.body.findAll('img')[:-1]
get_max_values(tags)
print ocr('max_values.png')
if __name__ == '__main__':
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment