Created
June 13, 2011 12:33
-
-
Save idlecool/1022698 to your computer and use it in GitHub Desktop.
Old s3ocr.py which has been moved to s3pdf.py - SahanaEden
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# -*- coding: utf-8 -*- | |
""" Sahana Optical Character Recognision Utility (s3ocr) | |
@author: Suryajith Chillara <suryajith1987[at]gmail.com> | |
@author: Shiv Deepak <idlecool[at]gmail.com> | |
@copyright: 2009-2011 (c) Sahana Software Foundation | |
@license: MIT | |
Permission is hereby granted, free of charge, to any person | |
obtaining a copy of this software and associated documentation | |
files (the "Software"), to deal in the Software without | |
restriction, including without limitation the rights to use, | |
copy, modify, merge, publish, distribute, sublicense, and/or sell | |
copies of the Software, and to permit persons to whom the | |
Software is furnished to do so, subject to the following | |
conditions: | |
The above copyright notice and this permission notice shall be | |
included in all copies or substantial portions of the Software. | |
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |
EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES | |
OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |
NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT | |
HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, | |
WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING | |
FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR | |
OTHER DEALINGS IN THE SOFTWARE. | |
""" | |
__all__ = ["S3OCR"] | |
#========================== import section ==================================== | |
# Generic stuff | |
import os | |
import sys | |
import re | |
import uuid | |
import Image | |
import ImageOps | |
import ImageStat | |
import math | |
from StringIO import StringIO | |
from htmlentitydefs import name2codepoint | |
from lxml import etree | |
# Importing reportlab stuff | |
try: | |
from reportlab.pdfgen.canvas import Canvas | |
from reportlab.lib.pagesizes import A4 | |
from reportlab.graphics.barcode import code128 | |
# for adding more fonts | |
from reportlab.pdfbase import pdfmetrics | |
from reportlab.pdfbase.ttfonts import TTFont | |
import reportlab | |
reportlab.rl_config.warnOnMissingFontGlyphs = 0 | |
except(ImportError): | |
print >> sys.stderr, "S3 Debug: WARNING: S3OCR: reportlab has not been installed." | |
from gluon.storage import Storage | |
from s3rest import S3Method | |
from s3cfg import S3Config | |
#========================================================================== | |
#================================= OCR API ================================ | |
#========================================================================== | |
class S3OCR(S3Method): | |
""" | |
Generate XForms and PDFs the s3 way | |
""" | |
def apply_method(self, | |
r, | |
**attr): | |
""" | |
S3Method's abstract method | |
""" | |
xml = self.manager.xml | |
self.r = r | |
# s3ocr_config - dict which stores ocr configuration | |
# settings for a resource) | |
s3ocr_config = attr.get("s3ocr_config", {}) | |
# storing localised names of components | |
self.rheader_tabs = s3ocr_config.get("tabs", []) | |
# store custom pdf title (if any) | |
self.pdftitle = s3ocr_config.get("pdftitle", None) | |
# store components which have to be excluded | |
self.exclude_component_list = s3ocr_config.get("exclude_components", []) | |
# store individual field specific properties | |
self.custom_field_properties = s3ocr_config.get("field_properties", {}) | |
# example field_properties | |
# field_properties = { | |
# "%s_%s__%s" % (prefix, resourcename, fieldname): { fieldtype="", | |
# . | |
# . | |
# } | |
# } | |
# store individual fieldtype specific properties | |
s3config = S3Config(globals(), self.T) | |
self.custom_fieldtype_properties = \ | |
s3config.get_s3ocr_fieldtype_properties() | |
# example field_properties | |
# field_properties = { | |
# fieldtype : { fieldtype="", | |
# . | |
# . | |
# } | |
# } | |
# field type convention mapping from resource to pdf forms | |
self.generic_ocr_field_type = { | |
"string": "string", | |
"text": "textbox", | |
"boolean" : "boolean", | |
"double": "double", | |
"date": "date", | |
"datetime": "datetime", | |
"integer": "integer", | |
"list:integer": "multiselect", | |
"list:string": "multiselect", | |
"list:double": "multiselect", | |
"list:text": "multiselect", | |
} | |
# text for localisation | |
self.l10n = { | |
"datetime_hint": { | |
"date": self.T("fill in order: day(2) month(2) year(4)"), | |
"datetime": self.T("fill in order: hour(2) min(2) day(2) month(2) year(4)"), | |
}, | |
"ocr_inst": { | |
"inst1": self.T("1. Fill the necessary fields in BLOCK CAPITAL letters."), | |
"inst2": self.T("2. Always use one box per letter and leave one box space to separate words."), | |
"inst3": self.T("3. Fill in the circles completely."), | |
}, | |
"boolean": { | |
"yes": self.T("Yes"), | |
"no": self.T("No"), | |
}, | |
"select": { | |
"multiselect": self.T("Select one or more option(s) that apply"), | |
"singleselect": self.T("Select any one option that apply"), | |
}, | |
} | |
# check if debug mode is enabled | |
if r.vars.get("_debug", False) == "1": | |
self.debug = True | |
else: | |
self.debug = False | |
if self.debug: | |
content_disposition = "inline" | |
else: | |
content_disposition = "attachment" | |
# serve the request | |
format = r.representation | |
if r.http == "GET": | |
if format == "xml": | |
output = self.s3ocr_etree() | |
self.response.view = "xml.html" | |
self.response.headers["Content-Type"] = "application/xml" | |
return xml.tostring(output, pretty_print=True) | |
elif format == "pdf": | |
if r.vars.get("_operation", False) == "putpdf": | |
output = self.s3ocr_parser() | |
self.response.view = None | |
self.response.headers["Content-Type"] = "text/plain" | |
#self.response.headers["Content-disposition"] = \ | |
# "%s; filename=\"%s.pdf\"" % (content_disposition, | |
# self.tablename) | |
return output | |
else: | |
form_uuid = uuid.uuid1() | |
form_revision = self.__book_revision(form_uuid) | |
output, layout_etree = self.pdf_manager(form_uuid, form_revision) | |
self.__update_dbmeta(layout_xml=etree.tostring(layout_etree), | |
form_uuid=form_uuid, | |
revision=form_revision) | |
self.response.view = None | |
self.response.headers["Content-Type"] = "application/pdf" | |
self.response.headers["Content-disposition"] = \ | |
"%s; filename=\"%s.pdf\"" % (content_disposition, | |
self.tablename) | |
return output | |
else: | |
r.error(501, self.manager.ERROR.BAD_FORMAT) | |
elif r.http in ("POST","PUT"): | |
if format == "xml": | |
r.error(501, self.manager.ERROR.NOT_IMPLEMENTED) | |
elif format == "pdf": | |
r.error(501, self.manager.ERROR.NOT_IMPLEMENTED) | |
else: | |
r.error(501, self.manager.ERROR.BAD_FORMAT) | |
else: | |
r.error(501, self.manager.ERROR.BAD_METHOD) | |
def s3ocr_etree(self): | |
""" | |
Optimise & Modifiy s3xml etree to and produce s3ocr etree | |
""" | |
s3xml_etree = self.resource.struct(options=True, | |
references=True, | |
stylesheet=None, | |
as_json=False, | |
as_tree=True) | |
# xml tags | |
ITEXT = "label" | |
HINT = "comment" | |
TYPE = "type" | |
HASOPTIONS = "has_options" | |
LINES = "lines" | |
BOXES = "boxes" | |
# Components Localised Text added to the etree | |
# Convering s3xml to s3ocr_xml (nicer to traverse) | |
s3xml_root = s3xml_etree.getroot() | |
resource_element = s3xml_root.getchildren()[0] | |
s3ocr_root = etree.Element("s3ocr") | |
if self.r.component: # if it is a component | |
component_sequence, components_l10n_dict = \ | |
self.__rheader_tabs_sequence(self.r.tablename) | |
resource_element.set(ITEXT, | |
components_l10n_dict.get(None, | |
self.resource.tablename)) | |
s3ocr_root.append(resource_element) | |
else: # if it is main resource | |
componentetrees = [] | |
# mres is main resource etree | |
mres = etree.Element("resource") | |
for attr in resource_element.attrib.keys(): | |
mres.set(attr, resource_element.attrib.get(attr)) | |
for field_element in resource_element: | |
if field_element.tag == "field": # main resource fields | |
mres.append(field_element) | |
elif field_element.tag == "resource": # component resource | |
componentetrees.append(field_element) | |
# Serialisation of Component List and l10n | |
component_sequence, components_l10n_dict = \ | |
self.__rheader_tabs_sequence(self.r.tablename) | |
mres.set(ITEXT, components_l10n_dict.get(None, | |
self.resource.tablename)) | |
if component_sequence: | |
serialised_component_etrees = [] | |
for eachcomponent in component_sequence: | |
component_table = "%s_%s" % (self.prefix, eachcomponent) | |
for eachtree in componentetrees: | |
if eachtree.attrib.get("name", None) == component_table: | |
# l10n strings are added and sequencing is done here | |
eachtree.set(ITEXT, | |
components_l10n_dict.get(eachcomponent, | |
component_table)) | |
serialised_component_etrees.append(eachtree) | |
else: | |
serialised_component_etrees = componentetrees | |
# create s3ocr tree | |
s3ocr_root.append(mres) | |
for res in serialised_component_etrees: | |
s3ocr_root.append(res) | |
# remove fields which are not required | |
# loading user defined configuartions | |
FIELD_TYPE_LINES = { # mapping types with number of lines | |
"string": 2, | |
"textbox": 4, | |
"integer": 1, | |
"double": 1, | |
"date": 1, | |
"datetime": 1, | |
} | |
FIELD_TYPE_BOXES = { # mapping type with numboxes | |
"integer": 9, | |
"double": 16, | |
} | |
for eachresource in s3ocr_root.iterchildren(): | |
resourcetablename = eachresource.attrib.get("name") | |
if eachresource.attrib.get("name") in self.exclude_component_list: | |
# excluded components are removed | |
s3ocr_root.remove(eachresource) | |
continue | |
for eachfield in eachresource.iterchildren(): | |
fieldname = eachfield.attrib.get("name") | |
# fields which have to be displayed | |
fieldtype = eachfield.attrib.get(TYPE) | |
# loading ocr specific fieldtypes | |
ocrfieldtype = self.generic_ocr_field_type.get(fieldtype, | |
None) | |
if ocrfieldtype != None: | |
eachfield.set(TYPE, ocrfieldtype) | |
# refresh fieldtypes after update | |
fieldtype = eachfield.attrib.get(TYPE) | |
# set num boxes and lines | |
fieldhasoptions = eachfield.attrib.get(HASOPTIONS) | |
if fieldhasoptions == "False": | |
eachfield.set(LINES, | |
str(FIELD_TYPE_LINES.get(fieldtype, | |
1))) | |
if fieldtype in FIELD_TYPE_BOXES.keys(): | |
eachfield.set(BOXES, | |
str(FIELD_TYPE_BOXES.get(fieldtype))) | |
# if field is readable but not writable set default value | |
if eachfield.attrib.get("readable", "False") == "True" and \ | |
eachfield.attrib.get("writable", "False") == "False": | |
try: | |
fieldresourcename = \ | |
eachresource.attrib.get("name").split("%s_" %\ | |
self.prefix)[1] | |
except: | |
fieldresourcename = \ | |
eachresource.attrib.get("name").split("_")[1] | |
fieldresource = \ | |
self.resource.components.get(fieldresourcename, None) | |
if not fieldresource: | |
fieldresource = self.resource | |
fieldname = eachfield.attrib.get("name") | |
try: | |
fielddefault = self.r.resource.table[fieldname].default | |
except(KeyError): | |
fielddefault = "None" | |
eachfield.set("default", | |
str(fielddefault)) | |
# load custom fieldtype specific settings | |
if fieldtype not in self.generic_ocr_field_type.values() \ | |
and fieldtype in self.custom_fieldtype_properties.keys(): | |
self.__update_custom_fieldtype_settings(eachfield) | |
# refresh fieldtypes after update | |
fieldtype = eachfield.attrib.get(TYPE) | |
# for unknown field types | |
if fieldtype not in self.generic_ocr_field_type.values(): | |
eachfield.set(TYPE, "string") | |
eachfield.set(HASOPTIONS, "False") | |
eachfield.set(LINES, "2") | |
# refresh fieldtypes after update | |
fieldtype = eachfield.attrib.get(TYPE) | |
# loading custom field specific settings | |
self.__update_custom_field_settings(eachfield, | |
resourcetablename, | |
fieldname) | |
# in ocr boolean fields should be shown as options | |
if fieldtype == "boolean": | |
eachfield.set(HASOPTIONS, "True") | |
# fields removed which need not be displayed | |
if eachfield.attrib.get("readable", "False") == "False" and \ | |
eachfield.attrib.get("writable", "False") == "False": | |
eachresource.remove(eachfield) | |
continue | |
if eachfield.attrib.get(HASOPTIONS, "False") == "True" and \ | |
eachfield.attrib.get(TYPE) != "boolean": | |
s3ocrselect = eachfield.getchildren()[0] | |
for eachoption in s3ocrselect.iterchildren(): | |
if eachoption.text == "" or eachoption.text == None: | |
s3ocrselect.remove(eachoption) | |
continue | |
return s3ocr_root | |
def pdf_manager(self, form_uuid, form_revision): | |
""" | |
Produces OCR Compatible PDF forms | |
""" | |
s3ocr_root = self.s3ocr_etree() # get element s3xml | |
s3ocr_layout_etree = etree.Element("s3ocrlayout") | |
# define font size | |
titlefontsize = 18 | |
sectionfontsize = 15 | |
regularfontsize = 13 | |
hintfontsize = 10 | |
# etree labels | |
ITEXT = "label" | |
HINT = "comment" | |
TYPE = "type" | |
HASOPTIONS = "has_options" | |
LINES = "lines" | |
BOXES = "boxes" | |
#l10n | |
l10n = self.l10n | |
# get pdf title | |
if self.pdftitle == None or self.pdftitle == "": | |
try: | |
pdftitle = self.manager.s3.crud_strings[\ | |
self.tablename].subtitle_list.decode("utf-8") | |
except: | |
pdftitle = self.resource.tablename | |
else: | |
pdftitle = self.pdftitle | |
# prepare pdf | |
form = Form(form_uuid=form_uuid, | |
form_revision=form_revision, | |
form_resourcename="%s_%s" % (self.prefix, | |
self.resource.name)) | |
form.decorate() | |
# set header | |
form.canvas.setTitle(pdftitle) # set pdf meta title | |
form.print_text([pdftitle,], | |
fontsize=titlefontsize, | |
style="center") # set pdf header title | |
form.print_text( | |
[ | |
unicode(l10n.get("ocr_inst").get("inst1").decode("utf-8")), | |
unicode(l10n.get("ocr_inst").get("inst2").decode("utf-8")), | |
unicode(l10n.get("ocr_inst").get("inst3").decode("utf-8")) | |
], | |
fontsize=regularfontsize, | |
gray=0) | |
form.linespace(3) | |
# printing the etree | |
for eachresource in s3ocr_root: | |
# create resource element of ocr layout xml | |
s3ocr_layout_resource_etree =\ | |
etree.SubElement(s3ocr_layout_etree, | |
"resource", name=eachresource.attrib.get("name")) | |
form.draw_line() | |
form.print_text([ | |
eachresource.attrib.get(ITEXT, | |
eachresource.attrib.get("name")) | |
], | |
fontsize=sectionfontsize) | |
form.draw_line(nextline=1) | |
form.linespace(12) # line spacing between each field | |
for eachfield in eachresource.iterchildren(): | |
# create field element of ocr layout xml | |
s3ocr_layout_field_etree =\ | |
etree.SubElement(s3ocr_layout_resource_etree, | |
"field", | |
name=eachfield.attrib.get("name"), | |
type=eachfield.attrib.get("type")) | |
fieldlabel = eachfield.attrib.get(ITEXT) | |
spacing = " " * 5 | |
fieldhint = self.__trim(eachfield.attrib.get(HINT)) | |
if fieldhint != "" and fieldhint != None: | |
form.print_text(["%s%s( %s )" % \ | |
(fieldlabel, | |
spacing, | |
fieldhint)], | |
fontsize=regularfontsize) | |
else: | |
form.print_text([fieldlabel], | |
fontsize=regularfontsize) | |
if eachfield.attrib.get("readable", "False") == "True" and \ | |
eachfield.attrib.get("writable", "False") == "False": | |
# if it is a readonly field | |
form.print_text( | |
[eachfield.attrib.get("default","No default Value")], | |
seek=10, | |
) | |
elif eachfield.attrib.get(HASOPTIONS) == "True": | |
fieldtype = eachfield.attrib.get(TYPE) | |
# if the field has to be shown with options | |
if fieldtype == "boolean": | |
form.nextline() | |
form.resetx() | |
bool_text = l10n.get("boolean") | |
form.print_text( | |
[bool_text.get("yes").decode("utf-8")], | |
continuetext=1, | |
seek=3, | |
) | |
loc_info = form.draw_circle( | |
boxes=1, | |
continuetext=1, | |
gray=0.9, | |
seek=10, | |
fontsize=12, | |
) | |
# create checkbox element of ocr layout xml | |
s3ocr_layout_optionbox_etree =\ | |
etree.SubElement(s3ocr_layout_field_etree, | |
"optionbox", | |
x=str(loc_info["x"]), | |
y=str(loc_info["y"]), | |
radius=str(loc_info["radius"]), | |
boxes=str(loc_info["boxes"]), | |
page=str(loc_info["page"])) | |
s3ocr_layout_optionbox_etree.text = "yes" | |
form.print_text( | |
[bool_text.get("no").decode("utf-8")], | |
continuetext=1, | |
seek=10, | |
) | |
loc_info = form.draw_circle( | |
boxes=1, | |
continuetext=1, | |
gray=0.9, | |
seek=10, | |
fontsize=12, | |
) | |
# create checkbox element of ocr layout xml | |
s3ocr_layout_optionbox_etree =\ | |
etree.SubElement(s3ocr_layout_field_etree, | |
"optionbox", | |
x=str(loc_info["x"]), | |
y=str(loc_info["y"]), | |
radius=str(loc_info["radius"]), | |
boxes=str(loc_info["boxes"]), | |
page=str(loc_info["page"])) | |
s3ocr_layout_optionbox_etree.text = "no" | |
else: | |
if fieldtype == "multiselect": | |
option_hint = l10n.get("select").get("multiselect") | |
else: | |
option_hint = l10n.get("select").get("singleselect") | |
form.print_text( | |
[option_hint.decode("utf-8")], | |
fontsize=hintfontsize, | |
gray=0.4, | |
seek=3, | |
) | |
s3ocrselect = eachfield.getchildren()[0] | |
form.nextline(regularfontsize) | |
form.resetx() # move cursor to the front | |
optionseek = 10 | |
# resting margin for options | |
formmargin = form.marginsides | |
form.marginsides = optionseek + formmargin | |
for eachoption in s3ocrselect.iterchildren(): | |
form.print_text( | |
[eachoption.text], | |
continuetext=1, | |
fontsize = regularfontsize, | |
seek = 10, | |
) | |
loc_info = form.draw_circle( | |
boxes=1, | |
continuetext=1, | |
gray=0.9, | |
seek=10, | |
fontsize=12, | |
) | |
# create checkbox element of ocr layout xml | |
s3ocr_layout_optionbox_etree =\ | |
etree.SubElement(s3ocr_layout_field_etree, | |
"optionbox", | |
x=str(loc_info["x"]), | |
y=str(loc_info["y"]), | |
radius=str(loc_info["radius"]), | |
boxes=str(loc_info["boxes"]), | |
page=str(loc_info["page"])) | |
s3ocr_layout_optionbox_etree.text =\ | |
eachoption.attrib.get("value") | |
# restoring orginal margin | |
form.marginsides = formmargin | |
else: | |
# if it is a text field | |
fieldtype = eachfield.attrib.get(TYPE) | |
BOXES_TYPES = ["string", "textbox", "integer", | |
"double", "date", "datetime",] | |
if fieldtype in BOXES_TYPES: | |
if fieldtype in ["string", "textbox"]: | |
form.linespace(3) | |
num_lines = int(eachfield.attrib.get("lines", | |
1)) | |
for eachline in xrange(num_lines): | |
loc_info = form.draw_check_boxes( | |
completeline=1, | |
gray=0.9, | |
seek=3, | |
) | |
# create checkbox element of ocr layout xml | |
s3ocr_layout_textbox_etree =\ | |
etree.SubElement(s3ocr_layout_field_etree, | |
"textbox", | |
x=str(loc_info["x"]), | |
y=str(loc_info["y"]), | |
side=str(loc_info["side"]), | |
boxes=str(loc_info["boxes"]), | |
page=str(loc_info["page"])) | |
s3ocr_layout_textbox_etree.text = " " | |
elif fieldtype in ["integer", "double"]: | |
num_boxes = int(eachfield.attrib.get("boxes", | |
9)) | |
form.linespace(3) | |
loc_info = form.draw_check_boxes( | |
boxes = num_boxes, | |
gray=0.9, | |
seek=3, | |
) | |
# create checkbox element of ocr layout xml | |
s3ocr_layout_textbox_etree =\ | |
etree.SubElement(s3ocr_layout_field_etree, | |
"textbox", | |
x=str(loc_info["x"]), | |
y=str(loc_info["y"]), | |
side=str(loc_info["side"]), | |
boxes=str(loc_info["boxes"]), | |
page=str(loc_info["page"])) | |
s3ocr_layout_textbox_etree.text = " " | |
elif fieldtype in ["date", "datetime"]: | |
# print hint | |
hinttext = \ | |
l10n.get("datetime_hint").get(fieldtype).decode("utf-8") | |
form.print_text( | |
[hinttext], | |
fontsize=hintfontsize, | |
gray=0.4, | |
seek=3, | |
) | |
form.linespace(8) | |
datetime_continuetext = 0 | |
datetime_seek = 3 | |
if fieldtype == "datetime": | |
datetime_continuetext = 1 | |
datetime_seek = 6 | |
#HH | |
loc_info = form.draw_check_boxes( | |
boxes = 2, | |
gray=0.9, | |
seek = 3, | |
) | |
# create checkbox element of ocr layout xml | |
s3ocr_layout_textbox_etree =\ | |
etree.SubElement(s3ocr_layout_field_etree, | |
"textbox", | |
x=str(loc_info["x"]), | |
y=str(loc_info["y"]), | |
side=str(loc_info["side"]), | |
boxes=str(loc_info["boxes"]), | |
page=str(loc_info["page"])) | |
s3ocr_layout_textbox_etree.text = "HH" | |
#MM | |
loc_info = form.draw_check_boxes( | |
boxes = 2, | |
gray=0.9, | |
continuetext=1, | |
seek = 4, | |
) | |
# create checkbox element of ocr layout xml | |
s3ocr_layout_textbox_etree =\ | |
etree.SubElement(s3ocr_layout_field_etree, | |
"textbox", | |
x=str(loc_info["x"]), | |
y=str(loc_info["y"]), | |
side=str(loc_info["side"]), | |
boxes=str(loc_info["boxes"]), | |
page=str(loc_info["page"])) | |
s3ocr_layout_textbox_etree.text = "MM" | |
# DD | |
loc_info = form.draw_check_boxes( | |
boxes = 2, | |
gray=0.9, | |
continuetext = datetime_continuetext, | |
seek = datetime_seek, | |
) | |
# create checkbox element of ocr layout xml | |
s3ocr_layout_textbox_etree =\ | |
etree.SubElement(s3ocr_layout_field_etree, | |
"textbox", | |
x=str(loc_info["x"]), | |
y=str(loc_info["y"]), | |
side=str(loc_info["side"]), | |
boxes=str(loc_info["boxes"]), | |
page=str(loc_info["page"])) | |
s3ocr_layout_textbox_etree.text = "DD" | |
# MO | |
loc_info = form.draw_check_boxes( | |
boxes = 2, | |
gray=0.9, | |
continuetext=1, | |
seek = 4, | |
) | |
# create checkbox element of ocr layout xml | |
s3ocr_layout_textbox_etree =\ | |
etree.SubElement(s3ocr_layout_field_etree, | |
"textbox", | |
x=str(loc_info["x"]), | |
y=str(loc_info["y"]), | |
side=str(loc_info["side"]), | |
boxes=str(loc_info["boxes"]), | |
page=str(loc_info["page"])) | |
s3ocr_layout_textbox_etree.text = "MO" | |
# YYYY | |
loc_info = form.draw_check_boxes( | |
boxes = 4, | |
gray=0.9, | |
continuetext=1, | |
seek = 4, | |
) | |
# create checkbox element of ocr layout xml | |
s3ocr_layout_textbox_etree =\ | |
etree.SubElement(s3ocr_layout_field_etree, | |
"textbox", | |
x=str(loc_info["x"]), | |
y=str(loc_info["y"]), | |
side=str(loc_info["side"]), | |
boxes=str(loc_info["boxes"]), | |
page=str(loc_info["page"])) | |
s3ocr_layout_textbox_etree.text = "YYYY" | |
else: | |
self.r.error(501, self.manager.PARSE_ERROR) | |
print sys.stderr("%s :invalid field type: %s" %\ | |
(eachfield.attrib.get("name"), | |
fieldtype)) | |
return form.save(), s3ocr_layout_etree | |
def __update_custom_fieldtype_settings(self, | |
eachfield, #field etree | |
): | |
""" | |
Update custom fieldtype specific settings into the etree | |
""" | |
# xml attributes | |
TYPE = "type" | |
READABLE = "readable" | |
WRITABLE = "writable" | |
LABEL = "label" | |
HINT = "comment" | |
DEFAULT = "default" | |
LINES = "lines" | |
BOXES = "boxes" | |
HASOPTIONS = "has_options" | |
fieldtype = eachfield.attrib.get(TYPE) | |
field_property = self.custom_fieldtype_properties.get(fieldtype, {}) | |
cust_fieldtype = fieldtype_property.get("fieldtype", None) | |
cust_readable = fieldtype_property.get("readable", None) | |
cust_writable = fieldtype_property.get("writable", None) | |
cust_label = fieldtype_property.get("label", None) | |
cust_hint = fieldtype_property.get("hint", None) | |
cust_default = fieldtype_property.get("default", None) | |
cust_lines = fieldtype_property.get("lines", None) | |
cust_boxes = fieldtype_property.get("boxes", None) | |
cust_has_options = fieldtype_property.get("has_options", None) | |
cust_options = fieldtype_property.get("options", None) | |
if cust_fieldtype: | |
if cust_fieldtype != None: | |
eachfield.set(TYPE, cust_fieldtype) | |
if cust_readable != None: | |
eachfield.set(READABLE, cust_readable) | |
if cust_writable != None: | |
eachfield.set(WRITABLE, cust_writable) | |
if cust_label != None: | |
eachfield.set(LABEL, cust_label) | |
if cust_hint != None: | |
eachfield.set(HINT, cust_hint) | |
if cust_default != None: | |
eachfield.set(DEFAULT, cust_default) | |
if cust_lines != None: | |
eachfield.set(LINES, cust_lines) | |
if cust_boxes != None: | |
eachfield.set(BOXES, cust_boxes) | |
if cust_has_options != None: | |
eachfield.set(HASOPTIONS, cust_has_options) | |
if cust_options != None: | |
opt_available = eachfield.getchildren() | |
if len(opt_available) == 0: | |
eachfield.append(cust_options) | |
elif len(opt_available) == 1: | |
eachfield.remove(opt_available[0]) | |
eachfield.append(cust_options) | |
def __update_custom_field_settings(self, | |
eachfield, #field etree | |
resourcetablename, | |
fieldname | |
): | |
""" | |
Update custom field specific settings into the etree | |
""" | |
# xml attributes | |
TYPE = "type" | |
READABLE = "readable" | |
WRITABLE = "writable" | |
LABEL = "label" | |
HINT = "comment" | |
DEFAULT = "default" | |
LINES = "lines" | |
BOXES = "boxes" | |
HASOPTIONS = "has_options" | |
unikey = "%s__%s" % (resourcetablename, fieldname) | |
field_property = self.custom_field_properties.get(unikey, {}) | |
cust_fieldtype = field_property.get("fieldtype", None) | |
cust_readable = field_property.get("readable", None) | |
cust_writable = field_property.get("writable", None) | |
cust_label = field_property.get("label", None) | |
cust_hint = field_property.get("hint", None) | |
cust_default = field_property.get("default", None) | |
cust_lines = field_property.get("lines", None) | |
cust_boxes = field_property.get("boxes", None) | |
cust_has_options = field_property.get("has_options", None) | |
cust_options = field_property.get("options", None) | |
if cust_fieldtype: | |
if cust_fieldtype != None: | |
eachfield.set(TYPE, cust_fieldtype) | |
if cust_readable != None: | |
eachfield.set(READABLE, cust_readable) | |
if cust_writable != None: | |
eachfield.set(WRITABLE, cust_writable) | |
if cust_label != None: | |
eachfield.set(LABEL, cust_label) | |
if cust_hint != None: | |
eachfield.set(HINT, cust_hint) | |
if cust_default != None: | |
eachfield.set(DEFAULT, cust_default) | |
if cust_lines != None: | |
eachfield.set(LINES, cust_lines) | |
if cust_boxes != None: | |
eachfield.set(BOXES, cust_boxes) | |
if cust_has_options != None: | |
eachfield.set(HASOPTIONS, cust_has_options) | |
if cust_options != None: | |
opt_available = eachfield.getchildren() | |
if len(opt_available) == 0: | |
eachfield.append(cust_options) | |
elif len(opt_available) == 1: | |
eachfield.remove(opt_available[0]) | |
eachfield.append(cust_options) | |
def __rheader_tabs_sequence(self, resourcename): | |
""" | |
Sequence of components is returned as a list | |
""" | |
component_seq = [] | |
component_l10n_dict = {} | |
rtabs = self.rheader_tabs | |
for eachel in rtabs: | |
if eachel[1] != None: | |
component_seq.append(eachel[1]) | |
component_l10n_dict[eachel[1]] = eachel[0].decode("utf-8") | |
return component_seq, component_l10n_dict | |
def __trim(self, text): | |
""" | |
Helper to trim off any enclosing paranthesis | |
""" | |
if isinstance(text, str) and \ | |
text[0] == "(" and \ | |
text[-1] == ")": | |
text = text[1:-1] | |
return text | |
def __update_dbmeta(self, **kwargs): | |
""" | |
Store the PDF layout information into the database/disk. | |
""" | |
# basic fields | |
form_uuid = kwargs.get("form_uuid", None) | |
layout_xml = kwargs.get("layout_xml", None) | |
revision = kwargs.get("revision", None) | |
layout_file_stream = StringIO(layout_xml) | |
layout_file_name = "%s_xml" % form_uuid | |
db = self.db | |
tablename = "ocr_meta" | |
#is_component = False if (len(self.resource.components) == 0) else True | |
resource_name = "%s_%s" % (self.prefix, self.resource.name) | |
rows = db(db[tablename]["form_uuid"] == form_uuid).select() | |
row = rows[0] | |
row.update_record(layout_file=db[tablename]["layout_file"].store(\ | |
layout_file_stream, | |
layout_file_name)) | |
def __book_revision(self, form_uuid): | |
""" | |
Books a revision number for current operation in ocr_meta | |
""" | |
db = self.db | |
tablename = "ocr_meta" | |
resource_name = "%s_%s" % (self.prefix, self.resource.name) | |
#determiniing revision | |
selector = db[tablename]["revision"].max() | |
rows = db(db[tablename]["resource_name"]==resource_name).select(selector) | |
row = rows.first() | |
revision = 0 if (row[selector] == None) else (row[selector] + 1) | |
db[tablename].insert(form_uuid=form_uuid, | |
resource_name=resource_name, | |
revision=revision) | |
return revision | |
def s3ocr_parser(self, **kwargs): | |
""" performs OCR on a given set of pages """ | |
pages = kwargs.get("pages", None) | |
raw_images = {} | |
images = {} | |
form_uuid = kwargs.get("form_uuid", None) | |
revision = kwargs.get("revision", None) | |
resourcename = kwargs.get("resourcename", None) | |
# <debug only> | |
for i in xrange(0, 11): | |
print "page %s" % i | |
raw_images[i+1] = Image.open(os.path.join("/home/idlecool/", | |
"pr_person-%s.png" % i)) | |
pages = 1 | |
form_uuid = "4ab7c932-8fdb-11e0-bf41-533e0c24f0a" | |
revision = 14 | |
resourcename = "pr_person" | |
# </debug only> | |
# transform image | |
for each_img_index in raw_images.keys(): | |
print each_img_index | |
images[each_img_index] = {} | |
images[each_img_index]["image"] =\ | |
self.__convertImage2binary(raw_images[each_img_index]) | |
images[each_img_index]["markers"] =\ | |
self.__getMarkers(images[each_img_index]["image"]) | |
images[each_img_index]["orientation"] =\ | |
self.__getOrientation(images[each_img_index]["markers"]) | |
if images[each_img_index]["orientation"] != 0.0: | |
images[each_img_index]["image"] =\ | |
images[each_img_index]["image"].rotate(images[each_img_index]["orientation"]) | |
images[each_img_index]["markers"] =\ | |
self.__getMarkers(images[each_img_index]["image"]) | |
images[each_img_index]["orientation"] =\ | |
self.__getOrientation(images[each_img_index]["markers"]) | |
images[each_img_index]["scalefactor"] =\ | |
self.__scaleFactor(images[each_img_index]["markers"]) | |
# get layout file, convert it to etree | |
db = self.db | |
layout_row =\ | |
db(db["ocr_meta"]["form_uuid"]==form_uuid and\ | |
db["ocr_meta"]["resource_name"]==resourcename and\ | |
db["ocr_meta"]["revision"]==revision | |
).select(db["ocr_meta"]["layout_file"]).first() | |
layout_file = open(os.path.join("./applications/", | |
self.request.application, | |
'uploads/ocr_meta/', | |
layout_row["layout_file"]), | |
'rb') | |
layout_xml = layout_file.read() | |
layout_file.close() | |
layout_etree = etree.fromstring(layout_xml) | |
for eachresource in layout_etree: | |
for eachfield in eachresource: | |
field_type = eachfield.attrib.get("type") | |
components = eachfield.getchildren() | |
numcomponents = len(components) | |
if numcomponents == 0: | |
continue | |
else: | |
component_type = components[0].tag | |
if component_type in ("optionbox", "textbox"): | |
if component_type == "optionbox": | |
linenum = 0 | |
for eachcomponent in components: | |
comp_x = float(eachcomponent.attrib.get("x")) | |
comp_y = float(eachcomponent.attrib.get("y")) | |
comp_boxes = int(eachcomponent.attrib.get("boxes")) | |
comp_radius = float(eachcomponent.attrib.get("radius")) | |
comp_page = int(eachcomponent.attrib.get("page")) | |
comp_value = str(eachcomponent.text) | |
try: | |
page_origin = images[comp_page]["markers"] | |
except(KeyError): | |
self.r.error(501, | |
self.T("insufficient number of pages provided")) | |
print eachcomponent.tag | |
crop_box = ( | |
int(page_origin[0][0]+\ | |
(comp_x*\ | |
images[comp_page]["scalefactor"]["x"])-\ | |
comp_radius*images[comp_page]["scalefactor"]["x"]), | |
int(page_origin[0][1]+\ | |
(comp_y*\ | |
images[comp_page]["scalefactor"]["y"])-\ | |
comp_radius*images[comp_page]["scalefactor"]["y"]), | |
int(page_origin[0][0]+\ | |
(comp_x*\ | |
images[comp_page]["scalefactor"]["x"])+\ | |
comp_radius*images[comp_page]["scalefactor"]["x"]), | |
int(page_origin[0][1]+\ | |
(comp_y*\ | |
images[comp_page]["scalefactor"]["y"])+\ | |
comp_radius*images[comp_page]["scalefactor"]["y"]), | |
) | |
temp_image = images[comp_page]["image"].crop(crop_box) | |
print eachcomponent.tag | |
cropped_image = images[comp_page]["image"].crop(crop_box) | |
result = self.__ocrIt(cropped_image, | |
form_uuid, | |
resourcename, | |
linenum, | |
content_type="optionbox") | |
if result: | |
print "TRUE: %s" % comp_value | |
linenum+=1 | |
elif component_type == "textbox": | |
linenum = 1 | |
for eachcomponent in components: | |
comp_x = float(eachcomponent.attrib.get("x")) | |
comp_y = float(eachcomponent.attrib.get("y")) | |
comp_boxes = int(eachcomponent.attrib.get("boxes")) | |
comp_side = float(eachcomponent.attrib.get("side")) | |
comp_page = int(eachcomponent.attrib.get("page")) | |
comp_meta = str(eachcomponent.text) | |
try: | |
page_origin = images[comp_page]["markers"] | |
except(KeyError): | |
self.r.error(501, | |
self.T("insufficient number of pages provided")) | |
print eachcomponent.tag | |
crop_box = ( | |
int(page_origin[0][0]+\ | |
(comp_x*\ | |
images[comp_page]["scalefactor"]["x"])), | |
int(page_origin[0][1]+\ | |
(comp_y*\ | |
images[comp_page]["scalefactor"]["y"])), | |
int(page_origin[0][0]+\ | |
(comp_x*\ | |
images[comp_page]["scalefactor"]["x"])+\ | |
comp_side*comp_boxes*images[comp_page]["scalefactor"]["x"]), | |
int(page_origin[0][1]+\ | |
(comp_y*\ | |
images[comp_page]["scalefactor"]["y"])+\ | |
comp_side*images[comp_page]["scalefactor"]["y"]), | |
) | |
cropped_image = images[comp_page]["image"].crop(crop_box) | |
output = self.__ocrIt(cropped_image, | |
form_uuid, | |
resourcename, | |
linenum) | |
print output | |
linenum+=1 | |
else: | |
continue | |
output = etree.tostring(layout_etree, pretty_print=True) | |
return output #"%s %s %s" % (markers, orientation, scalefactor) | |
def __ocrIt(self, | |
image, | |
form_uuid, | |
resourcename, | |
linenum, | |
content_type="textbox"): | |
""" put Tesseract into work, actual OCRing will be done here """ | |
if content_type=="optionbox": | |
stat = ImageStat.Stat(image) | |
if stat.mean[0] < 96 : | |
return True | |
elif content_type=="textbox": | |
uniqueuuid = uuid.uuid1() # to make it thread safe | |
inputfilename = "%s_%s_%s_%s.tif" % (uniqueuuid, | |
form_uuid, | |
resourcename, | |
linenum) | |
outputfilename = "%s_%s_%s_%s_text" % (uniqueuuid, | |
form_uuid, | |
resourcename, | |
linenum) | |
web2pywd = os.getcwd() | |
os.chdir(os.path.join(web2pywd, "applications/eden/uploads")) | |
try: | |
os.mkdir("ocr_temp") | |
except(OSError): | |
pass | |
uploadwd = os.getcwd() | |
os.chdir(os.path.join(uploadwd, "ocr_temp")) | |
image.save(inputfilename) | |
success =\ | |
os.system("tesseract \"%s\" \"%s\" -psm 7 >> /dev/null" % (inputfilename, | |
outputfilename)) | |
if success != 0: | |
self.r.error(501, self.T("Tesseract not installed")) | |
outputfile = open("%s.txt" % outputfilename) | |
outputtext = outputfile.read() | |
outputfile.close() | |
output = outputtext.replace("\n", " ") | |
os.remove(inputfilename) | |
os.remove("%s.txt" % outputfilename) | |
os.chdir(uploadwd) | |
os.removedirs("ocr_temp") | |
os.chdir(web2pywd) | |
return output | |
def __convertImage2binary(self, image, threshold = 180): | |
""" Converts the image into binary based on a threshold. here it is 180""" | |
image = ImageOps.grayscale(image) | |
image.convert("L") | |
width, height = image.size | |
for x in xrange(width): | |
for y in xrange(height): | |
if image.getpixel((x,y)) < 180 : | |
image.putpixel((x,y), 0) | |
else: | |
image.putpixel((x,y), 255) | |
return image | |
def __findRegions(self, im): | |
""" | |
Return the list of regions which are found by the following algorithm. | |
----------------------------------------------------------- | |
Raster Scanning Algorithm for Connected Component Analysis: | |
----------------------------------------------------------- | |
On the first pass: | |
================= | |
1. Iterate through each element of the data by column, then by row (Raster Scanning) | |
2. If the element is not the background | |
1. Get the neighboring elements of the current element | |
2. If there are no neighbors, uniquely label the current element and continue | |
3. Otherwise, find the neighbor with the smallest label and assign it to the current element | |
4. Store the equivalence between neighboring labels | |
On the second pass: | |
=================== | |
1. Iterate through each element of the data by column, then by row | |
2. If the element is not the background | |
1. Relabel the element with the lowest equivalent label | |
( source: http://en.wikipedia.org/wiki/Connected_Component_Labeling ) | |
""" | |
width, height = im.size | |
ImageOps.grayscale(im) | |
im = im.convert("L") | |
regions = {} | |
pixel_region = [[0 for y in xrange(height)] for x in xrange(width)] | |
equivalences = {} | |
n_regions = 0 | |
#first pass. find regions. | |
for x in xrange(width): | |
for y in xrange(height): | |
#look for a black pixel | |
if im.getpixel((x, y)) == 0 : #BLACK | |
# get the region number from north or west or create new region | |
region_n = pixel_region[x-1][y] if x > 0 else 0 | |
region_w = pixel_region[x][y-1] if y > 0 else 0 | |
#region_nw = pixel_region[x-1][y-1] if x > 0 and y > 0 else 0 | |
#region_ne = pixel_region[x-1][y+1] if x > 0 else 0 | |
max_region = max(region_n, region_w) | |
if max_region > 0: | |
#a neighbour already has a region, new region is the smallest > 0 | |
new_region = min(filter(lambda i: i > 0, (region_n, region_w))) | |
#update equivalences | |
if max_region > new_region: | |
if max_region in equivalences: | |
equivalences[max_region].add(new_region) | |
else: | |
equivalences[max_region] = set((new_region, )) | |
else: | |
n_regions += 1 | |
new_region = n_regions | |
pixel_region[x][y] = new_region | |
#Scan image again, assigning all equivalent regions the same region value. | |
for x in xrange(width): | |
for y in xrange(height): | |
r = pixel_region[x][y] | |
if r > 0: | |
while r in equivalences: | |
r = min(equivalences[r]) | |
if r in regions: | |
regions[r].add(x, y) | |
else: | |
regions[r] = self.__Region(x, y) | |
return list(regions.itervalues()) | |
def __getOrientation(self, markers): | |
""" Returns orientation of the sheet in radians """ | |
x1, y1 = markers[0] | |
x2, y2 = markers[2] | |
try: | |
slope = ((x2-x1)*1.0) / ((y2-y1)*1.0) | |
except(ZeroDivisionError): | |
slope = 999999999999999999999999999 | |
return math.atan(slope)*(180.0/math.pi)*(-1) | |
def __scaleFactor(self, markers): | |
""" Returns the scale factors lengthwise and breadthwise """ | |
stdWidth = sum((596, -60)) | |
stdHeight = sum((842, -60)) | |
li = [markers[0], markers[2]] | |
sf_y = self.__distance(li)/stdHeight | |
li = [markers[6], markers[2]] | |
sf_x = self.__distance(li)/stdWidth | |
return {"x":sf_x, "y":sf_y} | |
def __distance(self, li): | |
""" returns the euclidean distance if the input is of the form [(x1, y1), (x2, y2)]""" | |
return math.sqrt(math.fsum((math.pow(math.fsum((int(li[1][0]), -int(li[0][0]))), 2), math.pow(math.fsum((int(li[1][1]), -int(li[0][1]))), 2)))) | |
def __getMarkers(self, image): | |
""" Gets the markers on the OCR image """ | |
centers = {} | |
present = 0 | |
regions = self.__findRegions(image) | |
for r in regions: | |
if r.area > 320 and r.aspectratio() < 1.5 and r.aspectratio() > 0.67: | |
present += 1 | |
centers[present] = r.centroid() | |
# This is the list of all the markers on the form. | |
markers = list(centers.itervalues()) | |
markers.sort() | |
l1 = sorted(markers[0:3], key=lambda y: y[1]) | |
l2 = markers[3:4] | |
l3 = sorted(markers[4:7], key=lambda y: y[1]) | |
markers = [] | |
markers.extend(l1) | |
markers.extend(l2) | |
markers.extend(l3) | |
#markers.sort(key=lambda x: (x[0], x[1])) | |
return markers | |
class __Region(): | |
""" Self explainatory """ | |
def __init__(self, x, y): | |
""" Initialize the region """ | |
self._pixels = [(x, y)] | |
self._min_x = x | |
self._max_x = x | |
self._min_y = y | |
self._max_y = y | |
self.area = 1 | |
def add(self, x, y): | |
""" Add a pixel to the region """ | |
self._pixels.append((x, y)) | |
self.area += 1 | |
self._min_x = min(self._min_x, x) | |
self._max_x = max(self._max_x, x) | |
self._min_y = min(self._min_y, y) | |
self._max_y = max(self._max_y, y) | |
def centroid(self): | |
""" Returns the centroid of the bounding box """ | |
return ((self._min_x + self._max_x)/2 , (self._min_y + self._max_y)/2) | |
def box(self): | |
""" Returns the bounding box of the region """ | |
return [ (self._min_x, self._min_y) , (self._max_x, self._max_y)] | |
def aspectratio(self): | |
""" Calculating the aspect ratio of the region """ | |
width = self._max_x - self._min_x | |
length = self._max_y - self._min_y | |
return float(width)/float(length) | |
#============================================================================== | |
#==================== unicode support to reportlab ============================ | |
#============================================================================== | |
fonts_directory = os.path.join(os.path.dirname(os.path.abspath(__file__)), | |
"../../static/fonts") | |
#------------------------------------------------------------------------------ | |
# unifont - considered to be an allrounder | |
#------------------------------------------------------------------------------ | |
try: | |
pdfmetrics.registerFont(TTFont("unifont", | |
os.path.join(fonts_directory, | |
"unifont/unifont.ttf"))) | |
unifont_map = [ | |
(0, 65536), | |
] | |
except: | |
unifont_map = [] | |
print >> sys.stderr, "S3 Debug: s3ocr: unifont not found, run static/fonts/setfonts.py" | |
#------------------------------------------------------------------------------ | |
# Arabic fonts | |
#------------------------------------------------------------------------------ | |
try: | |
pdfmetrics.registerFont(TTFont("AlMateen-Bold", | |
os.path.join(fonts_directory, | |
"arabic/ae_AlMateen-Bold.ttf"))) | |
from fontmap.AlMateenBold import AlMateenBold_map | |
pdfmetrics.registerFont(TTFont("AlMohanad", | |
os.path.join(fonts_directory, | |
"arabic/ae_AlMohanad.ttf"))) | |
from fontmap.AlMohanad import AlMohanad_map | |
except: | |
AlMateenBold_map = [] | |
AlMohanad_map = [] | |
print >> sys.stderr, "S3 Debug: s3ocr: arabic fonts not found, run static/fonts/setfonts.py" | |
#------------------------------------------------------------------------------ | |
# japanese fonts | |
#------------------------------------------------------------------------------ | |
try: | |
pdfmetrics.registerFont(TTFont("SazanamiGothic", | |
os.path.join(fonts_directory, | |
"japanese/sazanami-gothic.ttf"))) | |
from fontmap.SazanamiGothic import SazanamiGothic_map | |
pdfmetrics.registerFont(TTFont("SazanamiMincho", | |
os.path.join(fonts_directory, | |
"japanese/sazanami-mincho.ttf"))) | |
from fontmap.SazanamiMincho import SazanamiMincho_map | |
except: | |
SazanamiGothic_map = [] | |
SazanamiMincho_map = [] | |
print >> sys.stderr, "S3 Debug: s3ocr: japanese fonts not found, run static/fonts/setfonts.py" | |
#-------------------------------------------------------------------------- | |
# Standard fonts | |
#-------------------------------------------------------------------------- | |
Helvetica = "Helvetica" | |
Helvetica_map = [ | |
(32, 127), | |
(160, 161), | |
(173, 173), | |
] | |
# Fonts | |
#Courier = "Courier" | |
#Helvetica_Bold = "Helvetica-Bold" | |
#Helvetica_Bold_Oblique = "Helvetica-BoldOblique" | |
#Helvetica_Oblique = "Helvetica-Oblique" | |
#-------------------------------------------------------------------------- | |
# some global variables | |
#-------------------------------------------------------------------------- | |
fontlist = [ | |
"Helvetica", # english and latin english fonts | |
"AlMateen-Bold", # arabic fonts | |
"AlMohanad", # arabic fonts | |
"SazanamiGothic", # japanese fonts | |
"SazanamiMincho", # japanese fonts | |
"unifont", # unifont should be always at the last | |
] | |
fontmapping = { | |
"Helvetica": Helvetica_map, | |
"AlMateen-Bold": AlMateenBold_map, | |
"AlMohanad": AlMohanad_map, | |
"SazanamiGothic": SazanamiGothic_map, | |
"SazanamiMincho": SazanamiMincho_map, | |
"unifont": unifont_map, | |
} | |
fontchecksequence = [] | |
for eachfont in fontlist: | |
if len(fontmapping[eachfont]) != 0: | |
fontchecksequence.append(eachfont) | |
#========================================================================== | |
#=============== internal Class Definitions and functions ================= | |
#========================================================================== | |
#======================== pdf layout from xform =========================== | |
class Form(object): | |
""" Form class to use reportlab to generate pdf """ | |
def __init__(self, pdfname="ocrform.pdf", margintop=65, marginsides=50, | |
**kw): | |
""" Form initialization """ | |
self.pdfpath = kw.get("pdfpath", pdfname) | |
self.verbose = kw.get("verbose", 0) | |
self.linespacing = kw.get("linespacing", 4) | |
self.font = kw.get("typeface", "Helvetica") | |
self.fontsize = kw.get("fontsize", 13) | |
self.IObuffer = StringIO() | |
self.canvas = Canvas(self.IObuffer, pagesize = A4) | |
self.width, self.height = A4 | |
self.x = marginsides | |
self.lastx = marginsides | |
self.marginsides = marginsides | |
self.margintop = margintop | |
self.y = self.height - margintop | |
self.lasty = self.height - margintop | |
self.num = 1 | |
self.gray = 0 | |
self.pagebegin = 1 | |
self.form_uuid = kw.get("form_uuid" ,"") | |
self.form_revision = kw.get("form_revision" ,"") | |
self.form_resourcename = kw.get("form_resourcename" ,"") | |
self.put_page_num() | |
self.put_metainfo() | |
def barcode(self, uuid): | |
""" Generate barcode of uuid """ | |
barcode = code128.Code128(str(uuid), barWidth=1, barHeight=20) | |
barcode.drawOn(self.canvas, self.lastx, self.lasty) | |
self.lasty = self.lasty - 20 | |
self.y = self.lasty | |
def decorate(self): | |
""" Decorates the the form with the markers needed to align the form later """ | |
c = self.canvas | |
c.rect(20, 20, 20, 20, fill=1) # bt lf | |
c.rect(self.width - 40, 20, 20, 20, fill=1) # bt rt | |
c.rect(20, self.height - 40, 20, 20, fill=1) # tp lf | |
c.rect(self.width/2 - 10, 20, 20, 20, fill=1) # bt md | |
c.rect(20, self.height/2 - 10, 20, 20, fill=1) # md lf | |
c.rect(self.width - 40, self.height - 40, 20, 20, fill=1) # tp rt | |
c.rect(self.width - 40, self.height/2 - 10, 20, 20, fill=1) # md rt | |
self.origin = {"x": 29, "y": 29} # location of top left marker | |
def print_text(self, | |
lines, | |
fontsize=13, | |
gray=0, | |
seek=0, | |
continuetext=0, | |
style="default"): | |
""" | |
Give the lines to be printed as a list, | |
set the font and grey level | |
""" | |
self.fontsize = fontsize | |
self.gray = gray | |
if not continuetext and not self.pagebegin: | |
self.resetx() | |
self.nextline() | |
self.pagebegin = 0 | |
if seek: | |
self.resetx(seek=seek) | |
numlines = len(lines) | |
loopcounter = 0 | |
for line in lines: | |
loopcounter += 1 | |
line = self.__html_unescape(unicode(line)) | |
# alignment | |
if not continuetext: | |
if style == "center": | |
self.x = \ | |
(self.width - (len(line) * (self.fontsize / 2)))/2 | |
elif style == "right": | |
self.x = \ | |
((self.width - self.marginsides) -\ | |
((len(line)+3) * (self.fontsize / 2))) | |
if continuetext: | |
# wrapping multiline options | |
if (self.width - self.marginsides - self.x) < 100: | |
self.resetx() | |
self.nextline() | |
if (self.y - self.fontsize) < 50: | |
self.set_new_page() | |
for char in line: | |
t = self.writechar(char) | |
self.x = t.getX() | |
self.y = t.getY() | |
# text wrapping -> TODO: word wrapping | |
if self.x > (self.width - self.marginsides - self.fontsize): | |
self.writechar("-") | |
self.nextline() | |
self.resetx(self.fontsize) | |
if not continuetext and loopcounter != numlines: | |
self.nextline() | |
self.resetx() | |
def writechar(self, char=" "): | |
""" | |
Writes one character on canvas | |
""" | |
font=self.selectfont(char) | |
t = self.canvas.beginText(self.x, self.y) | |
t.setFont(font, self.fontsize) | |
t.setFillGray(self.gray) | |
t.textOut(char) | |
self.canvas.drawText(t) | |
return t | |
def nextline(self, fontsize=0): | |
""" | |
Moves the y cursor down one line | |
""" | |
if fontsize != 0: | |
self.fontsize = fontsize | |
if self.pagebegin == 0: | |
self.y = self.y - (self.fontsize + self.linespacing) | |
if self.y < self.margintop: | |
self.set_new_page() | |
self.pagebegin = 0 | |
def resetx(self, offset=0, seek=None): | |
""" | |
Moves the x cursor with offset | |
""" | |
if seek == None: | |
self.x = self.marginsides + offset | |
else: | |
self.x += seek | |
lastvalidx = self.width - (self.marginsides + (self.fontsize / 2)) | |
writablex = self.width - (2 * self.marginsides) | |
if self.x > lastvalidx: | |
currentx = self.x - self.marginsides | |
remx = currentx % writablex | |
self.x = remx + self.marginsides | |
numlines = int(currentx / writablex) | |
for line in xrange(numlines): | |
self.nextline() | |
def __html_unescape(self, text): | |
""" | |
Helper function, unscape any html special characters | |
""" | |
return re.sub("&(%s);" % "|".join(name2codepoint), | |
lambda m: unichr(name2codepoint[m.group(1)]), | |
text) | |
def linespace(self, spacing=2): | |
""" | |
Moves the y cursor down by given units | |
""" | |
if self.pagebegin == 0: | |
self.y -= spacing | |
self.pagebegin = 0 | |
def selectfont(self, char): | |
""" Select font according to the input character """ | |
charcode = ord(char) | |
for font in fontchecksequence: | |
for fontrange in fontmapping[font]: | |
if charcode in xrange(fontrange[0], fontrange[1]): | |
return font | |
return "Helvetica" # fallback, if no thirdparty font is installed | |
def draw_check_boxes(self, | |
boxes=1, | |
completeline=0, | |
lines=0, | |
seek=0, | |
continuetext=0, | |
fontsize=15, | |
gray=0, | |
style="", | |
): | |
""" Function to draw check boxes default no of boxes = 1 """ | |
if not continuetext and not self.pagebegin: | |
self.resetx() | |
self.nextline() | |
self.pagebegin = 0 | |
self.fontsize = fontsize | |
c = self.canvas | |
c.setLineWidth(0.90) | |
c.setStrokeGray(gray) | |
if style == "center": | |
self.x = self.width / 2 | |
elif style == "right": | |
self.x = self.width - self.marginsides - self.fontsize | |
if seek > (self.width - (self.marginsides + self.fontsize)): | |
seek = 0 | |
if (self.y - self.fontsize) < 40: | |
self.set_new_page() | |
#if continuetext == 1: | |
# self.y = self.y + self.fontsize | |
# self.x = self.lastx | |
#else: | |
# self.x = self.marginsides | |
if seek != 0: | |
self.x = self.x + seek | |
if fontsize == 0: | |
fontsize = self.fontsize | |
else: | |
self.fontsize = fontsize | |
if completeline == 1: | |
boxes = int(self.width / self.fontsize) | |
box_startpx = { | |
"x": self.x - self.origin["x"], | |
"y": (842-self.y-self.fontsize) - self.origin["y"], | |
"side": self.fontsize - 1, | |
"boxes": boxes, | |
"page": self.num | |
} | |
for i in range(boxes): | |
c.rect(self.x, self.y, self.fontsize, self.fontsize) | |
self.x = self.x + self.fontsize | |
if self.x > (self.width - (self.marginsides + self.fontsize)): | |
break | |
self.lastx = self.x | |
#self.x = self.marginsides | |
#self.y = self.y - self.fontsize | |
#if isdate: | |
# t = c.beginText(self.x, self.y) | |
# t.setFont(Helvetica, 13) | |
# t.setFillGray(0) | |
# t.textOut(" D D M M Y Y Y Y") | |
# c.drawText(t) | |
# self.y = self.y - fontsize | |
# self.lastx = t.getX() | |
# self.lasty = self.y | |
#if isdatetime: | |
# t = c.beginText(self.x, self.y) | |
# t.setFont(Helvetica, 12.5) | |
# t.setFillGray(0.4) | |
# t.textOut(" D D M M Y Y Y Y -H H :M M") | |
# c.drawText(t) | |
# self.y = self.y - fontsize | |
# self.lastx = t.getX() | |
# self.lasty = self.y | |
self.lastx = self.x | |
return box_startpx | |
def draw_circle(self, | |
boxes=1, | |
completeline=0, | |
lines=0, | |
seek=0, | |
continuetext=0, | |
fontsize=0, | |
gray=0, | |
style=""): | |
""" Draw circles on the form """ | |
c = self.canvas | |
c.setLineWidth(0.90) | |
c.setStrokeGray(gray) | |
self.resetx(seek=seek) | |
#if style == "center": | |
# self.x = self.width / 2 | |
#elif style == "right": | |
# self.x = self.width - self.marginsides - self.fontsize | |
#if seek > (self.width - (self.marginsides + self.fontsize)): | |
# seek = 0 | |
#if (self.y - self.fontsize) < 40: | |
# self.set_new_page() | |
#if continuetext == 1: | |
# self.y = self.y + self.fontsize | |
# self.x = self.lastx | |
#else: | |
# self.x = self.marginsides | |
#if seek != 0: | |
# self.x = self.x + seek | |
#if fontsize == 0: | |
# fontsize = self.fontsize | |
#else: | |
# self.fontsize = fontsize | |
#if completeline == 1: | |
# boxes = int(self.width / self.fontsize) | |
circle_center = { | |
"x": (self.x + self.fontsize/2) - self.origin["x"], | |
"y": (842 - self.y - self.fontsize/2) - self.origin["y"], | |
"radius": self.fontsize/2, | |
"boxes" : boxes, | |
"page": self.num | |
} | |
for eachcircle in xrange(boxes): | |
c.circle(self.x + self.fontsize/2, self.y + self.fontsize/2, | |
self.fontsize/2, fill = 0) | |
self.resetx(seek=self.fontsize) | |
self.resetx(seek=seek) | |
# if self.x > (self.width - (self.marginsides + self.fontsize)): | |
# break | |
#self.lastx = self.x | |
#self.x = self.marginsides | |
#self.y = self.y - self.fontsize | |
return circle_center | |
def draw_line(self, gray=0, nextline=0): | |
""" Function to draw a straight line """ | |
self.fontsize = 4 | |
if nextline: | |
self.nextline() | |
else: | |
self.linespace(8) | |
self.resetx() | |
c = self.canvas | |
c.setStrokeGray(gray) | |
c.setLineWidth(1) | |
#self.y = self.y + self.linespacing + (self.fontsize/2) | |
c.line(self.x, self.y, self.width - self.x, self.y) | |
self.y = self.y + (self.linespacing) | |
def set_new_page(self): | |
""" | |
All changes are forgotten when a showPage() has been executed. | |
They have to be set again. | |
""" | |
self.num += 1 | |
c = self.canvas | |
c.showPage() | |
self.decorate() | |
self.x = self.marginsides | |
self.lastx = self.marginsides | |
self.y = self.height - self.margintop | |
#self.print_text(["Page %s" % unicode(self.num)], fontsize=8, | |
# style="right") | |
self.put_page_num() | |
self.put_metainfo() | |
#self.x = self.marginsides | |
#self.lastx = self.x | |
#self.y = self.y - 32 | |
self.pagebegin = 1 | |
def put_metainfo(self): | |
# preserve state | |
x, y = self.x, self.y | |
fontsize = self.fontsize | |
# do the job | |
self.fontsize = 10 | |
uuid_text = "UUID: %s" % self.form_uuid | |
rest_text = "Revision: %s Resource: %s" % (self.form_revision, | |
self.form_resourcename) | |
self.x = self.marginsides | |
self.y = 25 | |
for char in uuid_text: | |
t = self.writechar(char) | |
self.x = t.getX() | |
self.y = t.getY() | |
self.x = (self.width/2) + 20 | |
self.y = 25 | |
for char in rest_text: | |
t = self.writechar(char) | |
self.x = t.getX() | |
self.y = t.getY() | |
# restore state | |
self.fontsize = fontsize | |
self.x, self.y = x, y | |
def put_page_num(self): | |
# preserve state | |
x, y = self.x, self.y | |
fontsize = self.fontsize | |
# do the job | |
self.fontsize = 10 | |
text = "page%s" % self.num | |
self.x = self.width - \ | |
(((len(text)+2)*(self.fontsize/2)) + self.marginsides) | |
self.y = 25 | |
for char in text: | |
t = self.writechar(char) | |
self.x = t.getX() | |
self.y = t.getY() | |
# restore state | |
self.fontsize = fontsize | |
self.x, self.y = x, y | |
def set_title(self, title = "FORM"): | |
""" Sets the title of the pdf. """ | |
c = self.canvas.setTitle(title) | |
def save(self): | |
""" Saves the form """ | |
self.canvas.save() | |
pdf = self.IObuffer.getvalue() | |
self.IObuffer.close() | |
return pdf |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment