Skip to content

Instantly share code, notes, and snippets.

@Ahanmr
Created March 29, 2020 20:51
Show Gist options
  • Save Ahanmr/f8ecfb6278d1f235d0efd72861713895 to your computer and use it in GitHub Desktop.
Save Ahanmr/f8ecfb6278d1f235d0efd72861713895 to your computer and use it in GitHub Desktop.
class TableFinder(object):
"""
Given a PDF page, finds table structures.
"""
def __init__(self, page, settings={}):
for k in settings.keys():
if k not in DEFAULT_TABLE_SETTINGS:
raise ValueError("Unrecognized table setting: '{0}'".format(
k
))
self.page = page
self.settings = dict(DEFAULT_TABLE_SETTINGS)
self.settings.update(settings)
for var, fallback in [
("text_x_tolerance", "text_tolerance"),
("text_y_tolerance", "text_tolerance"),
("intersection_x_tolerance", "intersection_tolerance"),
("intersection_y_tolerance", "intersection_tolerance"),
]:
if self.settings[var] == None:
self.settings.update({
var: self.settings[fallback]
})
self.edges = self.get_edges()
self.intersections = edges_to_intersections(
self.edges,
self.settings["intersection_x_tolerance"],
self.settings["intersection_y_tolerance"],
)
self.cells = intersections_to_cells(
self.intersections
)
self.tables = [ Table(self.page, t)
for t in cells_to_tables(self.cells) ]
def get_edges(self):
settings = self.settings
for name in [ "vertical", "horizontal" ]:
strategy = settings[name + "_strategy"]
if strategy not in TABLE_STRATEGIES:
raise ValueError("{0} must be one of {{{1}}}".format(
name + "_strategy",
",".join(TABLE_STRATEGIES)
))
if strategy == "explicit":
if len(settings["explicit_" + name + "_lines"]) < 2:
raise ValueError("If {0} == 'explicit', {1} must be specified as list/tuple of two or more floats/ints.".format(
strategy + "_strategy",
"explicit_" + name + "_lines",
))
v_strat = settings["vertical_strategy"]
h_strat = settings["horizontal_strategy"]
if v_strat == "text" or h_strat == "text":
xt = settings["text_x_tolerance"]
if xt == None:
xt = settings["text_tolerance"]
yt = settings["text_y_tolerance"]
if yt == None:
yt = settings["text_tolerance"]
words = self.page.extract_words(
x_tolerance=xt,
y_tolerance=yt,
keep_blank_chars=settings["keep_blank_chars"]
)
def v_edge_desc_to_edge(desc):
if isinstance(desc, dict):
edge = {
"x0": desc.get("x0", desc.get("x")),
"x1": desc.get("x1", desc.get("x")),
"top": desc.get("top", self.page.bbox[1]),
"bottom": desc.get("bottom", self.page.bbox[3]),
"orientation": "v"
}
else:
edge = {
"x0": desc,
"x1": desc,
"top": self.page.bbox[1],
"bottom": self.page.bbox[3],
}
edge["height"] = edge["bottom"] - edge["top"]
edge["orientation"] = "v"
return edge
v_explicit = list(map(v_edge_desc_to_edge, settings["explicit_vertical_lines"]))
if v_strat == "lines":
v_base = utils.filter_edges(self.page.edges, "v")
elif v_strat == "lines_strict":
v_base = utils.filter_edges(self.page.edges, "v",
edge_type="lines")
elif v_strat == "text":
v_base = words_to_edges_v(words,
word_threshold=settings["min_words_vertical"])
elif v_strat == "explicit":
v_base = []
v = v_base + v_explicit
def h_edge_desc_to_edge(desc):
if isinstance(desc, dict):
edge = {
"x0": desc.get("x0", self.page.bbox[0]),
"x1": desc.get("x1", self.page.bbox[2]),
"top": desc.get("top", desc.get("bottom")),
"bottom": desc.get("bottom", desc.get("top")),
}
else:
edge = {
"x0": self.page.bbox[0],
"x1": self.page.bbox[2],
"top": desc,
"bottom": desc,
}
edge["width"] = edge["x1"] - edge["x0"]
edge["orientation"] = "h"
return edge
h_explicit = list(map(h_edge_desc_to_edge, settings["explicit_horizontal_lines"]))
if h_strat == "lines":
h_base = utils.filter_edges(self.page.edges, "h")
elif h_strat == "lines_strict":
h_base = utils.filter_edges(self.page.edges, "h",
edge_type="lines")
elif h_strat == "text":
h_base = words_to_edges_h(words,
word_threshold=settings["min_words_horizontal"])
elif h_strat == "explicit":
h_base = []
h = h_base + h_explicit
edges = list(v) + list(h)
if settings["snap_tolerance"] > 0 or settings["join_tolerance"] > 0:
edges = merge_edges(edges,
snap_tolerance=settings["snap_tolerance"],
join_tolerance=settings["join_tolerance"],
)
return utils.filter_edges(edges,
min_length=settings["edge_min_length"])
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment