Skip to content

Instantly share code, notes, and snippets.

View jbaiter's full-sized avatar

Johannes Baiter jbaiter

View GitHub Profile

archiscribe-corpus

This is the corpus repository for https://archiscribe.jbaiter.de.

The goal is to have as much diverse OCR ground truth for 19th Century German prints as possible.

Currently the corpus contains 123 from 3 published across 3 years. Detailed statistics are available below.

@jbaiter
jbaiter / dta_aligner.py
Last active September 28, 2017 13:51
Dependencies: `pip install click requests editdistance kraken lxml pillow-simd sickle`
from __future__ import division
import json
import logging
import os
import re
from collections import OrderedDict
from io import BytesIO
import click
package main
import (
"archive/tar"
"bytes"
"compress/gzip"
"encoding/json"
"errors"
"flag"
"fmt"
This file has been truncated, but you can view the full file.
<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN" "http://www.w3.org/TR/html4/loose.dtd">
<html>
<head>
<title>OCR Output</title>
<meta http-equiv='content-type' content='text/html; charset=utf-8'>
<meta http-equiv='content-style-type' content='text/css'>
<meta name='ocr-capabilities' content='ocr_page ocr_par ocr_cinfo ocr_line'>
<meta name='ocr-system' content=' '>
<meta name='ocr-number-of-pages' content='518'>
<meta name='DC.title' content='Popular Tales of the West Highlands'>
def align(truth_lines, ocr_lines):
nonaligned = []
aligned = []
align_idx = 0
for truth_line in truth_lines:
best_error = 1.0
best_align = 0
for idx, ocr_line in enumerate(ocr_lines[align_idx+1:],
align_idx+1):
total_error = levenshtein(truth_line, ocr_line)
package org.apache.solr.highlight;
import com.google.common.primitives.Longs;
import java.io.IOException;
import java.lang.invoke.MethodHandles;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import asyncio
import time
from threading import Thread
import aiohttp
import janus
# q: do i really have to pass the loop from the main thread around?
async def fetch(url, loop):
async with aiohttp.ClientSession(loop=loop) as session:
@jbaiter
jbaiter / example.py
Created December 25, 2016 22:19
threaded version
def fetch_from_remote(url):
return requests.get(url).content
def content_generator(urls):
with concurrent.futures.ThreadPoolExectur(max_workers=4) as pool:
futs = [pool.submit(fetch_from_remote, url) for url in urls]
for fut in concurrent.futures.as_completed(futs):
yield fut.result()
@jbaiter
jbaiter / manifest.json
Last active November 9, 2016 08:55
Manifest with absolute links in Metadata (see .metadata[1])
{
"attribution": "Bayerische Staatsbibliothek",
"license": "https://creativecommons.org/licenses/by/3.0",
"logo": "https://www.bsb-muenchen.de/fileadmin/templates/images/bsb_logo.png",
"related": "http://daten.digitale-sammlungen.de/~db/0008/bsb00083115/images",
"seeAlso": [
{
"@id": "http://daten.digitale-sammlungen.de/~db/0008/bsb00083115/images"
},
{