Christian Boulanger cboulanger

## signals.rb
signal_phrases = [[
                    "see", "cf.", "e.g.", "compare", "on", "for example",
                    "on the contrary", "on the other hand",
                    "generally", "similarly", "alternatively",
                    "the discussion in", "regarding", "on this, see",
                    "also", "although", "as discussed in", "detailed", "described", "cited in", "by",
                    "first published as", "reprinted in"
                  ], [
                    "siehe", "s.", "vgl.", "vgl",
                    "vgl. nur", "vgl", "zum ganzen", "so z.b.", "so auch", "bei",

## refs-annotated.txt
<text>1 Ober 50% der Deutschen verbringen jeden Abend und über 70% jedes Wochenende zu Hause, vgl. </text><ref>Michael Andritzky/Gert Selle (Hrsg.)  , Lembereich Wohnen Band 1, Reinbek 1979, S. 13</ref>.
<text>2 Der Anteil der Mietwohnungen betrug 1978 63 % (</text><ref>Lothar Herberger und Mitarbeiter, Bestand und Struktur der Gebäude und Wohnungen, in: Wirtschaft und Statistik 1980, S. 283—291, 286</ref><text>), in Großstädten sind sogar mehr als 80 % der Wohnungen vermietet (</text><ref>Rudi Ulbrich, a.a.O., Anm. 3, S. 18</ref><text>).               ' </text>
<text>3 Die 1 %-Wohnungsstichprobe 1978 hat ergeben, daß einer Zahl von 24,3 Mio. Haushalten nur 23,4 Mio. Wohnungen gegenüberstehen. Hiervon stehen knapp 700000 leer und etwa 200000 dienen als Zweitwohnungen, vgl. hierzu ausführlich </text><ref>Rudi Ulbrich, Die Wohnungsversorgung im Spiegel der Statistik, in: Joachim Brech (Hrsg.), Wohnen zur Miete, Weinheim 1981, S. 16-21</ref>

<text>6 </text><ref>Bericht der Bundesregierung über die Auswirkunge

## anystyle2neo4j.rb
require 'anystyle'
require 'active_graph'
require 'serrano'

# connect to Neo4j
url = 'neo4j+s://4dcc21ca.databases.neo4j.io'
auth = Neo4j::Driver::AuthTokens.basic('neo4j', 'HacrjGERBJpLsVMZvMdGpB7FBvvexENZ3ikNXXfaE1s')
ActiveGraph::Base.driver = Neo4j::Driver::GraphDatabase.driver(url, auth, encryption: false)

# setup models

## generate-sep-template.py
from gensim.models.phrases import Phrases, Phraser, ENGLISH_CONNECTOR_WORDS
from gensim.parsing.preprocessing import remove_stopwords, preprocess_string, strip_tags, strip_punctuation, strip_numeric, remove_stopwords
import regex as re
import pandas as pd


# both dataframes have "title", "abstract", "published" columns
df_ngram = pd.read_pickle("journal-ngram-corpus.pkl")
df_analysis = pd.read_pickle("journal-analysis.pkl")

## pocoweb.ts
import {default as fetch} from 'node-fetch';
const { pdf } = require("pdf-to-img");
import {tmpdir} from "os";
import {createWriteStream, createReadStream} from 'fs';
import * as fsp from 'fs/promises'
import * as archiver from 'archiver';
import {ArchiverError} from "archiver";
import * as path from "path";
import {Parser, Builder} from "xml2js";

## create-corpus-from-best-ocr-result.sh
#! /usr/bin/env bash

# see https://ryanfb.github.io/etc/2015/03/16/automatic_evaluation_of_ocr_quality.html
# using https://github.com/saffsd/langid.py
# install with pip install langid and add the scorelines.sh & ocrquality.rb scripts from the blog entry in the same directory

# The PDF source files, which start with a DOI, adapt this for your case
FILE_SELECTOR=/path/to/source/dir/*.pdf
# The path to the directory to which the selected documents should be copied
TARGET=/path/to/target/dir

## 10.1515_zfrs-1980-0101.xml.page.xml
<?xml version="1.0" encoding="UTF-8" standalone="no"?>
<PcGts xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
       xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd"
       xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15">
    <Metadata>
        <Creator>ABBYY FineReader Engine 12</Creator>
        <Created>1970-01-01T00:00:00</Created>
        <LastChange>1970-01-01T00:00:00</LastChange>
        <Comments/>
    </Metadata>

## error.txt
docker run --rm -it -v "$PWD":/data ubma/ocr-fileformat ocr-transform abbyy page 10.1515_zfrs-1980-0101.xml 10.1515_zfrs-1980-0101.page.xml

org.xml.sax.SAXParseException; lineNumber: 1; columnNumber: 1; Premature end of file.
	at com.sun.org.apache.xerces.internal.util.ErrorHandlerWrapper.createSAXParseException(ErrorHandlerWrapper.java:203)
	at com.sun.org.apache.xerces.internal.util.ErrorHandlerWrapper.fatalError(ErrorHandlerWrapper.java:177)
	at com.sun.org.apache.xerces.internal.impl.XMLErrorReporter.reportError(XMLErrorReporter.java:400)
	at com.sun.org.apache.xerces.internal.impl.XMLErrorReporter.reportError(XMLErrorReporter.java:327)
	at com.sun.org.apache.xerces.internal.impl.XMLScanner.reportFatalError(XMLScanner.java:1472)
	at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$PrologDriver.next(XMLDocumentScannerImpl.java:1014)
	at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl.next(XMLDocumentScannerImpl.java:602)

## 10.1515_zfrs-1980-0101.xml
<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<document xmlns="http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml" version="1.0" producer="ABBYY FineReader Engine 12" languages="" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml">
  <documentData>
    <paragraphStyles>
      <paragraphStyle id="{93BCCE1C-1547-4388-AA39-146AEF764F40}" name="Body text|1" mainFontStyleId="{DCF0C577-94EF-48F5-929F-C9402CFAA588}" role="text" align="Left" startIndent="0" leftIndent="0" rightIndent="0" lineSpacing="1197" lineSpacingRatio="1.1000000238418579" fixedLineSpacing="0">
        <fontStyle id="{DCF0C577-94EF-48F5-929F-C9402CFAA588}" baseFont="1" ff="Times New Roman" fs="9.5" backgroundColor="4278190079"/>
        <fontStyle id="{BBC2CAB0-2ACF-4618-9DCB-1E0FA6775B73}" ff="Times New Roman" fs="10." backgroundColor="4278190079"/>
        <fontStyle id="{C50

## DatabaseMaintenanceController.php
<?php

namespace app\controllers;

use yii\console\ExitCode;
use Yii;

/**
 * This is a Yii2 console controller class which can also be used standalone to run the
 * actionUpdateEncoding() method if you remove the Yii2 stuff.
	signal_phrases = [[
	"see", "cf.", "e.g.", "compare", "on", "for example",
	"on the contrary", "on the other hand",
	"generally", "similarly", "alternatively",
	"the discussion in", "regarding", "on this, see",
	"also", "although", "as discussed in", "detailed", "described", "cited in", "by",
	"first published as", "reprinted in"
	], [
	"siehe", "s.", "vgl.", "vgl",
	"vgl. nur", "vgl", "zum ganzen", "so z.b.", "so auch", "bei",
	<text>1 Ober 50% der Deutschen verbringen jeden Abend und über 70% jedes Wochenende zu Hause, vgl. </text><ref>Michael Andritzky/Gert Selle (Hrsg.) , Lembereich Wohnen Band 1, Reinbek 1979, S. 13</ref>.
	<text>2 Der Anteil der Mietwohnungen betrug 1978 63 % (</text><ref>Lothar Herberger und Mitarbeiter, Bestand und Struktur der Gebäude und Wohnungen, in: Wirtschaft und Statistik 1980, S. 283—291, 286</ref><text>), in Großstädten sind sogar mehr als 80 % der Wohnungen vermietet (</text><ref>Rudi Ulbrich, a.a.O., Anm. 3, S. 18</ref><text>). ' </text>
	<text>3 Die 1 %-Wohnungsstichprobe 1978 hat ergeben, daß einer Zahl von 24,3 Mio. Haushalten nur 23,4 Mio. Wohnungen gegenüberstehen. Hiervon stehen knapp 700000 leer und etwa 200000 dienen als Zweitwohnungen, vgl. hierzu ausführlich </text><ref>Rudi Ulbrich, Die Wohnungsversorgung im Spiegel der Statistik, in: Joachim Brech (Hrsg.), Wohnen zur Miete, Weinheim 1981, S. 16-21</ref>

	<text>6 </text><ref>Bericht der Bundesregierung über die Auswirkunge
	require 'anystyle'
	require 'active_graph'
	require 'serrano'

	# connect to Neo4j
	url = 'neo4j+s://4dcc21ca.databases.neo4j.io'
	auth = Neo4j::Driver::AuthTokens.basic('neo4j', 'HacrjGERBJpLsVMZvMdGpB7FBvvexENZ3ikNXXfaE1s')
	ActiveGraph::Base.driver = Neo4j::Driver::GraphDatabase.driver(url, auth, encryption: false)

	# setup models
	from gensim.models.phrases import Phrases, Phraser, ENGLISH_CONNECTOR_WORDS
	from gensim.parsing.preprocessing import remove_stopwords, preprocess_string, strip_tags, strip_punctuation, strip_numeric, remove_stopwords
	import regex as re
	import pandas as pd


	# both dataframes have "title", "abstract", "published" columns
	df_ngram = pd.read_pickle("journal-ngram-corpus.pkl")
	df_analysis = pd.read_pickle("journal-analysis.pkl")
	import {default as fetch} from 'node-fetch';
	const { pdf } = require("pdf-to-img");
	import {tmpdir} from "os";
	import {createWriteStream, createReadStream} from 'fs';
	import * as fsp from 'fs/promises'
	import * as archiver from 'archiver';
	import {ArchiverError} from "archiver";
	import * as path from "path";
	import {Parser, Builder} from "xml2js";
	#! /usr/bin/env bash

	# see https://ryanfb.github.io/etc/2015/03/16/automatic_evaluation_of_ocr_quality.html
	# using https://github.com/saffsd/langid.py
	# install with pip install langid and add the scorelines.sh & ocrquality.rb scripts from the blog entry in the same directory

	# The PDF source files, which start with a DOI, adapt this for your case
	FILE_SELECTOR=/path/to/source/dir/*.pdf
	# The path to the directory to which the selected documents should be copied
	TARGET=/path/to/target/dir
	<?xml version="1.0" encoding="UTF-8" standalone="no"?>
	<PcGts xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
	xsi:schemaLocation="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15 http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15/pagecontent.xsd"
	xmlns="http://schema.primaresearch.org/PAGE/gts/pagecontent/2019-07-15">
	<Metadata>
	<Creator>ABBYY FineReader Engine 12</Creator>
	<Created>1970-01-01T00:00:00</Created>
	<LastChange>1970-01-01T00:00:00</LastChange>
	<Comments/>
	</Metadata>
	docker run --rm -it -v "$PWD":/data ubma/ocr-fileformat ocr-transform abbyy page 10.1515_zfrs-1980-0101.xml 10.1515_zfrs-1980-0101.page.xml

	org.xml.sax.SAXParseException; lineNumber: 1; columnNumber: 1; Premature end of file.
	at com.sun.org.apache.xerces.internal.util.ErrorHandlerWrapper.createSAXParseException(ErrorHandlerWrapper.java:203)
	at com.sun.org.apache.xerces.internal.util.ErrorHandlerWrapper.fatalError(ErrorHandlerWrapper.java:177)
	at com.sun.org.apache.xerces.internal.impl.XMLErrorReporter.reportError(XMLErrorReporter.java:400)
	at com.sun.org.apache.xerces.internal.impl.XMLErrorReporter.reportError(XMLErrorReporter.java:327)
	at com.sun.org.apache.xerces.internal.impl.XMLScanner.reportFatalError(XMLScanner.java:1472)
	at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl$PrologDriver.next(XMLDocumentScannerImpl.java:1014)
	at com.sun.org.apache.xerces.internal.impl.XMLDocumentScannerImpl.next(XMLDocumentScannerImpl.java:602)
	<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
	<document xmlns="http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml" version="1.0" producer="ABBYY FineReader Engine 12" languages="" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml http://www.abbyy.com/FineReader_xml/FineReader10-schema-v1.xml">
	<documentData>
	<paragraphStyles>
	<paragraphStyle id="{93BCCE1C-1547-4388-AA39-146AEF764F40}" name="Body text\|1" mainFontStyleId="{DCF0C577-94EF-48F5-929F-C9402CFAA588}" role="text" align="Left" startIndent="0" leftIndent="0" rightIndent="0" lineSpacing="1197" lineSpacingRatio="1.1000000238418579" fixedLineSpacing="0">
	<fontStyle id="{DCF0C577-94EF-48F5-929F-C9402CFAA588}" baseFont="1" ff="Times New Roman" fs="9.5" backgroundColor="4278190079"/>
	<fontStyle id="{BBC2CAB0-2ACF-4618-9DCB-1E0FA6775B73}" ff="Times New Roman" fs="10." backgroundColor="4278190079"/>
	<fontStyle id="{C50
	<?php

	namespace app\controllers;

	use yii\console\ExitCode;
	use Yii;

	/**
	* This is a Yii2 console controller class which can also be used standalone to run the
	* actionUpdateEncoding() method if you remove the Yii2 stuff.