D-K-E/Epigraphic_Interoperability_Caf_Ephe.tex

## Epigraphic_Interoperability_Caf_Ephe.tex
\documentclass{beamer}
\mode<presentation>
\setbeameroption{show notes}
\usepackage{fontspec}
\usetheme{Warsaw}
\usecolortheme{beaver}
\usecolortheme{whale}
\usecolortheme{lily}
\usepackage[bibstyle=verbose,
citestyle=verbose-trad1,
isbn=false,
backend=biber]{biblatex}

\usepackage{color}

\usepackage{listingsutf8} %Xml'i latexe aktarmak için

\usepackage{bidi}%Soldan sağa, sağdan sola metin yazmak için
\addbibresource{EncodageDariusI.bib}

\setmainfont{FreeSerif}
\newfontfamily\AnaMetin{FreeSerif}
\DeclareTextFontCommand{\AnaMtnYz}{\AnaMetin}
\newfontfamily\HebrewFont{Aramaic Imperial Yeb}
\newenvironment{HebrewText}{\HebrewFont}{\par}

% From the answer of lockstep in
% -----------------------------------------------------
% SO:
% http://tex.stackexchange.com/questions/5852/beamer-footnote-text-collides-with-navigation-symbols
% Date: 2010-11-23
% Visit Date: 2017-01-26
\addtobeamertemplate{footnote}{\vspace{-6pt}\advance\hsize-0.5cm}{\vspace{6pt}}
\makeatletter
% Alternative A: footnote rule
\renewcommand*{\footnoterule}{\kern -3pt \hrule \@width 2in \kern 8.6pt}
% Alternative B: no footnote rule
% \renewcommand*{\footnoterule}{\kern 6pt}
\makeatother
% ---------------------------------------------
\title[Epigraphic Interoperability of Ancient Texts]{On the Use of Existing
  Resources for Ensuring Epigraphic Interoperability of Ancient Texts}

\subtitle{What do we do? How do we do it? How can we make it better ?}

\date[2017]{January 2017 - Caf'Ephe}
\author[D.K. Eraslan]{Doğu Kaan Eraslan \newline \texttt{dogu-kaan.eraslan@etu.ephe.fr}}
\institute[EPHE-PSL]{EA 4519 - Ecole Pratique des Hautes Etudes - Paris Science
  \& Lettres}
\logo{
  {\includegraphics[scale=0.25]{./logo-ephe-r.png}}
  {\includegraphics[scale=0.20]{./logo-psl.png}}
}


\begin{document}

\frame{\maketitle}

\section{Introduction:Encoding Ancient Languages}

\subsection{Definitions}

\begin{frame}{Definitions}

  \begin{itemize}

    {\item{\textbf{Character/Abstract Character}: The minimal semantic unit of a
        text, ex. ``e'' in ``letter''}}

    {\item{\textbf{Glyph}: Visual representation of the letter: Ex.
        {\textbf{a}}, {\textit{a}}}}
    {\note[item]{They are of the same letter but they are different
        glyphs.}}

    {\item{\textbf{Encoding/Character Encoding}: Any \emph{act} of mapping a
        character to something else}} % Encoding is an act, not the result of an
    % action. See Unicode Technical Report 17 for
    % further reading.

    {\note[item]{Mostly what you map to is bytes. Unicode calls Encoding/Character Encoding, character mapping.
        This is related to the peculiarities introduced by the uni code to the act of encoding.}}

    {\item{\textbf{Encoding Scheme}: A standard defining how the mapping
        would take place}}
    {\note[item]{This is rather complex and have several elements in
        Unicode due the abstraction level required by the code points.}}

    {\item{\textbf{(Coded) Character Set}: A set of encoded, meaning mapped, characters.}}
    {\note[item]{This used to be thought as a set of characters that are mapped to
        bytes. This constitutes the middle ground between the letters as we
        know it and what machine, computer, reads. Unicode added other
        abstraction layers to it.}}

    {\item{\textbf{Epigraphic Interoperability}: Ability to transform the
        encoding scheme of an encoded epigraphic phenomenon to another encoding
        scheme losslessly}}

  \end{itemize}


\end{frame}


\subsection{Common Historical Elements}

\begin{frame}{A very very very brief overview}

  \begin{itemize}

    {\item{Most of the work started around the end of 80s and during 90s: CAL Code
        (1987), MdC (1988), Cuneiform Encoding (1999-2000)}}

    {\item{All of the teams had specialists from computer science and their
        particular field.}}
    {\item{All of the encoding schemes are still partially used.}}

  \end{itemize}

\end{frame}


\subsection{Encoding Schemes for Ancient Languages}

\subsubsection{C-ATF}

\begin{frame}{C-ATF}

  \begin{itemize}

    {\item{What does it store ?}}

    \begin{itemize}

      {\item{Cuneiform texts: akkadian, elamite, ugarite, etc.}}
      {\item{Ex.}}
      \begin{figure}

        {
          \includegraphics[scale=0.7]{./resimler/C-ATF-P383090-d-Parça.png}
        }
        {
          \caption{CDLI no: P383090{\footnote[frame]{\cite{cdli_p383090:_2017}}}; PF 404{\footnote[frame]{\cite[p., 163]{hallock_pft:_1969}}}}
        }

      \end{figure}

      \begin{itemize}
          {\item{Hallock Transliteration: 20
              ZÍD.DA.lg kur-min}}
          {\item{C-ATF:  2(u) \_zi3-da-mesz\_ kur-min2}}

        \end{itemize}

    \end{itemize}

  \end{itemize}

\end{frame}

\begin{frame}{C-ATF}

  \begin{itemize}

    {\item{How does it store it?}}
          \begin{itemize}
            {\item{ASCII.}}
            \begin{itemize}
            {\item{zi3 instead of zíd or mesz instead of lg which stands for me{\v{s}} in elamite.}}
            \end{itemize}
            {\item{Special characters for indicating sign functions.}}
            \begin{itemize}
            {\item{``\_'' for indicating logograms in non sumerian texts.}}
            \end{itemize}
          {\item{Encoding store the readings of the glyphs, no effort to map the
              glyph to byte directly}}
          {\note[item]{this was done so by the unicode block of cuneiform signs
              later on.}}

          \end{itemize}

    \end{itemize}

  \end{frame}

  \subsubsection{Bavant XML-Elamite Standard}

  \begin{frame}[containsverbatim, squeeze]{Bavant XML-Elamite Standard}
    \begin{columns}[T]
      \begin{column}{4cm}
    \begin{itemize}

      {\item{What does it store ?}}

      \begin{itemize}

        {\item{Elamite texts.}}
        \begin{figure}[h]
          {\includegraphics[scale=.25]{./resimler/elamBavant1.png}}
          {\includegraphics[scale=.25]{./resimler/elamBavant2.png}}
          {
            \caption{
              DNa §1,1\footnote[frame]{
                \cite[plate 32]{schmidt_persepolis_1970}%
              }%
            }%
          }%
         \end{figure}

      \end{itemize}

    \end{itemize}
  \end{column}
  \begin{column}{10cm}
    \begin{lstlisting}
      <inscription name="DNa">
      <metadata>
      <king dates="522-486" name="Darius I" />
      <source></source>
      <url>http://www.um.es/ipoa/cuneiforme/elamita/archivosreales/dario1/dario_i_dna.htm</url>
      </metadata><notes/>
      <section id='DNa:§1'>
      <T1>(1) d.na-ap ir-šá-ir-ra</T1>
      <T2>(1) Ahuramazda es el gran dios</T2>
      <T3>d.na-ap ir-šá-ir-ra</T3>
      <T4>nap iršara</T4></section>
      </inscription>
    \end{lstlisting}
  \end{column}
\end{columns}

  \end{frame}

  \begin{frame}{Bavant XML-Elamite Standard}


    \begin{itemize}
      {\item{How does it store it ?}}

      \begin{itemize}
        {\item{XML with UTF-8 encoding: ir-šá-ir-ra}}

        {\item{Traditional conventions of elamology for glyph functions: d. for
            DINGIR as determinative.}}
        {\item{Encoding store the readings of the glyphs, no effort to map the
            glyph to byte directly.}}
        {\note[item]{There is no Unicode block for elamite cuneiform signs yet.}}

      \end{itemize}

    \end{itemize}

  \end{frame}


  \subsubsection{MdC}

  \begin{frame}{MdC}

        \begin{itemize}
          {\item{What does it store ?}}

          \begin{itemize}

            {\item{Ancient Egyptian hieroglyphic texts. In theory it can be extended
                to hieratic.}}
       \begin{figure}
          {\includegraphics[scale=.90]{./resimler/MdC-example.eps}}
          {\caption{Jansen-Winkeln, no. 57103, line 4\footnote[frame]{
                \cite[p., 460]{jansen-winkeln_26._2014-1}
              }%
            }%
          }%
        \end{figure}

        {\item{Normal Transliteration: {\textsuperscript{4}
              [---] (.w) jn.n xr⸗tw nH.t(?) pn rd(i) xr⸗tw}}}
        {\item{MdC:{\detokenize{ |4
                -[[-+b...+s-]]_-.:3_-W25:N35_-Aa1:D21_-X1-G43_-G21&"(?)"_}
              \newline
              \detokenize{-X1_-Q3:N35_-D21:D36*Z1_-Aa1:D21_-X1:G43_-!}}
          }}
          \end{itemize}

        \end{itemize}


  \end{frame}

  \begin{frame}{MdC}
    \begin{itemize}

      {\item{How does it store it ?}}

      \begin{itemize}

        {\item{ASCII: W25, D21, Aa1, etc.}}
        {\item{Special characters for indicating relative glyph positions: ``:''
          for vertical grouping, ``*'' for horizontal grouping, etc.}}
        {\item{Encoding store the glyphs and makes an effort to give the state
            of conservation about the individual glyphs of the signs: each glyph
            has a specific denomination like W25, Aa1, etc.}}

      \end{itemize}

    \end{itemize}

  \end{frame}


  \subsubsection{CAL Code}

  \begin{frame}{CAL Code}

    \begin{itemize}

      {\item{What does it store ?}}

      \begin{itemize}

        {\item{Aramaic texts, Syriac texts, Hebrew texts, etc.}}
        \begin{RTL}
          \begin{HebrewText}
            {\LRE{
                {\AnaMetin{TAD C 3-7: Col A:Recto, 2:01\footnote[frame]{
                      \cite{cal_porten_tad_c3-7}
                    }
                  }
                }
              }} מנדתא זי גבי מנה וע[ביד ע]ל בית מלכא
            \end{HebrewText}

          \end{RTL}
        {\item{Cal Code: mndt) zy gby mnh w([byd (]l byt mlk)}}

      \end{itemize}

    \end{itemize}

  \end{frame}
  \begin{frame}{CAL Code}
    \begin{itemize}
      {\item{How does it store it?}}
      \begin{itemize}

        {\item{ASCII}}
        {\item{Special characters not reserved to a specific area}}
        {\item{Encoding stores the characters and makes an effort to give the
            state of conservation about the individual characters of the
            signs.}}

      \end{itemize}
    \end{itemize}
  \end{frame}

  \subsubsection{EpiDoc}

  \begin{frame}
    \frametitle{EpiDoc}
    \begin{columns}[T]

      \begin{column}{5cm}

        \begin{itemize}

          {\item{What does it store ?}}
          \begin{itemize}
            {\item{Desires to be applicable to all the ancient texts but
                currently the major use case is greco-roman corpora}}\newline
            \AnaMtnYz{
        Ἀπόλλω- \newline
        νει θεῷ\footnote[frame]{
          \cite{crowthe_mamaXI_314:_2012}
         }
        }

          \end{itemize}
        \end{itemize}

  \end{column}

  \begin{column}[t]{13cm}
    \begin{semiverbatim}
      {\AnaMtnYz{
      <div type="edition"><head>edition</head>\newline
      <ab><lb xml:id="line-1" n="1"/> \newline
      <persName type="divine"> \newline
      <name nymRef="Ἀπόλλων">Ἀπόλλω- \newline
      <lb xml:id="line-2" n="2"/>\newline
      νει</name>\newline
      <w lemma="θεός">θεῷ</w>\newline
					</ab></div>
      }}
\end{semiverbatim}
   \end{column}

  \end{columns}

  \end{frame}

  \begin{frame}
    \begin{itemize}
      {\item{How does it store it?}}
      \begin{itemize}

        {\item{Utf-8}}
        {\item{XML mark-up language in TEI flavor.}}
        {\item{Degree of conservation depends on the project's goals. Anything
            goes as long as it conforms to the schema.}}

      \end{itemize}
    \end{itemize}
  \end{frame}

  \subsection{A Comparison of the Encoding Schemes}

  \subsubsection{Major Trends/Patterns and Similarities}

  \begin{frame}{Major Trends/Patterns and Similarities}
    \begin{itemize}
      {\item{Characters are mapped to an intermediary element instead of
          bytes.}}
      \begin{itemize}
      {\item{They are mapped to:}}
      \begin{itemize}
      {\item{Either semantic intermediary element: A transliteration: disz,
          mesz, mlk(, etc.}}
      {\item{Or non semantic intermediary element: An arbitrary code: W23, Aa1,
          F54, etc.}}
    \end{itemize}
  \end{itemize}
    {\item{Semantic treatment of the epigraphic material.}}
    {\note[item]{
THE major trend, except maybe MdC and EpiDoc, these encoding schemes are designed
to restore text as it is observed by humans.
The semantic hierarchy imposed by the observer creates substantial differences
in the retention rate of the data held by the original document, meaning since
we wanted conserve what the text said so much we conceived systems that conserve
in detail how what is said is said by the text and gave less attention to the
actual state or rather factual state of the document in question.
This translates as less precision about the state of conservation of a document and detailed precision about the semantics of the conserved sections of the document.}}

  \end{itemize}


  \end{frame}


  \subsubsection{Differences}

  \begin{frame}{Differences}

    \begin{itemize}

      {\item{The nature of the intermediary element}}
      {\note[item]{Whether it is semantic or not.}}
      {\item{Character treatment}}
      \begin{itemize}
        {\item{Character-only treatment vs semi-glyphtic treatment}}
        \end{itemize}
      {\note[item]{a clear difference is that some encoding schemes stores information about the glyphs of the signs and some store sign readings. MdC, or rather its Jsesh variant, with its {\detokenize{\shadingNUMBER}} mark, imprecise as it is, attempts, at least to give us a rough idea about the damage at the glyphtic level of a sign. This is not the case for example for C-ATF. In c-atf you would have no possibility of expressing which vertical of SUNKI is damaged.}}
    \end{itemize}


  \end{frame}


  \section{The Interoperability Problem}

  \begin{frame}{The Interoperability Problem}
    \begin{itemize}
      {\item{Semantic treatment of visual phenomenons}}
      {\note[item]{In one sentence: the above mentioned encoding schemes
        try to represent visual phenomenons, as damage or sign positions, by
        using semantic values, characters, letters, signs as a base. Whereas
        encoding of visual phenomenons require glyph level encoding, if it is to
      be reliable on a multilingual document. MdC being the exception. Arbitrary
    codes made it possible to store glyph variations in more or less reliable
    reliable fashion. The encoding schemes don't posses a unified visual
    decomposibility level for their characters. Ex. MdC supposes that every sign
  is visually decomposable to 4 parts, whereas c-atf doesn't admit such level.
  This results in distinct representations of the same document throughout
  different encoding schemes. Formal difference in representation is not a problem per
  se, but the amount of information regarding to the state of conservation of
  the document would be different which would lead to incompatible epigraphic
  assessment of the document. Practically speaking we would not be able to
  understand whether the document is well preserved in overall or not by looking
  at the encoding.}}
\only<1>{
\begin{figure}
{\includegraphics[scale=0.6]{./resimler/sunki.png}}
{\caption{SUNKI\footnote[frame]{
      \cite{bavant_svg_tool:_2014}
    }
    ?}
  }
\end{figure}
}
\only<2>{
\begin{itemize}

{\item{\o : the letter o is damaged (Cal Code, C-atf, Bavant-Elamite, EpiDoc)}}
{\item{\o : the letter o is divided into half (MdC, EpiDoc)}}
{\item{\o : a shape that can be attributed to ``o'', U+006F, with a 97\% precision intersects with a shape ``/''
    (x,y)-(x,y) points}}
\end{itemize}
}
    \end{itemize}


  \end{frame}


  \section{Possible Solutions with Existing Resources}

  \subsection{Unicode} % This introduced textness to machine by mapping semantic
  % units to bytes through ad-hoc code points.

  \begin{frame}{Unicode}
    \begin{itemize}
      {\item{What is it?}}

      \begin{itemize}
        {\item{A system to map characters to bytes through code points}}
        {\note[item]{Characters are interpreted as semantic units.}}
        {\note[item]{Code points are arbitrary constructions}}
        {\note[item]{The actual mapping of characters to code points, and code
            points to bytes require two different reversible procedures that
            make the system rather complex.}}

      \end{itemize}

      {\item{How can it be used ?}}
      \begin{itemize}
        {\item{Stable class identifiers for glyphs}}
        {\note[item]{Unicode depends on a semantic interpretation of characters
            in the attribution of code points, in other words, Unicode code
            points define when does what we read become understandable by the
            machine. It helps us to compute what we understand by the glyphs.
            Thus making them very stable as class identifiers for attributing
            glyphs to.}}

      \end{itemize}
    \end{itemize}

  \end{frame}


  \subsection{EpiDoc} % This provide the general framework for the conservation.

  \begin{frame}
    \frametitle{EpiDoc}
    \begin{itemize}
      {\item{What can it offer as of now ?}}
      \begin{itemize}
        {\item{Syntactic influences between contemporary languages}}
        {\note[item]{This could even be used for dating documents relatively.}}
        {\item{Extraction of phonetic data from different languages for a
            relative lemma}}
        {\item{All the advantages and disadvantages of using an XML based encoding scheme}}
      \end{itemize}

    \end{itemize}

  \end{frame}
  \begin{frame}
    \frametitle{EpiDoc - Disadvantages}

    \begin{itemize}
      {\item{Time consuming when done by hand}}
      {\item{The hierarchy of the elements are sometimes too strict to express
          what is happening accurately.}}
      {\note[item]{Ex. OP paragraph Elamite Paragraph.}}
      {\item{Requires a lot of planning on the "namespace" and "tag usage" for a reliable outcome}}
      {\item{The number of attributes of the <w> element}}
      {\note[item]{The number of attributes that a <w> element can take is not
          enough for very fine tuned linguistic analysis but this can be
          overcome by other methods without infringing the EpiDoc grammar.}}
      {\item{No Graphical User Interface}}
    \end{itemize}

  \end{frame}

  \begin{frame}{EpiDoc - Advantages}


    \begin{itemize}

      {\item{Text based format = easily exchangeable within peers}}
      {\item{XML based format = easily exchangeable with outside circles, such as
          computer scientist for help in certain issues}}
      {\item{LOTS of tools are available for parsing in all major computer
          languages}}
            {\note[item]{Giving a lot of control to researcher on how and what to
          search for in the encoded text}}
      {\item{Human readable to some extent}}
      {\item{Style Sheets}}
      {\note[item]{With the help of style sheets, the researcher can convert its
          encoded text into any form for presentation purposes}}
      {\item{Ease of reuse of the already encoded material}}
      {\item{No Graphical User Interface}}

    \end{itemize}

  \end{frame}

  \subsection{SVG} % Visual content representation easy integration to EpiDoc.
  % Possible to map to unicode points.

  \begin{frame}{SVG}

    \begin{itemize}
      {\item{What is it?}}
      \begin{itemize}
        {\item{A language to describe 2d graphics with XML-format}}
        {\note[item]{Scalable Vector Graphics are basically XML elements
            describing how a graphic should be.
          It is a joint product of web developer community and design
          community, making it extremely reliable for expressing any type of
          graphic that require cross platform interchange.}}

      \end{itemize}

      {\item{How can it be used ?}}
      \begin{itemize}
        {\item{Consistent glyph representation}}
        {\note[item]{Scalable Vector Graphics are: Scalable, that is whether you
          project it to a building or display it in your phone, the quality of
          the image would stay the same; Vectors, that is geometric objects,
          paths, curves, etc, this concerns how the elements that constitute the
        graph is stored. With popular raster formats, such as, png, jpg, the
        elements that constitute the graph are stored in tiny squares called
        pixels. The more pixel there is the more constitutive element shall be
        for the graph, which in turn reflects upon the quality of the image. The
      vectoral nature of the constitutive elements stored in svgs are what makes
    it scalable. Raster images are like books about a subject. A book about
    flowers contains some information on flowers. The more there is on flowers
    in the book, the thicker and better the book gets. Vector graphs are like
    triangles, there is nothing more to it then what is already available. It is
    in a sense complete. SVG also permit mixing of both.}}
{\note[item]{By consistent in this case, i understand the capacity to represent
    all of the glyphs with the same unit by use of SVGs, making them decomposable at the same
    level, since SVGs would give us a possibility to express the damages and
    the glyphs with the same measure no matter what the language of the
    character is. This is the key to ensure the epigraphic interoperability,
    that is expressing the visual phenomenon without any reference to its semantic
    correspondent}}

\end{itemize}

    \end{itemize}


  \end{frame}


  \section{Concluding Remarks on Long term Textual Data Conservation}
  % 1 Textual Data = Visual Data
  % 2 We should not opt for solution that is durable but for a solution
  % that is easy to convert without loss of information
  % 3 Data outlives the software, thus encoding schemes should be as much software
  % independant as possible.

  \begin{frame}{Some Concluding Remarks}
    \begin{itemize}
      {\item{Textual Data \emph{is} visual data.}}
      {\note[item]{Can't read without letters there to see. Any oral citation is
          a performance.}}
      {\item{Huge classic: Data outlives software.}}
      {\note[item]{Software Independence is a must, this was an important
          problem in Egyptology for example.}}
      {\item{No One Yes º}}
      {\note[item]{Planning an encoding project which involves 1-n, n-1, n-n
          relations is like planning a database, thus all the good practices of
          modelling a database is implied in the encoding project. A good
          practice for any database is to have unique identifiers for each row
          in a column. Whether these identifiers are semantic values or not is
          not relevant as long as they are unique, but it is also true that it
          is rather difficult to create unique labels with semantic values in a
          consistent manner, thus using abstract values such as integers work
          rather well for identifying rows. This is what basically Unicode did
          to all the letters of all the languages.}}

    \end{itemize}


  \end{frame}

  \begin{frame}[squeeze, shrink=40]{Further Reading}

    \begin{itemize}

    {\item{\cite{gippert_multilingual_encoding:_1999}}}
    {\item{\cite{tinney_cdli_atf:_2014}}}
    {\item{\cite{anderson_ice_report_1:_2000}}}
    {\item{\cite{feuerheim_unicode_proposal:_2004}}}
    {\item{\cite{kaufman_cal_manual:_1987}}}
    {\item{\cite{nederhof_manuel_2013}}}
    {\item{\cite{bodard_epidoc_guidelines:_2016}}}
    {\item{\cite{whistler_unicode_character_encoding_model:_2008}}}
    {\item{\cite{niccolucci_cidoc_crm_epigraphy:_2016}}}

  \end{itemize}

  \end{frame}


\end{document}

%%% Local Variables:
%%% mode: latex
%%% TeX-engine: xetex
%%% TeX-auto-save: t
%%% ispell-local-dictionary: "english"
%%% coding: utf-8
%%% TeX-master: t
%%% End: