Skip to content

Instantly share code, notes, and snippets.

@h20y6m
Last active May 6, 2022 11:56
Show Gist options
  • Save h20y6m/6449d1d5d29a71620d19f29f881a0549 to your computer and use it in GitHub Desktop.
Save h20y6m/6449d1d5d29a71620d19f29f881a0549 to your computer and use it in GitHub Desktop.
l3str-convert with (u)pLaTeX
%#!platex -no-guess-input-enc -kanji=utf8
% -*- coding: utf-8 -*-
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% l3names
%
\ExplSyntaxOn
% \tex_toucs:D already part of expl3 <2022-04-10>
%\tex_global:D \tex_let:D \tex_toucs:D \toucs
\ExplSyntaxOff
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% l3str-convert
%
\ExplSyntaxOn
\cs_new:Npn \__str_encode_if_extended_char_p:n #1 { \c_false_bool }
\cs_new:Npn \__str_encode_extended_char:n #1 { }
\cs_gset:Npn \__str_encode_utf_viii_char:n #1
{
\if_predicate:w \__str_encode_if_extended_char_p:n {#1}
\__str_encode_extended_char:n {#1}
\else:
\__str_encode_utf_viii_loop:wwnnw #1 ; - 1 + 0 * ;
{ 128 } { 0 }
{ 32 } { 192 }
{ 16 } { 224 }
{ 8 } { 240 }
\s__str_stop
\fi:
}
\cs_new:Npn \__str_decode_if_extended_char_p:N #1 { \c_false_bool }
\cs_new:Npn \__str_decode_extended_char:N #1 { }
\cs_gset:Npn \__str_decode_utf_viii_start:N #1
{
#1
\if_predicate:w \__str_decode_if_extended_char_p:N #1
\s__str
\__str_decode_extended_char:N #1
\else:
\if_int_compare:w `#1 < "C0 \exp_stop_f:
\s__str
\if_int_compare:w `#1 < "80 \exp_stop_f:
\int_value:w `#1
\else:
\flag_raise:n { str_extra }
\flag_raise:n { str_error }
\int_use:N \c__str_replacement_char_int
\fi:
\else:
\exp_after:wN \__str_decode_utf_viii_continuation:wwN
\int_value:w \int_eval:n { `#1 - "C0 } \exp_after:wN
\fi:
\exp_after:wN % ???
\fi:
\s__str
\__str_use_none_delimit_by_s_stop:w {"80} {"800} {"10000} {"110000} \s__str_stop
\__str_decode_utf_viii_start:N
}
\ExplSyntaxOff
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% l3ptex
%
\ExplSyntaxOn
\cs_new_protected:Npn \__ptex_tmp:w { }
% (upTeX only) declare kanji token which has specific kcatcode.
\sys_if_engine_uptex:T
{
\group_begin:
\cs_set:Npn \__ptex_tmp:w #1#2
{
\exp_after:wN \cs_new_eq:NN \exp_after:wN #1
\tex_Ucharcat:D
\int_eval:n { \tex_jis:D "2121 } ~
\int_eval:n {#2} \scan_stop:
}
\__ptex_tmp:w \c_kcatcode_kanji_token { 16 }
\__ptex_tmp:w \c_kcatcode_kana_token { 17 }
\__ptex_tmp:w \c_kcatcode_symbol_token { 18 }
\__ptex_tmp:w \c_kcatcode_hangul_token { 19 }
\group_end:
}
% (upTeX only) get kcatcode of token.
\sys_if_engine_uptex:T
{
\cs_new:Npn \__ptex_char_kcatcode:N #1
{
\if_catcode:w \exp_not:N #1 \c_kcatcode_kanji_token
16
\else:
\if_catcode:w \exp_not:N #1 \c_kcatcode_kana_token
17
\else:
\if_catcode:w \exp_not:N #1 \c_kcatcode_symbol_token
18
\else:
\if_catcode:w \exp_not:N #1 \c_kcatcode_hangul_token
19
\else:
15
\fi:
\fi:
\fi:
\fi:
}
}
% define ptex+utf8 encoding.
% ptex+utf8 encode function.
\cs_new_protected:cpn { __str_convert_encode_ptex+utf8: }
{
\group_begin:
\cs_set_eq:NN \__str_encode_if_extended_char_p:n \__str_encode_if_kanji_char_p:n
\cs_set_eq:NN \__str_encode_extended_char:n \__str_encode_kanji_char:n
\use:c { __str_convert_encode_utf8: }
\group_end:
}
% Tests whether the Unicode code point #1 is Japanese character
\int_compare:nNnTF { \tex_jis:D "2121 } = { "3000 }
{
% upTeX with unicode
\prg_new_conditional:Npnn \__str_encode_if_kanji_char:n #1 { p }
{
% NOTE:
% \kcatcode0--\kcatcode127 may not be 15,
% but ASCII (0--127) never be Japanese character.
\if_int_compare:w #1 < "80 \exp_stop_f:
\prg_return_false:
\else:
\if_int_compare:w \tex_kcatcode:D #1 = 15 \exp_stop_f:
\prg_return_false:
\else:
\prg_return_true:
\fi:
\fi:
}
}
{
% pTeX or upTeX with euc/sjis
\prg_new_conditional:Npnn \__str_encode_if_kanji_char:n #1 { p }
{
% If specified Unicode code point cannot convert to internal Kanji code,
% \ucs returns -1.
\if_int_compare:w \tex_ucs:D #1 < 0 \exp_stop_f:
\prg_return_false:
\else:
\if_int_compare:w \tex_kcatcode:D \tex_ucs:D #1 = 15 \exp_stop_f: % upTeX only. pTeX always false.
\prg_return_false:
\else:
\prg_return_true:
\fi:
\fi:
}
}
% Generate Japanese character token which has Unicode code point #1.
\int_compare:nNnTF { \tex_jis:D "2121 } = { "3000 }
{
% upTeX with unicode
\cs_new:Npn \__str_encode_kanji_char:n #1
{
% \Uchar always generates Latin character token for character
% code 128--255, but Japanese character token are needed,
% so use \Ucharcat instead.
\if_int_compare:w \tex_kcatcode:D #1 > 15 \exp_stop_f:
\tex_Ucharcat:D #1 \exp_stop_f: \tex_kcatcode:D #1 \exp_stop_f:
\else:
\tex_Ucharcat:D #1 \exp_stop_f: 18 \exp_stop_f:
\fi:
}
}
{
% pTeX or upTeX with euc/sjis
\cs_new:Npn \__str_encode_kanji_char:n #1
{ \tex_Uchar:D \tex_ucs:D #1 \exp_stop_f: \exp_stop_f: }
}
% ptex+utf8 decode function.
\cs_new_protected:cpn { __str_convert_decode_ptex+utf8: }
{
\group_begin:
\cs_set_eq:NN \__str_decode_if_extended_char_p:N \__str_decode_if_kanji_char_p:N
\cs_set_eq:NN \__str_decode_extended_char:N \__str_decode_kanji_char:N
\use:c { __str_convert_decode_utf8: }
\group_end:
}
% Tests whether the token #1 is Japanese character.
\int_compare:nNnTF { \tex_jis:D "2121 } = { "3000 }
{
% upTeX with unicode
\prg_new_conditional:Npnn \__str_decode_if_kanji_char:N #1 { p }
{
\if_int_compare:w \__ptex_char_kcatcode:N #1 > 15 \exp_stop_f:
\prg_return_true:
\else:
\prg_return_false:
\fi:
}
}
{
% pTeX or upTeX with euc/sjis
\prg_new_conditional:Npnn \__str_decode_if_kanji_char:N #1 { p }
{
\if_int_compare:w `#1 < "100 \exp_stop_f:
\prg_return_false:
\else:
\prg_return_true:
\fi:
}
}
% Convert Japanese character #1 to Unicode code point.
\int_compare:nNnTF { \tex_jis:D "2121 } = { "3000 }
{
% upTeX with unicode
\cs_new:Npn \__str_decode_kanji_char:N #1
{ \int_value:w `#1 }
}
{
% pTeX or upTeX with euc/sjis
\cs_new:Npn \__str_decode_kanji_char:N #1
{ \tex_toucs:D `#1 }
}
% alternative encoding name
\prop_gput:Nnn \g__str_alias_prop { ptex } { ptex+utf8 }
% set default encoding to ptex+utf8
\prop_gput:Nnn \g__str_alias_prop { default } { ptex+utf8 }
\ExplSyntaxOff
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\ExplSyntaxOn
\tl_set:Nn \l_tmpa_tl {ABC~äëïöü~αβγ~あいうえお~日本語~☀☁☂☃}
\exp_args:NNV \str_set_convert:Nnnn \l_tmpa_str \l_tmpa_tl { default } { utf16/hex }
\iow_term:x { \l_tmpa_str }
% => FEFF004100420043002000E400EB00EF00F600FC002003B103B203B300203042304430463048304A002065E5672C8A9E00202600260126022603
\tl_set:Nn \l_tmpa_tl { FEFF004100420043002000E400EB00EF00F600FC002003B103B203B300203042304430463048304A002065E5672C8A9E00202600260126022603 }
\exp_args:NNV \str_set_convert:Nnnn \l_tmpa_str \l_tmpa_tl { utf16/hex } { default }
\iow_term:x { \l_tmpa_str }
% => ABC ^^c3^^a4^^c3^^ab^^c3^^af^^c3^^b6^^c3^^bc αβγ あいうえお 日本語 ☀☁☂☃
% => ABC ^^c3^^a4^^c3^^ab^^c3^^af^^c3^^b6^^c3^^bc αβγ あいうえお 日本語 ^^e2^^98^^80^^e2^^98^^81^^e2^^98^^82^^e2^^98^^83
\tl_set:Nn \l_tmpa_tl {§¨°±´¶×÷}
\exp_args:NNV \str_set_convert:Nnnn \l_tmpa_str \l_tmpa_tl { default } { utf16/hex }
\iow_term:x { \l_tmpa_str }
% => FEFF00A700A800B000B100B400B600D700F7
\tl_set:Nn \l_tmpa_tl { FEFF00A700A800B000B100B400B600D700F7 }
\exp_args:NNV \str_set_convert:Nnnn \l_tmpa_str \l_tmpa_tl { utf16/hex } { default }
\iow_term:x { \l_tmpa_str }
% => §¨°±´¶×÷
\ExplSyntaxOff
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
\ifdefined\DocumentMetadata
% LaTeX2e 2022-06-01
\DocumentMetadata
{
backend = dvipdfmx
}
\else
% LaTeX2e 2021-11-15
\RequirePackage{pdfmanagement-testphase}
\DeclareDocumentMetadata
{
backend = dvipdfmx
}
\fi
\ExplSyntaxOn
% pdfmanagement-testphase override default to utf8. why?
% re-override default to ptex+utf8.
\prop_gput:Nnn \g__str_alias_prop { default } { ptex+utf8 }
\ExplSyntaxOff
\ifnum\jis"2121="3000
% upLaTeX with unicode
\documentclass[dvipdfmx]{ujarticle}
\else
% pLaTeX or upLaTeX with euc/sjis
\documentclass[dvipdfmx]{jarticle}
\fi
\usepackage{hyperref}
\usepackage{pxjahyper}
\hypersetup{pdftitle = {ABC äëïöü αβγ あいうえお 日本語 ☀☁☂☃}}
\begin{document}
\section{はじめに}
あいうえお。
\section{つぎに}
かきくけこ。
\section{さいごに}
わをん。
\end{document}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment