Skip to content

Instantly share code, notes, and snippets.

@rbitr
Created March 10, 2024 19:38
Show Gist options
  • Save rbitr/3caffae3fcd4e7b116a04629621adb57 to your computer and use it in GitHub Desktop.
Save rbitr/3caffae3fcd4e7b116a04629621adb57 to your computer and use it in GitHub Desktop.
Encode and decode unicode into byte-level encoding
program pretokenize
character(:), allocatable :: result, orig
character(:), dimension(:), allocatable :: c_encoding
integer :: j
c_encoding = make_encoding()
result = pre_tokenize('Andy99아마')
print *, result
orig = decode(result)
contains
function make_encoding() result(output)
character(len=2) :: output(256)
integer(1) :: t
integer :: i, k
character(len=:), allocatable :: ext
ext = 'ĀāĂ㥹ĆćĈĉĊċČčĎďĐđĒēĔĕĖėĘęĚěĜĝĞğĠ!"#$%&' // "'"&
& // '()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUV'&
& //'WXYZ[\]^_`abcdefghijklmnopqrstuvwxyz{|}~'&
& // 'ġĢģĤĥĦħĨĩĪīĬĭĮįİıIJijĴĵĶķĸĹĺĻļĽľĿŀŁł¡¢£¤¥'&
& // '¦§¨©ª«¬Ń®¯°±²³´µ¶·¸¹º»¼½¾¿ÀÁÂÃÄÅÆÇÈÉÊËÌ'&
& // 'ÍÎÏÐÑÒÓÔÕÖ×ØÙÚÛÜÝÞßàáâãäåæçèéêëìíîïðñòóôõö÷øùúûüýþÿ'
print *, len(ext)
! move one character at a time into output
i = 1
k = 1
do while (i <= len(ext))
read(ext(i:i), "(A)") t
if (t >= 0 .and. t < 128) then
output(k) = ext(i:i)
i = i + 1
else
output(k) = ext(i:i+1)
i = i + 2
end if
k = k + 1
end do
end function
function decode(r) result (tmp_str)
character(len=*) :: r
!character(len=len(r)) :: s
integer :: i, j, k
integer(1) :: t
character(len=2) :: next
integer(1), allocatable :: bytes(:)
character(len=:), allocatable :: tmp_str
i=1
j=1
allocate(bytes(len(r)))
do while (i<len(trim(r)))
read(r(i:i), "(A)") t
if (t >= 0 .and. t < 128) then
next = r(i:i)
i = i + 1
else
next = r(i:i+1)
i = i + 2
end if
!j = j + 1
print *, next
! we need the number corresponding to next's location in the encoding
!k = 1
do k = 1,size(c_encoding)
if (c_encoding(k) == next) then
ind = k
exit
end if
end do
print *, ind
if (ind < 128) then
bytes(j) = ind - 1
else
bytes(j) = ind-256-1
end if
j = j + 1
end do
print *, bytes
! open and write to a temp file
open(UNIT=5, FILE="tempfile.xxx", FORM="UNFORMATTED",&
&ACCESS="STREAM", STATUS="REPLACE", POSITION="REWIND", ACTION="WRITE")
write(5) bytes
close(5)
allocate(character(len=j-1) :: tmp_str)
open(UNIT=5, FILE="tempfile.xxx", FORM="UNFORMATTED",&
&ACCESS="STREAM", STATUS="OLD", POSITION="REWIND", ACTION="READ")
read(5) tmp_str
close(5)
print *, tmp_str
end function
function pre_tokenize(s) result (r)
! s is a unicode string
! we can get the bytes just by indexing
character(len=*) :: s
character(len=len(s)*2) :: r
integer :: i
integer(1) :: t
r = ""
print *, len(s)
do i = 1,len(s)
read(s(i:i), "(A)") t
! look it up
if (t < 0) then
!print *, c_encoding(t+256+1)
r = trim(r) // trim(c_encoding(t+256+1))
else
r = trim(r) // trim(c_encoding(t+1))
end if
print *, t
!print *, c_encoding(t+256)
end do
r = trim(r)
!print *, r
end function
end program
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment