Skip to content

Instantly share code, notes, and snippets.

@gingerbeardman
Last active April 12, 2024 20:04
Show Gist options
  • Star 1 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save gingerbeardman/4a3b66236e018b72b32ca17953474e12 to your computer and use it in GitHub Desktop.
Save gingerbeardman/4a3b66236e018b72b32ca17953474e12 to your computer and use it in GitHub Desktop.
Convert legacy text encodings to Unicode (UTF-8)
#!/opt/local/bin/port-tclsh
set ver {2021.12.02}
set author {Matt Sephton @gingerbeardman}
package require cmdline
set parameters {
{encoding.arg macJapan "source encoding, default:"}
{list "list encodings"}
{version "show version info"}
}
set usage {[-encoding value] ...
Convert legacy text encodings to Unicode (UTF-8)
... single or multiple files, directories, wildcards, or omit to use stdin
}
if {[catch {array set options [cmdline::getoptions ::argv $parameters $usage]}]} {
puts [cmdline::usage $parameters $usage]
}
proc convertTextToUnicode {enc f} {
if {[file isdirectory $f]} {
foreach g [glob -nocomplain [file join $f *]] {
convertTextToUnicode $enc $g
}
} else {
# open
if {$f == "stdin"} {
set in stdin
set out stdout
} else {
set in [open $f rb]
set out [open $f.new w]
set datestamp [file mtime $f]
}
# make sure input is treated as binary
fconfigure $in -translation binary
# do the encoding conversion
puts $out [encoding convertfrom $enc [read $in]]
# close and cleanup
if {$in != "stdin"} {
close $in
close $out
file rename -force $f.new $f
file mtime $f $datestamp
}
}
}
# show version
if {[array size options] > 0 && $options(version) == 1} {
puts "convert2unicode ($ver) by $author"
exit
}
# list encodings
if {[array size options] > 0 && $options(list) == 1} {
puts "List of known encodings:\n"
set lst [lsort -dictionary [split [encoding names] " "]]
foreach word $lst {
puts $word
}
exit
}
# default source encoding
if {[array size options] > 0 && $options(encoding) != ""} {
set enc $options(encoding)
}
# no files passed
if {$argc == 0 || ($argc == 2 && [array size argv] == 0)} {
convertTextToUnicode $enc stdin
} else {
# process each command-line argument
foreach f $argv {
if {[file isfile $f] || [file isdirectory $f]} {
convertTextToUnicode $enc $f
} else {
puts stderr "error: cannot find \"$f\""
}
}
}
@gingerbeardman
Copy link
Author

gingerbeardman commented Nov 8, 2021

Usage: convert2unicode.tcl -encoding value ...

Compared to earlier revisions this now includes such niceties as -help to show usage hints, -list to display all known encodings, and proper optional command line arguments. It will also work on single or multiple files, directories full of files, wildcards, and stdin.

2012-12-02: remembers source file date/time information

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment