Skip to content

Instantly share code, notes, and snippets.

Created July 30, 2014 20:52
Show Gist options
  • Star 0 You must be signed in to star a gist
  • Fork 0 You must be signed in to fork a gist
  • Save anonymous/bfc53cd886b204ea22d8 to your computer and use it in GitHub Desktop.
Save anonymous/bfc53cd886b204ea22d8 to your computer and use it in GitHub Desktop.
diff -ruwb hatari-orig/src/gemdos.c hatari/src/gemdos.c
--- hatari-orig/src/gemdos.c 2014-07-24 01:19:28.713746178 +0200
+++ hatari/src/gemdos.c 2014-07-30 22:51:22.533733503 +0200
@@ -988,6 +988,7 @@
{
while ((entry = readdir(dir)))
{
+ Str_DecomposedToPrecomposedUtf8(entry->d_name, entry->d_name); /* for OSX */
if (fsfirst_match(name, entry->d_name))
{
match = strdup(entry->d_name);
@@ -999,6 +1000,7 @@
{
while ((entry = readdir(dir)))
{
+ Str_DecomposedToPrecomposedUtf8(entry->d_name, entry->d_name); /* for OSX */
if (strcasecmp(name, entry->d_name) == 0)
{
match = strdup(entry->d_name);
@@ -1229,8 +1231,12 @@
void GemDOS_CreateHardDriveFileName(int Drive, const char *pszFileName,
char *pszDestName, int nDestNameLen)
{
- const char *s, *filename = pszFileName;
+ const char *s, *filename;
int minlen;
+ char pszFileNameHost[FILENAME_MAX];
+
+ Str_AtariToHost(pszFileName, pszFileNameHost, FILENAME_MAX, INVALID_CHAR);
+ filename = pszFileName = pszFileNameHost;
/* Is it a valid hard drive? */
if (Drive < 2)
@@ -2666,6 +2672,7 @@
j = 0;
for (i=0; i < count; i++)
{
+ Str_DecomposedToPrecomposedUtf8(files[i]->d_name, files[i]->d_name); /* for OSX */
if (fsfirst_match(dirmask, files[i]->d_name))
{
InternalDTAs[DTAIndex].found[j] = files[i];
diff -ruwb hatari-orig/src/includes/str.h hatari/src/includes/str.h
--- hatari-orig/src/includes/str.h 2014-07-24 01:19:28.733746178 +0200
+++ hatari/src/includes/str.h 2014-07-30 21:56:29.037360000 +0200
@@ -31,4 +31,10 @@
extern void Str_Filename2TOSname(const char *src, char *dst);
extern void Str_Dump_Hex_Ascii ( char *p , int Len , int Width , const char *Suffix , FILE *pFile );
+/* Interface of character set conversions */
+extern void Str_AtariToHost(const char *source, char *dest, int destLen, char replacementChar);
+extern void Str_HostToAtari(const char *source, char *dest, char replacementChar);
+extern void Str_DecomposedToPrecomposedUtf8(const char *source, char *dest);
+
+
#endif /* HATARI_STR_H */
Only in hatari/src: ._str.c
diff -ruwb hatari-orig/src/str.c hatari/src/str.c
--- hatari-orig/src/str.c 2014-07-24 01:19:28.750746178 +0200
+++ hatari/src/str.c 2014-07-30 19:36:36.493690845 +0200
@@ -12,6 +12,7 @@
#include <ctype.h>
#include <stdbool.h>
#include <stdlib.h>
+#include <locale.h>
#include <SDL_types.h>
#include "configuration.h"
#include "str.h"
@@ -136,6 +137,9 @@
int len;
src = strdup(source); /* dup so that it can be modified */
+
+ /* convert host string encoding to AtariST character set */
+ Str_HostToAtari(source, src, INVALID_CHAR);
len = strlen(src);
/* does filename have an extension? */
@@ -164,7 +168,8 @@
/* upcase and replace rest of invalid characters */
for (tmp = dst; *tmp; tmp++)
{
- if (*tmp < 33 || *tmp > 126)
+ /* invalid characters above 0x80 have already been replaced */
+ if (((unsigned char)*tmp) < 32 || *tmp == 127)
*tmp = INVALID_CHAR;
else
{
@@ -180,6 +185,7 @@
*tmp = INVALID_CHAR;
break;
default:
+ if (((unsigned char)*tmp) < 128)
*tmp = toupper((unsigned char)*tmp);
}
}
@@ -187,6 +193,330 @@
}
+/* ---------------------------------------------------------------------- */
+
+/* Implementation of character set conversions */
+
+/* Maps AtariST characters 0x80..0xFF to unicode code points
+ * see http://www.unicode.org/Public/MAPPINGS/VENDORS/MISC/ATARIST.TXT
+ */
+static int mapAtariToUnicode[128] =
+{
+ 0x00C7, 0x00FC, 0x00E9, 0x00E2, 0x00E4, 0x00E0, 0x00E5, 0x00E7,
+ 0x00EA, 0x00EB, 0x00E8, 0x00EF, 0x00EE, 0x00EC, 0x00C4, 0x00C5,
+ 0x00C9, 0x00E6, 0x00C6, 0x00F4, 0x00F6, 0x00F2, 0x00FB, 0x00F9,
+ 0x00FF, 0x00D6, 0x00DC, 0x00A2, 0x00A3, 0x00A5, 0x00DF, 0x0192,
+ 0x00E1, 0x00ED, 0x00F3, 0x00FA, 0x00F1, 0x00D1, 0x00AA, 0x00BA,
+ 0x00BF, 0x2310, 0x00AC, 0x00BD, 0x00BC, 0x00A1, 0x00AB, 0x00BB,
+ 0x00E3, 0x00F5, 0x00D8, 0x00F8, 0x0153, 0x0152, 0x00C0, 0x00C3,
+ 0x00D5, 0x00A8, 0x00B4, 0x2020, 0x00B6, 0x00A9, 0x00AE, 0x2122,
+ 0x0133, 0x0132, 0x05D0, 0x05D1, 0x05D2, 0x05D3, 0x05D4, 0x05D5,
+ 0x05D6, 0x05D7, 0x05D8, 0x05D9, 0x05DB, 0x05DC, 0x05DE, 0x05E0,
+ 0x05E1, 0x05E2, 0x05E4, 0x05E6, 0x05E7, 0x05E8, 0x05E9, 0x05EA,
+ 0x05DF, 0x05DA, 0x05DD, 0x05E3, 0x05E5, 0x00A7, 0x2227, 0x221E,
+ 0x03B1, 0x03B2, 0x0393, 0x03C0, 0x03A3, 0x03C3, 0x00B5, 0x03C4,
+ 0x03A6, 0x0398, 0x03A9, 0x03B4, 0x222E, 0x03C6, 0x2208, 0x2229,
+ 0x2261, 0x00B1, 0x2265, 0x2264, 0x2320, 0x2321, 0x00F7, 0x2248,
+ 0x00B0, 0x2219, 0x00B7, 0x221A, 0x207F, 0x00B2, 0x00B3, 0x00AF
+};
+
+/* Hashtable which maps unicode code points to AtariST characters 0x80..0xFF.
+ * The last 9 bits of the unicode code point provide a hash function
+ * without collisions.
+ */
+static char mapUnicodeToAtari[512];
+static bool characterMappingsInitialized = false;
+
+/* Define this only for an old Linux system which does not store
+ * pathnames in UTF-8. If this is defined, pathnames are converted
+ * to the host character set as defined by the locale.
+ * Do not define this for OSX, as the unicode pathnames then won't
+ * be converted from the decomposed to the precomposed form.
+ */
+/* #define USE_LOCALE_CHARSET 1 */
+
+
+/**
+ * This function initializes the mapUnicodeToAtari[] hashtable.
+ */
+static void initCharacterMappings(void)
+{
+ int i;
+ for (i = 0; i < 128; i++)
+ {
+ mapUnicodeToAtari[mapAtariToUnicode[i] & 511] = i;
+ }
+ characterMappingsInitialized = true;
+
+#if defined(WIN32) || defined(USE_LOCALE_CHARSET)
+ setlocale(LC_ALL, "");
+#endif
+}
+
+/**
+ * Convert a 0-terminated string in the AtariST character set to a 0-terminated
+ * UTF-8 encoded string. destLen is the number of available bytes in dest[].
+ * A single character of the AtariST charset can consume up to 3 bytes in UTF-8.
+ */
+static void Str_AtariToUtf8(const char *source, char *dest, int destLen)
+{
+ int c;
+ while (*source)
+ {
+ c = *source++ & 255;
+ if (c >= 128)
+ {
+ c = mapAtariToUnicode[c & 127];
+ }
+ if (c < 128 && destLen > 1)
+ {
+ *dest++ = c; /* 0xxxxxxx */
+ destLen--;
+ }
+ else if (c < 2048 && destLen > 2)
+ {
+ *dest++ = (c >> 6) | 192; /* 110xxxxx */
+ *dest++ = (c & 63) | 128; /* 10xxxxxx */
+ destLen -= 2;
+ }
+ else if (destLen > 3)
+ {
+ *dest++ = (c >> 12) | 224; /* 1110xxxx */
+ *dest++ = ((c >> 6) & 63) | 128; /* 10xxxxxx */
+ *dest++ = (c & 63) | 128; /* 10xxxxxx */
+ destLen -= 3;
+ }
+ }
+ *dest = 0;
+}
+
+/**
+ * Convert a 0-terminated utf-8 encoded string to a 0-terminated string
+ * in the AtariST character set.
+ * replacementChar is inserted when there is no mapping.
+ */
+static void Str_Utf8ToAtari(const char *source, char *dest, char replacementChar)
+{
+ int c, c2, c3, i;
+ if (!characterMappingsInitialized) { initCharacterMappings(); }
+
+ while (*source)
+ {
+ c = *source++ & 255;
+ if (c < 128) /* single-byte utf-8 code (0xxxxxxx) */
+ {
+ *dest++ = c;
+ }
+ else if (c < 192) /* invalid utf-8 encoding (10xxxxxx) */
+ {
+ *dest++ = replacementChar;
+ }
+ else /* multi-byte utf-8 code */
+ {
+ if (c < 224) /* 110xxxxx, 10xxxxxx */
+ {
+ c2 = *source++;
+ c = ((c & 31) << 6) | (c2 & 63);
+ }
+ else if (c < 240) /* 1110xxxx, 10xxxxxx, 10xxxxxx */
+ {
+ c2 = *source++;
+ c3 = *source++;
+ c = ((c & 15) << 12) | ((c2 & 63) << 6) | (c3 & 63);
+ }
+
+ /* find AtariST character code for unicode code point c */
+ i = mapUnicodeToAtari[c & 511];
+ *dest++ = (mapAtariToUnicode[i] == c ? i + 128 : replacementChar);
+ }
+ }
+ *dest = 0;
+}
+
+
+/**
+ * Convert a string from the AtariST character set into the host representation as
+ * defined by the current locale. Characters which do not exist in character set
+ * of the host as defined by the locale will be replaced by replacementChar.
+ */
+static void Str_AtariToLocal(const char *source, char *dest, int destLen, char replacementChar)
+{
+ int c, i;
+ if (!characterMappingsInitialized) { initCharacterMappings(); }
+
+ while (*source && destLen > (int)MB_CUR_MAX)
+ {
+ c = *source++ & 255;
+ if (c >= 128)
+ c = mapAtariToUnicode[c & 127];
+ /* convert the unicode code point c to a character in the current locale */
+ i = wctomb(dest, c);
+ if (i < 0)
+ {
+ *dest = replacementChar;
+ i = 1;
+ }
+ dest += i;
+ destLen -= i;
+ }
+ *dest = 0;
+}
+
+/**
+ * Convert a string from the character set defined by current host locale into the
+ * AtariST character set. Characters which do not exist in the AtariST character set
+ * will be replaced by replacementChar.
+ */
+static void Str_LocalToAtari(const char *source, char *dest, char replacementChar)
+{
+ int i;
+ wchar_t c;
+ if (!characterMappingsInitialized) { initCharacterMappings(); }
+
+ while (*source)
+ {
+ /* convert a character from the current locale into an unicode code point */
+ i = mbtowc(&c, source, 4);
+ if (i < 0)
+ {
+ c = replacementChar;
+ i = 1;
+ }
+ source += i;
+ if (c >= 128)
+ {
+ /* find AtariST character code for unicode code point c */
+ i = mapUnicodeToAtari[c & 511];
+ c = (mapAtariToUnicode[i] == c ? i + 128 : replacementChar);
+ }
+ *dest++ = c;
+ }
+ *dest = 0;
+}
+
+
+void Str_AtariToHost(const char *source, char *dest, int destLen, char replacementChar)
+{
+#if defined(WIN32) || defined(USE_LOCALE_CHARSET)
+ Str_AtariToLocal(source, dest, destLen, replacementChar);
+#else
+ Str_AtariToUtf8(source, dest, destLen);
+#endif
+}
+
+void Str_HostToAtari(const char *source, char *dest, char replacementChar)
+{
+#if defined(WIN32) || defined(USE_LOCALE_CHARSET)
+ Str_LocalToAtari(source, dest, replacementChar);
+#else
+ Str_Utf8ToAtari(source, dest, replacementChar);
+#endif
+}
+
+
+/* This table is needed to convert the UTF-8 representation of paths with
+ * diacritical marks from the decomposed form (as returned by OSX) into the
+ * precomposed form. Combining unicode characters are 0x0300..0x036F.
+ * This table contains only those characters which are part of the AtariST
+ * character set.
+ */
+static int mapDecomposedPrecomposed[] =
+{
+ 'A', 0x0300, 0xC0,
+ 'A', 0x0301, 0xC1,
+ 'A', 0x0302, 0xC2,
+ 'A', 0x0303, 0xC3,
+ 'A', 0x0308, 0xC4,
+ 'A', 0x030A, 0xC5,
+ 'C', 0x0327, 0xC7,
+ 'E', 0x0300, 0xC8,
+ 'E', 0x0301, 0xC9,
+ 'E', 0x0302, 0xCA,
+ 'E', 0x0308, 0xCB,
+ 'I', 0x0300, 0xCC,
+ 'I', 0x0301, 0xCD,
+ 'I', 0x0302, 0xCE,
+ 'I', 0x0308, 0xCF,
+ 'N', 0x0303, 0xD1,
+ 'O', 0x0300, 0xD2,
+ 'O', 0x0301, 0xD3,
+ 'O', 0x0302, 0xD4,
+ 'O', 0x0303, 0xD5,
+ 'O', 0x0308, 0xD6,
+ 'U', 0x0300, 0xD9,
+ 'U', 0x0301, 0xDA,
+ 'U', 0x0302, 0xDB,
+ 'U', 0x0308, 0xDC,
+ 'Y', 0x0301, 0xDD,
+ 'a', 0x0300, 0xE0,
+ 'a', 0x0301, 0xE1,
+ 'a', 0x0302, 0xE2,
+ 'a', 0x0303, 0xE3,
+ 'a', 0x0308, 0xE4,
+ 'a', 0x030A, 0xE5,
+ 'c', 0x0327, 0xE7,
+ 'e', 0x0300, 0xE8,
+ 'e', 0x0301, 0xE9,
+ 'e', 0x0302, 0xEA,
+ 'e', 0x0308, 0xEB,
+ 'i', 0x0300, 0xEC,
+ 'i', 0x0301, 0xED,
+ 'i', 0x0302, 0xEE,
+ 'i', 0x0308, 0xEF,
+ 'n', 0x0303, 0xF1,
+ 'o', 0x0300, 0xF2,
+ 'o', 0x0301, 0xF3,
+ 'o', 0x0302, 0xF4,
+ 'o', 0x0303, 0xF5,
+ 'o', 0x0308, 0xF6,
+ 'u', 0x0300, 0xF9,
+ 'u', 0x0301, 0xFA,
+ 'u', 0x0302, 0xFB,
+ 'u', 0x0308, 0xFC,
+ 'y', 0x0301, 0xFD,
+ 'y', 0x0308, 0xFF,
+ 0
+};
+
+/**
+ * Convert decomposed unicode characters (sequence of a letter
+ * and a combining character) in an UTF-8 encoded string into
+ * the precomposed UTF-8 encoded form. Only characters which
+ * exist in the AtariST character set are converted.
+ * This is needed for OSX which returns filesystem paths in the
+ * decomposed form (NFD).
+ */
+void Str_DecomposedToPrecomposedUtf8(const char *source, char *dest)
+{
+ int c, c1, i;
+ while (*source)
+ {
+ c = *source++ & 255;
+ /* do we have a combining character behind the current character */
+ if ((source[0] & 0xFC) == 0xCC) /* 0x03XX is in UTF-8: 110011xx 10xxxxxx */
+ {
+ c1 = ((source[0] & 31) << 6) | (source[1] & 63);
+ for (i = 0; mapDecomposedPrecomposed[i]; i += 3)
+ {
+ if (mapDecomposedPrecomposed[i] == c && mapDecomposedPrecomposed[i + 1] == c1)
+ {
+ c = mapDecomposedPrecomposed[i + 2]; /* precomposed unicode code point */
+ *dest++ = 0xC0 | (c >> 6); /* UTF-8 first byte: 110xxxxx */
+ c = 0x80 + (c & 63); /* UTF-8 second byte: 10xxxxxx */
+ source += 2;
+ break;
+ }
+ }
+ }
+ *dest++ = c;
+ }
+ *dest = 0;
+}
+
+/* ---------------------------------------------------------------------- */
+
+
+
/**
* Print an Hex/Ascii dump of Len bytes located at *p
* Each line consists of Width bytes, printed as an hexa value and as a char
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment