Moved utf8codepoint() out of platform_windows.c and into the core, wi…

…th hooks for the GUI plugins, so I can clean up the unicode FIXMEs in the stdio and ncurses targets.
icculus · Mar 2, 2008 · f06a586 · f06a586
1 parent f088468
commit f06a586
Show file tree

Hide file tree

Showing 4 changed files with 162 additions and 148 deletions.
diff --git a/gui.h b/gui.h
@@ -249,6 +249,11 @@ const MojoGui *MOJOGUI_ENTRY_POINT(int rev, const MojoSetupEntryPoints *e) \
 #endif
 #define ticks() entry->ticks()
 
+#ifdef utf8codepoint
+#undef utf8codepoint
+#endif
+#define utf8codepoint(x) entry->utf8codepoint(x)
+
 #endif
 
 

diff --git a/mojosetup.c b/mojosetup.c
@@ -39,6 +39,7 @@ MojoSetupEntryPoints GEntryPoints =
     format,
     numstr,
     MojoPlatform_ticks,
+    utf8codepoint,
 };
 
 int GArgc = 0;
@@ -629,6 +630,153 @@ char *xstrncpy(char *dst, const char *src, size_t len)
 } // xstrncpy
 
 
+uint32 utf8codepoint(const char **_str)
+{
+    const char *str = *_str;
+    uint32 retval = 0;
+    uint32 octet = (uint32) ((uint8) *str);
+    uint32 octet2, octet3, octet4;
+
+    if (octet == 0)  // null terminator, end of string.
+        return 0;
+
+    else if (octet < 128)  // one octet char: 0 to 127
+    {
+        (*_str)++;  // skip to next possible start of codepoint.
+        return octet;
+    } // else if
+
+    else if ((octet > 127) && (octet < 192))  // bad (starts with 10xxxxxx).
+    {
+        // Apparently each of these is supposed to be flagged as a bogus
+        //  char, instead of just resyncing to the next valid codepoint.
+        (*_str)++;  // skip to next possible start of codepoint.
+        return UNICODE_BOGUS_CHAR_VALUE;
+    } // else if
+
+    else if (octet < 224)  // two octets
+    {
+        octet -= (128+64);
+        octet2 = (uint32) ((uint8) *(++str));
+        if ((octet2 & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 2;  // skip to next possible start of codepoint.
+        retval = ((octet << 6) | (octet2 - 128));
+        if ((retval >= 0x80) && (retval <= 0x7FF))
+            return retval;
+    } // else if
+
+    else if (octet < 240)  // three octets
+    {
+        octet -= (128+64+32);
+        octet2 = (uint32) ((uint8) *(++str));
+        if ((octet2 & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet3 = (uint32) ((uint8) *(++str));
+        if ((octet3 & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 3;  // skip to next possible start of codepoint.
+        retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
+
+        // There are seven "UTF-16 surrogates" that are illegal in UTF-8.
+        switch (retval)
+        {
+            case 0xD800:
+            case 0xDB7F:
+            case 0xDB80:
+            case 0xDBFF:
+            case 0xDC00:
+            case 0xDF80:
+            case 0xDFFF:
+                return UNICODE_BOGUS_CHAR_VALUE;
+        } // switch
+
+        // 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge.
+        if ((retval >= 0x800) && (retval <= 0xFFFD))
+            return retval;
+    } // else if
+
+    else if (octet < 248)  // four octets
+    {
+        octet -= (128+64+32+16);
+        octet2 = (uint32) ((uint8) *(++str));
+        if ((octet2 & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet3 = (uint32) ((uint8) *(++str));
+        if ((octet3 & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet4 = (uint32) ((uint8) *(++str));
+        if ((octet4 & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 4;  // skip to next possible start of codepoint.
+        retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
+                   ((octet3 - 128) << 6) | ((octet4 - 128)) );
+        if ((retval >= 0x10000) && (retval <= 0x10FFFF))
+            return retval;
+    } // else if
+
+    // Five and six octet sequences became illegal in rfc3629.
+    //  We throw the codepoint away, but parse them to make sure we move
+    //  ahead the right number of bytes and don't overflow the buffer.
+
+    else if (octet < 252)  // five octets
+    {
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 5;  // skip to next possible start of codepoint.
+        return UNICODE_BOGUS_CHAR_VALUE;
+    } // else if
+
+    else  // six octets
+    {
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        octet = (uint32) ((uint8) *(++str));
+        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
+            return UNICODE_BOGUS_CHAR_VALUE;
+
+        *_str += 6;  // skip to next possible start of codepoint.
+        return UNICODE_BOGUS_CHAR_VALUE;
+    } // else if
+
+    return UNICODE_BOGUS_CHAR_VALUE;
+} // utf8codepoint
+
+
 static void outOfMemory(void)
 {
     // Try to translate "out of memory", but not if it causes recursion.

diff --git a/platform_windows.c b/platform_windows.c
@@ -71,154 +71,7 @@ static uint32 startupTime = 0;
 #endif
 
 
-#define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
-#define UNICODE_BOGUS_CHAR_CODEPOINT '?'
-
-static uint32 utf8codepoint(const char **_str)
-{
-    const char *str = *_str;
-    uint32 retval = 0;
-    uint32 octet = (uint32) ((uint8) *str);
-    uint32 octet2, octet3, octet4;
-
-    if (octet == 0)  // null terminator, end of string.
-        return 0;
-
-    else if (octet < 128)  // one octet char: 0 to 127
-    {
-        (*_str)++;  // skip to next possible start of codepoint.
-        return octet;
-    } // else if
-
-    else if ((octet > 127) && (octet < 192))  // bad (starts with 10xxxxxx).
-    {
-        // Apparently each of these is supposed to be flagged as a bogus
-        //  char, instead of just resyncing to the next valid codepoint.
-        (*_str)++;  // skip to next possible start of codepoint.
-        return UNICODE_BOGUS_CHAR_VALUE;
-    } // else if
-
-    else if (octet < 224)  // two octets
-    {
-        octet -= (128+64);
-        octet2 = (uint32) ((uint8) *(++str));
-        if ((octet2 & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        *_str += 2;  // skip to next possible start of codepoint.
-        retval = ((octet << 6) | (octet2 - 128));
-        if ((retval >= 0x80) && (retval <= 0x7FF))
-            return retval;
-    } // else if
-
-    else if (octet < 240)  // three octets
-    {
-        octet -= (128+64+32);
-        octet2 = (uint32) ((uint8) *(++str));
-        if ((octet2 & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet3 = (uint32) ((uint8) *(++str));
-        if ((octet3 & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        *_str += 3;  // skip to next possible start of codepoint.
-        retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );
-
-        // There are seven "UTF-16 surrogates" that are illegal in UTF-8.
-        switch (retval)
-        {
-            case 0xD800:
-            case 0xDB7F:
-            case 0xDB80:
-            case 0xDBFF:
-            case 0xDC00:
-            case 0xDF80:
-            case 0xDFFF:
-                return UNICODE_BOGUS_CHAR_VALUE;
-        } // switch
-
-        // 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge.
-        if ((retval >= 0x800) && (retval <= 0xFFFD))
-            return retval;
-    } // else if
-
-    else if (octet < 248)  // four octets
-    {
-        octet -= (128+64+32+16);
-        octet2 = (uint32) ((uint8) *(++str));
-        if ((octet2 & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet3 = (uint32) ((uint8) *(++str));
-        if ((octet3 & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet4 = (uint32) ((uint8) *(++str));
-        if ((octet4 & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        *_str += 4;  // skip to next possible start of codepoint.
-        retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
-                   ((octet3 - 128) << 6) | ((octet4 - 128)) );
-        if ((retval >= 0x10000) && (retval <= 0x10FFFF))
-            return retval;
-    } // else if
-
-    // Five and six octet sequences became illegal in rfc3629.
-    //  We throw the codepoint away, but parse them to make sure we move
-    //  ahead the right number of bytes and don't overflow the buffer.
-
-    else if (octet < 252)  // five octets
-    {
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        *_str += 5;  // skip to next possible start of codepoint.
-        return UNICODE_BOGUS_CHAR_VALUE;
-    } // else if
-
-    else  // six octets
-    {
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        octet = (uint32) ((uint8) *(++str));
-        if ((octet & (128+64)) != 128)  // Format isn't 10xxxxxx?
-            return UNICODE_BOGUS_CHAR_VALUE;
-
-        *_str += 6;  // skip to next possible start of codepoint.
-        return UNICODE_BOGUS_CHAR_VALUE;
-    } // else if
-
-    return UNICODE_BOGUS_CHAR_VALUE;
-} // utf8codepoint
+// these utf-8 functions may move to mojosetup.c some day...
 
 void utf8ToUcs2(const char *src, uint16 *dst, uint64 len)
 {

diff --git a/universal.h b/universal.h
@@ -96,6 +96,13 @@ extern const char *GBuildVer;
 // Static, non-stack memory for scratch work...not thread safe!
 extern uint8 scratchbuf_128k[128 * 1024];
 
+
+#define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
+#define UNICODE_BOGUS_CHAR_CODEPOINT '?'
+// !!! FIXME: document me!
+uint32 utf8codepoint(const char **_str);
+
+
 // Format a string, sort of (but not exactly!) like sprintf().
 //  The only formatters accepted are %0 through %9 (and %%), which do not
 //  have to appear in order in the string, but match the varargs passed to the
@@ -347,6 +354,7 @@ typedef struct MojoSetupEntryPoints
     char *(*format)(const char *fmt, ...);
     const char *(*numstr)(int val);
     uint32 (*ticks)(void);
+    uint32 (*utf8codepoint)(const char **_str);
 } MojoSetupEntryPoints;
 extern MojoSetupEntryPoints GEntryPoints;