Skip to content

Commit

Permalink
Moved utf8codepoint() out of platform_windows.c and into the core, wi…
Browse files Browse the repository at this point in the history
…th hooks

 for the GUI plugins, so I can clean up the unicode FIXMEs in the stdio and
 ncurses targets.
  • Loading branch information
icculus committed Mar 2, 2008
1 parent f088468 commit f06a586
Show file tree
Hide file tree
Showing 4 changed files with 162 additions and 148 deletions.
5 changes: 5 additions & 0 deletions gui.h
Expand Up @@ -249,6 +249,11 @@ const MojoGui *MOJOGUI_ENTRY_POINT(int rev, const MojoSetupEntryPoints *e) \
#endif
#define ticks() entry->ticks()

#ifdef utf8codepoint
#undef utf8codepoint
#endif
#define utf8codepoint(x) entry->utf8codepoint(x)

#endif


Expand Down
148 changes: 148 additions & 0 deletions mojosetup.c
Expand Up @@ -39,6 +39,7 @@ MojoSetupEntryPoints GEntryPoints =
format,
numstr,
MojoPlatform_ticks,
utf8codepoint,
};

int GArgc = 0;
Expand Down Expand Up @@ -629,6 +630,153 @@ char *xstrncpy(char *dst, const char *src, size_t len)
} // xstrncpy


uint32 utf8codepoint(const char **_str)
{
const char *str = *_str;
uint32 retval = 0;
uint32 octet = (uint32) ((uint8) *str);
uint32 octet2, octet3, octet4;

if (octet == 0) // null terminator, end of string.
return 0;

else if (octet < 128) // one octet char: 0 to 127
{
(*_str)++; // skip to next possible start of codepoint.
return octet;
} // else if

else if ((octet > 127) && (octet < 192)) // bad (starts with 10xxxxxx).
{
// Apparently each of these is supposed to be flagged as a bogus
// char, instead of just resyncing to the next valid codepoint.
(*_str)++; // skip to next possible start of codepoint.
return UNICODE_BOGUS_CHAR_VALUE;
} // else if

else if (octet < 224) // two octets
{
octet -= (128+64);
octet2 = (uint32) ((uint8) *(++str));
if ((octet2 & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

*_str += 2; // skip to next possible start of codepoint.
retval = ((octet << 6) | (octet2 - 128));
if ((retval >= 0x80) && (retval <= 0x7FF))
return retval;
} // else if

else if (octet < 240) // three octets
{
octet -= (128+64+32);
octet2 = (uint32) ((uint8) *(++str));
if ((octet2 & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet3 = (uint32) ((uint8) *(++str));
if ((octet3 & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

*_str += 3; // skip to next possible start of codepoint.
retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );

// There are seven "UTF-16 surrogates" that are illegal in UTF-8.
switch (retval)
{
case 0xD800:
case 0xDB7F:
case 0xDB80:
case 0xDBFF:
case 0xDC00:
case 0xDF80:
case 0xDFFF:
return UNICODE_BOGUS_CHAR_VALUE;
} // switch

// 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge.
if ((retval >= 0x800) && (retval <= 0xFFFD))
return retval;
} // else if

else if (octet < 248) // four octets
{
octet -= (128+64+32+16);
octet2 = (uint32) ((uint8) *(++str));
if ((octet2 & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet3 = (uint32) ((uint8) *(++str));
if ((octet3 & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet4 = (uint32) ((uint8) *(++str));
if ((octet4 & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

*_str += 4; // skip to next possible start of codepoint.
retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
((octet3 - 128) << 6) | ((octet4 - 128)) );
if ((retval >= 0x10000) && (retval <= 0x10FFFF))
return retval;
} // else if

// Five and six octet sequences became illegal in rfc3629.
// We throw the codepoint away, but parse them to make sure we move
// ahead the right number of bytes and don't overflow the buffer.

else if (octet < 252) // five octets
{
octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

*_str += 5; // skip to next possible start of codepoint.
return UNICODE_BOGUS_CHAR_VALUE;
} // else if

else // six octets
{
octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

*_str += 6; // skip to next possible start of codepoint.
return UNICODE_BOGUS_CHAR_VALUE;
} // else if

return UNICODE_BOGUS_CHAR_VALUE;
} // utf8codepoint


static void outOfMemory(void)
{
// Try to translate "out of memory", but not if it causes recursion.
Expand Down
149 changes: 1 addition & 148 deletions platform_windows.c
Expand Up @@ -71,154 +71,7 @@ static uint32 startupTime = 0;
#endif


#define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
#define UNICODE_BOGUS_CHAR_CODEPOINT '?'

static uint32 utf8codepoint(const char **_str)
{
const char *str = *_str;
uint32 retval = 0;
uint32 octet = (uint32) ((uint8) *str);
uint32 octet2, octet3, octet4;

if (octet == 0) // null terminator, end of string.
return 0;

else if (octet < 128) // one octet char: 0 to 127
{
(*_str)++; // skip to next possible start of codepoint.
return octet;
} // else if

else if ((octet > 127) && (octet < 192)) // bad (starts with 10xxxxxx).
{
// Apparently each of these is supposed to be flagged as a bogus
// char, instead of just resyncing to the next valid codepoint.
(*_str)++; // skip to next possible start of codepoint.
return UNICODE_BOGUS_CHAR_VALUE;
} // else if

else if (octet < 224) // two octets
{
octet -= (128+64);
octet2 = (uint32) ((uint8) *(++str));
if ((octet2 & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

*_str += 2; // skip to next possible start of codepoint.
retval = ((octet << 6) | (octet2 - 128));
if ((retval >= 0x80) && (retval <= 0x7FF))
return retval;
} // else if

else if (octet < 240) // three octets
{
octet -= (128+64+32);
octet2 = (uint32) ((uint8) *(++str));
if ((octet2 & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet3 = (uint32) ((uint8) *(++str));
if ((octet3 & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

*_str += 3; // skip to next possible start of codepoint.
retval = ( ((octet << 12)) | ((octet2-128) << 6) | ((octet3-128)) );

// There are seven "UTF-16 surrogates" that are illegal in UTF-8.
switch (retval)
{
case 0xD800:
case 0xDB7F:
case 0xDB80:
case 0xDBFF:
case 0xDC00:
case 0xDF80:
case 0xDFFF:
return UNICODE_BOGUS_CHAR_VALUE;
} // switch

// 0xFFFE and 0xFFFF are illegal, too, so we check them at the edge.
if ((retval >= 0x800) && (retval <= 0xFFFD))
return retval;
} // else if

else if (octet < 248) // four octets
{
octet -= (128+64+32+16);
octet2 = (uint32) ((uint8) *(++str));
if ((octet2 & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet3 = (uint32) ((uint8) *(++str));
if ((octet3 & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet4 = (uint32) ((uint8) *(++str));
if ((octet4 & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

*_str += 4; // skip to next possible start of codepoint.
retval = ( ((octet << 18)) | ((octet2 - 128) << 12) |
((octet3 - 128) << 6) | ((octet4 - 128)) );
if ((retval >= 0x10000) && (retval <= 0x10FFFF))
return retval;
} // else if

// Five and six octet sequences became illegal in rfc3629.
// We throw the codepoint away, but parse them to make sure we move
// ahead the right number of bytes and don't overflow the buffer.

else if (octet < 252) // five octets
{
octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

*_str += 5; // skip to next possible start of codepoint.
return UNICODE_BOGUS_CHAR_VALUE;
} // else if

else // six octets
{
octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

octet = (uint32) ((uint8) *(++str));
if ((octet & (128+64)) != 128) // Format isn't 10xxxxxx?
return UNICODE_BOGUS_CHAR_VALUE;

*_str += 6; // skip to next possible start of codepoint.
return UNICODE_BOGUS_CHAR_VALUE;
} // else if

return UNICODE_BOGUS_CHAR_VALUE;
} // utf8codepoint
// these utf-8 functions may move to mojosetup.c some day...

void utf8ToUcs2(const char *src, uint16 *dst, uint64 len)
{
Expand Down
8 changes: 8 additions & 0 deletions universal.h
Expand Up @@ -96,6 +96,13 @@ extern const char *GBuildVer;
// Static, non-stack memory for scratch work...not thread safe!
extern uint8 scratchbuf_128k[128 * 1024];


#define UNICODE_BOGUS_CHAR_VALUE 0xFFFFFFFF
#define UNICODE_BOGUS_CHAR_CODEPOINT '?'
// !!! FIXME: document me!
uint32 utf8codepoint(const char **_str);


// Format a string, sort of (but not exactly!) like sprintf().
// The only formatters accepted are %0 through %9 (and %%), which do not
// have to appear in order in the string, but match the varargs passed to the
Expand Down Expand Up @@ -347,6 +354,7 @@ typedef struct MojoSetupEntryPoints
char *(*format)(const char *fmt, ...);
const char *(*numstr)(int val);
uint32 (*ticks)(void);
uint32 (*utf8codepoint)(const char **_str);
} MojoSetupEntryPoints;
extern MojoSetupEntryPoints GEntryPoints;

Expand Down

0 comments on commit f06a586

Please sign in to comment.