From 7fcc071ca879a2638e24ab42d2dda4411b505bd2 Mon Sep 17 00:00:00 2001
From: "Ryan C. Gordon" <icculus@icculus.org>
Date: Sun, 5 Nov 2006 11:10:14 +0000
Subject: [PATCH] Initial Unicode work.

---
 CHANGELOG               |   2 +-
 Makefile.am.newautomake |   1 +
 Makefile.am.oldautomake |   1 +
 makeos2.cmd             |   6 ++
 physfs.dsp              |   4 ++
 physfs.h                | 156 ++++++++++++++++++++++++++++++++++++++++
 physfs.vcproj           |   3 +
 physfsMPW.make          |   3 +
 physfs_static.dsp       |   4 ++
 9 files changed, 179 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG b/CHANGELOG
index 02ca4caa..be8d19db 100644
--- a/CHANGELOG
+++ b/CHANGELOG
@@ -2,7 +2,7 @@
  * CHANGELOG.
  */
 
-11052006 - More 7zip archiver work (thanks, Dennis!).
+11052006 - More 7zip archiver work (thanks, Dennis!). Initial Unicode work.
 09272006 - Reworked 7zip archiver (thanks, Dennis!).
 09232006 - Fixed typo in doxygen comment.
 04112006 - Added LZMA archiver...7zip support (thanks, Dennis!).
diff --git a/Makefile.am.newautomake b/Makefile.am.newautomake
index af9afbbe..7121ddd2 100644
--- a/Makefile.am.newautomake
+++ b/Makefile.am.newautomake
@@ -110,6 +110,7 @@ SUBDIRS = platform archivers zlib123 lzma . test extras
 libphysfs_la_SOURCES =	\
 	physfs.c	\
 	physfs_internal.h	\
+	physfs_unicode.c	\
 	physfs_byteorder.c
 
 if BUILD_ZLIB
diff --git a/Makefile.am.oldautomake b/Makefile.am.oldautomake
index 44e9e5cf..5bdcaf7a 100644
--- a/Makefile.am.oldautomake
+++ b/Makefile.am.oldautomake
@@ -9,6 +9,7 @@ libphysfsinclude_HEADERS =	\
 libphysfs_la_SOURCES =	\
 	physfs.c	\
 	physfs_internal.h	\
+	physfs_unicode.c	\
 	physfs_byteorder.c
 
 if BUILD_ZLIB
diff --git a/makeos2.cmd b/makeos2.cmd
index fc3a4e95..da39b87c 100644
--- a/makeos2.cmd
+++ b/makeos2.cmd
@@ -110,6 +110,11 @@ rem goto :dolinking
 @echo  "PHYSFS_getCdRomDirsCallback" >> bin\physfs.def
 @echo  "PHYSFS_getSearchPathCallback" >> bin\physfs.def
 @echo  "PHYSFS_enumerateFilesCallback" >> bin\physfs.def
+@echo  "PHYSFS_utf8toucs2" >> bin\physfs.def
+@echo  "PHYSFS_utf8fromucs2" >> bin\physfs.def
+@echo  "PHYSFS_utf8toucs4" >> bin\physfs.def
+@echo  "PHYSFS_utf8fromucs4" >> bin\physfs.def
+@echo  "PHYSFS_utf8fromlatin1" >> bin\physfs.def
 
 @echo Building export library...
 emximp -o bin/physfs.lib bin/physfs.def
@@ -118,6 +123,7 @@ emximp -o bin/physfs.lib bin/physfs.def
 @echo on
 gcc %CFLAGS% -o bin/physfs.obj physfs.c
 gcc %CFLAGS% -o bin/physfs_byteorder.obj physfs_byteorder.c
+gcc %CFLAGS% -o bin/physfs_unicode.obj physfs_unicode.c
 gcc %CFLAGS% -o bin/os2.obj platform/os2.c
 gcc %CFLAGS% -o bin/dir.obj archivers/dir.c
 gcc %CFLAGS% -o bin/grp.obj archivers/grp.c
diff --git a/physfs.dsp b/physfs.dsp
index 53fc7a12..27007f86 100644
--- a/physfs.dsp
+++ b/physfs.dsp
@@ -149,6 +149,10 @@ SOURCE=.\physfs_byteorder.c
 # End Source File
 # Begin Source File
 
+SOURCE=.\physfs_unicode.c
+# End Source File
+# Begin Source File
+
 SOURCE=.\archivers\qpak.c
 # End Source File
 # Begin Source File
diff --git a/physfs.h b/physfs.h
index c39972d7..2320b57d 100644
--- a/physfs.h
+++ b/physfs.h
@@ -147,6 +147,40 @@
  *   - .WAD (DOOM engine archives)
  *   - .MIX (Older Westwood games archives)
  *
+ *
+ * String policy for PhysicsFS 2.0 and later:
+ *
+ * PhysicsFS 1.0 deals with null-terminated ASCII strings. All high ASCII
+ *  chars resulted in undefined behaviour, and there was no Unicode support.
+ *
+ * All strings passed through PhysicsFS are in null-terminated UTF-8 format.
+ *  This means that if all you care about is English (ASCII characters <= 127)
+ *  then you just use regular C strings. If you care about Unicode (and you
+ *  should!) then you need to figure out what your platform wants, needs, and
+ *  offers. If you are on Windows and build with Unicode support, your TCHAR
+ *  strings are two bytes per character (this is called "UCS-2 encoding"). You
+ *  should convert them to UTF-8 before handing them to PhysicsFS with
+ *  PHYSFS_utf8fromucs2(). If you're using Unix or Mac OS X, your wchar_t
+ *  strings are four bytes per character ("UCS-4 encoding"). Use
+ *  PHYSFS_utf8fromucs2(). Mac OS X can gie you UTF-8 directly from a CFString,
+ *  and many Unixes generally give you C strings in UTF-8 format everywhere.
+ *  If you have a single-byte high ASCII charset, like so-many European
+ *  "codepages" you may be out of luck. We'll convert from "Latin1" to UTF-8
+ *  only, and never back to Latin1. If you're above ASCII 127, all bets are
+ *  off: move to Unicode or use your platform's facilities. Passing a C string
+ *  with high-ASCII data that isn't UTF-8 encoded will NOT do what you expect!
+ *
+ * Naturally, there's also PHYSFS_utf8toucs2() and PHYSFS_utf8toucs4() to get
+ *  data back into a format you like. Behind the scenes, PhysicsFS will use
+ *  Unicode where possible: the UTF-8 strings on Windows will be converted
+ *  and used with the multibyte Windows APIs, for example.
+ *
+ * PhysicsFS offers basic encoding conversion support, but not a whole string
+ *  library. Get your stuff into whatever format you can work with.
+ *
+ *
+ * Other stuff:
+ *
  * Please see the file LICENSE in the source's root directory for licensing
  *  and redistribution rights.
  *
@@ -1989,6 +2023,128 @@ __EXPORT__ void PHYSFS_enumerateFilesCallback(const char *dir,
                                               PHYSFS_EnumFilesCallback c,
                                               void *d);
 
+/**
+ * \fn void PHYSFS_utf8fromucs4(const PHYSFS_uint32 *src, char *dst, PHYSFS_uint64 len)
+ * \brief Convert a UCS-4 string to a UTF-8 string.
+ *
+ * UCS-4 strings are 32-bits per character: \c wchar_t on Unix.
+ *
+ * To ensure that the destination buffer is large enough for the conversion,
+ *  please allocate a buffer that is the same size as the source buffer. UTF-8
+ *  never uses more than 32-bits per character, so while it may shrink a UCS-4
+ *  string, it will never expand it.
+ *
+ * Strings that don't fit in the destination buffer will be truncated, but
+ *  will always be null-terminated and never have an incomplete UTF-8
+ *  sequence at the end.
+ *
+ *   \param src Null-terminated source string in UCS-4 format.
+ *   \param dst Buffer to store converted UTF-8 string.
+ *   \param len Size, in bytes, of destination buffer.
+ */
+__EXPORT__ void PHYSFS_utf8fromucs4(const PHYSFS_uint32 *src, char *dst,
+                                    PHYSFS_uint64 len);
+
+/**
+ * \fn void PHYSFS_utf8toucs4(const char *src, PHYSFS_uint32 *dst, PHYSFS_uint64 len)
+ * \brief Convert a UTF-8 string to a UCS-4 string.
+ *
+ * UCS-4 strings are 32-bits per character: \c wchar_t on Unix.
+ *
+ * To ensure that the destination buffer is large enough for the conversion,
+ *  please allocate a buffer that is four times the size of the source buffer.
+ *  UTF-8 uses from one to four bytes per character, but UCS-4 always uses
+ *  four, so an entirely low-ASCII string will quadruple in size!
+ *
+ * Strings that don't fit in the destination buffer will be truncated, but
+ *  will always be null-terminated and never have an incomplete UCS-4
+ *  sequence at the end.
+ *
+ *   \param src Null-terminated source string in UTF-8 format.
+ *   \param dst Buffer to store converted UCS-4 string.
+ *   \param len Size, in bytes, of destination buffer.
+ */
+__EXPORT__ void PHYSFS_utf8toucs4(const char *src, PHYSFS_uint32 *dst,
+                                  PHYSFS_uint64 len);
+
+/**
+ * \fn void PHYSFS_utf8fromucs2(const PHYSFS_uint16 *src, char *dst, PHYSFS_uint64 len)
+ * \brief Convert a UCS-2 string to a UTF-8 string.
+ *
+ * UCS-2 strings are 16-bits per character: \c TCHAR on Windows, when building
+ *  with Unicode support.
+ *
+ * To ensure that the destination buffer is large enough for the conversion,
+ *  please allocate a buffer that is double the size of the source buffer.
+ *  UTF-8 never uses more than 32-bits per character, so while it may shrink
+ *  a UCS-2 string, it may also expand it.
+ *
+ * Strings that don't fit in the destination buffer will be truncated, but
+ *  will always be null-terminated and never have an incomplete UTF-8
+ *  sequence at the end.
+ *
+ * Please note that UCS-2 is not UTF-16; we do not support the "surrogate"
+ *  values at this time.
+ *
+ *   \param src Null-terminated source string in UCS-2 format.
+ *   \param dst Buffer to store converted UTF-8 string.
+ *   \param len Size, in bytes, of destination buffer.
+ */
+__EXPORT__ void PHYSFS_utf8fromucs2(const PHYSFS_uint16 *src, char *dst,
+                                    PHYSFS_uint64 len);
+
+/**
+ * \fn PHYSFS_utf8toucs2(const char *src, PHYSFS_uint16 *dst, PHYSFS_uint64 len)
+ * \brief Convert a UTF-8 string to a UCS-2 string.
+ *
+ * UCS-2 strings are 16-bits per character: \c TCHAR on Windows, when building
+ *  with Unicode support.
+ *
+ * To ensure that the destination buffer is large enough for the conversion,
+ *  please allocate a buffer that is double the size of the source buffer.
+ *  UTF-8 uses from one to four bytes per character, but UCS-2 always uses
+ *  two, so an entirely low-ASCII string will double in size!
+ *
+ * Strings that don't fit in the destination buffer will be truncated, but
+ *  will always be null-terminated and never have an incomplete UCS-2
+ *  sequence at the end.
+ *
+ * Please note that UCS-2 is not UTF-16; we do not support the "surrogate"
+ *  values at this time.
+ *
+ *   \param src Null-terminated source string in UTF-8 format.
+ *   \param dst Buffer to store converted UCS-2 string.
+ *   \param len Size, in bytes, of destination buffer.
+ */
+__EXPORT__ void PHYSFS_utf8toucs2(const char *src, PHYSFS_uint16 *dst,
+                                  PHYSFS_uint64 len);
+
+/**
+ * \fn void PHYSFS_utf8fromlatin1(const char *src, char *dst, PHYSFS_uint64 len)
+ * \brief Convert a UTF-8 string to a Latin1 string.
+ *
+ * Latin1 strings are 8-bits per character: a popular "high ASCII"
+ *  encoding.
+ *
+ * To ensure that the destination buffer is large enough for the conversion,
+ *  please allocate a buffer that is double the size of the source buffer.
+ *  UTF-8 expands latin1 codepoints over 127 from to 2 bytes, so the string
+ *  may grow in some cases.
+ *
+ * Strings that don't fit in the destination buffer will be truncated, but
+ *  will always be null-terminated and never have an incomplete UTF-8
+ *  sequence at the end.
+ *
+ * Please note that we do not supply a UTF-8 to Latin1 converter, since Latin1
+ *  can't express most Unicode codepoints. It's a legacy encoding; you should
+ *  be converting away from it at all times.
+ *
+ *   \param src Null-terminated source string in Latin1 format.
+ *   \param dst Buffer to store converted UTF-8 string.
+ *   \param len Size, in bytes, of destination buffer.
+ */
+__EXPORT__ void PHYSFS_utf8fromlatin1(const char *src, char *dst,
+                                  PHYSFS_uint64 len);
 
 /* Everything above this line is part of the PhysicsFS 2.0 API. */
 
diff --git a/physfs.vcproj b/physfs.vcproj
index a51fdae3..3403c46d 100644
--- a/physfs.vcproj
+++ b/physfs.vcproj
@@ -193,6 +193,9 @@
 			<File
 				RelativePath=".\physfs_byteorder.c">
 			</File>
+			<File
+				RelativePath=".\physfs_unicode.c">
+			</File>
 			<File
 				RelativePath="archivers\qpak.c">
 			</File>
diff --git a/physfsMPW.make b/physfsMPW.make
index 7ed6bf45..a8dae50f 100644
--- a/physfsMPW.make
+++ b/physfsMPW.make
@@ -27,6 +27,7 @@ SrcFiles        =  
 				  :archivers:wad.c ¶
 				  :archivers:zip.c ¶
 				  physfs.c ¶
+				  physfs_unicode.c ¶
 				  physfs_byteorder.c ¶
 				  :platform:macclassic.c ¶
 				  :zlib123:adler32.c ¶
@@ -56,6 +57,7 @@ ObjFiles-PPC    =  
 				  "{ObjDir}zip.c.x" ¶
 				  "{ObjDir}physfs.c.x" ¶
 				  "{ObjDir}physfs_byteorder.c.x" ¶
+				  "{ObjDir}physfs_unicode.c.x" ¶
 				  "{ObjDir}macclassic.c.x" ¶
 				  "{ObjDir}adler32.c.x" ¶
 				  "{ObjDir}compress.c.x" ¶
@@ -115,6 +117,7 @@ PhysicsFS  
 "{ObjDir}zip.c.x"  Ä  :archivers:zip.c
 "{ObjDir}physfs.c.x"  Ä  physfs.c
 "{ObjDir}physfs_byteorder.c.x"  Ä  physfs_byteorder.c
+"{ObjDir}physfs_unicode.c.x"  Ä  physfs_unicode.c
 "{ObjDir}macclassic.c.x"  Ä  :platform:macclassic.c
 "{ObjDir}adler32.c.x"  Ä  :zlib123:adler32.c
 "{ObjDir}compress.c.x"  Ä  :zlib123:compress.c
diff --git a/physfs_static.dsp b/physfs_static.dsp
index 9a72024e..c22c0dc0 100644
--- a/physfs_static.dsp
+++ b/physfs_static.dsp
@@ -159,6 +159,10 @@ SOURCE=.\physfs_byteorder.c
 # End Source File
 # Begin Source File
 
+SOURCE=.\physfs_unicode.c
+# End Source File
+# Begin Source File
+
 SOURCE=.\archivers\qpak.c
 # End Source File
 # Begin Source File