So I decided to do something exciting. I've seen all over the place in Pango and other libraries that use UTF-8 for their internal encoding, that they have to hardcode the UTF-8 value of a character. For example from Pango sources:
/* First try using a specific ellipsis character in the best matching font
*/
if (state->ellipsis_is_cjk)
ellipsis_text = "\342\213\257"; /* U+22EF: MIDLINE HORIZONTAL ELLIPSIS, used for CJK */
else
ellipsis_text = "\342\200\246"; /* U+2026: HORIZONTAL ELLIPSIS */
char ellipsis_text[] = UNICODE_TO_UTF8(0x22EF);
#ifndef _STATIC_UTF8_LONG_H
#define _STATIC_UTF8_LONG_H
#define UNICHAR_TO_UTF8(Char) \
(const char []) \
{ \
/* first octet */ \
(Char) < 0x00000080 ? (Char) : \
(Char) < 0x00000800 ? ((Char) >> 6) | 0xC0 : \
(Char) < 0x00010000 ? ((Char) >> 12) | 0xE0 : \
(Char) < 0x00200000 ? ((Char) >> 18) | 0xF0 : \
(Char) < 0x04000000 ? ((Char) >> 24) | 0xF8 : \
((Char) >> 30) | 0xFC, \
/* second octet */ \
(Char) < 0x00000080 ? 0 /* null-terminator */ : \
(Char) < 0x00000800 ? ((Char) & 0x3F) | 0x80 : \
(Char) < 0x00010000 ? (((Char) >> 6) & 0x3F) | 0x80 : \
(Char) < 0x00200000 ? (((Char) >> 12) & 0x3F) | 0x80 : \
(Char) < 0x04000000 ? (((Char) >> 18) & 0x3F) | 0x80 : \
(((Char) >> 24) & 0x3F) | 0x80, \
/* third octet */ \
(Char) < 0x00000800 ? 0 /* null-terminator */ : \
(Char) < 0x00010000 ? ((Char) & 0x3F) | 0x80 : \
(Char) < 0x00200000 ? (((Char) >> 6) & 0x3F) | 0x80 : \
(Char) < 0x04000000 ? (((Char) >> 12) & 0x3F) | 0x80 : \
(((Char) >> 18) & 0x3F) | 0x80, \
/* fourth octet */ \
(Char) < 0x00010000 ? 0 /* null-terminator */ : \
(Char) < 0x00200000 ? ((Char) & 0x3F) | 0x80 : \
(Char) < 0x04000000 ? (((Char) >> 6) & 0x3F) | 0x80 : \
(((Char) >> 12) & 0x3F) | 0x80, \
/* fifth octet */ \
(Char) < 0x00200000 ? 0 /* null-terminator */ : \
(Char) < 0x04000000 ? ((Char) & 0x3F) | 0x80 : \
(((Char) >> 6) & 0x3F) | 0x80, \
/* sixth octet */ \
(Char) < 0x04000000 ? 0 /* null-terminator */ : \
((Char) & 0x3F) | 0x80, \
0 /* null-terminator */ \
}
#endif /* !_STATIC_UTF8_LONG_H */
#include <stdio.h>
#include "static-utf8-long.h"
int
main()
{
printf ("%s\n", UNICHAR_TO_UTF8 (0x06CC));
return 0;
}
gcc
under different optimization options, and no wonder none of them kicked the trailing zero bytes out. So I needed to continue. Good, it was only 6 by now. For sure I needed to use preprocessor conditionals. But then, in preprocessor conditionals, you can only use preprocessor symbols. The rest is obvious now:#ifndef Char
# error Char undefined
#else
(const char [])
{
#if Char >= 0x00000000
/* first octet */
(Char) < 0x00000080 ? (Char) :
(Char) < 0x00000800 ? ((Char) >> 6) | 0xC0 :
(Char) < 0x00010000 ? ((Char) >> 12) | 0xE0 :
(Char) < 0x00200000 ? ((Char) >> 18) | 0xF0 :
(Char) < 0x04000000 ? ((Char) >> 24) | 0xF8 :
((Char) >> 30) | 0xFC,
#endif
#if Char >= 0x00000080
/* second octet */
(Char) < 0x00000800 ? ((Char) & 0x3F) | 0x80 :
(Char) < 0x00010000 ? (((Char) >> 6) & 0x3F) | 0x80 :
(Char) < 0x00200000 ? (((Char) >> 12) & 0x3F) | 0x80 :
(Char) < 0x04000000 ? (((Char) >> 18) & 0x3F) | 0x80 :
(((Char) >> 24) & 0x3F) | 0x80,
#endif
#if Char >= 0x00000800
/* third octet */
(Char) < 0x00010000 ? ((Char) & 0x3F) | 0x80 :
(Char) < 0x00200000 ? (((Char) >> 6) & 0x3F) | 0x80 :
(Char) < 0x04000000 ? (((Char) >> 12) & 0x3F) | 0x80 :
(((Char) >> 18) & 0x3F) | 0x80,
#endif
#if Char >= 0x00010000
/* fourth octet */
(Char) < 0x00200000 ? ((Char) & 0x3F) | 0x80 :
(Char) < 0x04000000 ? (((Char) >> 6) & 0x3F) | 0x80 :
(((Char) >> 12) & 0x3F) | 0x80,
#endif
#if Char >= 0x00200000
/* fifth octet */
(Char) < 0x04000000 ? ((Char) & 0x3F) | 0x80 :
(((Char) >> 6) & 0x3F) | 0x80,
#endif
#if Char >= 0x04000000
/* sixth octet */
((Char) & 0x3F) | 0x80,
#endif
0 /* null-terminator */
}
#undef Char
#endif
#include <stdio.h>
int
main()
{
printf ("%s\n",
# define Char 0x06CC
# include "static-utf8-short.h"
);
return 0;
}