Lhogho  0.0.028
 All Data Structures Files Functions Variables Typedefs Macros Pages
unicode.h File Reference

Go to the source code of this file.

Converters

Used to convert one string to another. In ASCII mode all conversions are like identities - i.e. no conversion is actually done.

#define TEXT(a)   L##a
 fix string constants More...
 
#define FILENAME(x)   UTF16_to_ASCII(x)
 fix file names More...
 
#define UNFILENAME(x)   ASCII_to_UTF16(x)
 unfix file names More...
 

Format strings

#define NULL_CHAR   TEXT('\0')
 
#define STR   "%S"
 
#define CHR   "%C"
 

String functions

They are used to maintain dual UNICODE/ASCII processing. This is needed because there are different names for mutibyte and widechar functions.

#define NO_MORE   WEOF
 
#define PUTCHAR(x, y)   putwc(x,y)
 
#define GETCHAR(x)   getwc(x)
 
#define STRLEN(x)   wcslen(x)
 
#define STRNCPY(x, y, z)   wcsncpy(x,y,z)
 
#define STRCMP(x, y)   wcscmp(x,y)
 
#define STRTOD(x, y)   wcstod(x,y)
 
#define STRTOL(x, y)   wcstoll(x,y,0)
 
#define TOUPPER(x)   towupper(x)
 
#define TOLOWER(x)   towlower(x)
 
#define ISDIGIT(x)   iswdigit(x)
 
#define STRCHR(x, y)   wcschr(x,y)
 
#define PRINT(x,...)   printf(x,__VA_ARGS__)
 
#define STRFTIME(x, y, z, t)   wcsftime(x,y,z,t)
 
#define SPRINTF(x, n, y, z)   swprintf(x,n,y,z)
 
#define SPRINT(x, n, y,...)   swprintf(x,n,y,__VA_ARGS__)
 

Functions

chars_t UTF8_to_UTF16 (byte_t *source)
 converts UTF-8 to UTF-16 More...
 
byte_tUTF16_to_UTF8 (chars_t source)
 
char * UTF16_to_ASCII (chars_t ws)
 converts UTF-16 to ASCII More...
 
chars_t ASCII_to_UTF16 (const char *s)
 converts ASCII to UTF-16 More...
 
chars_t ASCII_to_ASCII (const char *s)
 converts ASCII to ASCII More...
 
voidload_file (chars_t wfilename, int *filesize)
 load text file into a word atom More...
 

Macro Definition Documentation

#define TEXT (   a)    L##a

Definition at line 74 of file unicode.h.

#define FILENAME (   x)    UTF16_to_ASCII(x)

Definition at line 75 of file unicode.h.

#define UNFILENAME (   x)    ASCII_to_UTF16(x)

Definition at line 76 of file unicode.h.

#define NULL_CHAR   TEXT('\0')

Definition at line 89 of file unicode.h.

#define STR   "%S"

Definition at line 91 of file unicode.h.

#define CHR   "%C"

Definition at line 92 of file unicode.h.

#define NO_MORE   WEOF

Definition at line 109 of file unicode.h.

#define PUTCHAR (   x,
 
)    putwc(x,y)

Definition at line 110 of file unicode.h.

#define GETCHAR (   x)    getwc(x)

Definition at line 111 of file unicode.h.

#define STRLEN (   x)    wcslen(x)

Definition at line 112 of file unicode.h.

#define STRNCPY (   x,
  y,
 
)    wcsncpy(x,y,z)

Definition at line 113 of file unicode.h.

#define STRCMP (   x,
 
)    wcscmp(x,y)

Definition at line 115 of file unicode.h.

#define STRTOD (   x,
 
)    wcstod(x,y)

Definition at line 116 of file unicode.h.

#define STRTOL (   x,
 
)    wcstoll(x,y,0)

Definition at line 117 of file unicode.h.

#define TOUPPER (   x)    towupper(x)

Definition at line 118 of file unicode.h.

#define TOLOWER (   x)    towlower(x)

Definition at line 119 of file unicode.h.

#define ISDIGIT (   x)    iswdigit(x)

Definition at line 120 of file unicode.h.

#define STRCHR (   x,
 
)    wcschr(x,y)

Definition at line 121 of file unicode.h.

#define PRINT (   x,
  ... 
)    printf(x,__VA_ARGS__)

Definition at line 122 of file unicode.h.

#define STRFTIME (   x,
  y,
  z,
 
)    wcsftime(x,y,z,t)

Definition at line 123 of file unicode.h.

#define SPRINTF (   x,
  n,
  y,
 
)    swprintf(x,n,y,z)

Definition at line 128 of file unicode.h.

#define SPRINT (   x,
  n,
  y,
  ... 
)    swprintf(x,n,y,__VA_ARGS__)

Definition at line 129 of file unicode.h.

Function Documentation

chars_t UTF8_to_UTF16 ( byte_t source)
Parameters
sourcecharacters to convert
Returns
converted string
Note
defined only if UNICODE_CHARS symbol is defined

Converts string of multibyte UTF-8 encoding to widechar UTF-16LE encoding.

Definition at line 230 of file unicode.c.

231 {
232  int len = strlen ((char*)source);
233  wchar_t *buffer = alloca( CHAR_SIZE*(len+1) );
234  wchar_t *buf = buffer;
235  unsigned long wc;
236 
237  while( len>0 )
238  {
239  if( (*source & 0x80)==0x00 )
240  { // 00-7F [0zzz-zzzz]
241  wc = (byte_t)*source++;
242  len -= 1;
243  }
244  else if( (*source & 0xE0)==0xC0 )
245  { // 080-7FF [110y-yyyy] [10zz-zzzz]
246  wc = *source & 0x1F;
247  source++;
248  wc = (wc<<6) + (*source & 0x3F);
249  source++;
250  len -= 2;
251  }
252  else if( ((byte_t)*source & 0xF0)==0xE0 )
253  { // 0800-FFFF [1110-xxxx] [10yy-yyyy] [10zz-zzzz]
254  wc = *source & 0x1F;
255  source++;
256  wc = (wc<<6) + (*source & 0x3F);
257  source++;
258  wc = (wc<<6) + (*source & 0x3F);
259  source++;
260  len -= 3;
261  }
262  else
263  { // 01000-10FFFF [1111-wwww] [10xx-xxxx] [10yy-yyyy] [10zz-zzzz]
264  wc = *source & 0x1F;
265  source++;
266  wc = (wc<<6) + (*source & 0x3F);
267  source++;
268  wc = (wc<<6) + (*source & 0x3F);
269  source++;
270  wc = (wc<<6) + (*source & 0x3F);
271  source++;
272  len -= 4;
273  }
274  *buf = (unsigned short)wc;
275  buf++;
276  }
277 
278  *buf = L'\0';
279 
280  len = CHAR_SIZE*(buf-buffer+1);
281  buf = ALLOC( len );
282  memcpy( buf, buffer, len );
283 
284  return buf;
285 }
byte_t* UTF16_to_UTF8 ( chars_t  source)

Definition at line 186 of file unicode.c.

187 {
188  int len = STRLEN( source );
189  byte_t *buffer = alloca( len+1 );
190  byte_t *buf = buffer;
191 
192  while( len>0 )
193  {
194  char_t wc = *source;
195  //printf("\nCODE=%4x|",wc);
196  if( wc < 0x0080 )
197  { // 0000-007F
198  // from: [0xxxxxxx]
199  // to: [0xxxxxxx]
200  *buf++ = (byte_t)wc;
201  }
202  else if( wc < 0x0800 )
203  { // 0080-07FF
204  // from: [00000yyy yyxxxxxx]
205  // to: [110yyyyy] [10xxxxxx]
206  *buf++ = 0xC0 | (byte_t)(wc >> 6);
207  *buf++ = 0x80 | (byte_t)(wc & 0x3F);
208  }
209  else
210  { // 0800-FFFF
211  // from: [zzzzyyyy yyxxxxxx]
212  // to: [1110zzzz] [10yyyyyy] [10xxxxxx]
213  *buf++ = 0xE0 | (byte_t)(wc >> 12);
214  *buf++ = 0x80 | (byte_t)((wc >> 6) & 0x3F);
215  *buf++ = 0x80 | (byte_t)(wc & 0x3F);
216  }
217  len--;
218  source++;
219  }
220 
221  *buf = '\0';
222 
223  len = buf-buffer+1;
224  buf = ALLOC( len );
225  memcpy( buf, buffer, len );
226 
227  return buf;
228 }
char* UTF16_to_ASCII ( chars_t  ws)

Converts string of widechar UTF-16LE encoding to ASCII encoding. The input string is not freed.

Parameters
wscharacters to convert
Returns
converted string
Note
function defined only if

Definition at line 76 of file unicode.c.

77 {
78  size_t len = wcslen( ws );
79  char* buffer = alloca( 4*(len+1) ); // assume one utf16 can expand to 4 bytes max
80  char* buf = buffer;
81  mbstate_t state;
82  size_t nbytes;
83 
84  memset (&state, '\0', sizeof (state));
85  while (len>0)
86  {
87  nbytes = wcrtomb (buf, *ws, &state);
88  buf += nbytes;
89  len -= 1;
90  ws += 1;
91  }
92  *buf = '\0';
93 
94  len = buf-buffer+1;
95  buf = ALLOC( len );
96  memcpy( buf, buffer, len );
97  return buf;
98 }
chars_t ASCII_to_UTF16 ( const char *  s)

Converts string of ASCII encoding to widechar UTF-16LE encoding.

Parameters
scharacters to convert
Returns
converted string
Note
defined only if UNICODE_CHARS

Definition at line 116 of file unicode.c.

117 {
118  //printf("===%s===\n",s);
119 
120  size_t len = strlen(s);
121  wchar_t *buffer = ALLOC( CHAR_SIZE*(len+1) );
122  wchar_t *buf = buffer;
123  mbstate_t state;
124  size_t nbytes;
125  memset (&state, '\0', sizeof (state));
126  while (len>0)
127  {
128  nbytes = mbrtowc (buf, s, len, &state);
129  buf++;
130  len -= nbytes;
131  s += nbytes;
132  }
133  *buf = L'\0';
134 
135  len = CHAR_SIZE*(buf-buffer+1);
136  buf = ALLOC( len );
137  memcpy( buf, buffer, len );
138  DEALLOC( buffer );
139  return buf;
140 }
chars_t ASCII_to_ASCII ( const char *  s)
Parameters
scharacters to convert
Returns
converted string
Note
defined only if UNICODE_CHARS symbol is not defined

Converts string of ASCII encoding to ASCII. Actually does not covert anything. This function is used because it uses the ALLOC() macro which helps tracing memory allocation.

Definition at line 159 of file unicode.c.

160 {
161 #ifndef UNICODE_CHARS
162  size_t len = STRLEN(s)+1;
163  char* buf = ALLOC( len );
164  memcpy( buf, s, len );
165  return buf;
166 #else
167  return (chars_t)0;
168 #endif //UNICODE_CHARS
169 }
void* load_file ( chars_t  wfilename,
int *  filesize 
)
Parameters
wfilenamefile name
filesizefile size
Returns
word atom

Loads a text file which can be ASCII, multibyte UTF-8 or widechar UTF-16LE encoding. The size of the file is returned in filesize so that the caller can append null character if needed.

Definition at line 303 of file unicode.c.

304 {
305  FILE* file; // file stream
306  void* buffer; // file buffer
307  struct stat st_info; // file attributes
308 
309  // convert filename to multibyte
310  char* filename = FILENAME(wfilename);
311 
312  errno = 0;
313 
314  // open file
315  file = fopen( filename, "rb" );
316  if( errno )
317  {
318  // searching failed, try again looking in subfolder
319  // lib of the folder where the compiler is
320  char buf[PATH_MAX+1];
322  char* path = dirname(buf);
323  int pathlen = strlen(path);
324  strncpy(buf,path,pathlen);
325 
326  int filelen = strlen(filename);
327  if( pathlen+filelen+6 > PATH_MAX ) filelen=0;
328 #ifdef WINDOWS
329  strncpy(buf+pathlen,"\\lib\\",5);
330 #else
331  strncpy(buf+pathlen,"/lib/",5);
332 #endif
333  strncpy(buf+5+pathlen,filename,filelen);
334  buf[pathlen+5+filelen] = '\0';
335 
336  //printf("failed %s, try '%s' while compiler is %s\n",filename,buf,option_compiler_filename);
337  errno = 0;
338  file = fopen( buf, "rb" );
339  if( errno )
340  {
341  //printf("failed again\n");
342  return NULL;
343  }
344  }
345 
346  // get file size
347  fstat( fileno(file), &st_info );
348  *filesize = st_info.st_size;
349 
350  // file name is not needed any more
351  #ifdef UNICODE_CHARS
352  DEALLOC( filename );
353  #endif
354 
355  // allocate buffer
356  buffer = ALLOC(*filesize+1);
357  #ifdef SAFEMODE
358  if( !buffer )
359  {
360  fclose( file );
361  errno = ENOMEM;
362  return NULL;
363  }
364  #endif //SAFEMODE
365 
366  // read file into the buffer
367  if( *filesize && !fread(buffer,1,*filesize,file ) )
368  {
369  #ifdef SAFEMODE
370  DEALLOC( buffer );
371  return NULL;
372  #endif //SAFEMODE
373  }
374 
375  fclose( file );
376  #ifdef SAFEMODE
377  if( errno )
378  {
379  free( buffer );
380  return NULL;
381  }
382  #endif //SAFEMODE
383 
384  return buffer;
385 }

[ HOME | INDEX | ATOMS | VARS | REFERENCE ]
Lhogho Developer's Documentation
Wed Jul 10 2013