Main Page | Modules | Alphabetical List | Data Structures | File List | Data Fields | Globals | Related Pages

charconvert.c

Go to the documentation of this file.
00001 
00008 /* {{{ Initial headers */
00009 /*
00010  * $LastChangedDate: 2004-04-01 18:34:17 +0200 (Thu, 01 Apr 2004) $
00011  * $LastChangedRevision: 50 $
00012  * $LastChangedBy: ckruse $
00013  *
00014  */
00015 /* }}} */
00016 
00017 /* {{{ Includes */
00018 #include "config.h"
00019 #include "defines.h"
00020 
00021 #include <errno.h>
00022 #include <stdlib.h>
00023 #include <stdio.h>
00024 #include <iconv.h>
00025 #include <string.h>
00026 
00027 #include "entitytable.h"
00028 
00029 #include "utils.h"
00030 /* }}} */
00031 
00032 int utf8_to_unicode(const u_char *s,size_t n,u_int32_t *num) {
00033   u_char c = s[0];
00034 
00035   if(c < 0x80) {
00036     *num = c;
00037     return 1;
00038   }
00039   else if(c < 0xc2) return EILSEQ;
00040   else if(c < 0xe0) {
00041     if(n < 2) return EILSEQ;
00042 
00043     if(!((s[1] ^ 0x80) < 0x40)) return EILSEQ;
00044     *num = ((u_int32_t)(c & 0x1f) << 6) | (u_int32_t)(s[1] ^ 0x80);
00045     return 2;
00046   }
00047   else if(c < 0xf0) {
00048     if(n < 3) return EILSEQ;
00049     if(!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (c >= 0xe1 || s[1] >= 0xa0))) return EILSEQ;
00050 
00051     *num = ((u_int32_t)(c & 0x0f) << 12) | ((u_int32_t)(s[1] ^ 0x80) << 6) | (u_int32_t)(s[2] ^ 0x80);
00052     return 3;
00053   }
00054   else if(c < 0xf8) {
00055     if(n < 4) return EILSEQ;
00056     if(!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (c >= 0xf1 || s[1] >= 0x90))) return EILSEQ;
00057 
00058     *num = ((u_int32_t)(c & 0x07) << 18) | ((u_int32_t)(s[1] ^ 0x80) << 12) | ((u_int32_t)(s[2] ^ 0x80) << 6) | (u_int32_t)(s[3] ^ 0x80);
00059     return 4;
00060   }
00061   else if(c < 0xfc) {
00062     if(n < 5) return EILSEQ;
00063     if(!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 && (c >= 0xf9 || s[1] >= 0x88))) return EILSEQ;
00064 
00065     *num = ((u_int32_t)(c & 0x03) << 24) | ((u_int32_t)(s[1] ^ 0x80) << 18) | ((u_int32_t)(s[2] ^ 0x80) << 12) | ((u_int32_t)(s[3] ^ 0x80) << 6) | (u_int32_t)(s[4] ^ 0x80);
00066     return 5;
00067   }
00068   else if(c < 0xfe) {
00069     if(n < 6) return EILSEQ;
00070     if(!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 && (s[5] ^ 0x80) < 0x40 && (c >= 0xfd || s[1] >= 0x84))) return EILSEQ;
00071 
00072     *num = ((u_int32_t)(c & 0x01) << 30) | ((u_int32_t)(s[1] ^ 0x80) << 24) | ((u_int32_t)(s[2] ^ 0x80) << 18) | ((u_int32_t)(s[3] ^ 0x80) << 12) | ((u_int32_t)(s[4] ^ 0x80) << 6) | (u_int32_t)(s[5] ^ 0x80);
00073     return 6;
00074   }
00075   else return EILSEQ;
00076 
00077 }
00078 
00079 int is_valid_utf8_string(const u_char *str,size_t len) {
00080   register u_char *ptr = (u_char *)str;
00081   int x;
00082   int ret;
00083 
00084   for(;*ptr && len > 0;) {
00085     if((ret = utf8_to_unicode(ptr,len,&x)) == EILSEQ) {
00086       return -1;
00087     }
00088 
00089     ptr += ret;
00090     len -= ret;
00091   }
00092 
00093   return 0;
00094 }
00095 
00096 /*
00097  * Returns: u_char * (NULL on failure, a string-array on success)
00098  * Parameters:
00099  *   - const u_char *toencode     the string which has to be encoded
00100  *   - size_t in_len              the length of the string to convert
00101  *   - const u_char *from_charset the starting charset
00102  *   - const u_char *to_charset   the target charset
00103  *   - size_t *out_len_p          pointer to a variable; will be filled with the length of the output
00104  *
00105  * this function tries to convert a string from one charset to another charset
00106  *
00107  */
00108 u_char *charset_convert(const u_char *toencode,size_t in_len,const u_char *from_charset,const u_char *to_charset,size_t *out_len_p) {
00109   iconv_t cd;
00110   size_t in_left, out_size, out_left;
00111   u_char *out_p, *out_buf, *tmp_buf;
00112   size_t bsz, result = 0;
00113 
00114   cd = iconv_open(to_charset,from_charset);
00115 
00116   if(cd == (iconv_t)(-1)) {
00117     return NULL;
00118   }
00119 
00120   in_left  = in_len;
00121   out_left = in_len + 32; /* avoids realloc() in most cases */
00122   out_size = 0;
00123   bsz      = out_left;
00124   out_buf  = fo_alloc(NULL,bsz+1,1,FO_ALLOC_MALLOC);
00125   out_p    = out_buf;
00126 
00127   while(in_left > 0) {
00128     result = iconv(cd,(u_char **)&toencode,&in_left,(u_char **)&out_p,&out_left);
00129     out_size = bsz - out_left;
00130     if(result == (size_t)(-1)) {
00131       if(errno == E2BIG && in_left > 0) {
00132         /* converted string is longer than out buffer */
00133         bsz += in_len;
00134 
00135         /* tmp_buf cannot be NULL because if memory allocation failes, fo_alloc calls exit() */
00136         tmp_buf = (u_char *)fo_alloc(out_buf, bsz+1,1,FO_ALLOC_REALLOC);
00137 
00138         out_buf  = tmp_buf;
00139         out_p    = out_buf + out_size;
00140         out_left = bsz - out_size;
00141         continue;
00142       }
00143     }
00144 
00145     break;
00146   }
00147 
00148   iconv_close(cd);
00149 
00150   if(result == (size_t)(-1)) {
00151     free(out_buf);
00152     return NULL;
00153   }
00154 
00155   *out_p = '\0';
00156   if(out_len_p) *out_len_p = (size_t)(out_p - out_buf);
00157   return out_buf;
00158 }
00159 
00160 /*
00161  * Returns: u_char * (NULL on failure, a string-array on success)
00162  * Parameters:
00163  *   - const u_char *string  the string which has to be encoded
00164  *
00165  * this function converts HTML named characters to their entities
00166  *
00167  */
00168 u_char *htmlentities(const u_char *string,int sq) {
00169   register u_char *ptr;
00170   t_string new_str;
00171 
00172   str_init(&new_str);
00173 
00174   if(!string) {
00175     return NULL;
00176   }
00177 
00178   for(ptr=(u_char *)string;*ptr;ptr++) {
00179     switch(*ptr) {
00180       case '>':
00181         str_chars_append(&new_str,"&gt;",4);
00182         break;
00183       case '<':
00184         str_chars_append(&new_str,"&lt;",4);
00185         break;
00186       case '&':
00187         str_chars_append(&new_str,"&amp;",5);
00188         break;
00189       case '"':
00190         str_chars_append(&new_str,"&quot;",6);
00191         break;
00192       case '\'':
00193         if(sq) {
00194           str_chars_append(&new_str,"&#39;",5);
00195         }
00196         else {
00197           str_char_append(&new_str,*ptr);
00198         }
00199         break;
00200       default:
00201         str_char_append(&new_str,*ptr);
00202         break;
00203     }
00204   }
00205 
00206   return fo_alloc(new_str.content,new_str.len+1,1,FO_ALLOC_REALLOC);
00207 }
00208 
00209 size_t print_htmlentities_encoded(const u_char *string,int sq,FILE *handle) {
00210   register u_char *ptr;
00211   register size_t written = 0,s = 0;
00212 
00213   if(!string) return 0;
00214 
00215   for(ptr=(u_char *)string;*ptr;ptr++) {
00216     switch(*ptr) {
00217       case 34:
00218         s = fwrite("&quot;",1,6,handle);
00219         break;
00220       case 38:
00221         s = fwrite("&amp;",1,5,handle);
00222         break;
00223       case 39:
00224         if(sq) {
00225           s = fwrite("&#39;",1,5,handle);
00226         }
00227         else {
00228           s = fwrite(ptr,1,1,handle);
00229         }
00230         break;
00231       case 60:
00232         s = fwrite("&lt;",1,4,handle);
00233         break;
00234       case 62:
00235         s = fwrite("&gt;",1,4,handle);
00236         break;
00237       default:
00238         fputc(*ptr,handle);
00239         s = 1;
00240         break;
00241     }
00242 
00243     if(s <= 0) {
00244       return written;
00245     }
00246 
00247     written += s;
00248   }
00249 
00250   return written;
00251 }
00252 
00253 /*
00254  * This function converts a string between to charsets and encodes it
00255  * as html (this means, " to &quot;, < to &lt;, > to &gt; and & to &amp;). If a
00256  * sequence cannot be converted to the target charset, it will be converted to
00257  * a named entity (if given) or a unicode entity (&#<number>;)
00258  */
00259 u_char *htmlentities_charset_convert(const u_char *toencode, const u_char *from, const u_char *to,size_t *outlen,int sq) {
00260   register u_char *ptr;
00261   u_char *in_ptr,*entity,buff[15];
00262   t_string new_str;
00263 
00264   iconv_t cd;
00265   size_t in_left, out_size, out_left,in_len,elen;
00266   u_char *out_p, *out_buf, *tmp_buf;
00267   size_t bsz, result = 0;
00268   int unicode,ret;
00269 
00270   cd = iconv_open(to,from);
00271 
00272   if(cd == (iconv_t)(-1)) {
00273     return NULL;
00274   }
00275 
00276   /* first phase: encode html active characters */
00277   str_init(&new_str);
00278 
00279   for(ptr=(u_char *)toencode;*ptr;ptr++) {
00280     switch(*ptr) {
00281       case '>':
00282         str_chars_append(&new_str,"&gt;",4);
00283         break;
00284       case '<':
00285         str_chars_append(&new_str,"&lt;",4);
00286         break;
00287       case '&':
00288         str_chars_append(&new_str,"&amp;",5);
00289         break;
00290       case '"':
00291         str_chars_append(&new_str,"&quot;",6);
00292         break;
00293       case '\'':
00294         if(sq) {
00295           str_chars_append(&new_str,"&#39;",5);
00296         }
00297         else {
00298           str_char_append(&new_str,*ptr);
00299         }
00300         break;
00301       default:
00302         str_char_append(&new_str,*ptr);
00303         break;
00304     }
00305   }
00306 
00307   /* second phase: convert string to charset */
00308   in_len   = new_str.len;
00309   in_ptr   = new_str.content;
00310   in_left  = new_str.len;
00311   out_left = in_len + 32; /* avoids realloc() in most cases */
00312   out_size = 0;
00313   bsz      = out_left;
00314   out_buf  = fo_alloc(NULL,bsz+1,1,FO_ALLOC_MALLOC);
00315   out_p    = out_buf;
00316 
00317   while(in_left > 0) {
00318     result = iconv(cd,&in_ptr,&in_left,(u_char **)&out_p,&out_left);
00319     out_size = bsz - out_left;
00320     if(result == (size_t)(-1)) {
00321       if(errno == E2BIG && in_left > 0) {
00322         /* converted string is longer than out buffer */
00323         bsz += in_len;
00324 
00325         /* tmp_buf cannot be NULL because if memory allocation failes, fo_alloc calls exit() */
00326         tmp_buf = (u_char *)fo_alloc(out_buf, bsz+1,1,FO_ALLOC_REALLOC);
00327 
00328         out_buf  = tmp_buf;
00329         out_p    = out_buf + out_size;
00330         out_left = bsz - out_size;
00331         continue;
00332       }
00333       else if(errno == EILSEQ) {
00334         /* ok, we got an illegal sequence... lets convert it to an entity */
00335         if((ret = utf8_to_unicode(in_ptr,in_left,&unicode)) <= 0) {
00336           str_cleanup(&new_str);
00337           free(out_buf);
00338           return NULL;
00339         }
00340 
00341         /* longest entity is about 19 bytes; we need more space if buffer is shorter */
00342         if(out_left < 20) {
00343           bsz += in_len;
00344           /* tmp_buf cannot be NULL because if memory allocation failes, fo_alloc calls exit() */
00345           tmp_buf = (u_char *)fo_alloc(out_buf, bsz+1,1,FO_ALLOC_REALLOC);
00346 
00347           out_buf  = tmp_buf;
00348           out_p    = out_buf + out_size;
00349           out_left = bsz - out_size;
00350         }
00351 
00352         /* get named enity (if available) */
00353         if((entity = (u_char *)entity_lookup(unicode)) == NULL) {
00354           elen = snprintf(buff,15,"&#%d;",unicode);
00355         }
00356         else {
00357           elen = snprintf(buff,15,"&%s;",entity);
00358         }
00359 
00360         /* copy entity to buffer */
00361         strncpy(out_p,buff,elen);
00362 
00363         /* go to next sequence */
00364         in_left -= ret;
00365         in_ptr += ret;
00366 
00367         /* go to free space */
00368         out_p    += elen;
00369         out_left -= elen;
00370 
00371         if(in_left <= 0) {
00372           result = 0;
00373           break;
00374         }
00375 
00376         continue;
00377       }
00378     }
00379 
00380     break;
00381   }
00382 
00383   iconv_close(cd);
00384   str_cleanup(&new_str);
00385 
00386   if(result == (size_t)(-1)) {
00387     free(out_buf);
00388     return NULL;
00389   }
00390 
00391   *out_p = '\0';
00392   if(outlen) *outlen = (size_t)(out_p - out_buf);
00393   return out_buf;
00394 }
00395 
00396 /*
00397  * This function converts a string between to charsets; every entity which cannot be shown
00398  * in the corresponding charset will be converted to HTML entities (named or UTF8-reference)
00399  */
00400 u_char *charset_convert_entities(const u_char *toencode, size_t in_len,const u_char *from, const u_char *to,size_t *outlen) {
00401   u_char *in_ptr,*entity,buff[15];
00402 
00403   iconv_t cd;
00404   size_t in_left, out_size, out_left,elen;
00405   u_char *out_p, *out_buf, *tmp_buf;
00406   size_t bsz, result = 0;
00407   int unicode,ret;
00408 
00409   cd = iconv_open(to,from);
00410 
00411   if(cd == (iconv_t)(-1)) {
00412     return NULL;
00413   }
00414 
00415   /* second phase: convert string to charset */
00416   in_ptr   = (u_char *)toencode;
00417   in_left  = in_len;
00418   out_left = in_len + 32; /* avoids realloc() in most cases */
00419   out_size = 0;
00420   bsz      = out_left;
00421   out_buf  = fo_alloc(NULL,bsz+1,1,FO_ALLOC_MALLOC);
00422   out_p    = out_buf;
00423 
00424   while(in_left > 0) {
00425     result = iconv(cd,&in_ptr,&in_left,(u_char **)&out_p,&out_left);
00426     out_size = bsz - out_left;
00427     if(result == (size_t)(-1)) {
00428       if(errno == E2BIG && in_left > 0) {
00429         /* converted string is longer than out buffer */
00430         bsz += in_len;
00431 
00432         /* tmp_buf cannot be NULL because if memory allocation failes, fo_alloc calls exit() */
00433         tmp_buf = (u_char *)fo_alloc(out_buf, bsz+1,1,FO_ALLOC_REALLOC);
00434 
00435         out_buf  = tmp_buf;
00436         out_p    = out_buf + out_size;
00437         out_left = bsz - out_size;
00438         continue;
00439       }
00440       else if(errno == EILSEQ) {
00441         /* ok, we got an illegal sequence... lets convert it to an entity */
00442         if((ret = utf8_to_unicode(in_ptr,in_left,&unicode)) <= 0) {
00443           free(out_buf);
00444           return NULL;
00445         }
00446 
00447         /* longest entity is about 19 bytes; we need more space if buffer is shorter */
00448         if(out_left < 20) {
00449           bsz += in_len;
00450           /* tmp_buf cannot be NULL because if memory allocation failes, fo_alloc calls exit() */
00451           tmp_buf = (u_char *)fo_alloc(out_buf, bsz+1,1,FO_ALLOC_REALLOC);
00452 
00453           out_buf  = tmp_buf;
00454           out_p    = out_buf + out_size;
00455           out_left = bsz - out_size;
00456         }
00457 
00458         /* get named enity (if available) */
00459         if((entity = (u_char *)entity_lookup(unicode)) == NULL) {
00460           elen = snprintf(buff,15,"&#%d;",unicode);
00461         }
00462         else {
00463           elen = snprintf(buff,15,"&%s;",entity);
00464         }
00465 
00466         /* copy entity to buffer */
00467         strncpy(out_p,buff,elen);
00468 
00469         /* go to next sequence */
00470         in_left -= ret;
00471         in_ptr += ret;
00472 
00473         /* go to free space */
00474         out_p    += elen;
00475         out_left -= elen;
00476 
00477         if(in_left <= 0) {
00478           result = 0;
00479           break;
00480         }
00481 
00482         continue;
00483       }
00484     }
00485 
00486     break;
00487   }
00488 
00489   iconv_close(cd);
00490 
00491   if(result == (size_t)(-1)) {
00492     free(out_buf);
00493     return NULL;
00494   }
00495 
00496   *out_p = '\0';
00497   if(outlen) *outlen = (size_t)(out_p - out_buf);
00498   return out_buf;
00499 }
00500 
00501 /* eof */

Generated on Sun Apr 25 16:37:37 2004 for Classic Forum by doxygen 1.3.5