00001
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018 #include "config.h"
00019 #include "defines.h"
00020
00021 #include <errno.h>
00022 #include <stdlib.h>
00023 #include <stdio.h>
00024 #include <iconv.h>
00025 #include <string.h>
00026
00027 #include "entitytable.h"
00028
00029 #include "utils.h"
00030
00031
00032 int utf8_to_unicode(const u_char *s,size_t n,u_int32_t *num) {
00033 u_char c = s[0];
00034
00035 if(c < 0x80) {
00036 *num = c;
00037 return 1;
00038 }
00039 else if(c < 0xc2) return EILSEQ;
00040 else if(c < 0xe0) {
00041 if(n < 2) return EILSEQ;
00042
00043 if(!((s[1] ^ 0x80) < 0x40)) return EILSEQ;
00044 *num = ((u_int32_t)(c & 0x1f) << 6) | (u_int32_t)(s[1] ^ 0x80);
00045 return 2;
00046 }
00047 else if(c < 0xf0) {
00048 if(n < 3) return EILSEQ;
00049 if(!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (c >= 0xe1 || s[1] >= 0xa0))) return EILSEQ;
00050
00051 *num = ((u_int32_t)(c & 0x0f) << 12) | ((u_int32_t)(s[1] ^ 0x80) << 6) | (u_int32_t)(s[2] ^ 0x80);
00052 return 3;
00053 }
00054 else if(c < 0xf8) {
00055 if(n < 4) return EILSEQ;
00056 if(!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (c >= 0xf1 || s[1] >= 0x90))) return EILSEQ;
00057
00058 *num = ((u_int32_t)(c & 0x07) << 18) | ((u_int32_t)(s[1] ^ 0x80) << 12) | ((u_int32_t)(s[2] ^ 0x80) << 6) | (u_int32_t)(s[3] ^ 0x80);
00059 return 4;
00060 }
00061 else if(c < 0xfc) {
00062 if(n < 5) return EILSEQ;
00063 if(!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 && (c >= 0xf9 || s[1] >= 0x88))) return EILSEQ;
00064
00065 *num = ((u_int32_t)(c & 0x03) << 24) | ((u_int32_t)(s[1] ^ 0x80) << 18) | ((u_int32_t)(s[2] ^ 0x80) << 12) | ((u_int32_t)(s[3] ^ 0x80) << 6) | (u_int32_t)(s[4] ^ 0x80);
00066 return 5;
00067 }
00068 else if(c < 0xfe) {
00069 if(n < 6) return EILSEQ;
00070 if(!((s[1] ^ 0x80) < 0x40 && (s[2] ^ 0x80) < 0x40 && (s[3] ^ 0x80) < 0x40 && (s[4] ^ 0x80) < 0x40 && (s[5] ^ 0x80) < 0x40 && (c >= 0xfd || s[1] >= 0x84))) return EILSEQ;
00071
00072 *num = ((u_int32_t)(c & 0x01) << 30) | ((u_int32_t)(s[1] ^ 0x80) << 24) | ((u_int32_t)(s[2] ^ 0x80) << 18) | ((u_int32_t)(s[3] ^ 0x80) << 12) | ((u_int32_t)(s[4] ^ 0x80) << 6) | (u_int32_t)(s[5] ^ 0x80);
00073 return 6;
00074 }
00075 else return EILSEQ;
00076
00077 }
00078
00079 int is_valid_utf8_string(const u_char *str,size_t len) {
00080 register u_char *ptr = (u_char *)str;
00081 int x;
00082 int ret;
00083
00084 for(;*ptr && len > 0;) {
00085 if((ret = utf8_to_unicode(ptr,len,&x)) == EILSEQ) {
00086 return -1;
00087 }
00088
00089 ptr += ret;
00090 len -= ret;
00091 }
00092
00093 return 0;
00094 }
00095
00096
00097
00098
00099
00100
00101
00102
00103
00104
00105
00106
00107
00108 u_char *charset_convert(const u_char *toencode,size_t in_len,const u_char *from_charset,const u_char *to_charset,size_t *out_len_p) {
00109 iconv_t cd;
00110 size_t in_left, out_size, out_left;
00111 u_char *out_p, *out_buf, *tmp_buf;
00112 size_t bsz, result = 0;
00113
00114 cd = iconv_open(to_charset,from_charset);
00115
00116 if(cd == (iconv_t)(-1)) {
00117 return NULL;
00118 }
00119
00120 in_left = in_len;
00121 out_left = in_len + 32;
00122 out_size = 0;
00123 bsz = out_left;
00124 out_buf = fo_alloc(NULL,bsz+1,1,FO_ALLOC_MALLOC);
00125 out_p = out_buf;
00126
00127 while(in_left > 0) {
00128 result = iconv(cd,(u_char **)&toencode,&in_left,(u_char **)&out_p,&out_left);
00129 out_size = bsz - out_left;
00130 if(result == (size_t)(-1)) {
00131 if(errno == E2BIG && in_left > 0) {
00132
00133 bsz += in_len;
00134
00135
00136 tmp_buf = (u_char *)fo_alloc(out_buf, bsz+1,1,FO_ALLOC_REALLOC);
00137
00138 out_buf = tmp_buf;
00139 out_p = out_buf + out_size;
00140 out_left = bsz - out_size;
00141 continue;
00142 }
00143 }
00144
00145 break;
00146 }
00147
00148 iconv_close(cd);
00149
00150 if(result == (size_t)(-1)) {
00151 free(out_buf);
00152 return NULL;
00153 }
00154
00155 *out_p = '\0';
00156 if(out_len_p) *out_len_p = (size_t)(out_p - out_buf);
00157 return out_buf;
00158 }
00159
00160
00161
00162
00163
00164
00165
00166
00167
00168 u_char *htmlentities(const u_char *string,int sq) {
00169 register u_char *ptr;
00170 t_string new_str;
00171
00172 str_init(&new_str);
00173
00174 if(!string) {
00175 return NULL;
00176 }
00177
00178 for(ptr=(u_char *)string;*ptr;ptr++) {
00179 switch(*ptr) {
00180 case '>':
00181 str_chars_append(&new_str,">",4);
00182 break;
00183 case '<':
00184 str_chars_append(&new_str,"<",4);
00185 break;
00186 case '&':
00187 str_chars_append(&new_str,"&",5);
00188 break;
00189 case '"':
00190 str_chars_append(&new_str,""",6);
00191 break;
00192 case '\'':
00193 if(sq) {
00194 str_chars_append(&new_str,"'",5);
00195 }
00196 else {
00197 str_char_append(&new_str,*ptr);
00198 }
00199 break;
00200 default:
00201 str_char_append(&new_str,*ptr);
00202 break;
00203 }
00204 }
00205
00206 return fo_alloc(new_str.content,new_str.len+1,1,FO_ALLOC_REALLOC);
00207 }
00208
00209 size_t print_htmlentities_encoded(const u_char *string,int sq,FILE *handle) {
00210 register u_char *ptr;
00211 register size_t written = 0,s = 0;
00212
00213 if(!string) return 0;
00214
00215 for(ptr=(u_char *)string;*ptr;ptr++) {
00216 switch(*ptr) {
00217 case 34:
00218 s = fwrite(""",1,6,handle);
00219 break;
00220 case 38:
00221 s = fwrite("&",1,5,handle);
00222 break;
00223 case 39:
00224 if(sq) {
00225 s = fwrite("'",1,5,handle);
00226 }
00227 else {
00228 s = fwrite(ptr,1,1,handle);
00229 }
00230 break;
00231 case 60:
00232 s = fwrite("<",1,4,handle);
00233 break;
00234 case 62:
00235 s = fwrite(">",1,4,handle);
00236 break;
00237 default:
00238 fputc(*ptr,handle);
00239 s = 1;
00240 break;
00241 }
00242
00243 if(s <= 0) {
00244 return written;
00245 }
00246
00247 written += s;
00248 }
00249
00250 return written;
00251 }
00252
00253
00254
00255
00256
00257
00258
00259 u_char *htmlentities_charset_convert(const u_char *toencode, const u_char *from, const u_char *to,size_t *outlen,int sq) {
00260 register u_char *ptr;
00261 u_char *in_ptr,*entity,buff[15];
00262 t_string new_str;
00263
00264 iconv_t cd;
00265 size_t in_left, out_size, out_left,in_len,elen;
00266 u_char *out_p, *out_buf, *tmp_buf;
00267 size_t bsz, result = 0;
00268 int unicode,ret;
00269
00270 cd = iconv_open(to,from);
00271
00272 if(cd == (iconv_t)(-1)) {
00273 return NULL;
00274 }
00275
00276
00277 str_init(&new_str);
00278
00279 for(ptr=(u_char *)toencode;*ptr;ptr++) {
00280 switch(*ptr) {
00281 case '>':
00282 str_chars_append(&new_str,">",4);
00283 break;
00284 case '<':
00285 str_chars_append(&new_str,"<",4);
00286 break;
00287 case '&':
00288 str_chars_append(&new_str,"&",5);
00289 break;
00290 case '"':
00291 str_chars_append(&new_str,""",6);
00292 break;
00293 case '\'':
00294 if(sq) {
00295 str_chars_append(&new_str,"'",5);
00296 }
00297 else {
00298 str_char_append(&new_str,*ptr);
00299 }
00300 break;
00301 default:
00302 str_char_append(&new_str,*ptr);
00303 break;
00304 }
00305 }
00306
00307
00308 in_len = new_str.len;
00309 in_ptr = new_str.content;
00310 in_left = new_str.len;
00311 out_left = in_len + 32;
00312 out_size = 0;
00313 bsz = out_left;
00314 out_buf = fo_alloc(NULL,bsz+1,1,FO_ALLOC_MALLOC);
00315 out_p = out_buf;
00316
00317 while(in_left > 0) {
00318 result = iconv(cd,&in_ptr,&in_left,(u_char **)&out_p,&out_left);
00319 out_size = bsz - out_left;
00320 if(result == (size_t)(-1)) {
00321 if(errno == E2BIG && in_left > 0) {
00322
00323 bsz += in_len;
00324
00325
00326 tmp_buf = (u_char *)fo_alloc(out_buf, bsz+1,1,FO_ALLOC_REALLOC);
00327
00328 out_buf = tmp_buf;
00329 out_p = out_buf + out_size;
00330 out_left = bsz - out_size;
00331 continue;
00332 }
00333 else if(errno == EILSEQ) {
00334
00335 if((ret = utf8_to_unicode(in_ptr,in_left,&unicode)) <= 0) {
00336 str_cleanup(&new_str);
00337 free(out_buf);
00338 return NULL;
00339 }
00340
00341
00342 if(out_left < 20) {
00343 bsz += in_len;
00344
00345 tmp_buf = (u_char *)fo_alloc(out_buf, bsz+1,1,FO_ALLOC_REALLOC);
00346
00347 out_buf = tmp_buf;
00348 out_p = out_buf + out_size;
00349 out_left = bsz - out_size;
00350 }
00351
00352
00353 if((entity = (u_char *)entity_lookup(unicode)) == NULL) {
00354 elen = snprintf(buff,15,"&#%d;",unicode);
00355 }
00356 else {
00357 elen = snprintf(buff,15,"&%s;",entity);
00358 }
00359
00360
00361 strncpy(out_p,buff,elen);
00362
00363
00364 in_left -= ret;
00365 in_ptr += ret;
00366
00367
00368 out_p += elen;
00369 out_left -= elen;
00370
00371 if(in_left <= 0) {
00372 result = 0;
00373 break;
00374 }
00375
00376 continue;
00377 }
00378 }
00379
00380 break;
00381 }
00382
00383 iconv_close(cd);
00384 str_cleanup(&new_str);
00385
00386 if(result == (size_t)(-1)) {
00387 free(out_buf);
00388 return NULL;
00389 }
00390
00391 *out_p = '\0';
00392 if(outlen) *outlen = (size_t)(out_p - out_buf);
00393 return out_buf;
00394 }
00395
00396
00397
00398
00399
00400 u_char *charset_convert_entities(const u_char *toencode, size_t in_len,const u_char *from, const u_char *to,size_t *outlen) {
00401 u_char *in_ptr,*entity,buff[15];
00402
00403 iconv_t cd;
00404 size_t in_left, out_size, out_left,elen;
00405 u_char *out_p, *out_buf, *tmp_buf;
00406 size_t bsz, result = 0;
00407 int unicode,ret;
00408
00409 cd = iconv_open(to,from);
00410
00411 if(cd == (iconv_t)(-1)) {
00412 return NULL;
00413 }
00414
00415
00416 in_ptr = (u_char *)toencode;
00417 in_left = in_len;
00418 out_left = in_len + 32;
00419 out_size = 0;
00420 bsz = out_left;
00421 out_buf = fo_alloc(NULL,bsz+1,1,FO_ALLOC_MALLOC);
00422 out_p = out_buf;
00423
00424 while(in_left > 0) {
00425 result = iconv(cd,&in_ptr,&in_left,(u_char **)&out_p,&out_left);
00426 out_size = bsz - out_left;
00427 if(result == (size_t)(-1)) {
00428 if(errno == E2BIG && in_left > 0) {
00429
00430 bsz += in_len;
00431
00432
00433 tmp_buf = (u_char *)fo_alloc(out_buf, bsz+1,1,FO_ALLOC_REALLOC);
00434
00435 out_buf = tmp_buf;
00436 out_p = out_buf + out_size;
00437 out_left = bsz - out_size;
00438 continue;
00439 }
00440 else if(errno == EILSEQ) {
00441
00442 if((ret = utf8_to_unicode(in_ptr,in_left,&unicode)) <= 0) {
00443 free(out_buf);
00444 return NULL;
00445 }
00446
00447
00448 if(out_left < 20) {
00449 bsz += in_len;
00450
00451 tmp_buf = (u_char *)fo_alloc(out_buf, bsz+1,1,FO_ALLOC_REALLOC);
00452
00453 out_buf = tmp_buf;
00454 out_p = out_buf + out_size;
00455 out_left = bsz - out_size;
00456 }
00457
00458
00459 if((entity = (u_char *)entity_lookup(unicode)) == NULL) {
00460 elen = snprintf(buff,15,"&#%d;",unicode);
00461 }
00462 else {
00463 elen = snprintf(buff,15,"&%s;",entity);
00464 }
00465
00466
00467 strncpy(out_p,buff,elen);
00468
00469
00470 in_left -= ret;
00471 in_ptr += ret;
00472
00473
00474 out_p += elen;
00475 out_left -= elen;
00476
00477 if(in_left <= 0) {
00478 result = 0;
00479 break;
00480 }
00481
00482 continue;
00483 }
00484 }
00485
00486 break;
00487 }
00488
00489 iconv_close(cd);
00490
00491 if(result == (size_t)(-1)) {
00492 free(out_buf);
00493 return NULL;
00494 }
00495
00496 *out_p = '\0';
00497 if(outlen) *outlen = (size_t)(out_p - out_buf);
00498 return out_buf;
00499 }
00500
00501