Changeset 985:f6acd0400f18 for charset.c

Show
Ignore:
Timestamp:
1999-03-30 15:50:33 (10 years ago)
Author:
Thomas Roessler <roessler@…>
Branch:
HEAD
Message:

This patch removes at least some of the horrible utf-8 kluges in
charset.c. The new DECODER framework is currently only used in
handler.c, and there in a horribly inefficient manner. We should
use greater blocks of data, which would be much more efficient than
what we are currently doing.

Most of the other charset-related code still uses the old
mutt_display_char() &friends interface, which is actually ok as long
as you don't try to handle multibyte character sets.

The most notable change should be the one to mutt_get_translation():
It will delay the loading and parsing of character set information
files until it's really needed, catching a huge amount of standard
cases. As a side effect, this will make "iso tagged as ascii"
"work" again, as long as both sides use the same iso character set.

Files:
1 modified

Legend:

Unmodified
Added
Removed
  • charset.c

    r935 r985  
    683683  if(!_from || !_to) 
    684684    return NULL; 
    685    
    686   init_charsets(); 
    687685 
    688686  canonical_charset(from_canon, sizeof(from_canon), _from); 
    689687  canonical_charset(to_canon, sizeof(to_canon), _to); 
     688 
     689  /* quick check for some trivial cases.  Doing this before 
     690   * we actually call the initialization routine delays character 
     691   * set loading until it's _really_ needed. 
     692   */ 
     693 
     694  if(!strcmp(from_canon, to_canon) 
     695     || (!strcmp (from_canon, "us-ascii") && !strncmp (to_canon, "iso-8859", 8))) 
     696    return NULL; 
     697 
     698  init_charsets(); 
    690699 
    691700  if(!CharsetAliases || !(from = hash_find(CharsetAliases, from_canon))) 
     
    695704   
    696705  /* quick check for the identity mapping */ 
    697   if((from == to) || ((*from == *to) && !mutt_strcmp(from, to))) 
     706  if((from == to) || !mutt_strcmp(from, to)) 
    698707    return NULL; 
    699    
     708 
    700709  snprintf(key, sizeof(key), "%s %s", from, to); 
    701710  if((map = hash_find(Translations, key)) == NULL) 
     
    813822static CHARSET *Unicode = NULL; 
    814823 
     824static int unicode_init (void) 
     825{ 
     826  if (!Unicode) 
     827  { 
     828    if (load_charset ("ISO_10646", &Unicode, 1) == -1) 
     829      Unicode = NULL; 
     830  } 
     831   
     832  return (Unicode == NULL ? -1 : 0); 
     833} 
     834 
    815835void mutt_decode_utf8_string(char *str, CHARSET *chs) 
    816836{ 
     
    819839  int ch; 
    820840 
    821   /* Hack */ 
    822    
    823   if (!Unicode) 
    824   { 
    825     if (load_charset ("ISO_10646", &Unicode, 1) == -1) 
    826       Unicode = NULL; 
    827   } 
     841  (void) unicode_init (); 
    828842   
    829843  for (s = t = str; *t; s++) 
     
    845859} 
    846860 
    847 /* internal use only */ 
    848  
    849 struct utf8_state 
    850 { 
    851   char *buffer; 
    852   size_t blen; 
    853   size_t bp; 
    854 }; 
    855  
    856 static struct utf8_state *new_utf8_state (void) 
    857 { 
    858   return safe_calloc (1, sizeof (struct utf8_state)); 
    859 } 
    860  
    861 static void free_utf8_state (struct utf8_state **sp) 
    862 { 
    863   if (!sp || !*sp) return; 
    864   safe_free ((void **) &(*sp)->buffer); 
    865   safe_free ((void **) sp); 
    866 } 
    867  
    868 static void _state_utf8_flush(STATE *s, CHARSET *chs, struct utf8_state *sfu) 
    869 { 
    870   char *t; 
    871   if(!sfu->buffer || !sfu->bp) 
     861 
     862 
     863 
     864/************************************************************* 
     865 * General decoder framework 
     866 */ 
     867 
     868 
     869 
     870#define MIN(a,b) (((a) <= (b)) ? (a): (b)) 
     871 
     872DECODER *mutt_open_decoder (const char *src, const char *dest) 
     873{ 
     874  DECODER *d = safe_calloc (1, sizeof (DECODER));; 
     875 
     876  d->in.size = DECODER_BUFFSIZE; 
     877  d->out.size = DECODER_BUFFSIZE; 
     878 
     879  if (!src || !dest || mutt_is_utf8 (dest)) 
     880  { 
     881    d->just_take_id = 1; 
     882    return d; 
     883  } 
     884   
     885  if (mutt_is_utf8 (src)) 
     886  { 
     887    if (!(d->chs = mutt_get_charset (dest)) || unicode_init () == -1) 
     888    { 
     889      d->just_take_id = 1; 
     890      return d; 
     891    } 
     892     
     893    d->src_is_utf8 = 1; 
     894    return d; 
     895  } 
     896   
     897  if (!(d->chm = mutt_get_translation (src, dest))) 
     898    d->just_take_id = 1; 
     899   
     900  return d; 
     901} 
     902 
     903void mutt_free_decoder (DECODER **dpp) 
     904{ 
     905  safe_free ((void **) dpp); 
     906} 
     907 
     908static void _process_data (DECODER *, short); 
     909 
     910void mutt_decoder_push (DECODER *d, void *_buff, size_t blen, size_t *taken) 
     911{ 
     912  if (!_buff || !blen) 
     913  { 
     914    _process_data (d, 1); 
    872915    return; 
    873    
    874   sfu->buffer[sfu->bp] = '\0'; 
    875    
    876   mutt_decode_utf8_string(sfu->buffer, chs); 
    877   for(t = sfu->buffer; *t; t++) 
    878   { 
    879     /* This may lead to funny-looking output if  
    880      * there are embedded CRs, NLs or similar things 
    881      * - but these would constitute illegal  
    882      * UTF8 encoding anyways, so we don't care. 
    883      */ 
    884  
    885     state_prefix_putc(*t, s); 
    886   } 
    887   sfu->bp = 0; 
    888 } 
     916  } 
     917 
     918  if ((*taken = MIN(blen, d->in.size - d->in.used))) 
     919  { 
     920    memcpy (d->in.buff + d->in.used, _buff, *taken); 
     921    d->in.used += *taken; 
     922  } 
     923} 
     924 
     925 
     926void mutt_decoder_pop (DECODER *d, void *_buff, size_t blen, size_t *popped) 
     927{ 
     928  unsigned char *buff = _buff; 
     929 
     930  _process_data (d, 0); 
     931   
     932  if ((*popped = MIN (blen, d->out.used))) 
     933  { 
     934    memcpy (buff, d->out.buff, *popped); 
     935    memmove (d->out.buff, d->out.buff + *popped, d->out.used - *popped); 
     936    d->out.used -= *popped; 
     937  } 
     938} 
     939 
     940void mutt_decoder_pop_to_state (DECODER *d, STATE *s) 
     941{ 
     942  char tmp[DECODER_BUFFSIZE]; 
     943  size_t i, l; 
     944   
     945  do  
     946  { 
     947    mutt_decoder_pop (d, tmp, sizeof (tmp), &l); 
     948    for (i = 0; i < l; i++) 
     949      state_prefix_putc (tmp[i], s); 
     950  } 
     951  while (l > 0); 
     952} 
     953 
     954/* this is where things actually happen */ 
     955 
     956static void _process_data_8bit (DECODER *d) 
     957{ 
     958  size_t i; 
     959   
     960  for (i = 0; i < d->in.used && d->out.used < d->out.size; i++) 
     961    d->out.buff[d->out.used++] = mutt_display_char (d->in.buff[i], d->chm); 
     962   
     963  memmove (d->in.buff, d->in.buff + i, d->in.used - i); 
     964  d->in.used -= i; 
     965} 
     966 
     967static void _process_data_utf8 (DECODER *d) 
     968{ 
     969  size_t i, j; 
     970  CHARDESC *cd; 
     971   
     972  for (i = 0, j = 0; i < d->in.used && d->out.used < d->out.size;) 
     973  { 
     974    while (((d->in.buff[j] & 0x80) == 0) && (j < d->in.used) && (d->out.used < d->out.size)) 
     975      d->out.buff[d->out.used++] = d->in.buff[j++]; 
     976    i = j; 
     977 
     978    while ((d->in.buff[j] & 0x80) && j < d->in.used && 
     979           (d->forced || j + 6 < d->in.used) && d->out.used < d->out.size) 
     980    { 
     981      int ch; 
     982      char *c = utf_to_unicode (&ch, &d->in.buff[j]); 
     983       
     984      j = c - d->in.buff; 
     985 
     986      if (0 <= ch && ch < 128) 
     987        d->out.buff[d->out.used] = ch; 
     988      else if ((cd = repr2descr (ch, Unicode)) && (ch = translate_character (d->chs, cd->symbol)) != -1) 
     989        d->out.buff[d->out.used] = ch; 
     990      else 
     991        d->out.buff[d->out.used] = '?'; 
     992       
     993      if(!d->out.buff[d->out.used])  
     994        d->out.buff[d->out.used] = '?'; 
     995       
     996      d->out.used++; 
     997    } 
    889998     
    890 static void state_fput_utf8(STATE *st, char u, CHARSET *chs, struct utf8_state *sfu) 
    891 { 
    892   if((u & 0x80) == 0 || (sfu->bp && (u & IIOOOOOO) != IOOOOOOO)) 
    893     _state_utf8_flush(st, chs, sfu); 
    894       
    895   if((u & 0x80) == 0) 
    896   { 
    897     if(u) state_prefix_putc(u, st); 
    898   } 
     999    i = j; 
     1000     
     1001    if (d->in.buff[j] & 0x80) 
     1002      break; 
     1003  } 
     1004 
     1005  memmove (d->in.buff, d->in.buff + i, d->in.used - i); 
     1006  d->in.used -= i; 
     1007} 
     1008 
     1009static void _process_data (DECODER *d, short force) 
     1010{ 
     1011  if (force) d->forced = 1; 
     1012   
     1013  if (d->just_take_id) 
     1014  { 
     1015    size_t l = MIN (d->out.size - d->out.used, d->in.used); 
     1016    memmove (d->out.buff + d->out.used, d->in.buff, l); 
     1017    memmove (d->in.buff, d->in.buff + l, d->in.used - l); 
     1018    d->in.used -= l; 
     1019    d->out.used += l; 
     1020  } 
     1021  else if (d->src_is_utf8) 
     1022    _process_data_utf8 (d); 
    8991023  else 
    900   { 
    901     if(sfu->bp + 1 >= sfu->blen) 
    902     { 
    903       sfu->blen = (sfu->blen + 80) * 2; 
    904       safe_realloc((void **) &sfu->buffer, sfu->blen + 1); 
    905     } 
    906     sfu->buffer[sfu->bp++] = u; 
    907   } 
    908 } 
    909  
    910 /* a nicer interface for decoding */ 
    911  
    912 DECODER *mutt_open_decoder (STATE *s, BODY *b, int istext) 
    913 { 
    914   DECODER *dp = safe_calloc (1, sizeof (DECODER)); 
    915    
    916   dp->s = s; 
    917    
    918   if (istext && (s->flags & M_CHARCONV)) 
    919   { 
    920     char *charset = mutt_get_parameter ("charset", b->parameter); 
    921     dp->is_utf8 = mutt_is_utf8 (charset) && !mutt_is_utf8 (Charset); 
    922      
    923     if (dp->is_utf8) 
    924     { 
    925       dp->sfu = new_utf8_state (); 
    926       dp->chs = mutt_get_charset (Charset); 
    927     } 
    928     else 
    929       dp->map = mutt_get_translation (charset, Charset); 
    930   } 
    931    
    932   return dp; 
    933 } 
    934  
    935 void mutt_close_decoder (DECODER **dpp) 
    936 { 
    937   if (!dpp || !*dpp) 
    938     return; 
    939    
    940   if ((*dpp)->is_utf8) 
    941   { 
    942     _state_utf8_flush ((*dpp)->s, (*dpp)->chs, (*dpp)->sfu); 
    943     free_utf8_state (&(*dpp)->sfu); 
    944   } 
    945  
    946   safe_free ((void **) dpp); 
    947 } 
    948  
    949 void mutt_decoder_putc (DECODER *dp, char c) 
    950 { 
    951   if (dp->is_utf8) 
    952     state_fput_utf8 (dp->s, c, dp->chs, dp->sfu); 
    953   else 
    954     state_prefix_putc (mutt_display_char ((unsigned char) c, dp->map), dp->s); 
    955 } 
    956  
    957 /* FIXME: utf-8 support */ 
     1024    _process_data_8bit (d); 
     1025} 
     1026 
     1027/* This one is currently lacking utf-8 support */ 
    9581028 
    9591029int mutt_recode_file (const char *fname, const char *src, const char *dest) 
     
    9611031  FILE *fp, *tmpfp; 
    9621032  char tempfile[_POSIX_PATH_MAX]; 
     1033  char buffer[1024]; 
     1034  char tmp[1024]; 
    9631035  int c; 
    9641036  int rv = -1; 
    965    
    966   CHARSET_MAP *map; 
    967  
    968   if (mutt_is_utf8 (dest) ^ mutt_is_utf8(src)) 
     1037 
     1038  size_t lf, lpu, lpo; 
     1039  char *t; 
     1040  DECODER *dec; 
     1041 
     1042  if (mutt_is_utf8 (dest) && !mutt_is_utf8 (src)) 
    9691043  { 
    9701044    mutt_error (_("We can't currently handle utf-8 at this point.")); 
     
    9861060  } 
    9871061 
    988   map = mutt_get_translation (src, dest); 
    989    
    990   while ((c = fgetc (fp)) != EOF) 
    991     if (fputc (mutt_display_char ((unsigned char) c, map), tmpfp) == EOF) 
    992       goto bail; 
     1062  dec = mutt_open_decoder (src, dest); 
     1063   
     1064  while ((lf = fread (buffer, 1, sizeof (buffer), fp)) > 0) 
     1065  { 
     1066    for (t = buffer; lf; t += lpu) 
     1067    { 
     1068      mutt_decoder_push (dec, t, lf, &lpu); 
     1069      lf -= lpu; 
     1070       
     1071      do 
     1072      { 
     1073        mutt_decoder_pop (dec, tmp, sizeof (tmp), &lpo); 
     1074        if (lpo) 
     1075          fwrite (tmp, lpo, 1, tmpfp); 
     1076      }  
     1077      while (lpo); 
     1078    } 
     1079  } 
     1080 
     1081  mutt_decoder_push (dec, NULL, 0, NULL); 
     1082  do  
     1083  { 
     1084    mutt_decoder_pop (dec, tmp, sizeof (tmp), &lpo); 
     1085    if (lpo) fwrite (tmp, lpo, 1, tmpfp); 
     1086  } 
     1087  while (lpo); 
     1088 
     1089  mutt_free_decoder (&dec); 
    9931090 
    9941091  fclose (fp); fp = NULL;