mandoc/preconv.c - diff

Return to preconv.c CVS log

Up to [cvsweb.bsd.lv] / mandoc

Diff for /mandoc/preconv.c between version 1.11 and 1.17

-version 1.11, 2014/11/01 04:08:43
+version 1.17, 2018/12/13 11:55:47
 Line 19
 Line 19
 Line 19
  #include <sys/types.h>
+ #include <assert.h>
  #include <stdio.h>
  #include <string.h>
  #include "mandoc.h"
+ #include "roff.h"
+ #include "mandoc_parse.h"
  #include "libmandoc.h"
  int
- preconv_encode(struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
+ preconv_encode(const struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
      int *filenc)
  {
-         size_t           i;
+         const unsigned char     *cu;
-         const long       one = 1L;
+         int                      nby;
-         int              state, be;
+         unsigned int             accum;
-         unsigned int     accum;
-         unsigned char    cu;
+         cu = (const unsigned char *)ib->buf + *ii;
+         assert(*cu & 0x80);
          if ( ! (*filenc & MPARSE_UTF8))
                  goto latin;
-         state = 0;
+         nby = 1;
-         accum = 0U;
+         while (nby < 5 && *cu & (1 << (7 - nby)))
-         be = 0;
+                 nby++;
-         /* Quick test for big-endian value. */
+         switch (nby) {
+         case 2:
+                 accum = *cu & 0x1f;
+                 if (accum < 0x02)  /* Obfuscated ASCII. */
+                         goto latin;
+                 break;
+         case 3:
+                 accum = *cu & 0x0f;
+                 break;
+         case 4:
+                 accum = *cu & 0x07;
+                 if (accum > 0x04) /* Beyond Unicode. */
+                         goto latin;
+                 break;
+         default:  /* Bad sequence header. */
+                 goto latin;
+         }
-         if ( ! (*((const char *)(&one))))
+         cu++;
-                 be = 1;
+         switch (nby) {
+         case 3:
+                 if ((accum == 0x00 && ! (*cu & 0x20)) ||  /* Use 2-byte. */
+                     (accum == 0x0d && *cu & 0x20))  /* Surrogates. */
+                         goto latin;
+                 break;
+         case 4:
+                 if ((accum == 0x00 && ! (*cu & 0x30)) ||  /* Use 3-byte. */
+                     (accum == 0x04 && *cu & 0x30))  /* Beyond Unicode. */
+                         goto latin;
+                 break;
+         default:
+                 break;
+         }
-         for (i = *ii; i < ib->sz; i++) {
+         while (--nby) {
-                 cu = ib->buf[i];
+                 if ((*cu & 0xc0) != 0x80)  /* Invalid continuation. */
-                 if (state) {
+                         goto latin;
-                         if ( ! (cu & 128) || (cu & 64)) {
+                 accum <<= 6;
-                                 /* Bad sequence header. */
+                 accum += *cu & 0x3f;
-                                 break;
+                 cu++;
-                         }
-                         /* Accept only legitimate bit patterns. */
-                         if (cu > 191 || cu < 128) {
-                                 /* Bad in-sequence bits. */
-                                 break;
-                         }
-                         accum |= (cu & 63) << --state * 6;
-                         if (state)
-                                 continue;
-                         /*
-                          * Accum is held in little-endian order as
-                          * stipulated by the UTF-8 sequence coding.  We
-                          * need to convert to a native big-endian if our
-                          * architecture requires it.
-                          */
-                         if (be)
-                                 accum = (accum >> 24) |
-                                         ((accum << 8) & 0x00FF0000) |
-                                         ((accum >> 8) & 0x0000FF00) |
-                                         (accum << 24);
-                         if (accum < 0x80)
-                                 ob->buf[(*oi)++] = accum;
-                         else
-                                 *oi += snprintf(ob->buf + *oi,
-, "\\[u%.4X]", accum);
-                         *ii = i + 1;
-                         *filenc &= ~MPARSE_LATIN1;
-                         return(1);
-                 } else {
-                         /*
-                          * Entering a UTF-8 state:  if we encounter a
-                          * UTF-8 bitmask, calculate the expected UTF-8
-                          * state from it.
-                          */
-                         for (state = 0; state < 7; state++)
-                                 if ( ! (cu & (1 << (7 - state))))
-                                         break;
-                         /* Accept only legitimate bit patterns. */
-                         switch (state--) {
-                         case (4):
-                                 if (cu <= 244 && cu >= 240) {
-                                         accum = (cu & 7) << 18;
-                                         continue;
-                                 }
-                                 /* Bad 4-sequence start bits. */
-                                 break;
-                         case (3):
-                                 if (cu <= 239 && cu >= 224) {
-                                         accum = (cu & 15) << 12;
-                                         continue;
-                                 }
-                                 /* Bad 3-sequence start bits. */
-                                 break;
-                         case (2):
-                                 if (cu <= 223 && cu >= 194) {
-                                         accum = (cu & 31) << 6;
-                                         continue;
-                                 }
-                                 /* Bad 2-sequence start bits. */
-                                 break;
-                         default:
-                                 /* Bad sequence bit mask. */
-                                 break;
-                         }
-                         break;
-                 }
          }
-         /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
+         assert(accum > 0x7f);
+         assert(accum < 0x110000);
+         assert(accum < 0xd800 || accum > 0xdfff);
+         *oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum);
+         *ii = (const char *)cu - ib->buf;
+         *filenc &= ~MPARSE_LATIN1;
+         return 1;
  latin:
          if ( ! (*filenc & MPARSE_LATIN1))
-                 return(0);
+                 return 0;
          *oi += snprintf(ob->buf + *oi, 11,
              "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]);
          *filenc &= ~MPARSE_UTF8;
-         return(1);
+         return 1;
  }
  int
-Line 158  preconv_cue(const struct buf *b, size_t offset)
+Line 124  preconv_cue(const struct buf *b, size_t offset)
 Line 158  preconv_cue(const struct buf *b, size_t offset)
 Line 124  preconv_cue(const struct buf *b, size_t offset)
          /* Check if we have the correct header/trailer. */
          if ((sz = (size_t)(eoln - ln)) < 10 ||
              memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3))
-                 return(MPARSE_UTF8 | MPARSE_LATIN1);
+                 return MPARSE_UTF8 | MPARSE_LATIN1;
          /* Move after the header and adjust for the trailer. */
-Line 189  preconv_cue(const struct buf *b, size_t offset)
+Line 155  preconv_cue(const struct buf *b, size_t offset)
 Line 189  preconv_cue(const struct buf *b, size_t offset)
 Line 155  preconv_cue(const struct buf *b, size_t offset)
                          sz -= phsz;
                          ln += phsz;
                          continue;
                  }
                  sz -= 7;
                  ln += 7;
-Line 199  preconv_cue(const struct buf *b, size_t offset)
+Line 165  preconv_cue(const struct buf *b, size_t offset)
 Line 199  preconv_cue(const struct buf *b, size_t offset)
 Line 165  preconv_cue(const struct buf *b, size_t offset)
                          sz--;
                  }
                  if (0 == sz)
-                         return(0);
+                         return 0;
                  /* Check us against known encodings. */
                  if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
-                         return(MPARSE_UTF8);
+                         return MPARSE_UTF8;
                  if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
-                         return(MPARSE_LATIN1);
+                         return MPARSE_LATIN1;
-                 return(0);
+                 return 0;
          }
-         return(MPARSE_UTF8 | MPARSE_LATIN1);
+         return MPARSE_UTF8 | MPARSE_LATIN1;
  }

CVSweb