krb5 commit: Replace UCS-2 conversions with UTF-16

Greg Hudson ghudson at mit.edu
Fri Apr 21 16:39:14 EDT 2017


https://github.com/krb5/krb5/commit/89ce6420832858950271858e7c6e1a2eefebc683
commit 89ce6420832858950271858e7c6e1a2eefebc683
Author: Greg Hudson <ghudson at mit.edu>
Date:   Tue Apr 18 14:01:06 2017 -0400

    Replace UCS-2 conversions with UTF-16
    
    Where we convert between UTF-8 and UCS-2 (RC4 string-to-key and PAC
    client info), use UTF-16 instead of UCS-2.  Add a test program for
    the conversion functions.
    
    ticket: 8577 (new)

 .gitignore                                    |    1 +
 src/include/k5-utf8.h                         |   14 ++--
 src/lib/crypto/krb/s2k_rc4.c                  |    2 +-
 src/lib/krb5/krb/pac.c                        |    2 +-
 src/lib/krb5/krb/pac_sign.c                   |   20 ++--
 src/util/support/Makefile.in                  |   10 ++-
 src/util/support/deps                         |    8 ++-
 src/util/support/libkrb5support-fixed.exports |    4 +-
 src/util/support/t_utf16.c                    |  117 +++++++++++++++++++++++++
 src/util/support/utf8.c                       |    2 +-
 src/util/support/utf8_conv.c                  |  108 +++++++++++++++++------
 11 files changed, 236 insertions(+), 52 deletions(-)

diff --git a/.gitignore b/.gitignore
index 815c67d..862a87a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -523,6 +523,7 @@ local.properties
 /src/util/support/t_path_win
 /src/util/support/t_unal
 /src/util/support/t_utf8
+/src/util/support/t_utf16
 
 /src/util/verto/rename.h
 
diff --git a/src/include/k5-utf8.h b/src/include/k5-utf8.h
index 4b7415e..e2f20d4 100644
--- a/src/include/k5-utf8.h
+++ b/src/include/k5-utf8.h
@@ -73,8 +73,6 @@
 typedef uint16_t krb5_ucs2;
 typedef uint32_t krb5_ucs4;
 
-#define KRB5_MAX_UTF8_LEN   (sizeof(krb5_ucs2) * 3/2)
-
 int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out);
 size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf);
 
@@ -82,21 +80,21 @@ int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out);
 size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf);
 
 /*
- * Convert a little-endian UCS-2 string to an allocated null-terminated UTF-8
+ * Convert a little-endian UTF-16 string to an allocated null-terminated UTF-8
  * string.  nbytes is the length of ucs2bytes in bytes, and must be an even
  * number.  Return EINVAL on invalid input, ENOMEM on out of memory, or 0 on
  * success.
  */
-int k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes,
-                      char **utf8_out);
+int k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes,
+                       char **utf8_out);
 
 /*
- * Convert a UTF-8 string to an allocated little-endian UCS-2 string.  The
+ * Convert a UTF-8 string to an allocated little-endian UTF-16 string.  The
  * resulting length is in bytes and will always be even.  Return EINVAL on
  * invalid input, ENOMEM on out of memory, or 0 on success.
  */
-int k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out,
-                      size_t *nbytes_out);
+int k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out,
+                       size_t *nbytes_out);
 
 /* returns the number of bytes in the UTF-8 string */
 size_t krb5int_utf8_bytes(const char *);
diff --git a/src/lib/crypto/krb/s2k_rc4.c b/src/lib/crypto/krb/s2k_rc4.c
index fb41b26..081a912 100644
--- a/src/lib/crypto/krb/s2k_rc4.c
+++ b/src/lib/crypto/krb/s2k_rc4.c
@@ -24,7 +24,7 @@ krb5int_arcfour_string_to_key(const struct krb5_keytypes *ktp,
     utf8 = k5memdup0(string->data, string->length, &err);
     if (utf8 == NULL)
         return err;
-    err = k5_utf8_to_ucs2le(utf8, &copystr, &copystrlen);
+    err = k5_utf8_to_utf16le(utf8, &copystr, &copystrlen);
     free(utf8);
     if (err)
         return err;
diff --git a/src/lib/krb5/krb/pac.c b/src/lib/krb5/krb/pac.c
index 485a0f7..d1662b9 100644
--- a/src/lib/krb5/krb/pac.c
+++ b/src/lib/krb5/krb/pac.c
@@ -436,7 +436,7 @@ k5_pac_validate_client(krb5_context context,
         pac_princname_length % 2)
         return ERANGE;
 
-    ret = k5_ucs2le_to_utf8(p, pac_princname_length, &pac_princname);
+    ret = k5_utf16le_to_utf8(p, pac_princname_length, &pac_princname);
     if (ret != 0)
         return ret;
 
diff --git a/src/lib/krb5/krb/pac_sign.c b/src/lib/krb5/krb/pac_sign.c
index c6eee76..c94899c 100644
--- a/src/lib/krb5/krb/pac_sign.c
+++ b/src/lib/krb5/krb/pac_sign.c
@@ -38,8 +38,8 @@ k5_insert_client_info(krb5_context context,
     krb5_error_code ret;
     krb5_data client_info;
     char *princ_name_utf8 = NULL;
-    unsigned char *princ_name_ucs2 = NULL, *p;
-    size_t princ_name_ucs2_len = 0;
+    unsigned char *princ_name_utf16 = NULL, *p;
+    size_t princ_name_utf16_len = 0;
     uint64_t nt_authtime;
 
     /* If we already have a CLIENT_INFO buffer, then just validate it */
@@ -54,12 +54,12 @@ k5_insert_client_info(krb5_context context,
     if (ret != 0)
         goto cleanup;
 
-    ret = k5_utf8_to_ucs2le(princ_name_utf8, &princ_name_ucs2,
-                            &princ_name_ucs2_len);
+    ret = k5_utf8_to_utf16le(princ_name_utf8, &princ_name_utf16,
+                             &princ_name_utf16_len);
     if (ret != 0)
         goto cleanup;
 
-    client_info.length = PAC_CLIENT_INFO_LENGTH + princ_name_ucs2_len;
+    client_info.length = PAC_CLIENT_INFO_LENGTH + princ_name_utf16_len;
     client_info.data = NULL;
 
     ret = k5_pac_add_buffer(context, pac, KRB5_PAC_CLIENT_INFO,
@@ -74,16 +74,16 @@ k5_insert_client_info(krb5_context context,
     store_64_le(nt_authtime, p);
     p += 8;
 
-    /* copy in number of UCS-2 characters in principal name */
-    store_16_le(princ_name_ucs2_len, p);
+    /* copy in number of UTF-16 bytes in principal name */
+    store_16_le(princ_name_utf16_len, p);
     p += 2;
 
     /* copy in principal name */
-    memcpy(p, princ_name_ucs2, princ_name_ucs2_len);
+    memcpy(p, princ_name_utf16, princ_name_utf16_len);
 
 cleanup:
-    if (princ_name_ucs2 != NULL)
-        free(princ_name_ucs2);
+    if (princ_name_utf16 != NULL)
+        free(princ_name_utf16);
     krb5_free_unparsed_name(context, princ_name_utf8);
 
     return ret;
diff --git a/src/util/support/Makefile.in b/src/util/support/Makefile.in
index 6239e41..0bf0b7a 100644
--- a/src/util/support/Makefile.in
+++ b/src/util/support/Makefile.in
@@ -143,6 +143,7 @@ SRCS=\
 	$(srcdir)/bcmp.c \
 	$(srcdir)/strerror_r.c \
 	$(srcdir)/t_utf8.c \
+	$(srcdir)/t_utf16.c \
 	$(srcdir)/getopt.c \
 	$(srcdir)/getopt_long.c
 
@@ -220,7 +221,12 @@ t_unal: t_unal.o
 t_utf8: t_utf8.o utf8.o
 	$(CC_LINK) -o t_utf8 t_utf8.o utf8.o
 
-TEST_PROGS= t_k5buf t_path t_path_win t_base64 t_json t_unal t_utf8
+T_UTF16_OBJS= t_utf16.o utf8_conv.o utf8.o k5buf.o $(PRINTF_ST_OBJ)
+
+t_utf16: $(T_UTF16_OBJS)
+	$(CC_LINK) -o $@ $(T_UTF16_OBJS)
+
+TEST_PROGS= t_k5buf t_path t_path_win t_base64 t_json t_unal t_utf8 t_utf16
 
 check-unix: $(TEST_PROGS)
 	./t_k5buf
@@ -230,11 +236,13 @@ check-unix: $(TEST_PROGS)
 	./t_json
 	./t_unal
 	./t_utf8
+	./t_utf16
 
 clean:
 	$(RM) t_k5buf.o t_k5buf t_unal.o t_unal path_win.o path_win
 	$(RM) t_path_win.o t_path_win t_path.o t_path t_base64.o t_base64
 	$(RM) t_json.o t_json libkrb5support.exports t_utf8.o t_utf8
+	$(RM) t_utf16.o t_utf16
 
 @lib_frag@
 @libobj_frag@
diff --git a/src/util/support/deps b/src/util/support/deps
index a95d2ad..34d8a88 100644
--- a/src/util/support/deps
+++ b/src/util/support/deps
@@ -34,8 +34,9 @@ utf8.so utf8.po $(OUTPRE)utf8.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \
   $(top_srcdir)/include/k5-utf8.h supp-int.h utf8.c
 utf8_conv.so utf8_conv.po $(OUTPRE)utf8_conv.$(OBJEXT): \
   $(BUILDTOP)/include/autoconf.h $(top_srcdir)/include/k5-buf.h \
-  $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
-  $(top_srcdir)/include/k5-utf8.h supp-int.h utf8_conv.c
+  $(top_srcdir)/include/k5-input.h $(top_srcdir)/include/k5-platform.h \
+  $(top_srcdir)/include/k5-thread.h $(top_srcdir)/include/k5-utf8.h \
+  supp-int.h utf8_conv.c
 gettimeofday.so gettimeofday.po $(OUTPRE)gettimeofday.$(OBJEXT): \
   $(BUILDTOP)/include/autoconf.h $(top_srcdir)/include/k5-platform.h \
   $(top_srcdir)/include/k5-thread.h gettimeofday.c
@@ -84,6 +85,9 @@ strerror_r.so strerror_r.po $(OUTPRE)strerror_r.$(OBJEXT): \
 t_utf8.so t_utf8.po $(OUTPRE)t_utf8.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \
   $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
   $(top_srcdir)/include/k5-utf8.h t_utf8.c
+t_utf16.so t_utf16.po $(OUTPRE)t_utf16.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \
+  $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
+  $(top_srcdir)/include/k5-utf8.h t_utf16.c
 getopt.so getopt.po $(OUTPRE)getopt.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \
   $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
   getopt.c
diff --git a/src/util/support/libkrb5support-fixed.exports b/src/util/support/libkrb5support-fixed.exports
index 750dc24..fd74a18 100644
--- a/src/util/support/libkrb5support-fixed.exports
+++ b/src/util/support/libkrb5support-fixed.exports
@@ -52,8 +52,8 @@ k5_path_isabs
 k5_path_join
 k5_path_split
 k5_strerror_r
-k5_utf8_to_ucs2le
-k5_ucs2le_to_utf8
+k5_utf8_to_utf16le
+k5_utf16le_to_utf8
 krb5int_key_register
 krb5int_key_delete
 krb5int_getspecific
diff --git a/src/util/support/t_utf16.c b/src/util/support/t_utf16.c
new file mode 100644
index 0000000..bc3390a
--- /dev/null
+++ b/src/util/support/t_utf16.c
@@ -0,0 +1,117 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+/* util/support/t_utf16.c - test UTF-16 conversion functions */
+/*
+ * Copyright (C) 2017 by the Massachusetts Institute of Technology.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ *   notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ *   notice, this list of conditions and the following disclaimer in
+ *   the documentation and/or other materials provided with the
+ *   distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This program tests conversions between UTF-8 and little-endian UTF-16, with
+ * an eye mainly towards covering UTF-16 edge cases and UTF-8 decoding results
+ * which we detect as invalid in utf8_conv.c.  t_utf8.c covers more UTF-8 edge
+ * cases.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "k5-platform.h"
+#include "k5-utf8.h"
+
+struct test {
+    const char *utf8;
+    const char *utf16;
+    size_t utf16len;
+} tests[] = {
+    { "", "", 0 },
+    { "abcd", "a\0b\0c\0d\0", 8 },
+    /* From RFC 2781 (tests code point 0x12345 and some ASCII) */
+    { "\xF0\x92\x8D\x85=Ra", "\x08\xD8\x45\xDF=\0R\0a\0", 10 },
+    /* Lowest and highest Supplementary Plane code points */
+    { "\xF0\x90\x80\x80 \xF4\x8F\xBF\xBF",
+      "\x00\xD8\x00\xDC \0\xFF\xDB\xFF\xDF", 10 },
+    /* Basic Multilingual Plane code points near and above surrogate range */
+    { "\xED\x9F\xBF", "\xFF\xD7", 2 },
+    { "\xEE\x80\x80 \xEE\xBF\xBF", "\x00\xE0 \0\xFF\xEF", 6 },
+    /* Invalid UTF-8: decodes to value in surrogate pair range */
+    { "\xED\xA0\x80", NULL, 0 }, /* 0xD800 */
+    { "\xED\xAF\xBF", NULL, 0 }, /* 0xDBFF */
+    { "\xED\xB0\x80", NULL, 0 }, /* 0xDC00 */
+    { "\xED\xBF\xBF", NULL, 0 }, /* 0xDFFF */
+    /* Invalid UTF-8: decodes to value above Unicode range */
+    { "\xF4\x90\x80\x80", NULL, 0 },
+    { "\xF4\xBF\xBF\xBF", NULL, 0 },
+    { "\xF5\x80\x80\x80", NULL, 0 }, /* thrown out early due to first byte */
+    /* Invalid UTF-16: odd numbers of UTF-16 bytes */
+    { NULL, "\x00", 1 },
+    { NULL, "\x01\x00\x02", 3 },
+    /* Invalid UTF-16: high surrogate without a following low surrogate */
+    { NULL, "\x00\xD8\x00\x00", 4 },
+    { NULL, "\x00\xD8\xFF\xDB", 4 },
+    { NULL, "\xFF\xDB", 2 },
+    /* Invalid UTF-16: low surrogate without a preceding high surrogate */
+    { NULL, "\x61\x00\x00\xDC", 4 },
+    { NULL, "\xFF\xDF\xFF\xDB", 4 },
+};
+
+int
+main(int argc, char **argv)
+{
+    int ret;
+    struct test *t;
+    size_t i, utf16len;
+    uint8_t *utf16;
+    char *utf8;
+
+    for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) {
+        t = &tests[i];
+        if (t->utf8 != NULL) {
+            ret = k5_utf8_to_utf16le(t->utf8, &utf16, &utf16len);
+            if (t->utf16 == NULL) {
+                assert(ret == EINVAL);
+            } else {
+                assert(ret == 0);
+                assert(t->utf16len == utf16len);
+                assert(memcmp(t->utf16, utf16, utf16len) == 0);
+                free(utf16);
+            }
+        }
+
+        if (t->utf16 != NULL) {
+            ret = k5_utf16le_to_utf8((uint8_t *)t->utf16, t->utf16len, &utf8);
+            if (t->utf8 == NULL) {
+                assert(ret == EINVAL);
+            } else {
+                assert(ret == 0);
+                assert(strcmp(t->utf8, utf8) == 0);
+                free(utf8);
+            }
+        }
+    }
+    return 0;
+}
diff --git a/src/util/support/utf8.c b/src/util/support/utf8.c
index e42c0c7..34e2b6a 100644
--- a/src/util/support/utf8.c
+++ b/src/util/support/utf8.c
@@ -205,7 +205,7 @@ int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out)
     return 0;
 }
 
-/* conv UCS-2 to UTF-8, not used */
+/* conv UCS-4 to UTF-8 */
 size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
 {
     size_t len = 0;
diff --git a/src/util/support/utf8_conv.c b/src/util/support/utf8_conv.c
index 5f279c3..5cfc2c5 100644
--- a/src/util/support/utf8_conv.c
+++ b/src/util/support/utf8_conv.c
@@ -1,7 +1,7 @@
 /* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
 /* util/support/utf8_conv.c */
 /*
- * Copyright 2008 by the Massachusetts Institute of Technology.
+ * Copyright 2008, 2017 by the Massachusetts Institute of Technology.
  * All Rights Reserved.
  *
  * Export of this software from the United States of America may
@@ -47,34 +47,56 @@
  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
  */
 
-/* This work is part of OpenLDAP Software <http://www.openldap.org/>. */
+/* This work is based on OpenLDAP Software <http://www.openldap.org/>. */
 
 /*
- * UTF-8 Conversion Routines
- *
- * These routines convert between Wide Character and UTF-8,
- * or between MultiByte and UTF-8 encodings.
- *
- * Both single character and string versions of the functions are provided.
- * All functions return -1 if the character or string cannot be converted.
+ * These routines convert between UTF-16 and UTF-8.  UTF-16 encodes a Unicode
+ * character in either two or four bytes.  Characters in the Basic Multilingual
+ * Plane (hex 0..D7FF and E000..FFFF) are encoded as-is in two bytes.
+ * Characters in the Supplementary Planes (10000..10FFFF) are split into a high
+ * surrogate and a low surrogate, each containing ten bits of the character
+ * value, and encoded in four bytes.
  */
 
 #include "k5-platform.h"
 #include "k5-utf8.h"
 #include "k5-buf.h"
+#include "k5-input.h"
 #include "supp-int.h"
 
 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
 
+/* A high surrogate is ten bits masked with 0xD800. */
+#define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDBFF)
+
+/* A low surrogate is ten bits masked with 0xDC00. */
+#define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) <= 0xDFFF)
+
+/* A valid Unicode code point is in the range 0..10FFFF and is not a surrogate
+ * value. */
+#define IS_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDFFF)
+#define IS_VALID_UNICODE(c) ((c) <= 0x10FFFF && !IS_SURROGATE(c))
+
+/* A Basic Multilingual Plane character is in the range 0..FFFF and is not a
+ * surrogate value. */
+#define IS_BMP(c) ((c) <= 0xFFFF && !IS_SURROGATE(c))
+
+/* Characters in the Supplementary Planes have a base value subtracted from
+ * their code points to form a 20-bit value; ten bits go in each surrogate. */
+#define BASE 0x10000
+#define HIGH_SURROGATE(c) (0xD800 | (((c) - BASE) >> 10))
+#define LOW_SURROGATE(c) (0xDC00 | (((c) - BASE) & 0x3FF))
+#define COMPOSE(c1, c2) (BASE + ((((c1) & 0x3FF) << 10) | ((c2) & 0x3FF)))
+
 int
-k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, size_t *nbytes_out)
+k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out)
 {
     struct k5buf buf;
-    krb5_ucs2 ch;
+    krb5_ucs4 ch;
     size_t chlen, i;
-    void *p;
+    uint8_t *p;
 
-    *ucs2_out = NULL;
+    *utf16_out = NULL;
     *nbytes_out = 0;
 
     k5_buf_init_dynamic(&buf);
@@ -83,11 +105,11 @@ k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, size_t *nbytes_out)
     while (*utf8 != '\0') {
         /* Get UTF-8 sequence length from first byte. */
         chlen = KRB5_UTF8_CHARLEN2(utf8, chlen);
-        if (chlen == 0 || chlen > KRB5_MAX_UTF8_LEN)
+        if (chlen == 0)
             goto invalid;
 
         /* First byte minus length tag */
-        ch = (krb5_ucs2)(utf8[0] & mask[chlen]);
+        ch = (krb5_ucs4)(utf8[0] & mask[chlen]);
 
         for (i = 1; i < chlen; i++) {
             /* Subsequent bytes must start with 10. */
@@ -96,19 +118,30 @@ k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, size_t *nbytes_out)
 
             /* 6 bits of data in each subsequent byte */
             ch <<= 6;
-            ch |= (krb5_ucs2)(utf8[i] & 0x3f);
+            ch |= (krb5_ucs4)(utf8[i] & 0x3f);
         }
+        if (!IS_VALID_UNICODE(ch))
+            goto invalid;
 
-        p = k5_buf_get_space(&buf, 2);
+        /* Characters in the basic multilingual plane are encoded using two
+         * bytes; other characters are encoded using four bytes. */
+        p = k5_buf_get_space(&buf, IS_BMP(ch) ? 2 : 4);
         if (p == NULL)
             return ENOMEM;
-        store_16_le(ch, p);
+        if (IS_BMP(ch)) {
+            store_16_le(ch, p);
+        } else {
+            /* 0x10000 is subtracted from ch; then the high ten bits plus
+             * 0xD800 and the low ten bits plus 0xDC00 are the surrogates. */
+            store_16_le(HIGH_SURROGATE(ch), p);
+            store_16_le(LOW_SURROGATE(ch), p + 2);
+        }
 
         /* Move to next UTF-8 character. */
         utf8 += chlen;
     }
 
-    *ucs2_out = buf.data;
+    *utf16_out = buf.data;
     *nbytes_out = buf.len;
     return 0;
 
@@ -118,11 +151,13 @@ invalid:
 }
 
 int
-k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes, char **utf8_out)
+k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, char **utf8_out)
 {
     struct k5buf buf;
-    krb5_ucs2 ch;
-    size_t chlen, i;
+    struct k5input in;
+    uint16_t ch1, ch2;
+    krb5_ucs4 ch;
+    size_t chlen;
     void *p;
 
     *utf8_out = NULL;
@@ -131,16 +166,37 @@ k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes, char **utf8_out)
         return EINVAL;
 
     k5_buf_init_dynamic(&buf);
+    k5_input_init(&in, utf16bytes, nbytes);
+    while (!in.status && in.len > 0) {
+        /* Get the next character or high surrogate.  A low surrogate without a
+         * preceding high surrogate is invalid. */
+        ch1 = k5_input_get_uint16_le(&in);
+        if (IS_LOW_SURROGATE(ch1))
+            goto invalid;
+        if (IS_HIGH_SURROGATE(ch1)) {
+            /* Get the low surrogate and combine the pair. */
+            ch2 = k5_input_get_uint16_le(&in);
+            if (!IS_LOW_SURROGATE(ch2))
+                goto invalid;
+            ch = COMPOSE(ch1, ch2);
+        } else {
+            ch = ch1;
+        }
 
-    for (i = 0; i < nbytes; i += 2) {
-        ch = load_16_le(&ucs2bytes[i]);
-        chlen = krb5int_ucs2_to_utf8(ch, NULL);
+        chlen = krb5int_ucs4_to_utf8(ch, NULL);
         p = k5_buf_get_space(&buf, chlen);
         if (p == NULL)
             return ENOMEM;
-        (void)krb5int_ucs2_to_utf8(ch, p);
+        (void)krb5int_ucs4_to_utf8(ch, p);
     }
 
+    if (in.status)
+        goto invalid;
+
     *utf8_out = buf.data;
     return 0;
+
+invalid:
+    k5_buf_free(&buf);
+    return EINVAL;
 }


More information about the cvs-krb5 mailing list