krb5 commit: Replace UCS-2 conversions with UTF-16
Greg Hudson
ghudson at mit.edu
Fri Apr 21 16:39:14 EDT 2017
https://github.com/krb5/krb5/commit/89ce6420832858950271858e7c6e1a2eefebc683
commit 89ce6420832858950271858e7c6e1a2eefebc683
Author: Greg Hudson <ghudson at mit.edu>
Date: Tue Apr 18 14:01:06 2017 -0400
Replace UCS-2 conversions with UTF-16
Where we convert between UTF-8 and UCS-2 (RC4 string-to-key and PAC
client info), use UTF-16 instead of UCS-2. Add a test program for
the conversion functions.
ticket: 8577 (new)
.gitignore | 1 +
src/include/k5-utf8.h | 14 ++--
src/lib/crypto/krb/s2k_rc4.c | 2 +-
src/lib/krb5/krb/pac.c | 2 +-
src/lib/krb5/krb/pac_sign.c | 20 ++--
src/util/support/Makefile.in | 10 ++-
src/util/support/deps | 8 ++-
src/util/support/libkrb5support-fixed.exports | 4 +-
src/util/support/t_utf16.c | 117 +++++++++++++++++++++++++
src/util/support/utf8.c | 2 +-
src/util/support/utf8_conv.c | 108 +++++++++++++++++------
11 files changed, 236 insertions(+), 52 deletions(-)
diff --git a/.gitignore b/.gitignore
index 815c67d..862a87a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -523,6 +523,7 @@ local.properties
/src/util/support/t_path_win
/src/util/support/t_unal
/src/util/support/t_utf8
+/src/util/support/t_utf16
/src/util/verto/rename.h
diff --git a/src/include/k5-utf8.h b/src/include/k5-utf8.h
index 4b7415e..e2f20d4 100644
--- a/src/include/k5-utf8.h
+++ b/src/include/k5-utf8.h
@@ -73,8 +73,6 @@
typedef uint16_t krb5_ucs2;
typedef uint32_t krb5_ucs4;
-#define KRB5_MAX_UTF8_LEN (sizeof(krb5_ucs2) * 3/2)
-
int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out);
size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf);
@@ -82,21 +80,21 @@ int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out);
size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf);
/*
- * Convert a little-endian UCS-2 string to an allocated null-terminated UTF-8
+ * Convert a little-endian UTF-16 string to an allocated null-terminated UTF-8
* string. nbytes is the length of ucs2bytes in bytes, and must be an even
* number. Return EINVAL on invalid input, ENOMEM on out of memory, or 0 on
* success.
*/
-int k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes,
- char **utf8_out);
+int k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes,
+ char **utf8_out);
/*
- * Convert a UTF-8 string to an allocated little-endian UCS-2 string. The
+ * Convert a UTF-8 string to an allocated little-endian UTF-16 string. The
* resulting length is in bytes and will always be even. Return EINVAL on
* invalid input, ENOMEM on out of memory, or 0 on success.
*/
-int k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out,
- size_t *nbytes_out);
+int k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out,
+ size_t *nbytes_out);
/* returns the number of bytes in the UTF-8 string */
size_t krb5int_utf8_bytes(const char *);
diff --git a/src/lib/crypto/krb/s2k_rc4.c b/src/lib/crypto/krb/s2k_rc4.c
index fb41b26..081a912 100644
--- a/src/lib/crypto/krb/s2k_rc4.c
+++ b/src/lib/crypto/krb/s2k_rc4.c
@@ -24,7 +24,7 @@ krb5int_arcfour_string_to_key(const struct krb5_keytypes *ktp,
utf8 = k5memdup0(string->data, string->length, &err);
if (utf8 == NULL)
return err;
- err = k5_utf8_to_ucs2le(utf8, ©str, ©strlen);
+ err = k5_utf8_to_utf16le(utf8, ©str, ©strlen);
free(utf8);
if (err)
return err;
diff --git a/src/lib/krb5/krb/pac.c b/src/lib/krb5/krb/pac.c
index 485a0f7..d1662b9 100644
--- a/src/lib/krb5/krb/pac.c
+++ b/src/lib/krb5/krb/pac.c
@@ -436,7 +436,7 @@ k5_pac_validate_client(krb5_context context,
pac_princname_length % 2)
return ERANGE;
- ret = k5_ucs2le_to_utf8(p, pac_princname_length, &pac_princname);
+ ret = k5_utf16le_to_utf8(p, pac_princname_length, &pac_princname);
if (ret != 0)
return ret;
diff --git a/src/lib/krb5/krb/pac_sign.c b/src/lib/krb5/krb/pac_sign.c
index c6eee76..c94899c 100644
--- a/src/lib/krb5/krb/pac_sign.c
+++ b/src/lib/krb5/krb/pac_sign.c
@@ -38,8 +38,8 @@ k5_insert_client_info(krb5_context context,
krb5_error_code ret;
krb5_data client_info;
char *princ_name_utf8 = NULL;
- unsigned char *princ_name_ucs2 = NULL, *p;
- size_t princ_name_ucs2_len = 0;
+ unsigned char *princ_name_utf16 = NULL, *p;
+ size_t princ_name_utf16_len = 0;
uint64_t nt_authtime;
/* If we already have a CLIENT_INFO buffer, then just validate it */
@@ -54,12 +54,12 @@ k5_insert_client_info(krb5_context context,
if (ret != 0)
goto cleanup;
- ret = k5_utf8_to_ucs2le(princ_name_utf8, &princ_name_ucs2,
- &princ_name_ucs2_len);
+ ret = k5_utf8_to_utf16le(princ_name_utf8, &princ_name_utf16,
+ &princ_name_utf16_len);
if (ret != 0)
goto cleanup;
- client_info.length = PAC_CLIENT_INFO_LENGTH + princ_name_ucs2_len;
+ client_info.length = PAC_CLIENT_INFO_LENGTH + princ_name_utf16_len;
client_info.data = NULL;
ret = k5_pac_add_buffer(context, pac, KRB5_PAC_CLIENT_INFO,
@@ -74,16 +74,16 @@ k5_insert_client_info(krb5_context context,
store_64_le(nt_authtime, p);
p += 8;
- /* copy in number of UCS-2 characters in principal name */
- store_16_le(princ_name_ucs2_len, p);
+ /* copy in number of UTF-16 bytes in principal name */
+ store_16_le(princ_name_utf16_len, p);
p += 2;
/* copy in principal name */
- memcpy(p, princ_name_ucs2, princ_name_ucs2_len);
+ memcpy(p, princ_name_utf16, princ_name_utf16_len);
cleanup:
- if (princ_name_ucs2 != NULL)
- free(princ_name_ucs2);
+ if (princ_name_utf16 != NULL)
+ free(princ_name_utf16);
krb5_free_unparsed_name(context, princ_name_utf8);
return ret;
diff --git a/src/util/support/Makefile.in b/src/util/support/Makefile.in
index 6239e41..0bf0b7a 100644
--- a/src/util/support/Makefile.in
+++ b/src/util/support/Makefile.in
@@ -143,6 +143,7 @@ SRCS=\
$(srcdir)/bcmp.c \
$(srcdir)/strerror_r.c \
$(srcdir)/t_utf8.c \
+ $(srcdir)/t_utf16.c \
$(srcdir)/getopt.c \
$(srcdir)/getopt_long.c
@@ -220,7 +221,12 @@ t_unal: t_unal.o
t_utf8: t_utf8.o utf8.o
$(CC_LINK) -o t_utf8 t_utf8.o utf8.o
-TEST_PROGS= t_k5buf t_path t_path_win t_base64 t_json t_unal t_utf8
+T_UTF16_OBJS= t_utf16.o utf8_conv.o utf8.o k5buf.o $(PRINTF_ST_OBJ)
+
+t_utf16: $(T_UTF16_OBJS)
+ $(CC_LINK) -o $@ $(T_UTF16_OBJS)
+
+TEST_PROGS= t_k5buf t_path t_path_win t_base64 t_json t_unal t_utf8 t_utf16
check-unix: $(TEST_PROGS)
./t_k5buf
@@ -230,11 +236,13 @@ check-unix: $(TEST_PROGS)
./t_json
./t_unal
./t_utf8
+ ./t_utf16
clean:
$(RM) t_k5buf.o t_k5buf t_unal.o t_unal path_win.o path_win
$(RM) t_path_win.o t_path_win t_path.o t_path t_base64.o t_base64
$(RM) t_json.o t_json libkrb5support.exports t_utf8.o t_utf8
+ $(RM) t_utf16.o t_utf16
@lib_frag@
@libobj_frag@
diff --git a/src/util/support/deps b/src/util/support/deps
index a95d2ad..34d8a88 100644
--- a/src/util/support/deps
+++ b/src/util/support/deps
@@ -34,8 +34,9 @@ utf8.so utf8.po $(OUTPRE)utf8.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \
$(top_srcdir)/include/k5-utf8.h supp-int.h utf8.c
utf8_conv.so utf8_conv.po $(OUTPRE)utf8_conv.$(OBJEXT): \
$(BUILDTOP)/include/autoconf.h $(top_srcdir)/include/k5-buf.h \
- $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
- $(top_srcdir)/include/k5-utf8.h supp-int.h utf8_conv.c
+ $(top_srcdir)/include/k5-input.h $(top_srcdir)/include/k5-platform.h \
+ $(top_srcdir)/include/k5-thread.h $(top_srcdir)/include/k5-utf8.h \
+ supp-int.h utf8_conv.c
gettimeofday.so gettimeofday.po $(OUTPRE)gettimeofday.$(OBJEXT): \
$(BUILDTOP)/include/autoconf.h $(top_srcdir)/include/k5-platform.h \
$(top_srcdir)/include/k5-thread.h gettimeofday.c
@@ -84,6 +85,9 @@ strerror_r.so strerror_r.po $(OUTPRE)strerror_r.$(OBJEXT): \
t_utf8.so t_utf8.po $(OUTPRE)t_utf8.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \
$(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
$(top_srcdir)/include/k5-utf8.h t_utf8.c
+t_utf16.so t_utf16.po $(OUTPRE)t_utf16.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \
+ $(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
+ $(top_srcdir)/include/k5-utf8.h t_utf16.c
getopt.so getopt.po $(OUTPRE)getopt.$(OBJEXT): $(BUILDTOP)/include/autoconf.h \
$(top_srcdir)/include/k5-platform.h $(top_srcdir)/include/k5-thread.h \
getopt.c
diff --git a/src/util/support/libkrb5support-fixed.exports b/src/util/support/libkrb5support-fixed.exports
index 750dc24..fd74a18 100644
--- a/src/util/support/libkrb5support-fixed.exports
+++ b/src/util/support/libkrb5support-fixed.exports
@@ -52,8 +52,8 @@ k5_path_isabs
k5_path_join
k5_path_split
k5_strerror_r
-k5_utf8_to_ucs2le
-k5_ucs2le_to_utf8
+k5_utf8_to_utf16le
+k5_utf16le_to_utf8
krb5int_key_register
krb5int_key_delete
krb5int_getspecific
diff --git a/src/util/support/t_utf16.c b/src/util/support/t_utf16.c
new file mode 100644
index 0000000..bc3390a
--- /dev/null
+++ b/src/util/support/t_utf16.c
@@ -0,0 +1,117 @@
+/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
+/* util/support/t_utf16.c - test UTF-16 conversion functions */
+/*
+ * Copyright (C) 2017 by the Massachusetts Institute of Technology.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ * * Redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer.
+ *
+ * * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in
+ * the documentation and/or other materials provided with the
+ * distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
+ * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE
+ * COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT,
+ * INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+ * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT,
+ * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED
+ * OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+/*
+ * This program tests conversions between UTF-8 and little-endian UTF-16, with
+ * an eye mainly towards covering UTF-16 edge cases and UTF-8 decoding results
+ * which we detect as invalid in utf8_conv.c. t_utf8.c covers more UTF-8 edge
+ * cases.
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "k5-platform.h"
+#include "k5-utf8.h"
+
+struct test {
+ const char *utf8;
+ const char *utf16;
+ size_t utf16len;
+} tests[] = {
+ { "", "", 0 },
+ { "abcd", "a\0b\0c\0d\0", 8 },
+ /* From RFC 2781 (tests code point 0x12345 and some ASCII) */
+ { "\xF0\x92\x8D\x85=Ra", "\x08\xD8\x45\xDF=\0R\0a\0", 10 },
+ /* Lowest and highest Supplementary Plane code points */
+ { "\xF0\x90\x80\x80 \xF4\x8F\xBF\xBF",
+ "\x00\xD8\x00\xDC \0\xFF\xDB\xFF\xDF", 10 },
+ /* Basic Multilingual Plane code points near and above surrogate range */
+ { "\xED\x9F\xBF", "\xFF\xD7", 2 },
+ { "\xEE\x80\x80 \xEE\xBF\xBF", "\x00\xE0 \0\xFF\xEF", 6 },
+ /* Invalid UTF-8: decodes to value in surrogate pair range */
+ { "\xED\xA0\x80", NULL, 0 }, /* 0xD800 */
+ { "\xED\xAF\xBF", NULL, 0 }, /* 0xDBFF */
+ { "\xED\xB0\x80", NULL, 0 }, /* 0xDC00 */
+ { "\xED\xBF\xBF", NULL, 0 }, /* 0xDFFF */
+ /* Invalid UTF-8: decodes to value above Unicode range */
+ { "\xF4\x90\x80\x80", NULL, 0 },
+ { "\xF4\xBF\xBF\xBF", NULL, 0 },
+ { "\xF5\x80\x80\x80", NULL, 0 }, /* thrown out early due to first byte */
+ /* Invalid UTF-16: odd numbers of UTF-16 bytes */
+ { NULL, "\x00", 1 },
+ { NULL, "\x01\x00\x02", 3 },
+ /* Invalid UTF-16: high surrogate without a following low surrogate */
+ { NULL, "\x00\xD8\x00\x00", 4 },
+ { NULL, "\x00\xD8\xFF\xDB", 4 },
+ { NULL, "\xFF\xDB", 2 },
+ /* Invalid UTF-16: low surrogate without a preceding high surrogate */
+ { NULL, "\x61\x00\x00\xDC", 4 },
+ { NULL, "\xFF\xDF\xFF\xDB", 4 },
+};
+
+int
+main(int argc, char **argv)
+{
+ int ret;
+ struct test *t;
+ size_t i, utf16len;
+ uint8_t *utf16;
+ char *utf8;
+
+ for (i = 0; i < sizeof(tests) / sizeof(*tests); i++) {
+ t = &tests[i];
+ if (t->utf8 != NULL) {
+ ret = k5_utf8_to_utf16le(t->utf8, &utf16, &utf16len);
+ if (t->utf16 == NULL) {
+ assert(ret == EINVAL);
+ } else {
+ assert(ret == 0);
+ assert(t->utf16len == utf16len);
+ assert(memcmp(t->utf16, utf16, utf16len) == 0);
+ free(utf16);
+ }
+ }
+
+ if (t->utf16 != NULL) {
+ ret = k5_utf16le_to_utf8((uint8_t *)t->utf16, t->utf16len, &utf8);
+ if (t->utf8 == NULL) {
+ assert(ret == EINVAL);
+ } else {
+ assert(ret == 0);
+ assert(strcmp(t->utf8, utf8) == 0);
+ free(utf8);
+ }
+ }
+ }
+ return 0;
+}
diff --git a/src/util/support/utf8.c b/src/util/support/utf8.c
index e42c0c7..34e2b6a 100644
--- a/src/util/support/utf8.c
+++ b/src/util/support/utf8.c
@@ -205,7 +205,7 @@ int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out)
return 0;
}
-/* conv UCS-2 to UTF-8, not used */
+/* conv UCS-4 to UTF-8 */
size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
{
size_t len = 0;
diff --git a/src/util/support/utf8_conv.c b/src/util/support/utf8_conv.c
index 5f279c3..5cfc2c5 100644
--- a/src/util/support/utf8_conv.c
+++ b/src/util/support/utf8_conv.c
@@ -1,7 +1,7 @@
/* -*- mode: c; c-basic-offset: 4; indent-tabs-mode: nil -*- */
/* util/support/utf8_conv.c */
/*
- * Copyright 2008 by the Massachusetts Institute of Technology.
+ * Copyright 2008, 2017 by the Massachusetts Institute of Technology.
* All Rights Reserved.
*
* Export of this software from the United States of America may
@@ -47,34 +47,56 @@
* THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
*/
-/* This work is part of OpenLDAP Software <http://www.openldap.org/>. */
+/* This work is based on OpenLDAP Software <http://www.openldap.org/>. */
/*
- * UTF-8 Conversion Routines
- *
- * These routines convert between Wide Character and UTF-8,
- * or between MultiByte and UTF-8 encodings.
- *
- * Both single character and string versions of the functions are provided.
- * All functions return -1 if the character or string cannot be converted.
+ * These routines convert between UTF-16 and UTF-8. UTF-16 encodes a Unicode
+ * character in either two or four bytes. Characters in the Basic Multilingual
+ * Plane (hex 0..D7FF and E000..FFFF) are encoded as-is in two bytes.
+ * Characters in the Supplementary Planes (10000..10FFFF) are split into a high
+ * surrogate and a low surrogate, each containing ten bits of the character
+ * value, and encoded in four bytes.
*/
#include "k5-platform.h"
#include "k5-utf8.h"
#include "k5-buf.h"
+#include "k5-input.h"
#include "supp-int.h"
static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
+/* A high surrogate is ten bits masked with 0xD800. */
+#define IS_HIGH_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDBFF)
+
+/* A low surrogate is ten bits masked with 0xDC00. */
+#define IS_LOW_SURROGATE(c) ((c) >= 0xDC00 && (c) <= 0xDFFF)
+
+/* A valid Unicode code point is in the range 0..10FFFF and is not a surrogate
+ * value. */
+#define IS_SURROGATE(c) ((c) >= 0xD800 && (c) <= 0xDFFF)
+#define IS_VALID_UNICODE(c) ((c) <= 0x10FFFF && !IS_SURROGATE(c))
+
+/* A Basic Multilingual Plane character is in the range 0..FFFF and is not a
+ * surrogate value. */
+#define IS_BMP(c) ((c) <= 0xFFFF && !IS_SURROGATE(c))
+
+/* Characters in the Supplementary Planes have a base value subtracted from
+ * their code points to form a 20-bit value; ten bits go in each surrogate. */
+#define BASE 0x10000
+#define HIGH_SURROGATE(c) (0xD800 | (((c) - BASE) >> 10))
+#define LOW_SURROGATE(c) (0xDC00 | (((c) - BASE) & 0x3FF))
+#define COMPOSE(c1, c2) (BASE + ((((c1) & 0x3FF) << 10) | ((c2) & 0x3FF)))
+
int
-k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, size_t *nbytes_out)
+k5_utf8_to_utf16le(const char *utf8, uint8_t **utf16_out, size_t *nbytes_out)
{
struct k5buf buf;
- krb5_ucs2 ch;
+ krb5_ucs4 ch;
size_t chlen, i;
- void *p;
+ uint8_t *p;
- *ucs2_out = NULL;
+ *utf16_out = NULL;
*nbytes_out = 0;
k5_buf_init_dynamic(&buf);
@@ -83,11 +105,11 @@ k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, size_t *nbytes_out)
while (*utf8 != '\0') {
/* Get UTF-8 sequence length from first byte. */
chlen = KRB5_UTF8_CHARLEN2(utf8, chlen);
- if (chlen == 0 || chlen > KRB5_MAX_UTF8_LEN)
+ if (chlen == 0)
goto invalid;
/* First byte minus length tag */
- ch = (krb5_ucs2)(utf8[0] & mask[chlen]);
+ ch = (krb5_ucs4)(utf8[0] & mask[chlen]);
for (i = 1; i < chlen; i++) {
/* Subsequent bytes must start with 10. */
@@ -96,19 +118,30 @@ k5_utf8_to_ucs2le(const char *utf8, uint8_t **ucs2_out, size_t *nbytes_out)
/* 6 bits of data in each subsequent byte */
ch <<= 6;
- ch |= (krb5_ucs2)(utf8[i] & 0x3f);
+ ch |= (krb5_ucs4)(utf8[i] & 0x3f);
}
+ if (!IS_VALID_UNICODE(ch))
+ goto invalid;
- p = k5_buf_get_space(&buf, 2);
+ /* Characters in the basic multilingual plane are encoded using two
+ * bytes; other characters are encoded using four bytes. */
+ p = k5_buf_get_space(&buf, IS_BMP(ch) ? 2 : 4);
if (p == NULL)
return ENOMEM;
- store_16_le(ch, p);
+ if (IS_BMP(ch)) {
+ store_16_le(ch, p);
+ } else {
+ /* 0x10000 is subtracted from ch; then the high ten bits plus
+ * 0xD800 and the low ten bits plus 0xDC00 are the surrogates. */
+ store_16_le(HIGH_SURROGATE(ch), p);
+ store_16_le(LOW_SURROGATE(ch), p + 2);
+ }
/* Move to next UTF-8 character. */
utf8 += chlen;
}
- *ucs2_out = buf.data;
+ *utf16_out = buf.data;
*nbytes_out = buf.len;
return 0;
@@ -118,11 +151,13 @@ invalid:
}
int
-k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes, char **utf8_out)
+k5_utf16le_to_utf8(const uint8_t *utf16bytes, size_t nbytes, char **utf8_out)
{
struct k5buf buf;
- krb5_ucs2 ch;
- size_t chlen, i;
+ struct k5input in;
+ uint16_t ch1, ch2;
+ krb5_ucs4 ch;
+ size_t chlen;
void *p;
*utf8_out = NULL;
@@ -131,16 +166,37 @@ k5_ucs2le_to_utf8(const uint8_t *ucs2bytes, size_t nbytes, char **utf8_out)
return EINVAL;
k5_buf_init_dynamic(&buf);
+ k5_input_init(&in, utf16bytes, nbytes);
+ while (!in.status && in.len > 0) {
+ /* Get the next character or high surrogate. A low surrogate without a
+ * preceding high surrogate is invalid. */
+ ch1 = k5_input_get_uint16_le(&in);
+ if (IS_LOW_SURROGATE(ch1))
+ goto invalid;
+ if (IS_HIGH_SURROGATE(ch1)) {
+ /* Get the low surrogate and combine the pair. */
+ ch2 = k5_input_get_uint16_le(&in);
+ if (!IS_LOW_SURROGATE(ch2))
+ goto invalid;
+ ch = COMPOSE(ch1, ch2);
+ } else {
+ ch = ch1;
+ }
- for (i = 0; i < nbytes; i += 2) {
- ch = load_16_le(&ucs2bytes[i]);
- chlen = krb5int_ucs2_to_utf8(ch, NULL);
+ chlen = krb5int_ucs4_to_utf8(ch, NULL);
p = k5_buf_get_space(&buf, chlen);
if (p == NULL)
return ENOMEM;
- (void)krb5int_ucs2_to_utf8(ch, p);
+ (void)krb5int_ucs4_to_utf8(ch, p);
}
+ if (in.status)
+ goto invalid;
+
*utf8_out = buf.data;
return 0;
+
+invalid:
+ k5_buf_free(&buf);
+ return EINVAL;
}
More information about the cvs-krb5
mailing list