ksmbd: add support for surrogate pair conversion

Message ID	20231021120540.28732-1-linkinjeon@kernel.org (mailing list archive)
State	New, archived
Headers	show Return-Path: <linux-cifs-owner@vger.kernel.org> From: Namjae Jeon <linkinjeon@kernel.org> To: linux-cifs@vger.kernel.org Cc: smfrench@gmail.com, senozhatsky@chromium.org, tom@talpey.com, atteh.mailbox@gmail.com, Namjae Jeon <linkinjeon@kernel.org>, Marios Makassikis <mmakassikis@freebox.fr> Subject: [PATCH] ksmbd: add support for surrogate pair conversion Date: Sat, 21 Oct 2023 21:05:40 +0900 Message-Id: <20231021120540.28732-1-linkinjeon@kernel.org> MIME-Version: 1.0 Content-Transfer-Encoding: 8bit Precedence: bulk
Series	ksmbd: add support for surrogate pair conversion \| expand ksmbd: add support for surrogate pair conversion

diff --git a/fs/smb/server/unicode.c b/fs/smb/server/unicode.c index 393dd4a7432b..43ed29ee44ea 100644 --- a/fs/smb/server/unicode.c +++ b/fs/smb/server/unicode.c @@ -13,46 +13,10 @@ #include "unicode.h" #include "smb_common.h" -/* - * smb_utf16_bytes() - how long will a string be after conversion? - * @from: pointer to input string - * @maxbytes: don't go past this many bytes of input string - * @codepage: destination codepage - * - * Walk a utf16le string and return the number of bytes that the string will - * be after being converted to the given charset, not including any null - * termination required. Don't walk past maxbytes in the source buffer. - * - * Return: string length after conversion - */ -static int smb_utf16_bytes(const __le16 *from, int maxbytes, - const struct nls_table *codepage) -{ - int i; - int charlen, outlen = 0; - int maxwords = maxbytes / 2; - char tmp[NLS_MAX_CHARSET_SIZE]; - __u16 ftmp; - - for (i = 0; i < maxwords; i++) { - ftmp = get_unaligned_le16(&from[i]); - if (ftmp == 0) - break; - - charlen = codepage->uni2char(ftmp, tmp, NLS_MAX_CHARSET_SIZE); - if (charlen > 0) - outlen += charlen; - else - outlen++; - } - - return outlen; -} - /* * cifs_mapchar() - convert a host-endian char to proper char in codepage * @target: where converted character should be copied - * @src_char: 2 byte host-endian source character + * @from: host-endian source string * @cp: codepage to which character should be converted * @mapchar: should character be mapped according to mapchars mount option? * @@ -63,10 +27,13 @@ static int smb_utf16_bytes(const __le16 *from, int maxbytes, * Return: string length after conversion */ static int -cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, +cifs_mapchar(char *target, const __u16 *from, const struct nls_table *cp, bool mapchar) { int len = 1; + __u16 src_char; + + src_char = *from; if (!mapchar) goto cp_convert; @@ -104,12 +71,66 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, cp_convert: len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); - if (len <= 0) { - *target = '?'; - len = 1; - } + if (len <= 0) + goto surrogate_pair; goto out; + +surrogate_pair: + /* convert SURROGATE_PAIR and IVS */ + if (strcmp(cp->charset, "utf8")) + goto unknown; + len = utf16s_to_utf8s(from, 3, UTF16_LITTLE_ENDIAN, target, 6); + if (len <= 0) + goto unknown; + return len; + +unknown: + *target = '?'; + len = 1; + goto out; +} + +/* + * smb_utf16_bytes() - compute converted string length + * @from: pointer to input string + * @maxbytes: input string length + * @codepage: destination codepage + * + * Walk a utf16le string and return the number of bytes that the string will + * be after being converted to the given charset, not including any null + * termination required. Don't walk past maxbytes in the source buffer. + * + * Return: string length after conversion + */ +static int smb_utf16_bytes(const __le16 *from, int maxbytes, + const struct nls_table *codepage) +{ + int i, j; + int charlen, outlen = 0; + int maxwords = maxbytes / 2; + char tmp[NLS_MAX_CHARSET_SIZE]; + __u16 ftmp[3]; + + for (i = 0; i < maxwords; i++) { + ftmp[0] = get_unaligned_le16(&from[i]); + if (ftmp[0] == 0) + break; + for (j = 1; j <= 2; j++) { + if (i + j < maxwords) + ftmp[j] = get_unaligned_le16(&from[i + j]); + else + ftmp[j] = 0; + } + + charlen = cifs_mapchar(tmp, ftmp, codepage, 0); + if (charlen > 0) + outlen += charlen; + else + outlen++; + } + + return outlen; } /* @@ -139,12 +160,12 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, const struct nls_table *codepage, bool mapchar) { - int i, charlen, safelen; + int i, j, charlen, safelen; int outlen = 0; int nullsize = nls_nullsize(codepage); int fromwords = fromlen / 2; char tmp[NLS_MAX_CHARSET_SIZE]; - __u16 ftmp; + __u16 ftmp[3]; /* ftmp[3] = 3array x 2bytes = 6bytes UTF-16 */ /* * because the chars can be of varying widths, we need to take care @@ -155,9 +176,15 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, safelen = tolen - (NLS_MAX_CHARSET_SIZE + nullsize); for (i = 0; i < fromwords; i++) { - ftmp = get_unaligned_le16(&from[i]); - if (ftmp == 0) + ftmp[0] = get_unaligned_le16(&from[i]); + if (ftmp[0] == 0) break; + for (j = 1; j <= 2; j++) { + if (i + j < fromwords) + ftmp[j] = get_unaligned_le16(&from[i + j]); + else + ftmp[j] = 0; + } /* * check to see if converting this character might make the @@ -172,6 +199,19 @@ static int smb_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, /* put converted char into 'to' buffer */ charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar); outlen += charlen; + + /* + * charlen (=bytes of UTF-8 for 1 character) + * 4bytes UTF-8(surrogate pair) is charlen=4 + * (4bytes UTF-16 code) + * 7-8bytes UTF-8(IVS) is charlen=3+4 or 4+4 + * (2 UTF-8 pairs divided to 2 UTF-16 pairs) + */ + if (charlen == 4) + i++; + else if (charlen >= 5) + /* 5-6bytes UTF-8 */ + i += 2; } /* properly null-terminate string */ @@ -306,6 +346,9 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen, char src_char; __le16 dst_char; wchar_t tmp; + wchar_t wchar_to[6]; /* UTF-16 */ + int ret; + unicode_t u; if (!mapchars) return smb_strtoUTF16(target, source, srclen, cp); @@ -348,11 +391,57 @@ int smbConvertToUTF16(__le16 *target, const char *source, int srclen, * if no match, use question mark, which at least in * some cases serves as wild card */ - if (charlen < 1) { - dst_char = cpu_to_le16(0x003f); - charlen = 1; + if (charlen > 0) + goto ctoUTF16; + + /* convert SURROGATE_PAIR */ + if (strcmp(cp->charset, "utf8")) + goto unknown; + if (*(source + i) & 0x80) { + charlen = utf8_to_utf32(source + i, 6, &u); + if (charlen < 0) + goto unknown; + } else + goto unknown; + ret = utf8s_to_utf16s(source + i, charlen, + UTF16_LITTLE_ENDIAN, + wchar_to, 6); + if (ret < 0) + goto unknown; + + i += charlen; + dst_char = cpu_to_le16(*wchar_to); + if (charlen <= 3) + /* 1-3bytes UTF-8 to 2bytes UTF-16 */ + put_unaligned(dst_char, &target[j]); + else if (charlen == 4) { + /* + * 4bytes UTF-8(surrogate pair) to 4bytes UTF-16 + * 7-8bytes UTF-8(IVS) divided to 2 UTF-16 + * (charlen=3+4 or 4+4) + */ + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 1)); + j++; + put_unaligned(dst_char, &target[j]); + } else if (charlen >= 5) { + /* 5-6bytes UTF-8 to 6bytes UTF-16 */ + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 1)); + j++; + put_unaligned(dst_char, &target[j]); + dst_char = cpu_to_le16(*(wchar_to + 2)); + j++; + put_unaligned(dst_char, &target[j]); } + continue; + +unknown: + dst_char = cpu_to_le16(0x003f); + charlen = 1; } + +ctoUTF16: /* * character may take more than one byte in the source string, * but will take exactly two bytes in the target string

ksmbd: add support for surrogate pair conversion

Commit Message

Patch