NLS: update handling of Unicode

author Alan Stern <stern@rowland.harvard.edu>

Thu, 30 Apr 2009 14:08:18 +0000 (10:08 -0400)

committer Greg Kroah-Hartman <gregkh@suse.de>

Tue, 16 Jun 2009 04:44:43 +0000 (21:44 -0700)
author Alan Stern <stern@rowland.harvard.edu>
Thu, 30 Apr 2009 14:08:18 +0000 (10:08 -0400)
committer Greg Kroah-Hartman <gregkh@suse.de>
Tue, 16 Jun 2009 04:44:43 +0000 (21:44 -0700)
diff --git a/drivers/usb/core/message.c b/drivers/usb/core/message.c

index e98f928c08ea557bc5ba5ca3f7070c47d0337bab..9bd26dec7599762bdc15f476da58112b9bedb2cb 100644 (file)
--- a/drivers/usb/core/message.c
+++ b/drivers/usb/core/message.c
@@ -780,14 +780,13 @@ int usb_string(struct usb_device *dev, int index, char *buf, size_t size)
  {
         unsigned char *tbuf;
         int err;
-       unsigned int u;
  
         if (dev->state == USB_STATE_SUSPENDED)
                 return -EHOSTUNREACH;
         if (size <= 0 || !buf || !index)
                 return -EINVAL;
         buf[0] = 0;
-       tbuf = kmalloc(256 + 2, GFP_NOIO);
+       tbuf = kmalloc(256, GFP_NOIO);
         if (!tbuf)
                 return -ENOMEM;
  
@@ -814,12 +813,9 @@ int usb_string(struct usb_device *dev, int index, char *buf, size_t size)
         if (err < 0)
                 goto errout;
  
-       for (u = 2; u < err; u += 2)
-               le16_to_cpus((u16 *)&tbuf[u]);
-       tbuf[u] = 0;
-       tbuf[u + 1] = 0;
         size--;         /* leave room for trailing NULL char in output buffer */
-       err = utf8_wcstombs(buf, (u16 *)&tbuf[2], size);
+       err = utf16s_to_utf8s((wchar_t *) &tbuf[2], (err - 2) / 2,
+                       UTF16_LITTLE_ENDIAN, buf, size);
         buf[err] = 0;
  
         if (tbuf[1] != USB_DT_STRING)
diff --git a/fs/befs/linuxvfs.c b/fs/befs/linuxvfs.c

index 9367b6297d84c819c1c42abef581ed39c0093b78..89cd2deeb4aff144d1ee4355e6e57b4c9ec3aaeb 100644 (file)
--- a/fs/befs/linuxvfs.c
+++ b/fs/befs/linuxvfs.c
@@ -513,7 +513,7 @@ befs_utf2nls(struct super_block *sb, const char *in,
  {
         struct nls_table *nls = BEFS_SB(sb)->nls;
         int i, o;
-       wchar_t uni;
+       unicode_t uni;
         int unilen, utflen;
         char *result;
         /* The utf8->nls conversion won't make the final nls string bigger
@@ -539,16 +539,16 @@ befs_utf2nls(struct super_block *sb, const char *in,
         for (i = o = 0; i < in_len; i += utflen, o += unilen) {
  
                 /* convert from UTF-8 to Unicode */
-               utflen = utf8_mbtowc(&uni, &in[i], in_len - i);
-               if (utflen < 0) {
+               utflen = utf8_to_utf32(&in[i], in_len - i, &uni);
+               if (utflen < 0)
                         goto conv_err;
-               }
  
                 /* convert from Unicode to nls */
+               if (uni > MAX_WCHAR_T)
+                       goto conv_err;
                 unilen = nls->uni2char(uni, &result[o], in_len - o);
-               if (unilen < 0) {
+               if (unilen < 0)
                         goto conv_err;
-               }
         }
         result[o] = '\0';
         *out_len = o;
@@ -619,15 +619,13 @@ befs_nls2utf(struct super_block *sb, const char *in,
  
                 /* convert from nls to unicode */
                 unilen = nls->char2uni(&in[i], in_len - i, &uni);
-               if (unilen < 0) {
+               if (unilen < 0)
                         goto conv_err;
-               }
  
                 /* convert from unicode to UTF-8 */
-               utflen = utf8_wctomb(&result[o], uni, 3);
-               if (utflen <= 0) {
+               utflen = utf32_to_utf8(uni, &result[o], 3);
+               if (utflen <= 0)
                         goto conv_err;
-               }
         }
  
         result[o] = '\0';
diff --git a/fs/fat/dir.c b/fs/fat/dir.c

index f3500294eec583a04b908e8013880a177a849f76..7c14c8cbbabaabd2ecb52220b21cfe0c6fa8dcbd 100644 (file)
--- a/fs/fat/dir.c
+++ b/fs/fat/dir.c
@@ -22,6 +22,19 @@
  #include <asm/uaccess.h>
  #include "fat.h"
  
+/*
+ * Maximum buffer size of short name.
+ * [(MSDOS_NAME + '.') * max one char + nul]
+ * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
+ */
+#define FAT_MAX_SHORT_SIZE     ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
+/*
+ * Maximum buffer size of unicode chars from slots.
+ * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
+ */
+#define FAT_MAX_UNI_CHARS      ((MSDOS_SLOTS - 1) * 13 + 1)
+#define FAT_MAX_UNI_SIZE       (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
+
  static inline loff_t fat_make_i_pos(struct super_block *sb,
                                     struct buffer_head *bh,
                                     struct msdos_dir_entry *de)
@@ -171,7 +184,8 @@ static inline int fat_uni_to_x8(struct msdos_sb_info *sbi, const wchar_t *uni,
                                 unsigned char *buf, int size)
  {
         if (sbi->options.utf8)
-               return utf8_wcstombs(buf, uni, size);
+               return utf16s_to_utf8s(uni, FAT_MAX_UNI_CHARS,
+                               UTF16_HOST_ENDIAN, buf, size);
         else
                 return uni16_to_x8(buf, uni, size, sbi->options.unicode_xlate,
                                    sbi->nls_io);
@@ -324,19 +338,6 @@ parse_long:
         return 0;
  }
  
-/*
- * Maximum buffer size of short name.
- * [(MSDOS_NAME + '.') * max one char + nul]
- * For msdos style, ['.' (hidden) + MSDOS_NAME + '.' + nul]
- */
-#define FAT_MAX_SHORT_SIZE     ((MSDOS_NAME + 1) * NLS_MAX_CHARSET_SIZE + 1)
-/*
- * Maximum buffer size of unicode chars from slots.
- * [(max longname slots * 13 (size in a slot) + nul) * sizeof(wchar_t)]
- */
-#define FAT_MAX_UNI_CHARS      ((MSDOS_SLOTS - 1) * 13 + 1)
-#define FAT_MAX_UNI_SIZE       (FAT_MAX_UNI_CHARS * sizeof(wchar_t))
-
  /*
   * Return values: negative -> error, 0 -> not found, positive -> found,
   * value is the total amount of slots, including the shortname entry.
diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c

index b50ecbe97f83d09a412390c96a750facfcf603e9..f92ad9995356a5be88a260698bc72b09f1ce6401 100644 (file)
--- a/fs/fat/namei_vfat.c
+++ b/fs/fat/namei_vfat.c
@@ -502,11 +502,11 @@ xlate_to_uni(const unsigned char *name, int len, unsigned char *outname,
         if (utf8) {
                 int name_len = strlen(name);
  
-               *outlen = utf8_mbstowcs((wchar_t *)outname, name, PATH_MAX);
+               *outlen = utf8s_to_utf16s(name, PATH_MAX, (wchar_t *) outname);
  
                 /*
                  * We stripped '.'s before and set len appropriately,
-                * but utf8_mbstowcs doesn't care about len
+                * but utf8s_to_utf16s doesn't care about len
                  */
                 *outlen -= (name_len - len);
  
diff --git a/fs/isofs/joliet.c b/fs/isofs/joliet.c

index 92c14b850e9cadeaf2179ca012130cfa6b744204..a048de81c09318ae5ccd092f5675649a50a7025b 100644 (file)
--- a/fs/isofs/joliet.c
+++ b/fs/isofs/joliet.c
@@ -37,37 +37,6 @@ uni16_to_x8(unsigned char *ascii, __be16 *uni, int len, struct nls_table *nls)
         return (op - ascii);
  }
  
-/* Convert big endian wide character string to utf8 */
-static int
-wcsntombs_be(__u8 *s, const __u8 *pwcs, int inlen, int maxlen)
-{
-       const __u8 *ip;
-       __u8 *op;
-       int size;
-       __u16 c;
-
-       op = s;
-       ip = pwcs;
-       while ((*ip || ip[1]) && (maxlen > 0) && (inlen > 0)) {
-               c = (*ip << 8) | ip[1];
-               if (c > 0x7f) {
-                       size = utf8_wctomb(op, c, maxlen);
-                       if (size == -1) {
-                               /* Ignore character and move on */
-                               maxlen--;
-                       } else {
-                               op += size;
-                               maxlen -= size;
-                       }
-               } else {
-                       *op++ = (__u8) c;
-               }
-               ip += 2;
-               inlen--;
-       }
-       return (op - s);
-}
-
  int
  get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, struct inode * inode)
  {
@@ -79,8 +48,9 @@ get_joliet_filename(struct iso_directory_record * de, unsigned char *outname, st
         nls = ISOFS_SB(inode->i_sb)->s_nls_iocharset;
  
         if (utf8) {
-               len = wcsntombs_be(outname, de->name,
-                               de->name_len[0] >> 1, PAGE_SIZE);
+               len = utf16s_to_utf8s((const wchar_t *) de->name,
+                               de->name_len[0] >> 1, UTF16_BIG_ENDIAN,
+                               outname, PAGE_SIZE);
         } else {
                 len = uni16_to_x8(outname, (__be16 *) de->name,
                                 de->name_len[0] >> 1, nls);
diff --git a/fs/ncpfs/ncplib_kernel.c b/fs/ncpfs/ncplib_kernel.c

index 97645f112114e0c2f1aa4304a1f85046ec66ebb6..0ec6237a5970f162e0ed0d72b33fa9f1e15a7b86 100644 (file)
--- a/fs/ncpfs/ncplib_kernel.c
+++ b/fs/ncpfs/ncplib_kernel.c
@@ -1113,11 +1113,13 @@ ncp__io2vol(struct ncp_server *server, unsigned char *vname, unsigned int *vlen,
  
                 if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
                         int k;
+                       unicode_t u;
  
-                       k = utf8_mbtowc(&ec, iname, iname_end - iname);
-                       if (k < 0)
+                       k = utf8_to_utf32(iname, iname_end - iname, &u);
+                       if (k < 0 || u > MAX_WCHAR_T)
                                 return -EINVAL;
                         iname += k;
+                       ec = u;
                 } else {
                         if (*iname == NCP_ESC) {
                                 int k;
@@ -1214,7 +1216,7 @@ ncp__vol2io(struct ncp_server *server, unsigned char *iname, unsigned int *ilen,
                 if (NCP_IS_FLAG(server, NCP_FLAG_UTF8)) {
                         int k;
  
-                       k = utf8_wctomb(iname, ec, iname_end - iname);
+                       k = utf32_to_utf8(ec, iname, iname_end - iname);
                         if (k < 0) {
                                 err = -ENAMETOOLONG;
                                 goto quit;
diff --git a/fs/nls/nls_base.c b/fs/nls/nls_base.c

index 750abf211e2608267d25174531c73e74d058ca6f..477d37d83b316367e1ac04fb31ba98e375a37b1a 100644 (file)
--- a/fs/nls/nls_base.c
+++ b/fs/nls/nls_base.c
@@ -15,6 +15,7 @@
  #include <linux/errno.h>
  #include <linux/kmod.h>
  #include <linux/spinlock.h>
+#include <asm/byteorder.h>
  
  static struct nls_table default_table;
  static struct nls_table *tables = &default_table;
@@ -43,10 +44,17 @@ static const struct utf8_table utf8_table[] =
      {0,                                                       /* end of table    */}
  };
  
-int
-utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
+#define UNICODE_MAX    0x0010ffff
+#define PLANE_SIZE     0x00010000
+
+#define SURROGATE_MASK 0xfffff800
+#define SURROGATE_PAIR 0x0000d800
+#define SURROGATE_LOW  0x00000400
+#define SURROGATE_BITS 0x000003ff
+
+int utf8_to_utf32(const u8 *s, int len, unicode_t *pu)
  {
-       long l;
+       unsigned long l;
         int c0, c, nc;
         const struct utf8_table *t;
    
@@ -57,12 +65,13 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
                 nc++;
                 if ((c0 & t->cmask) == t->cval) {
                         l &= t->lmask;
-                       if (l < t->lval)
+                       if (l < t->lval || l > UNICODE_MAX ||
+                                       (l & SURROGATE_MASK) == SURROGATE_PAIR)
                                 return -1;
-                       *p = l;
+                       *pu = (unicode_t) l;
                         return nc;
                 }
-               if (n <= nc)
+               if (len <= nc)
                         return -1;
                 s++;
                 c = (*s ^ 0x80) & 0xFF;
@@ -72,76 +81,119 @@ utf8_mbtowc(wchar_t *p, const __u8 *s, int n)
         }
         return -1;
  }
+EXPORT_SYMBOL(utf8_to_utf32);
  
-int
-utf8_mbstowcs(wchar_t *pwcs, const __u8 *s, int n)
+int utf32_to_utf8(unicode_t u, u8 *s, int maxlen)
  {
-       __u16 *op;
-       const __u8 *ip;
-       int size;
-
-       op = pwcs;
-       ip = s;
-       while (*ip && n > 0) {
-               if (*ip & 0x80) {
-                       size = utf8_mbtowc(op, ip, n);
-                       if (size == -1) {
-                               /* Ignore character and move on */
-                               ip++;
-                               n--;
-                       } else {
-                               op++;
-                               ip += size;
-                               n -= size;
-                       }
-               } else {
-                       *op++ = *ip++;
-                       n--;
-               }
-       }
-       return (op - pwcs);
-}
-
-int
-utf8_wctomb(__u8 *s, wchar_t wc, int maxlen)
-{
-       long l;
+       unsigned long l;
         int c, nc;
         const struct utf8_table *t;
-  
+
         if (!s)
                 return 0;
-  
-       l = wc;
+
+       l = u;
+       if (l > UNICODE_MAX || (l & SURROGATE_MASK) == SURROGATE_PAIR)
+               return -1;
+
         nc = 0;
         for (t = utf8_table; t->cmask && maxlen; t++, maxlen--) {
                 nc++;
                 if (l <= t->lmask) {
                         c = t->shift;
-                       *s = t->cval | (l >> c);
+                       *s = (u8) (t->cval | (l >> c));
                         while (c > 0) {
                                 c -= 6;
                                 s++;
-                               *s = 0x80 | ((l >> c) & 0x3F);
+                               *s = (u8) (0x80 | ((l >> c) & 0x3F));
                         }
                         return nc;
                 }
         }
         return -1;
  }
+EXPORT_SYMBOL(utf32_to_utf8);
  
-int
-utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
+int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs)
  {
-       const __u16 *ip;
-       __u8 *op;
+       u16 *op;
         int size;
+       unicode_t u;
+
+       op = pwcs;
+       while (*s && len > 0) {
+               if (*s & 0x80) {
+                       size = utf8_to_utf32(s, len, &u);
+                       if (size < 0) {
+                               /* Ignore character and move on */
+                               size = 1;
+                       } else if (u >= PLANE_SIZE) {
+                               u -= PLANE_SIZE;
+                               *op++ = (wchar_t) (SURROGATE_PAIR |
+                                               ((u >> 10) & SURROGATE_BITS));
+                               *op++ = (wchar_t) (SURROGATE_PAIR |
+                                               SURROGATE_LOW |
+                                               (u & SURROGATE_BITS));
+                       } else {
+                               *op++ = (wchar_t) u;
+                       }
+                       s += size;
+                       len -= size;
+               } else {
+                       *op++ = *s++;
+                       len--;
+               }
+       }
+       return op - pwcs;
+}
+EXPORT_SYMBOL(utf8s_to_utf16s);
+
+static inline unsigned long get_utf16(unsigned c, enum utf16_endian endian)
+{
+       switch (endian) {
+       default:
+               return c;
+       case UTF16_LITTLE_ENDIAN:
+               return __le16_to_cpu(c);
+       case UTF16_BIG_ENDIAN:
+               return __be16_to_cpu(c);
+       }
+}
+
+int utf16s_to_utf8s(const wchar_t *pwcs, int len, enum utf16_endian endian,
+               u8 *s, int maxlen)
+{
+       u8 *op;
+       int size;
+       unsigned long u, v;
  
         op = s;
-       ip = pwcs;
-       while (*ip && maxlen > 0) {
-               if (*ip > 0x7f) {
-                       size = utf8_wctomb(op, *ip, maxlen);
+       while (len > 0 && maxlen > 0) {
+               u = get_utf16(*pwcs, endian);
+               if (!u)
+                       break;
+               pwcs++;
+               len--;
+               if (u > 0x7f) {
+                       if ((u & SURROGATE_MASK) == SURROGATE_PAIR) {
+                               if (u & SURROGATE_LOW) {
+                                       /* Ignore character and move on */
+                                       continue;
+                               }
+                               if (len <= 0)
+                                       break;
+                               v = get_utf16(*pwcs, endian);
+                               if ((v & SURROGATE_MASK) != SURROGATE_PAIR ||
+                                               !(v & SURROGATE_LOW)) {
+                                       /* Ignore character and move on */
+                                       continue;
+                               }
+                               u = PLANE_SIZE + ((u & SURROGATE_BITS) << 10)
+                                               + (v & SURROGATE_BITS);
+                               pwcs++;
+                               len--;
+                       }
+                       size = utf32_to_utf8(u, op, maxlen);
                         if (size == -1) {
                                 /* Ignore character and move on */
                         } else {
@@ -149,13 +201,13 @@ utf8_wcstombs(__u8 *s, const wchar_t *pwcs, int maxlen)
                                 maxlen -= size;
                         }
                 } else {
-                       *op++ = (__u8) *ip;
+                       *op++ = (u8) u;
                         maxlen--;
                 }
-               ip++;
         }
-       return (op - s);
+       return op - s;
  }
+EXPORT_SYMBOL(utf16s_to_utf8s);
  
  int register_nls(struct nls_table * nls)
  {
@@ -467,9 +519,5 @@ EXPORT_SYMBOL(unregister_nls);
  EXPORT_SYMBOL(unload_nls);
  EXPORT_SYMBOL(load_nls);
  EXPORT_SYMBOL(load_nls_default);
-EXPORT_SYMBOL(utf8_mbtowc);
-EXPORT_SYMBOL(utf8_mbstowcs);
-EXPORT_SYMBOL(utf8_wctomb);
-EXPORT_SYMBOL(utf8_wcstombs);
  
  MODULE_LICENSE("Dual BSD/GPL");
diff --git a/fs/nls/nls_utf8.c b/fs/nls/nls_utf8.c

index aa2c42fdd977d8ed481e9e303382e608d1f6a9d8..0d60a44acacd42b7eee10349a672a9ec9a74240a 100644 (file)
--- a/fs/nls/nls_utf8.c
+++ b/fs/nls/nls_utf8.c
@@ -15,7 +15,11 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
  {
         int n;
  
-       if ( (n = utf8_wctomb(out, uni, boundlen)) == -1) {
+       if (boundlen <= 0)
+               return -ENAMETOOLONG;
+
+       n = utf32_to_utf8(uni, out, boundlen);
+       if (n < 0) {
                 *out = '?';
                 return -EINVAL;
         }
@@ -25,11 +29,14 @@ static int uni2char(wchar_t uni, unsigned char *out, int boundlen)
  static int char2uni(const unsigned char *rawstring, int boundlen, wchar_t *uni)
  {
         int n;
+       unicode_t u;
  
-       if ( (n = utf8_mbtowc(uni, rawstring, boundlen)) == -1) {
+       n = utf8_to_utf32(rawstring, boundlen, &u);
+       if (n < 0 || u > MAX_WCHAR_T) {
                 *uni = 0x003f;  /* ? */
-               n = -EINVAL;
+               return -EINVAL;
         }
+       *uni = (wchar_t) u;
         return n;
  }
  
diff --git a/include/linux/nls.h b/include/linux/nls.h

index 52b1a76c1b431520773888b0861d5a61163d408f..d47beef08dfdd59374c9d4b41309d99ba9682fd0 100644 (file)
--- a/include/linux/nls.h
+++ b/include/linux/nls.h
@@ -3,8 +3,23 @@
  
  #include <linux/init.h>
  
-/* unicode character */
-typedef __u16 wchar_t;
+/* Unicode has changed over the years.  Unicode code points no longer
+ * fit into 16 bits; as of Unicode 5 valid code points range from 0
+ * to 0x10ffff (17 planes, where each plane holds 65536 code points).
+ *
+ * The original decision to represent Unicode characters as 16-bit
+ * wchar_t values is now outdated.  But plane 0 still includes the
+ * most commonly used characters, so we will retain it.  The newer
+ * 32-bit unicode_t type can be used when it is necessary to
+ * represent the full Unicode character set.
+ */
+
+/* Plane-0 Unicode character */
+typedef u16 wchar_t;
+#define MAX_WCHAR_T    0xffff
+
+/* Arbitrary Unicode character */
+typedef u32 unicode_t;
  
  struct nls_table {
         const char *charset;
@@ -21,6 +36,13 @@ struct nls_table {
  /* this value hold the maximum octet of charset */
  #define NLS_MAX_CHARSET_SIZE 6 /* for UTF-8 */
  
+/* Byte order for UTF-16 strings */
+enum utf16_endian {
+       UTF16_HOST_ENDIAN,
+       UTF16_LITTLE_ENDIAN,
+       UTF16_BIG_ENDIAN
+};
+
  /* nls.c */
  extern int register_nls(struct nls_table *);
  extern int unregister_nls(struct nls_table *);
@@ -28,10 +50,11 @@ extern struct nls_table *load_nls(char *);
  extern void unload_nls(struct nls_table *);
  extern struct nls_table *load_nls_default(void);
  
-extern int utf8_mbtowc(wchar_t *, const __u8 *, int);
-extern int utf8_mbstowcs(wchar_t *, const __u8 *, int);
-extern int utf8_wctomb(__u8 *, wchar_t, int);
-extern int utf8_wcstombs(__u8 *, const wchar_t *, int);
+extern int utf8_to_utf32(const u8 *s, int len, unicode_t *pu);
+extern int utf32_to_utf8(unicode_t u, u8 *s, int maxlen);
+extern int utf8s_to_utf16s(const u8 *s, int len, wchar_t *pwcs);
+extern int utf16s_to_utf8s(const wchar_t *pwcs, int len,
+               enum utf16_endian endian, u8 *s, int maxlen);
  
  static inline unsigned char nls_tolower(struct nls_table *t, unsigned char c)
  {
author	Alan Stern <stern@rowland.harvard.edu>
	Thu, 30 Apr 2009 14:08:18 +0000 (10:08 -0400)
committer	Greg Kroah-Hartman <gregkh@suse.de>
	Tue, 16 Jun 2009 04:44:43 +0000 (21:44 -0700)
drivers/usb/core/message.c		patch \| blob \| history
fs/befs/linuxvfs.c		patch \| blob \| history
fs/fat/dir.c		patch \| blob \| history
fs/fat/namei_vfat.c		patch \| blob \| history
fs/isofs/joliet.c		patch \| blob \| history
fs/ncpfs/ncplib_kernel.c		patch \| blob \| history
fs/nls/nls_base.c		patch \| blob \| history
fs/nls/nls_utf8.c		patch \| blob \| history
include/linux/nls.h		patch \| blob \| history