[PATCH 01 of 12] Win32: non-ASCII names support in autoindex (ticket #458)

Sergey Kandaurov pluknet at nginx.com
Wed Feb 22 15:39:13 UTC 2023


> On 19 Feb 2023, at 21:17, Maxim Dounin <mdounin at mdounin.ru> wrote:
> 
> Hello!
> 
> On Fri, Feb 17, 2023 at 06:38:54PM +0400, Sergey Kandaurov wrote:
> 
>>> On 13 Jan 2023, at 01:35, Maxim Dounin <mdounin at mdounin.ru> wrote:
>>> 
>>> # HG changeset patch
>>> # User Maxim Dounin <mdounin at mdounin.ru>
>>> # Date 1673548890 -10800
>>> #      Thu Jan 12 21:41:30 2023 +0300
>>> # Node ID 60d845f9505fe1b97c1e04b680523b790e29fdb1
>>> # Parent  07b0bee87f32be91a33210bc06973e07c4c1dac9
>>> Win32: non-ASCII names support in autoindex (ticket #458).
>>> 
>>> Notably, ngx_open_dir() now supports opening directories with non-ASCII
>>> characters, and directory entries returned by ngx_read_dir() are properly
>>> converted to UTF-8.
>>> 
>>> diff -r 07b0bee87f32 -r 60d845f9505f src/os/win32/ngx_files.c
>>> --- a/src/os/win32/ngx_files.c	Wed Dec 21 14:53:27 2022 +0300
>>> +++ b/src/os/win32/ngx_files.c	Thu Jan 12 21:41:30 2023 +0300
>>> @@ -13,7 +13,11 @@
>>> 
>>> static ngx_int_t ngx_win32_check_filename(u_char *name, u_short *u,
>>>    size_t len);
>>> -static u_short *ngx_utf8_to_utf16(u_short *utf16, u_char *utf8, size_t *len);
>>> +static u_short *ngx_utf8_to_utf16(u_short *utf16, u_char *utf8, size_t *len,
>>> +    size_t reserved);
>>> +static u_char *ngx_utf16_to_utf8(u_char *utf8, u_short *utf16, size_t *len,
>>> +    size_t *allocated);
>>> +uint32_t ngx_utf16_decode(u_short **u, size_t n);
>>> 
>>> 
>>> /* FILE_FLAG_BACKUP_SEMANTICS allows to obtain a handle to a directory */
>>> @@ -28,7 +32,7 @@ ngx_open_file(u_char *name, u_long mode,
>>>    u_short     utf16[NGX_UTF16_BUFLEN];
>>> 
>>>    len = NGX_UTF16_BUFLEN;
>>> -    u = ngx_utf8_to_utf16(utf16, name, &len);
>>> +    u = ngx_utf8_to_utf16(utf16, name, &len, 0);
>>> 
>>>    if (u == NULL) {
>>>        return INVALID_HANDLE_VALUE;
>>> @@ -269,7 +273,7 @@ ngx_file_info(u_char *file, ngx_file_inf
>>> 
>>>    len = NGX_UTF16_BUFLEN;
>>> 
>>> -    u = ngx_utf8_to_utf16(utf16, file, &len);
>>> +    u = ngx_utf8_to_utf16(utf16, file, &len, 0);
>>> 
>>>    if (u == NULL) {
>>>        return NGX_FILE_ERROR;
>>> @@ -427,49 +431,51 @@ ngx_realpath(u_char *path, u_char *resol
>>> ngx_int_t
>>> ngx_open_dir(ngx_str_t *name, ngx_dir_t *dir)
>>> {
>>> -    u_char     *pattern, *p;
>>> +    size_t      len;
>>> +    u_short    *u, *p;
>>>    ngx_err_t   err;
>>> +    u_short     utf16[NGX_UTF16_BUFLEN];
>>> 
>>> -    pattern = malloc(name->len + 3);
>>> -    if (pattern == NULL) {
>>> +    len = NGX_UTF16_BUFLEN - 2;
>>> +    u = ngx_utf8_to_utf16(utf16, name->data, &len, 2);
>>> +
>>> +    if (u == NULL) {
>>>        return NGX_ERROR;
>>>    }
>>> 
>>> -    p = ngx_cpymem(pattern, name->data, name->len);
>>> +    if (ngx_win32_check_filename(name->data, u, len) != NGX_OK) {
>>> +        goto failed;
>>> +    }
>>> +
>>> +    p = &u[len - 1];
>>> 
>>>    *p++ = '/';
>>>    *p++ = '*';
>>>    *p = '\0';
>>> 
>>> -    dir->dir = FindFirstFile((const char *) pattern, &dir->finddata);
>>> +    dir->dir = FindFirstFileW(u, &dir->finddata);
>>> 
>>>    if (dir->dir == INVALID_HANDLE_VALUE) {
>>> -        err = ngx_errno;
>>> -        ngx_free(pattern);
>>> -        ngx_set_errno(err);
>>> -        return NGX_ERROR;
>>> +        goto failed;
>>>    }
>>> 
>>> -    ngx_free(pattern);
>>> +    if (u != utf16) {
>>> +        ngx_free(u);
>>> +    }
>>> 
>>>    dir->valid_info = 1;
>>>    dir->ready = 1;
>>> +    dir->name = NULL;
>>> +    dir->allocated = 0;
>>> 
>>>    return NGX_OK;
>>> -}
>>> 
>>> +failed:
>>> 
>>> -ngx_int_t
>>> -ngx_read_dir(ngx_dir_t *dir)
>>> -{
>>> -    if (dir->ready) {
>>> -        dir->ready = 0;
>>> -        return NGX_OK;
>>> -    }
>>> -
>>> -    if (FindNextFile(dir->dir, &dir->finddata) != 0) {
>>> -        dir->type = 1;
>>> -        return NGX_OK;
>>> +    if (u != utf16) {
>>> +        err = ngx_errno;
>>> +        ngx_free(u);
>>> +        ngx_set_errno(err);
>>>    }
>>> 
>>>    return NGX_ERROR;
>>> @@ -477,8 +483,56 @@ ngx_read_dir(ngx_dir_t *dir)
>>> 
>>> 
>>> ngx_int_t
>>> +ngx_read_dir(ngx_dir_t *dir)
>>> +{
>>> +    u_char  *name;
>>> +    size_t   len, allocated;
>>> +
>>> +    if (dir->ready) {
>>> +        dir->ready = 0;
>>> +        goto convert;
>>> +    }
>>> +
>>> +    if (FindNextFileW(dir->dir, &dir->finddata) != 0) {
>>> +        dir->type = 1;
>>> +        goto convert;
>>> +    }
>>> +
>>> +    return NGX_ERROR;
>>> +
>>> +convert:
>>> +
>>> +    name = dir->name;
>>> +    len = dir->allocated;
>>> +
>>> +    name = ngx_utf16_to_utf8(name, dir->finddata.cFileName, &len, &allocated);
>>> +    if (name == NULL) {
>> 
>> Note that you just return on NULL, but see below.
>> 
>>> +        return NGX_ERROR;
>>> +    }
>>> +
>>> +    if (name != dir->name) {
>>> +
>>> +        if (dir->name) {
>>> +            ngx_free(dir->name);
>>> +        }
>>> +
>>> +        dir->name = name;
>>> +        dir->allocated = allocated;
>>> +    }
>>> +
>>> +    dir->namelen = len - 1;
>>> +
>>> +    return NGX_OK;
>>> +}
>>> +
>>> +
>>> +ngx_int_t
>>> ngx_close_dir(ngx_dir_t *dir)
>>> {
>>> +    if (dir->name) {
>>> +        ngx_free(dir->name);
>>> +    }
>>> +
>>>    if (FindClose(dir->dir) == 0) {
>>>        return NGX_ERROR;
>>>    }
>>> @@ -816,7 +870,7 @@ failed:
>>> 
>>> 
>>> static u_short *
>>> -ngx_utf8_to_utf16(u_short *utf16, u_char *utf8, size_t *len)
>>> +ngx_utf8_to_utf16(u_short *utf16, u_char *utf8, size_t *len, size_t reserved)
>>> {
>>>    u_char    *p;
>>>    u_short   *u, *last;
>>> @@ -865,7 +919,7 @@ ngx_utf8_to_utf16(u_short *utf16, u_char
>>> 
>>>    /* the given buffer is not enough, allocate a new one */
>>> 
>>> -    u = malloc(((p - utf8) + ngx_strlen(p) + 1) * sizeof(u_short));
>>> +    u = malloc(((p - utf8) + ngx_strlen(p) + 1 + reserved) * sizeof(u_short));
>>>    if (u == NULL) {
>>>        return NULL;
>>>    }
>>> @@ -910,3 +964,170 @@ ngx_utf8_to_utf16(u_short *utf16, u_char
>>> 
>>>    /* unreachable */
>>> }
>>> +
>>> +
>>> +static u_char *
>>> +ngx_utf16_to_utf8(u_char *utf8, u_short *utf16, size_t *len, size_t *allocated)
>>> +{
>>> +    u_char    *p, *last;
>>> +    u_short   *u, *j;
>>> +    uint32_t   n;
>>> +
>>> +    u = utf16;
>>> +    p = utf8;
>>> +    last = utf8 + *len;
>>> +
>>> +    while (p < last) {
>>> +
>>> +        if (*u < 0x80) {
>>> +            *p++ = (u_char) *u;
>>> +
>>> +            if (*u == 0) {
>>> +                *len = p - utf8;
>>> +                return utf8;
>>> +            }
>>> +
>>> +            u++;
>>> +
>>> +            continue;
>>> +        }
>>> +
>>> +        if (p >= last - 4) {
>>> +            *len = p - utf8;
>>> +            break;
>>> +        }
>>> +
>>> +        n = ngx_utf16_decode(&u, 2);
>>> +
>>> +        if (n > 0x10ffff) {
>>> +            ngx_free(utf8);
>>> +            ngx_set_errno(NGX_EILSEQ);
>> 
>> This can result in double-free on a subsequent ngx_close_dir() call.
> 
> Thanks for catching, that's slipped in from the second loop.
> Removed ngx_free() here.

I believe it should be fine now.

> 
>> 
>>> +            return NULL;
>>> +        }
>>> +
>>> +        if (n >= 0x10000) {
>>> +            *p++ = (u_char) (0xf0 + (n >> 18));
>>> +            *p++ = (u_char) (0x80 + ((n >> 12) & 0x3f));
>>> +            *p++ = (u_char) (0x80 + ((n >> 6) & 0x3f));
>>> +            *p++ = (u_char) (0x80 + (n & 0x3f));
>>> +            continue;
>>> +            
>> 
>> stray ws-only line (one more below)
> 
> Fixed both, thanks.
> 
>>> +        }
>>> +
>>> +        if (n >= 0x0800) {
>>> +            *p++ = (u_char) (0xe0 + (n >> 12));
>>> +            *p++ = (u_char) (0x80 + ((n >> 6) & 0x3f));
>>> +            *p++ = (u_char) (0x80 + (n & 0x3f));
>>> +            continue;
>>> +        }
>>> +
>>> +        *p++ = (u_char) (0xc0 + (n >> 6));
>>> +        *p++ = (u_char) (0x80 + (n & 0x3f));
>>> +    }
>>> +
>>> +    /* the given buffer is not enough, allocate a new one */
>>> +
>>> +    for (j = u; *j; j++) { /* void */ }
>>> +
>>> +    p = malloc((j - utf16) * 4 + 1);
>> 
>> [Note that pointer arithmetics is in u_short (2 bytes) units.]
>> It seems you are allocating memory assuming that utf16 -> utf8
>> conversion can result in memory expansion up to 4 bytes per unit.
>> At first glance that's true, because UTF-8 symbol may take 4 bytes.
>> IIUC, 4-byte UTF-8 can result only from UTF-16 surrogate pairs,
>> this gives 2x growth (2-paired UTF-16 to 4-bytes UTF-8).
>> Otherwise, 1/2/3-byte encoded UTF-8 can result from 2-byte UTF-16,
>> which gives from 1x to 3x growth per unit.
>> So, "malloc((j - utf16) * 3 + 1)" should be enough there.
> 
> That's correct, though this depends on not-directly-visible 
> encoding details.  I would rather prefer to keep 4 here, since it 
> can be easily verified to be enough from the loop itself.
> 
> I don't think that a minor optimization of memory usage by going 
> from 4x to 3x of the particular name is important here.  Further, 
> extra allocated bytes are likely to be used as a preallocation for 
> other names.
> 

Ok, it makes sense then.

>>> +    if (p == NULL) {
>>> +        return NULL;
>>> +    }
>> 
>> Old memory is still there, allocated and referenced by dir->name,
>> see above.  It seems ok, it should be freed by ngx_close_dir() call.
>> If we happen to change callers to not call ngx_close_dir() on error,
>> this could turn into memory leaks, though.
> 
> The ngx_close_dir() is certainly required (including for other 
> reasons), so this looks perfectly fine.
> 
>>> +
>>> +    if (allocated) {
>>> +        *allocated = (j - utf16) * 4 + 1;
>>> +    }
>>> +
>>> +    ngx_memcpy(p, utf8, *len);
>>> +
>>> +    utf8 = p;
>>> +    p += *len;
>>> +
>>> +    for ( ;; ) {
>>> +
>>> +        if (*u < 0x80) {
>>> +            *p++ = (u_char) *u;
>>> +
>>> +            if (*u == 0) {
>>> +                *len = p - utf8;
>>> +                return utf8;
>>> +            }
>>> +
>>> +            u++;
>>> +
>>> +            continue;
>>> +        }
>>> +
>>> +        n = ngx_utf16_decode(&u, 2);
>>> +
>>> +        if (n > 0x10ffff) {
>>> +            ngx_free(utf8);
>>> +            ngx_set_errno(NGX_EILSEQ);
>>> +            return NULL;
>>> +        }
>>> +
>>> +        if (n >= 0x10000) {
>>> +            *p++ = (u_char) (0xf0 + (n >> 18));
>>> +            *p++ = (u_char) (0x80 + ((n >> 12) & 0x3f));
>>> +            *p++ = (u_char) (0x80 + ((n >> 6) & 0x3f));
>>> +            *p++ = (u_char) (0x80 + (n & 0x3f));
>>> +            continue;
>>> +            
>>> +        }
>>> +
>>> +        if (n >= 0x0800) {
>>> +            *p++ = (u_char) (0xe0 + (n >> 12));
>>> +            *p++ = (u_char) (0x80 + ((n >> 6) & 0x3f));
>>> +            *p++ = (u_char) (0x80 + (n & 0x3f));
>>> +            continue;
>>> +        }
>>> +
>>> +        *p++ = (u_char) (0xc0 + (n >> 6));
>>> +        *p++ = (u_char) (0x80 + (n & 0x3f));
>>> +    }
>>> +
>>> +    /* unreachable */
>>> +}
>>> +
>>> +
>>> +/*
>>> + * ngx_utf16_decode() decodes one or two UTF-16 code units
>>> + * the return values:
>>> + *    0x80 - 0x10ffff         valid character
>>> + *    0x110000 - 0xfffffffd   invalid sequence
>>> + *    0xfffffffe              incomplete sequence
>>> + *    0xffffffff              error
>>> + */
>>> +
>>> +uint32_t
>>> +ngx_utf16_decode(u_short **u, size_t n)
>>> +{
>>> +    uint32_t  k, m;
>>> +
>>> +    k = **u;
>>> +
>>> +    if (k < 0xd800 || k > 0xdfff) {
>>> +        (*u)++;
>>> +        return k;
>>> +    }
>>> +
>>> +    if (k > 0xdbff) {
>>> +        (*u)++;
>>> +        return 0xffffffff;
>>> +    }
>>> +
>>> +    if (n < 2) {
>>> +        return 0xfffffffe;
>>> +    }
>>> +
>>> +    (*u)++;
>>> +
>>> +    m = *(*u)++;
>>> +
>>> +    if (m < 0xdc00 || m > 0xdfff) {
>>> +        return 0xffffffff;
>>> +
>>> +    }
>>> +
>>> +    return 0x10000 + ((k - 0xd800) << 10) + (m - 0xdc00);
>>> +}
>>> diff -r 07b0bee87f32 -r 60d845f9505f src/os/win32/ngx_files.h
>>> --- a/src/os/win32/ngx_files.h	Wed Dec 21 14:53:27 2022 +0300
>>> +++ b/src/os/win32/ngx_files.h	Thu Jan 12 21:41:30 2023 +0300
>>> @@ -30,7 +30,11 @@ typedef struct {
>>> 
>>> typedef struct {
>>>    HANDLE                          dir;
>>> -    WIN32_FIND_DATA                 finddata;
>>> +    WIN32_FIND_DATAW                finddata;
>>> +
>>> +    u_char                         *name;
>>> +    size_t                          namelen;
>>> +    size_t                          allocated;
>>> 
>>>    unsigned                        valid_info:1;
>>>    unsigned                        type:1;
>>> @@ -205,8 +209,8 @@ ngx_int_t ngx_close_dir(ngx_dir_t *dir);
>>> #define ngx_dir_access(a)           (a)
>>> 
>>> 
>>> -#define ngx_de_name(dir)            ((u_char *) (dir)->finddata.cFileName)
>>> -#define ngx_de_namelen(dir)         ngx_strlen((dir)->finddata.cFileName)
>>> +#define ngx_de_name(dir)            (dir)->name
>>> +#define ngx_de_namelen(dir)         (dir)->namelen
>>> 
>>> ngx_int_t ngx_de_info(u_char *name, ngx_dir_t *dir);
>>> #define ngx_de_info_n               "dummy()"
>> 
>> Otherwise, looks good.
> 
> Fixes:
> 
> diff -r d7521ae9e138 src/os/win32/ngx_files.c
> --- a/src/os/win32/ngx_files.c	Sat Feb 18 12:08:25 2023 +0300
> +++ b/src/os/win32/ngx_files.c	Sat Feb 18 12:57:02 2023 +0300
> @@ -506,6 +506,7 @@ convert:
>     len = dir->allocated;
> 
>     name = ngx_utf16_to_utf8(name, dir->finddata.cFileName, &len, &allocated);
> +
>     if (name == NULL) {
>         return NGX_ERROR;
>     }
> @@ -1000,7 +1000,6 @@ ngx_utf16_to_utf8(u_char *utf8, u_short 
>         n = ngx_utf16_decode(&u, 2);
> 
>         if (n > 0x10ffff) {
> -            ngx_free(utf8);
>             ngx_set_errno(NGX_EILSEQ);
>             return NULL;
>         }
> @@ -1011,7 +1010,6 @@ ngx_utf16_to_utf8(u_char *utf8, u_short 
>             *p++ = (u_char) (0x80 + ((n >> 6) & 0x3f));
>             *p++ = (u_char) (0x80 + (n & 0x3f));
>             continue;
> -            
>         }
> 
>         if (n >= 0x0800) {
> @@ -1072,7 +1070,6 @@ ngx_utf16_to_utf8(u_char *utf8, u_short 
>             *p++ = (u_char) (0x80 + ((n >> 6) & 0x3f));
>             *p++ = (u_char) (0x80 + (n & 0x3f));
>             continue;
> -            
>         }
> 
>         if (n >= 0x0800) {
> 

Looks good.

-- 
Sergey Kandaurov


More information about the nginx-devel mailing list