A gopher client for your terminal.
git clone git://git.skec.site/pub/sr71.git
log | files | refs | readme | license

uri.c (13555B)

      1
      2
      3
      4
      5
      6
      7
      8
      9
     10
     11
     12
     13
     14
     15
     16
     17
     18
     19
     20
     21
     22
     23
     24
     25
     26
     27
     28
     29
     30
     31
     32
     33
     34
     35
     36
     37
     38
     39
     40
     41
     42
     43
     44
     45
     46
     47
     48
     49
     50
     51
     52
     53
     54
     55
     56
     57
     58
     59
     60
     61
     62
     63
     64
     65
     66
     67
     68
     69
     70
     71
     72
     73
     74
     75
     76
     77
     78
     79
     80
     81
     82
     83
     84
     85
     86
     87
     88
     89
     90
     91
     92
     93
     94
     95
     96
     97
     98
     99
    100
    101
    102
    103
    104
    105
    106
    107
    108
    109
    110
    111
    112
    113
    114
    115
    116
    117
    118
    119
    120
    121
    122
    123
    124
    125
    126
    127
    128
    129
    130
    131
    132
    133
    134
    135
    136
    137
    138
    139
    140
    141
    142
    143
    144
    145
    146
    147
    148
    149
    150
    151
    152
    153
    154
    155
    156
    157
    158
    159
    160
    161
    162
    163
    164
    165
    166
    167
    168
    169
    170
    171
    172
    173
    174
    175
    176
    177
    178
    179
    180
    181
    182
    183
    184
    185
    186
    187
    188
    189
    190
    191
    192
    193
    194
    195
    196
    197
    198
    199
    200
    201
    202
    203
    204
    205
    206
    207
    208
    209
    210
    211
    212
    213
    214
    215
    216
    217
    218
    219
    220
    221
    222
    223
    224
    225
    226
    227
    228
    229
    230
    231
    232
    233
    234
    235
    236
    237
    238
    239
    240
    241
    242
    243
    244
    245
    246
    247
    248
    249
    250
    251
    252
    253
    254
    255
    256
    257
    258
    259
    260
    261
    262
    263
    264
    265
    266
    267
    268
    269
    270
    271
    272
    273
    274
    275
    276
    277
    278
    279
    280
    281
    282
    283
    284
    285
    286
    287
    288
    289
    290
    291
    292
    293
    294
    295
    296
    297
    298
    299
    300
    301
    302
    303
    304
    305
    306
    307
    308
    309
    310
    311
    312
    313
    314
    315
    316
    317
    318
    319
    320
    321
    322
    323
    324
    325
    326
    327
    328
    329
    330
    331
    332
    333
    334
    335
    336
    337
    338
    339
    340
    341
    342
    343
    344
    345
    346
    347
    348
    349
    350
    351
    352
    353
    354
    355
    356
    357
    358
    359
    360
    361
    362
    363
    364
    365
    366
    367
    368
    369
    370
    371
    372
    373
    374
    375
    376
    377
    378
    379
    380
    381
    382
    383
    384
    385
    386
    387
    388
    389
    390
    391
    392
    393
    394
    395
    396
    397
    398
    399
    400
    401
    402
    403
    404
    405
    406
    407
#include "pch.h"
#include "str.h"
#include "uri.h"
#include "util.h"

struct uri
uri_parse(const char *uristr, int uristr_len)
{
    struct uri uri = { 0 };

    int colon = 0,
        hostname_start = 0,
        hostname_len = 0,
        protocol_name_len = 0,
        path_start = 0,
        path_len = 0,
        query = 0;

    /*
     * Look for a colon.  We expect a maximum of two colons in a URI:
     *
     *   1 colon  -- gopher://example.com/
     *   1 colon  -- example.com:70/
     *   2 colons -- gopher://example.com:70/
     *
     * Note that we consider the second case to be an invalid URI, as no
     * protocol is specified, we cannot make any assumption of what protocol to
     * use.
     */
    colon = str_ichr(uristr, ':', uristr_len);

    if (colon < uristr_len)
    {
        char nextchar;

        if (colon + 1 < uristr_len)
            nextchar = uristr[colon + 1];
        else
            nextchar = 0;

        /* Check if this is a port on a URI without a specified scheme */
        if (isdigit(nextchar))
        {
            /* TODO how should we handle this? */
            ASSERT(0 && "port parse without scheme is unimplemented");
        }
        else /* this is a scheme */
        {
            int protocol_name_size;
            const char *protocol_name;

            protocol_name = uristr;
            protocol_name_len = colon;
            protocol_name_size = colon + 1; /* +1 for null-terminator */

            /* Set the URI protocol */
            ASSERT(protocol_name_size < sizeof(uri.protocol_str) &&
                   "big protocol name");

            /* Truncate protocol name */
            protocol_name_size = min(sizeof(uri.protocol_str),
                                     protocol_name_size);
            /* TODO: log truncation */

            str_copy(uri.protocol_str, protocol_name, protocol_name_size);
            uri.protocol = uri_protocol_lookup(protocol_name,
                                               protocol_name_len);

            /*
             * We handle two types of schemes.  Those with two slashes '//' and
             * those without.  For example:
             *
             *   No slashes  -- mailto:info@example.com
             *   No slashes  -- internal:about
             *   Two slashes -- gopher://example.com/
             */
            if (nextchar != '/') /* no slashes */
                uri.flags |= URI_NO_PROTOCOL_SLASHES_BIT;
        }

        /* Look for a second colon (for optional port) */
        if (uri.port == 0)
        {
            int colon2, port_start, port_len;

            ++colon;
            colon2 = colon + str_ichr(uristr + colon, ':', uristr_len - colon);

            port_start = colon2 + 1;

            for (port_len = 0;
                 (port_start + port_len < uristr_len) &&
                     isdigit(uristr[port_start + port_len]);
                 ++port_len)
                ;

            if (port_len > 0)
                uri.port = str_tol(uristr + port_start, port_len);
        }
    }


    /* Determine hostname, which can appear only in non-relative URIs.
     * We assume that the hostname can only appear when a protocol is
     * specified, e.g. the following URI is considered invalid:
     *
     *   example.com/index.html
     *
     * But the following is valid and has a hostname of example.com:
     *
     *   gopher://example.com/index.html
     *
     * Note that the LOCAL protocols (file://) are a special case.  There is
     * never a hostname for these protocols and instead we always assume that
     * anything following the scheme is the path.
     */
    if (uri.protocol != PROTOCOL_FILE &&
           (protocol_name_len > 0 &&
            protocol_name_len + 1 < uristr_len)) /* check +1 for colon */
    {
        char c;
        int hostname_size;

        ASSERT(uristr[protocol_name_len] == ':');

        /* Hostname starts after the scheme.  We start just past the colon and
         * skip over any slashes */
        for (hostname_start = protocol_name_len + 1;
             hostname_start < uristr_len &&
                (uristr[hostname_start] == '/');
             ++hostname_start)
            ;

        /* Hostname can end at either a colon, a slash (the beginning of the
         * path), or the end of the string */
        for (hostname_len = 0;
             hostname_start + hostname_len < uristr_len;
             ++hostname_len)
        {
            c = uristr[hostname_len + hostname_start];

            if (c == '\0' || c == ':' || c == '/' || c == '?' || c == '#')
                break;
        }

        hostname_size = hostname_len + 1;
        ASSERT(hostname_size < sizeof(uri.host) && "big hostname");
        hostname_size = min(sizeof(uri.host), hostname_size); /* truncate */
        /* TODO: log truncation */

        str_copy(uri.host, uristr + hostname_start, hostname_size);
    }

    /* Determine path.  For URIs with a scheme and hostname, this path should
     * always begin with a leading slash.  For relative URIs, the path actually
     * starts from the beginning of the string.  For local schemes (file://)
     * this is anything the follows the scheme.
     *
     * e.g.:
     *      URI                        PATH
     *   0. gopher://example.com       -->  (empty)
     *   1. gopher://example.com:70/   -->  /
     *   2. gopher://example.com/index -->  /index
     *   3. /index                     -->  /index
     *   4. index                      -->  index
     *   5. ../index                   -->  ../index
     *   6. file://index               -->  index
     *   7. file:///index              -->  /index
     *
     * Note that in this parsing function for examples 4, 5, and 6 the paths
     * are stored in the relative form as shown.  These paths should be
     * normalised with the current URI at some point to compute the real path,
     * perhaps when the page is displayed to the user.
     *
     *      URI                        ACTUAL PATH
     *      index                      -->  <current path>/index
     *      ../index                   -->  <current path>/../index
     *      file://index               -->  <current path>/index
     */
    if (uri.protocol == PROTOCOL_FILE) /* local URI */
    {
        int n_slashes, path_size;

        ASSERT(protocol_name_len > 0);
        ASSERT(uristr[protocol_name_len] == ':');

        /* Everything after scheme is the path.  We move over the colon and
         * *maximum of two* slashes (to allow absolute paths) */
        n_slashes = 0;
        for (path_start = protocol_name_len + 1;
             path_start < uristr_len && n_slashes < 2;
             ++path_start)
        {
            if (uristr[path_start] == '/')
                ++n_slashes;
        }

        /* Find length of path.  Generally ends at the end of the string. */
        for (path_len = 0;
             (path_start + path_len < uristr_len) &&
                 (uristr[path_start + path_len] != '\0') &&
                 (uristr[path_start + path_len] != '?') &&
                 (uristr[path_start + path_len] != '#'); /* ignore fragment */
             ++path_len)
            ;

        path_size = path_len + 1;
        ASSERT(path_size < sizeof(uri.path) && "big path");
        path_size = min(sizeof(uri.path), path_size); /* truncate */
        /* TODO: log truncation */

        /* Copy path */
        str_copy(uri.path, uristr + path_start, path_size);
    }
    else if (hostname_len > 0) /* URI with a hostname */
    {
        int path_size;

        /* Start may start at the end of hostname and after port, at the first
         * slash.  If we encounter the query (?) or fragment (#), we leave the
         * "path start" point there, even though the path length will be 0, so
         * that we can still find the query later on. */
        for (path_start = hostname_start + hostname_len;
             path_start < uristr_len &&
                 (uristr[path_start] != '\0') &&
                 (uristr[path_start] != '/') &&
                 (uristr[path_start] != '?') &&
                 (uristr[path_start] != '#'); /* ignore fragment */
             ++path_start)
        {
            /* Skip past the port if there is one */
            if (uristr[path_start] == ':')
            {
                for (;
                     path_start < uristr_len && isdigit(uristr[path_start]);
                     ++path_start)
                    ;
            }
        }

        /* Length is to end of the string or to the query. */
        for (path_len = 0;
             path_start + path_len < uristr_len &&
                 (uristr[path_start + path_len] != '\0') &&
                 (uristr[path_start + path_len] != '?') &&
                 (uristr[path_start + path_len] != '#'); /* ignore fragment */
             ++path_len)
             ;

        path_size = path_len + 1;
        ASSERT(path_size < sizeof(uri.path) && "big path");
        path_size = min(sizeof(uri.path), path_size); /* truncate */
        /* TODO: log truncation */

        str_copy(uri.path, uristr + path_start, path_size);
    }
    else /* relative URI--we use whole beginning section as path. */
    {
        int path_size;

        ASSERT(protocol_name_len == 0);

        /* Find length of path.  Goes until the end of the string or at
         * query or fragment. */
        for (path_len = 0;
             (path_start + path_len < uristr_len) &&
                 (uristr[path_start + path_len] != '\0') &&
                 (uristr[path_start + path_len] != '?') &&
                 (uristr[path_start + path_len] != '#'); /* ignore fragment */
             ++path_len)
            ;

        path_size = path_len + 1;
        ASSERT(path_size < sizeof(uri.path) && "big path");
        path_size = min(sizeof(uri.path), path_size); /* truncate */
        /* TODO: log truncation */

        /* Copy path */
        str_copy(uri.path, uristr + path_start, path_size);
    }

    /* Find query, which always appears after the path */
    query = str_ichr(uristr + path_start + path_len, '?',
                     uristr_len - path_start - path_len) +
                     path_start + path_len + 1 /* +1 to read after the '?' */;
    if (query < uristr_len)
    {
        int query_len, query_size;

        /* Find length of query.  Goes until the end of the string or at
         * query. */
        for (query_len = 0;
             (query + query_len < uristr_len) &&
                 (uristr[query + query_len] != '\0') &&
                 (uristr[query + query_len] != '#'); /* ignore fragment */
             ++query_len)
            ;

        ASSERT(query_len > 0);

        query_size = query_len + 1;
        ASSERT(query_size < sizeof(uri.query) && "big query");
        query_size = min(sizeof(uri.query), query_size);
        /* TODO: log truncation */

        /* Copy query */
        str_copy(uri.query, uristr + query, query_size);
    }

    return uri;
}

size_t
uri_str(char *buffer,
        const struct uri *const u,
        size_t buffer_size,
        uint32_t flags)
{
    ASSERT(u);
    ASSERT(buffer);
    ASSERT(buffer_size > 0);

    int pos = 0;
    buffer[pos] = '\0';

    /* Write the scheme first */
    if (u->protocol != PROTOCOL_NONE)
    {
        ASSERT(u->protocol_str[0] && "uri has no protocol");

        int protocol_len;

        /* Write protocol string */
        protocol_len = (int)str_copy(buffer, u->protocol_str, buffer_size);

        /* Write the colon following protocol */
        protocol_len += (int)str_copy(buffer + protocol_len, ":",
                                      buffer_size - protocol_len);

        /* Write slashes if the protocol has them */
        if (!(u->flags & URI_NO_PROTOCOL_SLASHES_BIT))
        {
            protocol_len += (int)str_copy(buffer + protocol_len, "//",
                                          buffer_size - protocol_len);
        }

        pos += protocol_len;
    }

    /* Write the hostname */
    if (u->host[0] != '\0')
    {
        pos += (int)str_copy(buffer + pos, u->host, buffer_size - pos);
    }

    /* Write the port */
    if (u->port > 0 && !(flags & URISTR_NO_PORT_BIT))
    {
        pos += (int)snprintf(buffer + pos, buffer_size - pos, ":%d", u->port);
    }

    /* Write path */
    if (u->path[0] != '\0')
    {
        pos += (int)str_copy(buffer + pos, u->path, buffer_size - pos);

        /* trim trailing slash */
        if ((flags & URISTR_NOSLASH_BIT) && pos > 0 && buffer[pos - 1] == '/')
            buffer[--pos] = '\0';
    }

    /* Write query */
    if (u->query[0] != '\0' && !(flags & URISTR_NOQUERY_BIT))
    {
        /* Write the leading '?' followed by query */
        pos += (int)str_copy(buffer + pos, "?", buffer_size - pos);
        pos += (int)str_copy(buffer + pos, u->query, buffer_size - pos);
    }

    return pos;
}

bool
uri_equal(struct uri u1, struct uri u2, uint32_t flags)
{
    size_t l1 = strz_len(u1.path);
    size_t l2 = strz_len(u2.path);

    /* Strip trailing path slashes */
    if (flags & URIEQ_IGNORE_TRAILING_SLASH_BIT)
    {
        if (u1.path[l1 - 1] == '/') --l1;
        if (u2.path[l2 - 1] == '/') --l2;
    }

    /* Check that query matches */
    if (!(flags & URIEQ_IGNORE_QUERY_BIT) &&
        !str_equal(u1.query, u2.query, sizeof(u1.query)))
    {
        return false;
    }

    return l1 == l2 &&
           str_equal(u1.host, u2.host, sizeof(u1.host)) &&
           str_equal(u1.path, u2.path, l1) &&
           u1.protocol == u2.protocol;
}