uri.c - sr71.git - A gopher client for your terminal.

uri.c (13555B)
#include "pch.h"
#include "str.h"
#include "uri.h"
#include "util.h"

struct uri
uri_parse(const char *uristr, int uristr_len)
{
    struct uri uri = { 0 };

    int colon = 0,
        hostname_start = 0,
        hostname_len = 0,
        protocol_name_len = 0,
        path_start = 0,
        path_len = 0,
        query = 0;

    /*
     * Look for a colon.  We expect a maximum of two colons in a URI:
     *
     *   1 colon  -- gopher://example.com/
     *   1 colon  -- example.com:70/
     *   2 colons -- gopher://example.com:70/
     *
     * Note that we consider the second case to be an invalid URI, as no
     * protocol is specified, we cannot make any assumption of what protocol to
     * use.
     */
    colon = str_ichr(uristr, ':', uristr_len);

    if (colon < uristr_len)
    {
        char nextchar;

        if (colon + 1 < uristr_len)
            nextchar = uristr[colon + 1];
        else
            nextchar = 0;

        /* Check if this is a port on a URI without a specified scheme */
        if (isdigit(nextchar))
        {
            /* TODO how should we handle this? */
            ASSERT(0 && "port parse without scheme is unimplemented");
        }
        else /* this is a scheme */
        {
            int protocol_name_size;
            const char *protocol_name;

            protocol_name = uristr;
            protocol_name_len = colon;
            protocol_name_size = colon + 1; /* +1 for null-terminator */

            /* Set the URI protocol */
            ASSERT(protocol_name_size < sizeof(uri.protocol_str) &&
                   "big protocol name");

            /* Truncate protocol name */
            protocol_name_size = min(sizeof(uri.protocol_str),
                                     protocol_name_size);
            /* TODO: log truncation */

            str_copy(uri.protocol_str, protocol_name, protocol_name_size);
            uri.protocol = uri_protocol_lookup(protocol_name,
                                               protocol_name_len);

            /*
             * We handle two types of schemes.  Those with two slashes '//' and
             * those without.  For example:
             *
             *   No slashes  -- mailto:info@example.com
             *   No slashes  -- internal:about
             *   Two slashes -- gopher://example.com/
             */
            if (nextchar != '/') /* no slashes */
                uri.flags |= URI_NO_PROTOCOL_SLASHES_BIT;
        }

        /* Look for a second colon (for optional port) */
        if (uri.port == 0)
        {
            int colon2, port_start, port_len;

            ++colon;
            colon2 = colon + str_ichr(uristr + colon, ':', uristr_len - colon);

            port_start = colon2 + 1;

            for (port_len = 0;
                 (port_start + port_len < uristr_len) &&
                     isdigit(uristr[port_start + port_len]);
                 ++port_len)
                ;

            if (port_len > 0)
                uri.port = str_tol(uristr + port_start, port_len);
        }
    }


    /* Determine hostname, which can appear only in non-relative URIs.
     * We assume that the hostname can only appear when a protocol is
     * specified, e.g. the following URI is considered invalid:
     *
     *   example.com/index.html
     *
     * But the following is valid and has a hostname of example.com:
     *
     *   gopher://example.com/index.html
     *
     * Note that the LOCAL protocols (file://) are a special case.  There is
     * never a hostname for these protocols and instead we always assume that
     * anything following the scheme is the path.
     */
    if (uri.protocol != PROTOCOL_FILE &&
           (protocol_name_len > 0 &&
            protocol_name_len + 1 < uristr_len)) /* check +1 for colon */
    {
        char c;
        int hostname_size;

        ASSERT(uristr[protocol_name_len] == ':');

        /* Hostname starts after the scheme.  We start just past the colon and
         * skip over any slashes */
        for (hostname_start = protocol_name_len + 1;
             hostname_start < uristr_len &&
                (uristr[hostname_start] == '/');
             ++hostname_start)
            ;

        /* Hostname can end at either a colon, a slash (the beginning of the
         * path), or the end of the string */
        for (hostname_len = 0;
             hostname_start + hostname_len < uristr_len;
             ++hostname_len)
        {
            c = uristr[hostname_len + hostname_start];

            if (c == '\0' || c == ':' || c == '/' || c == '?' || c == '#')
                break;
        }

        hostname_size = hostname_len + 1;
        ASSERT(hostname_size < sizeof(uri.host) && "big hostname");
        hostname_size = min(sizeof(uri.host), hostname_size); /* truncate */
        /* TODO: log truncation */

        str_copy(uri.host, uristr + hostname_start, hostname_size);
    }

    /* Determine path.  For URIs with a scheme and hostname, this path should
     * always begin with a leading slash.  For relative URIs, the path actually
     * starts from the beginning of the string.  For local schemes (file://)
     * this is anything the follows the scheme.
     *
     * e.g.:
     *      URI                        PATH
     *   0. gopher://example.com       -->  (empty)
     *   1. gopher://example.com:70/   -->  /
     *   2. gopher://example.com/index -->  /index
     *   3. /index                     -->  /index
     *   4. index                      -->  index
     *   5. ../index                   -->  ../index
     *   6. file://index               -->  index
     *   7. file:///index              -->  /index
     *
     * Note that in this parsing function for examples 4, 5, and 6 the paths
     * are stored in the relative form as shown.  These paths should be
     * normalised with the current URI at some point to compute the real path,
     * perhaps when the page is displayed to the user.
     *
     *      URI                        ACTUAL PATH
     *      index                      -->  <current path>/index
     *      ../index                   -->  <current path>/../index
     *      file://index               -->  <current path>/index
     */
    if (uri.protocol == PROTOCOL_FILE) /* local URI */
    {
        int n_slashes, path_size;

        ASSERT(protocol_name_len > 0);
        ASSERT(uristr[protocol_name_len] == ':');

        /* Everything after scheme is the path.  We move over the colon and
         * *maximum of two* slashes (to allow absolute paths) */
        n_slashes = 0;
        for (path_start = protocol_name_len + 1;
             path_start < uristr_len && n_slashes < 2;
             ++path_start)
        {
            if (uristr[path_start] == '/')
                ++n_slashes;
        }

        /* Find length of path.  Generally ends at the end of the string. */
        for (path_len = 0;
             (path_start + path_len < uristr_len) &&
                 (uristr[path_start + path_len] != '\0') &&
                 (uristr[path_start + path_len] != '?') &&
                 (uristr[path_start + path_len] != '#'); /* ignore fragment */
             ++path_len)
            ;

        path_size = path_len + 1;
        ASSERT(path_size < sizeof(uri.path) && "big path");
        path_size = min(sizeof(uri.path), path_size); /* truncate */
        /* TODO: log truncation */

        /* Copy path */
        str_copy(uri.path, uristr + path_start, path_size);
    }
    else if (hostname_len > 0) /* URI with a hostname */
    {
        int path_size;

        /* Start may start at the end of hostname and after port, at the first
         * slash.  If we encounter the query (?) or fragment (#), we leave the
         * "path start" point there, even though the path length will be 0, so
         * that we can still find the query later on. */
        for (path_start = hostname_start + hostname_len;
             path_start < uristr_len &&
                 (uristr[path_start] != '\0') &&
                 (uristr[path_start] != '/') &&
                 (uristr[path_start] != '?') &&
                 (uristr[path_start] != '#'); /* ignore fragment */
             ++path_start)
        {
            /* Skip past the port if there is one */
            if (uristr[path_start] == ':')
            {
                for (;
                     path_start < uristr_len && isdigit(uristr[path_start]);
                     ++path_start)
                    ;
            }
        }

        /* Length is to end of the string or to the query. */
        for (path_len = 0;
             path_start + path_len < uristr_len &&
                 (uristr[path_start + path_len] != '\0') &&
                 (uristr[path_start + path_len] != '?') &&
                 (uristr[path_start + path_len] != '#'); /* ignore fragment */
             ++path_len)
             ;

        path_size = path_len + 1;
        ASSERT(path_size < sizeof(uri.path) && "big path");
        path_size = min(sizeof(uri.path), path_size); /* truncate */
        /* TODO: log truncation */

        str_copy(uri.path, uristr + path_start, path_size);
    }
    else /* relative URI--we use whole beginning section as path. */
    {
        int path_size;

        ASSERT(protocol_name_len == 0);

        /* Find length of path.  Goes until the end of the string or at
         * query or fragment. */
        for (path_len = 0;
             (path_start + path_len < uristr_len) &&
                 (uristr[path_start + path_len] != '\0') &&
                 (uristr[path_start + path_len] != '?') &&
                 (uristr[path_start + path_len] != '#'); /* ignore fragment */
             ++path_len)
            ;

        path_size = path_len + 1;
        ASSERT(path_size < sizeof(uri.path) && "big path");
        path_size = min(sizeof(uri.path), path_size); /* truncate */
        /* TODO: log truncation */

        /* Copy path */
        str_copy(uri.path, uristr + path_start, path_size);
    }

    /* Find query, which always appears after the path */
    query = str_ichr(uristr + path_start + path_len, '?',
                     uristr_len - path_start - path_len) +
                     path_start + path_len + 1 /* +1 to read after the '?' */;
    if (query < uristr_len)
    {
        int query_len, query_size;

        /* Find length of query.  Goes until the end of the string or at
         * query. */
        for (query_len = 0;
             (query + query_len < uristr_len) &&
                 (uristr[query + query_len] != '\0') &&
                 (uristr[query + query_len] != '#'); /* ignore fragment */
             ++query_len)
            ;

        ASSERT(query_len > 0);

        query_size = query_len + 1;
        ASSERT(query_size < sizeof(uri.query) && "big query");
        query_size = min(sizeof(uri.query), query_size);
        /* TODO: log truncation */

        /* Copy query */
        str_copy(uri.query, uristr + query, query_size);
    }

    return uri;
}

size_t
uri_str(char *buffer,
        const struct uri *const u,
        size_t buffer_size,
        uint32_t flags)
{
    ASSERT(u);
    ASSERT(buffer);
    ASSERT(buffer_size > 0);

    int pos = 0;
    buffer[pos] = '\0';

    /* Write the scheme first */
    if (u->protocol != PROTOCOL_NONE)
    {
        ASSERT(u->protocol_str[0] && "uri has no protocol");

        int protocol_len;

        /* Write protocol string */
        protocol_len = (int)str_copy(buffer, u->protocol_str, buffer_size);

        /* Write the colon following protocol */
        protocol_len += (int)str_copy(buffer + protocol_len, ":",
                                      buffer_size - protocol_len);

        /* Write slashes if the protocol has them */
        if (!(u->flags & URI_NO_PROTOCOL_SLASHES_BIT))
        {
            protocol_len += (int)str_copy(buffer + protocol_len, "//",
                                          buffer_size - protocol_len);
        }

        pos += protocol_len;
    }

    /* Write the hostname */
    if (u->host[0] != '\0')
    {
        pos += (int)str_copy(buffer + pos, u->host, buffer_size - pos);
    }

    /* Write the port */
    if (u->port > 0 && !(flags & URISTR_NO_PORT_BIT))
    {
        pos += (int)snprintf(buffer + pos, buffer_size - pos, ":%d", u->port);
    }

    /* Write path */
    if (u->path[0] != '\0')
    {
        pos += (int)str_copy(buffer + pos, u->path, buffer_size - pos);

        /* trim trailing slash */
        if ((flags & URISTR_NOSLASH_BIT) && pos > 0 && buffer[pos - 1] == '/')
            buffer[--pos] = '\0';
    }

    /* Write query */
    if (u->query[0] != '\0' && !(flags & URISTR_NOQUERY_BIT))
    {
        /* Write the leading '?' followed by query */
        pos += (int)str_copy(buffer + pos, "?", buffer_size - pos);
        pos += (int)str_copy(buffer + pos, u->query, buffer_size - pos);
    }

    return pos;
}

bool
uri_equal(struct uri u1, struct uri u2, uint32_t flags)
{
    size_t l1 = strz_len(u1.path);
    size_t l2 = strz_len(u2.path);

    /* Strip trailing path slashes */
    if (flags & URIEQ_IGNORE_TRAILING_SLASH_BIT)
    {
        if (u1.path[l1 - 1] == '/') --l1;
        if (u2.path[l2 - 1] == '/') --l2;
    }

    /* Check that query matches */
    if (!(flags & URIEQ_IGNORE_QUERY_BIT) &&
        !str_equal(u1.query, u2.query, sizeof(u1.query)))
    {
        return false;
    }

    return l1 == l2 &&
           str_equal(u1.host, u2.host, sizeof(u1.host)) &&
           str_equal(u1.path, u2.path, l1) &&
           u1.protocol == u2.protocol;
}
index	:	sr71.git
		A gopher client for your terminal.
		git clone git://git.skec.site/pub/sr71.git
		log \| files \| refs \| readme \| license