diff mbox series

[v6,2/3] gweb: Refactor 'parse_request_and_proxy_urls'.

Message ID F91C98F6-5CFF-4A53-B364-5A56C77D1BA4@nuovations.com (mailing list archive)
State Not Applicable, archived
Headers show
Series gweb: refactor parse_url for IPv6 addresses. | expand

Commit Message

Grant Erickson Nov. 11, 2023, 5:47 p.m. UTC
Prior to this change, 'parse_request_and_proxy_urls' failed to correctly
handle RFC 2732-compliant URLs with bracketed IPv6 addresses such as:

    http://[2001:db8:4006:812::200e]:8080/online/status.html

Such bracketing is necessary when using IPv6 addresses to disambiguate
the host component from the port component due to the presence of the
colon (':') in IPv6 addresses. As such, prior to this change, such URLs
resulted in the brackets and the IPv6 address being passed to GResolv
which, unsurprisingly, failed to successfully forward resolve since the
resulting host was neither a valid host name nor a valid IPv6 address.

As a result, support for such RFC 2732-compliant bracketed IPv6
addresses has been added with this change which refactors the
previously-monolithic 'parse_request_and_proxy_urls' into several,
focused functions:

    * parse_request_and_proxy_urls
        - parse_request_url
            o parse_url_components
                + parse_url_scheme
                + parse_url_host_and_port
                    * parse_url_host
                    * parse_url_port
                + parse_url_path
        - parse_proxy_url

In particular, 'parse_url_host' is the new function responsible for
parsing the host and correctly handling one of seven possible
combinations of host and port, two of which include bracketed IPv6
addresses.

In addition, 'parse_url_host' will now return an error on an empty,
non-existent host and 'parse_url_port' will return an error on invalid,
out-of-range ports.
---
 gweb/gweb.c | 455 ++++++++++++++++++++++++++++++++++++++++++++--------
 1 file changed, 390 insertions(+), 65 deletions(-)
diff mbox series

Patch

diff --git a/gweb/gweb.c b/gweb/gweb.c
index b68ec123438f..16980b0e6a4e 100644
--- a/gweb/gweb.c
+++ b/gweb/gweb.c
@@ -1110,101 +1110,426 @@  static int create_transport(struct web_session *session)
 	return 0;
 }
 
-static int parse_request_and_proxy_urls(struct web_session *session,
-				const char *url, const char *proxy)
+static int parse_url_scheme(const char *url, size_t url_length,
+						const char **cursor,
+						char **scheme)
 {
-	char *scheme, *host, *port, *path;
+	static const char * const scheme_delimiter = "://";
+	static const size_t scheme_delimiter_length = 3;
+	const char *result;
+	size_t remaining_length;
+	size_t scheme_length = 0;
 
-	scheme = g_strdup(url);
-	if (!scheme)
+	if (!url || !url_length || !cursor)
 		return -EINVAL;
 
-	host = strstr(scheme, "://");
-	if (host) {
-		*host = '\0';
-		host += 3;
+	remaining_length = url_length - (size_t)(*cursor - url);
+	if (remaining_length) {
+		result = memmem(*cursor,
+					remaining_length,
+					scheme_delimiter,
+					scheme_delimiter_length);
+		if (result) {
+			scheme_length = (size_t)(result - *cursor);
 
-		if (strcasecmp(scheme, "https") == 0) {
-			session->port = 443;
-			session->flags |= SESSION_FLAG_USE_TLS;
-		} else if (strcasecmp(scheme, "http") == 0) {
-			session->port = 80;
-		} else {
-			g_free(scheme);
+			if (scheme)
+				*scheme = g_strndup(*cursor, scheme_length);
+
+			*cursor += scheme_length + scheme_delimiter_length;
+		} else if (scheme)
+			*scheme = NULL;
+	} else if (scheme)
+		*scheme = NULL;
+
+	return 0;
+}
+
+static int parse_url_host(const char *url, size_t url_length,
+						const char **cursor,
+						char **host)
+{
+	static char port_delimiter = ':';
+	static char path_delimiter = '/';
+	size_t remaining_length;
+	size_t host_length	= 0;
+	const char *result;
+	const char *opening_bracket;
+	const char *closing_bracket;
+	int err = 0;
+
+	if (!url || !url_length || !cursor)
+		return -EINVAL;
+
+	/*
+	 * Since it's the easiest to detect, first rule out an IPv6
+	 * address. The only reliably way to do so is to search for the
+	 * delimiting '[' and ']'. Searching for ':' may incorrectly yield
+	 * one of the other forms above (for example, (2), (5), or (7)).
+	 */
+	remaining_length = url_length - (size_t)(*cursor - url);
+
+	opening_bracket = memchr(*cursor, '[', remaining_length);
+	if (opening_bracket) {
+		/*
+		 * We found an opening bracket; this might be an IPv6
+		 * address. Search for its peer closing bracket.
+		 */
+		remaining_length = url_length - (size_t)(opening_bracket - url);
+
+		closing_bracket = memchr(opening_bracket,
+								']',
+								remaining_length);
+		if (!closing_bracket)
 			return -EINVAL;
-		}
+
+		/*
+		 * Assign the first character of the IPv6 address after the
+		 * opening bracket up to, but not including, the closing
+		 * bracket to the host name.
+		 */
+		host_length = closing_bracket - opening_bracket - 1;
+
+		if (host_length && host)
+			*host = g_strndup(opening_bracket + 1, host_length);
 	} else {
-		host = scheme;
-		session->port = 80;
-	}
+		/*
+		 * At this point, we either have an IPv4 address or a host
+		 * name, maybe with a port and maybe with a path.
+		 *
+		 * Whether we have a port or not, we definitively know where
+		 * the IPv4 address or host name ends. If we have a port, it
+		 * ends at the port delimiter, ':'. If we don't have a port,
+		 * then it ends at the end of the string or at the path
+		 * delimiter, if any.
+		 */
+		result = memchr(*cursor, port_delimiter, remaining_length);
+
+		/*
+		 * There was no port delimiter; attempt to find a path
+		 * delimiter.
+		 */
+		if (!result)
+			result = memchr(*cursor, path_delimiter, remaining_length);
+
+		/*
+		 * Whether stopping at the port or path delimiter, if we had a
+		 * result, the end of the host is the span from the cursor to
+		 * that result. Otherwise, it is simply the remaining length
+		 * of the string.
+		 */
+		if (result)
+			host_length = result - *cursor;
+		else
+			host_length = remaining_length;
 
-	path = strchr(host, '/');
-	if (path)
-		*(path++) = '\0';
+		if (host_length && host)
+			*host = g_strndup(*cursor, host_length);
+	}
 
-	if (!proxy)
-		session->request = g_strdup_printf("/%s", path ? path : "");
+	if (!host_length)
+		err = -EINVAL;
 	else
-		session->request = g_strdup(url);
+		*cursor += host_length;
+
+	return err;
+}
+
+static int parse_url_port(const char *url, size_t url_length,
+						const char **cursor,
+						int16_t *port)
+{
+	static char port_delimiter = ':';
+	static const size_t port_delimiter_length = 1;
+	const char *result;
+	size_t remaining_length;
+	size_t port_length = 0;
+	char *end;
+	unsigned long tmp_port;
+
+	if (!url || !url_length || !cursor)
+		return -EINVAL;
+
+	remaining_length = url_length - (size_t)(*cursor - url);
+
+	result = memchr(*cursor, port_delimiter, remaining_length);
+	if (result) {
+		tmp_port = strtoul(result + port_delimiter_length, &end, 10);
+		if (tmp_port == ULONG_MAX)
+			return -ERANGE;
+		else if (tmp_port > UINT16_MAX)
+			return -ERANGE;
+		else if (result + port_delimiter_length == end)
+			return -EINVAL;
 
-	port = strrchr(host, ':');
-	if (port) {
-		char *end;
-		int tmp = strtol(port + 1, &end, 10);
+		port_length = end - (result + port_delimiter_length);
 
-		if (*end == '\0') {
-			*port = '\0';
-			session->port = tmp;
+		*cursor += port_length;
+	} else
+		tmp_port = -1;
+
+	if (port)
+		*port = (int16_t)tmp_port;
+
+	return 0;
+}
+
+static int parse_url_host_and_port(const char *url, size_t url_length,
+						const char **cursor,
+						char **host,
+						int16_t *port)
+{
+	g_autofree char *temp_host = NULL;
+	int err = 0;
+
+	if (!url || !url_length || !cursor)
+		return -EINVAL;
+
+	/* Attempt to handle the host component. */
+
+	err = parse_url_host(url, url_length, cursor, &temp_host);
+	if (err != 0)
+		goto done;
+
+	/* Attempt to handle the port component. */
+
+	err = parse_url_port(url, url_length, cursor, port);
+	if (err != 0)
+		goto done;
+
+	if (host)
+		*host = g_steal_pointer(&temp_host);
+
+done:
+	return err;
+}
+
+static int parse_url_path(const char *url, size_t url_length,
+						const char **cursor,
+						char **path)
+{
+	static char path_delimiter = '/';
+	static const size_t path_delimiter_length = 1;
+	const char *result;
+	size_t remaining_length;
+	size_t path_length = 0;
+
+	if (!url || !url_length || !cursor)
+		return -EINVAL;
+
+	remaining_length = url_length - (size_t)(*cursor - url);
+
+	result = memchr(*cursor, path_delimiter, remaining_length);
+	if (result) {
+		path_length = url_length -
+			(size_t)(result + path_delimiter_length - url);
+
+		if (path)
+			*path = g_strndup(result + path_delimiter_length, path_length);
+
+		*cursor += path_length + path_delimiter_length;
+	} else if (path)
+		*path = NULL;
+
+	return 0;
+}
+
+static int parse_url_components(const char *url,
+						char **scheme,
+						char **host,
+						int16_t *port,
+						char **path)
+{
+	size_t total_length;
+	const char *p;
+	g_autofree char *temp_scheme = NULL;
+	g_autofree char *temp_host = NULL;
+	int err = 0;
+
+	if (!url)
+		return -EINVAL;
+
+	p = url;
+
+	total_length = strlen(p);
+	if (!total_length)
+		return -EINVAL;
+
+	/* Skip any leading space, if any. */
+
+	while (g_ascii_isspace(*p))
+		p++;
+
+	/* Attempt to handle the scheme component. */
+
+	err = parse_url_scheme(url, total_length, &p, &temp_scheme);
+	if (err != 0)
+		goto done;
+
+	/* Attempt to handle the host component. */
+
+	err = parse_url_host_and_port(url, total_length, &p, &temp_host, port);
+	if (err != 0)
+		goto done;
+
+	/* Attempt to handle the path component. */
+
+	err = parse_url_path(url, total_length, &p, path);
+	if (err != 0)
+		goto done;
+
+	if (scheme)
+		*scheme = g_steal_pointer(&temp_scheme);
+
+	if (host)
+		*host = g_steal_pointer(&temp_host);
+
+done:
+	return err;
+}
+
+static int parse_request_url(struct web_session *session,
+				const char *request_url, bool has_proxy_url)
+{
+	g_autofree char *scheme = NULL;
+	g_autofree char *host = NULL;
+	g_autofree char *path = NULL;
+	int16_t port = -1;
+	int err = 0;
+
+	if (!session || !request_url)
+		return -EINVAL;
+
+	/* Parse the request URL components. */
+
+	err = parse_url_components(request_url,
+			&scheme,
+			&host,
+			&port,
+			&path);
+	if (err != 0)
+		goto done;
+
+	/*
+	 * Handle the URL scheme, if any, for the session, defaulting to
+	 * the "http" scheme and port 80.
+	 */
+	if (scheme) {
+		if (g_ascii_strcasecmp(scheme, "https") == 0)
+			session->port = 443;
+		else if (g_ascii_strcasecmp(scheme, "http") == 0)
+			session->port = 80;
+		else {
+			err = -EINVAL;
+			goto done;
 		}
+	} else
+		session->port = 80;
+
+	/* Handle the URL host and port, if any, for the session. */
 
-		if (!proxy)
+	if (port != -1) {
+		session->port = port;
+
+		if (!has_proxy_url)
 			session->host = g_strdup(host);
 		else
-			session->host = g_strdup_printf("%s:%u", host, tmp);
+			session->host = g_strdup_printf("%s:%u", host, port);
 	} else
 		session->host = g_strdup(host);
 
-	g_free(scheme);
+	/* Handle the URL path, if any, for the session. */
 
-	if (!proxy)
-		return 0;
+	if (!has_proxy_url)
+		session->request = g_strdup_printf("/%s", path ? path : "");
+	else
+		session->request = g_strdup(request_url);
+
+done:
+	return err;
+}
 
-	scheme = g_strdup(proxy);
-	if (!scheme)
+static int parse_proxy_url(struct web_session *session, const char *proxy_url)
+{
+	const char *p;
+	size_t proxy_length;
+	g_autofree char *scheme = NULL;
+	g_autofree char *host = NULL;
+	int16_t port = -1;
+	int err = 0;
+
+	if (!session || !proxy_url)
 		return -EINVAL;
 
-	host = strstr(proxy, "://");
-	if (host) {
-		*host = '\0';
-		host += 3;
+	/*
+	 * Parse the proxy URL scheme, host, and port, the only three
+	 * components we care about.
+	 */
+	p = proxy_url;
+	proxy_length = strlen(p);
+
+	err = parse_url_scheme(proxy_url,
+			proxy_length,
+			&p,
+			&scheme);
+	if (err != 0)
+		goto done;
+
+	err = parse_url_host_and_port(proxy_url,
+			proxy_length,
+			&p,
+			&host,
+			&port);
+	if (err != 0)
+		goto done;
+
+	/*
+	 * Handle the proxy URL scheme, if any, for the session. Only
+	 * "http" is allowed.
+	 */
+	if (scheme && g_ascii_strcasecmp(scheme, "http") != 0) {
+		err = -EINVAL;
+		goto done;
+	}
 
-		if (strcasecmp(scheme, "http") != 0) {
-			g_free(scheme);
-			return -EINVAL;
-		}
-	} else
-		host = scheme;
+	/*
+	 * Handle the proxy URL host and port for the session.
+	 */
+	if (host)
+		session->address = host;
 
-	path = strchr(host, '/');
-	if (path)
-		*(path++) = '\0';
+	if (port != -1)
+		session->port = port;
 
-	port = strrchr(host, ':');
-	if (port) {
-		char *end;
-		int tmp = strtol(port + 1, &end, 10);
+done:
+	return err;
+}
 
-		if (*end == '\0') {
-			*port = '\0';
-			session->port = tmp;
-		}
-	}
+static int parse_request_and_proxy_urls(struct web_session *session,
+				const char *url, const char *proxy)
+{
+	const bool has_proxy_url = (proxy != NULL);
+	int err = 0;
 
-	session->address = g_strdup(host);
+	if (!session || !url)
+		return -EINVAL;
 
-	g_free(scheme);
+	/* Parse and handle the request URL */
 
-	return 0;
+	err = parse_request_url(session, url, has_proxy_url);
+	if (err != 0)
+		goto done;
+
+	if (!has_proxy_url)
+		goto done;
+
+	/* Parse and handle the proxy URL */
+
+	err = parse_proxy_url(session, proxy);
+	if (err != 0)
+		goto done;
+
+done:
+	return err;
 }
 
 static void handle_resolved_address(struct web_session *session)