Message ID | 13b81b8aa06cfd63a5fd9d1acbaf21a8b388ff47.1714343461.git.gitgitgadget@gmail.com (mailing list archive) |
---|---|
State | New, archived |
Headers | show |
Series | builtin: implement, document and test url-parse | expand |
On Sun, 28 Apr 2024, Matheus Afonso Martins Moreira via GitGitGadget <gitgitgadget@gmail.com> wrote: > From: Matheus Afonso Martins Moreira <matheus@matheusmoreira.com> > > Define general parsing function that supports all Git URLs > including scp style URLs such as hostname:~user/repo. > Has the same interface as the URL normalization function > and uses the same data structures, facilitating its use. > It's adapted from the algorithm used to process URLs in connect.c, > so it should support the same inputs. > > Signed-off-by: Matheus Afonso Martins Moreira <matheus@matheusmoreira.com> > --- > urlmatch.c | 90 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ > urlmatch.h | 1 + > 2 files changed, 91 insertions(+) > > diff --git a/urlmatch.c b/urlmatch.c > index 1d0254abacb..5a442e31fa2 100644 > --- a/urlmatch.c > +++ b/urlmatch.c > @@ -3,6 +3,7 @@ > #include "hex-ll.h" > #include "strbuf.h" > #include "urlmatch.h" > +#include "url.h" > > #define URL_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" > #define URL_DIGIT "0123456789" > @@ -438,6 +439,95 @@ char *url_normalize(const char *url, struct url_info *out_info) > return url_normalize_1(url, out_info, 0); > } > > +enum protocol { > + PROTO_UNKNOWN = 0, > + PROTO_LOCAL, > + PROTO_FILE, > + PROTO_SSH, > + PROTO_GIT, > +}; > + > +static enum protocol url_get_protocol(const char *name, size_t n) > +{ > + if (!strncmp(name, "ssh", n)) > + return PROTO_SSH; > + if (!strncmp(name, "git", n)) > + return PROTO_GIT; > + if (!strncmp(name, "git+ssh", n)) /* deprecated - do not use */ > + return PROTO_SSH; > + if (!strncmp(name, "ssh+git", n)) /* deprecated - do not use */ > + return PROTO_SSH; > + if (!strncmp(name, "file", n)) > + return PROTO_FILE; > + return PROTO_UNKNOWN; > +} > + > +char *url_parse(const char *url_orig, struct url_info *out_info) > +{ > + struct strbuf url; > + char *host, *separator; > + char *detached, *normalized; > + enum protocol protocol = PROTO_LOCAL; > + struct url_info local_info; > + struct url_info *info = out_info? out_info : &local_info; > + bool scp_syntax = false; > + > + if (is_url(url_orig)) { > + url_orig = url_decode(url_orig); > + } else { > + url_orig = xstrdup(url_orig); > + } > + > + strbuf_init(&url, strlen(url_orig) + sizeof("ssh://")); > + strbuf_addstr(&url, url_orig); > + > + host = strstr(url.buf, "://"); > + if (host) { > + protocol = url_get_protocol(url.buf, host - url.buf); > + host += 3; > + } else { > + if (!url_is_local_not_ssh(url.buf)) { > + scp_syntax = true; > + protocol = PROTO_SSH; > + strbuf_insertstr(&url, 0, "ssh://"); > + host = url.buf + 6; > + } > + } Interesting. ` $ ./git url-parse -c protocol file:/test/test ssh ` seems like only having a single slash after the 'protocol:' prints 'ssh' always (I think this may not even be a valid url). After this 'else' block, the url turns into 'ssh://file/test/test'. Will examine the details later. Not that it's your code's doing, and rather the result of url_is_local_not_ssh(). But just wanted to point this out and ask if this should error out or is this an intended behavior that I can't figure out. Thanks.
[] > Interesting. > > ` > $ ./git url-parse -c protocol file:/test/test > ssh > ` > > seems like only having a single slash after the 'protocol:' prints > 'ssh' always (I think this may not even be a valid url). After this 'else' > block, the url turns into 'ssh://file/test/test'. Will examine the details > later. Not that it's your code's doing, and rather the result of > url_is_local_not_ssh(). But just wanted to point this out and ask if this > should error out or is this an intended behavior that I can't figure out. ssh is the correct answer, try something like `git clone localhost:/home/myself/project/git.git` It is the scp syntax, supported by Git as well. From `man scp` scp copies files between hosts on a network. [] The source and target may be specified as a local pathname, a remote host with optional path in the form [user@]host:[path], or a URI in the form scp://[user@]host[:port][/path]. Local file names can be made explicit using absolute or relative pathnames to avoid scp treating file names containing ‘:’ as host specifiers. So yes, they share similar problems with the ':' that could mean different things when using the short form.
diff --git a/urlmatch.c b/urlmatch.c index 1d0254abacb..5a442e31fa2 100644 --- a/urlmatch.c +++ b/urlmatch.c @@ -3,6 +3,7 @@ #include "hex-ll.h" #include "strbuf.h" #include "urlmatch.h" +#include "url.h" #define URL_ALPHA "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" #define URL_DIGIT "0123456789" @@ -438,6 +439,95 @@ char *url_normalize(const char *url, struct url_info *out_info) return url_normalize_1(url, out_info, 0); } +enum protocol { + PROTO_UNKNOWN = 0, + PROTO_LOCAL, + PROTO_FILE, + PROTO_SSH, + PROTO_GIT, +}; + +static enum protocol url_get_protocol(const char *name, size_t n) +{ + if (!strncmp(name, "ssh", n)) + return PROTO_SSH; + if (!strncmp(name, "git", n)) + return PROTO_GIT; + if (!strncmp(name, "git+ssh", n)) /* deprecated - do not use */ + return PROTO_SSH; + if (!strncmp(name, "ssh+git", n)) /* deprecated - do not use */ + return PROTO_SSH; + if (!strncmp(name, "file", n)) + return PROTO_FILE; + return PROTO_UNKNOWN; +} + +char *url_parse(const char *url_orig, struct url_info *out_info) +{ + struct strbuf url; + char *host, *separator; + char *detached, *normalized; + enum protocol protocol = PROTO_LOCAL; + struct url_info local_info; + struct url_info *info = out_info? out_info : &local_info; + bool scp_syntax = false; + + if (is_url(url_orig)) { + url_orig = url_decode(url_orig); + } else { + url_orig = xstrdup(url_orig); + } + + strbuf_init(&url, strlen(url_orig) + sizeof("ssh://")); + strbuf_addstr(&url, url_orig); + + host = strstr(url.buf, "://"); + if (host) { + protocol = url_get_protocol(url.buf, host - url.buf); + host += 3; + } else { + if (!url_is_local_not_ssh(url.buf)) { + scp_syntax = true; + protocol = PROTO_SSH; + strbuf_insertstr(&url, 0, "ssh://"); + host = url.buf + 6; + } + } + + /* path starts after ':' in scp style SSH URLs */ + if (scp_syntax) { + separator = strchr(host, ':'); + if (separator) { + if (separator[1] == '/') + strbuf_remove(&url, separator - url.buf, 1); + else + *separator = '/'; + } + } + + detached = strbuf_detach(&url, NULL); + normalized = url_normalize(detached, info); + free(detached); + + if (!normalized) { + return NULL; + } + + /* point path to ~ for URL's like this: + * + * ssh://host.xz/~user/repo + * git://host.xz/~user/repo + * host.xz:~user/repo + * + */ + if (protocol == PROTO_GIT || protocol == PROTO_SSH) { + if (normalized[info->path_off + 1] == '~') + info->path_off++; + } + + return normalized; +} + static size_t url_match_prefix(const char *url, const char *url_prefix, size_t url_prefix_len) diff --git a/urlmatch.h b/urlmatch.h index 5ba85cea139..6b3ce428582 100644 --- a/urlmatch.h +++ b/urlmatch.h @@ -35,6 +35,7 @@ struct url_info { }; char *url_normalize(const char *, struct url_info *); +char *url_parse(const char *, struct url_info *); struct urlmatch_item { size_t hostmatch_len;