diff mbox series

[v4,10/15] bugreport: add config values from safelist

Message ID 20191213004312.169753-11-emilyshaffer@google.com (mailing list archive)
State New, archived
Headers show
Series [v4,01/15] bugreport: add tool to generate debugging info | expand

Commit Message

Emily Shaffer Dec. 13, 2019, 12:43 a.m. UTC
Teach bugreport to gather the values of config options which are present
in 'bugreport-config-safelist.h'.

Many config options are sensitive, and many Git add-ons use config
options which git-core does not know about; it is better only to gather
config options which we know to be safe, rather than excluding options
which we know to be unsafe.

Taking the build-time generated array and putting it into a set saves us
time - since git_config_bugreport() is called for every option the user
has configured, performing option lookup in constant time is a useful
optimization.

Signed-off-by: Emily Shaffer <emilyshaffer@google.com>
---
 bugreport.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 1 file changed, 71 insertions(+), 1 deletion(-)

Comments

Junio C Hamano Dec. 13, 2019, 9:45 p.m. UTC | #1
Emily Shaffer <emilyshaffer@google.com> writes:

> Teach bugreport to gather the values of config options which are present
> in 'bugreport-config-safelist.h'.
>
> Many config options are sensitive, and many Git add-ons use config
> options which git-core does not know about; it is better only to gather
> config options which we know to be safe, rather than excluding options
> which we know to be unsafe.
>
> Taking the build-time generated array and putting it into a set saves us
> time - since git_config_bugreport() is called for every option the user
> has configured, performing option lookup in constant time is a useful
> optimization.

Interesting.  I actually was expecting the look-up to go the other
way around.  We know the safe keys, so iterate over them and grab
their values, if defined.  No need for hashes or anything, but just
a simple linear list of safe stuff.

But that is too simple-minded.  If we wanted to safelist foo.*.bar,
where '*' can be anything, walking on the list of safe variables
would not work.  We must have a hash table that knows "foo.*.bar" is
allowed, and while walking all the configuration keys, when we see
foo.a.bar, we consult "foo.*.bar" as well as "foo.a.bar" to see if
it is whitelisted, or something like that.

But then I am not sure if this implementation does something like
this for three-level names.  If not, I do not see much point in use
of the hash there either.

Puzzled.

>
> Signed-off-by: Emily Shaffer <emilyshaffer@google.com>
> ---
>  bugreport.c | 72 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 71 insertions(+), 1 deletion(-)
>
> diff --git a/bugreport.c b/bugreport.c
> index 759cc0b0f8..1fca28f0b9 100644
> --- a/bugreport.c
> +++ b/bugreport.c
> @@ -6,6 +6,9 @@
>  #include "help.h"
>  #include <gnu/libc-version.h>
>  #include "run-command.h"
> +#include "config.h"
> +#include "bugreport-config-safelist.h"
> +#include "khash.h"
>  
>  static void get_http_version_info(struct strbuf *http_info)
>  {
> @@ -18,6 +21,41 @@ static void get_http_version_info(struct strbuf *http_info)
>  	    strbuf_addstr(http_info, "'git-http-fetch -V' not supported\n");
>  }
>  
> +KHASH_INIT(cfg_set, const char*, int, 0, kh_str_hash_func, kh_str_hash_equal);
> +
> +struct cfgset {
> +	kh_cfg_set_t set;
> +};
> +
> +struct cfgset safelist;
> +
> +static void cfgset_init(struct cfgset *set, size_t initial_size)
> +{
> +	memset(&set->set, 0, sizeof(set->set));
> +	if (initial_size)
> +		kh_resize_cfg_set(&set->set, initial_size);
> +}
> +
> +static int cfgset_insert(struct cfgset *set, const char *cfg_key)
> +{
> +	int added;
> +	kh_put_cfg_set(&set->set, cfg_key, &added);
> +	printf("ESS: added %s\n", cfg_key);
> +	return !added;
> +}
> +
> +static int cfgset_contains(struct cfgset *set, const char *cfg_key)
> +{
> +	khiter_t pos = kh_get_cfg_set(&set->set, cfg_key);
> +	return pos != kh_end(&set->set);
> +}
> +
> +static void cfgset_clear(struct cfgset *set)
> +{
> +	kh_release_cfg_set(&set->set);
> +	cfgset_init(set, 0);
> +}
> +
>  static void get_system_info(struct strbuf *sys_info)
>  {
>  	struct strbuf version_info = STRBUF_INIT;
> @@ -53,6 +91,36 @@ static void get_system_info(struct strbuf *sys_info)
>  	strbuf_complete_line(sys_info);
>  }
>  
> +static void gather_safelist()
> +{
> +	int index;
> +	int safelist_len = sizeof(bugreport_config_safelist) / sizeof(const char *);
> +	cfgset_init(&safelist, safelist_len);
> +	for (index = 0; index < safelist_len; index++)
> +		cfgset_insert(&safelist, bugreport_config_safelist[index]);
> +
> +}
> +
> +static int git_config_bugreport(const char *var, const char *value, void *cb)
> +{
> +	struct strbuf *config_info = (struct strbuf *)cb;
> +
> +	if (cfgset_contains(&safelist, var))
> +		strbuf_addf(config_info,
> +			    "%s (%s) : %s\n",
> +			    var, config_scope_to_string(current_config_scope()),
> +			    value);
> +
> +	return 0;
> +}
> +
> +static void get_safelisted_config(struct strbuf *config_info)
> +{
> +	gather_safelist();
> +	git_config(git_config_bugreport, config_info);
> +	cfgset_clear(&safelist);
> +}
> +
>  static const char * const bugreport_usage[] = {
>  	N_("git bugreport [-o|--output <file>]"),
>  	NULL
> @@ -114,10 +182,12 @@ int cmd_main(int argc, const char **argv)
>  
>  	get_bug_template(&buffer);
>  
> -	// add other contents
>  	get_header(&buffer, "System Info");
>  	get_system_info(&buffer);
>  
> +	get_header(&buffer, "Safelisted Config Info");
> +	get_safelisted_config(&buffer);
> +
>  	report = fopen_for_writing(report_path.buf);
>  	strbuf_write(&buffer, report);
>  	fclose(report);
Emily Shaffer Dec. 16, 2019, 11:40 p.m. UTC | #2
On Fri, Dec 13, 2019 at 01:45:47PM -0800, Junio C Hamano wrote:
> Emily Shaffer <emilyshaffer@google.com> writes:
> 
> > Teach bugreport to gather the values of config options which are present
> > in 'bugreport-config-safelist.h'.
> >
> > Many config options are sensitive, and many Git add-ons use config
> > options which git-core does not know about; it is better only to gather
> > config options which we know to be safe, rather than excluding options
> > which we know to be unsafe.
> >
> > Taking the build-time generated array and putting it into a set saves us
> > time - since git_config_bugreport() is called for every option the user
> > has configured, performing option lookup in constant time is a useful
> > optimization.
> 
> Interesting.  I actually was expecting the look-up to go the other
> way around.  We know the safe keys, so iterate over them and grab
> their values, if defined.  No need for hashes or anything, but just
> a simple linear list of safe stuff.

Hm. Without looking at the code, I said to myself,
"repo_config_get_value() will open all the available config files to
find the resolved value, so I don't want to do n*4 file open/closes, I
only want to do 4 total."

Now I look at the code and see that the configs are already being read
into a hashset before now. So you're right that it doesn't make sense
for me to do it this way....

> 
> But that is too simple-minded.  If we wanted to safelist foo.*.bar,
> where '*' can be anything, walking on the list of safe variables
> would not work.  We must have a hash table that knows "foo.*.bar" is
> allowed, and while walking all the configuration keys, when we see
> foo.a.bar, we consult "foo.*.bar" as well as "foo.a.bar" to see if
> it is whitelisted, or something like that.

...unless we want to use wildcards like you suggest.

But I'm not sure it's a good idea. I envision someone writing another
Git add-on, which offers someone to specify "user.password" and
automagically do some Bitbucket interaction, or something. (An extreme
and misguided example, but I hope the point remains.) Then if I say,
"all the user.* configs I can see in 'git help config' are safe, so I
will just safelist user.*," I'm not accounting for this other tool's
known configs which are a superset of what Git knows and expects.

The point of this safelist-generation exercise is to avoid accidentally
printing some sensitive user config, and the point of using a safelist
instead of a blocklist is to avoid printing a third-party config we
didn't know about (or messing up our pattern matching). So I suggest we
avoid pattern matching entirely.

> 
> But then I am not sure if this implementation does something like
> this for three-level names.  If not, I do not see much point in use
> of the hash there either.
> 
> Puzzled.

I'd prefer to solve it by iterating over the compile-time-generated
safelist using get_config_value_multi(), which I clock at O(n) for n =
safelist length: performing n calls to an O(1) hashset lookup. That
saves us the O(n*log(n)) hashset population, and the implementation of
Yet Another Set in the source.

Thanks.

 - Emily
Junio C Hamano Dec. 17, 2019, 5:43 p.m. UTC | #3
Emily Shaffer <emilyshaffer@google.com> writes:

>> But that is too simple-minded.  If we wanted to safelist foo.*.bar,
>> where '*' can be anything, walking on the list of safe variables
>> would not work.  We must have a hash table that knows "foo.*.bar" is
>> allowed, and while walking all the configuration keys, when we see
>> foo.a.bar, we consult "foo.*.bar" as well as "foo.a.bar" to see if
>> it is whitelisted, or something like that.
>
> ...unless we want to use wildcards like you suggest.
>
> But I'm not sure it's a good idea. I envision someone writing another
> Git add-on, which offers someone to specify "user.password" ...

Wildcarding the leaf level of two (or for that matter three) level
names like "user.*" in your example is of course a nonsense way to
use the safelist.  But think about three-level names where the
second level is designed to be used for user-supplied token to give
things of similar nature a name the user can use to distinguish
things.  If remote.origin.url is worth reporting, wouldn't
remote.upstream.url also be?  Shouldn't remote.*.url be the way to
say "the URL field for each and every remote is a safe thing to
report" in the safelist?
Johannes Schindelin Dec. 29, 2019, 8:17 p.m. UTC | #4
Hi Emily,

On Thu, 12 Dec 2019, Emily Shaffer wrote:

> diff --git a/bugreport.c b/bugreport.c
> index 759cc0b0f8..1fca28f0b9 100644
> --- a/bugreport.c
> +++ b/bugreport.c
> @@ -6,6 +6,9 @@
>  #include "help.h"
>  #include <gnu/libc-version.h>
>  #include "run-command.h"
> +#include "config.h"
> +#include "bugreport-config-safelist.h"
> +#include "khash.h"

Seems that this patch makes things fail in the CI build
(https://dev.azure.com/gitgitgadget/git/_build/results?buildId=23830&view=results):

-- snipsnap --
bugreport.c:10:10: fatal error: 'bugreport-config-safelist.h' file not found
#include "bugreport-config-safelist.h"
         ^~~~~~~~~~~~~~~~~~~~~~~~~~~~~
1 error generated.
Makefile:2395: recipe for target 'bugreport.o' failed
make: *** [bugreport.o] Error 1
make: *** Waiting for unfinished jobs....
Emily Shaffer Jan. 24, 2020, 3:29 a.m. UTC | #5
On Tue, Dec 17, 2019 at 09:43:23AM -0800, Junio C Hamano wrote:
> Emily Shaffer <emilyshaffer@google.com> writes:
> 
> >> But that is too simple-minded.  If we wanted to safelist foo.*.bar,
> >> where '*' can be anything, walking on the list of safe variables
> >> would not work.  We must have a hash table that knows "foo.*.bar" is
> >> allowed, and while walking all the configuration keys, when we see
> >> foo.a.bar, we consult "foo.*.bar" as well as "foo.a.bar" to see if
> >> it is whitelisted, or something like that.
> >
> > ...unless we want to use wildcards like you suggest.
> >
> > But I'm not sure it's a good idea. I envision someone writing another
> > Git add-on, which offers someone to specify "user.password" ...
> 
> Wildcarding the leaf level of two (or for that matter three) level
> names like "user.*" in your example is of course a nonsense way to
> use the safelist.  But think about three-level names where the
> second level is designed to be used for user-supplied token to give
> things of similar nature a name the user can use to distinguish
> things.  If remote.origin.url is worth reporting, wouldn't
> remote.upstream.url also be?  Shouldn't remote.*.url be the way to
> say "the URL field for each and every remote is a safe thing to
> report" in the safelist?

Bah, somehow this mail slipped through the cracks for me.

When you put it that way, I see how it can be useful. Although I admit
to fatigue on this patchset and now I'm hesitant about feature creep. :)

I'm going to send the reroll for v5 as I have it, not having seen this
mail, and think more about how I'd approach wildcarding in the middle
like this. I want to take a look at the other configs which use a middle
level as a user token and try to reason about whether we ought to be
collecting them. I'll hope to report back sooner (or later).

 - Emily
diff mbox series

Patch

diff --git a/bugreport.c b/bugreport.c
index 759cc0b0f8..1fca28f0b9 100644
--- a/bugreport.c
+++ b/bugreport.c
@@ -6,6 +6,9 @@ 
 #include "help.h"
 #include <gnu/libc-version.h>
 #include "run-command.h"
+#include "config.h"
+#include "bugreport-config-safelist.h"
+#include "khash.h"
 
 static void get_http_version_info(struct strbuf *http_info)
 {
@@ -18,6 +21,41 @@  static void get_http_version_info(struct strbuf *http_info)
 	    strbuf_addstr(http_info, "'git-http-fetch -V' not supported\n");
 }
 
+KHASH_INIT(cfg_set, const char*, int, 0, kh_str_hash_func, kh_str_hash_equal);
+
+struct cfgset {
+	kh_cfg_set_t set;
+};
+
+struct cfgset safelist;
+
+static void cfgset_init(struct cfgset *set, size_t initial_size)
+{
+	memset(&set->set, 0, sizeof(set->set));
+	if (initial_size)
+		kh_resize_cfg_set(&set->set, initial_size);
+}
+
+static int cfgset_insert(struct cfgset *set, const char *cfg_key)
+{
+	int added;
+	kh_put_cfg_set(&set->set, cfg_key, &added);
+	printf("ESS: added %s\n", cfg_key);
+	return !added;
+}
+
+static int cfgset_contains(struct cfgset *set, const char *cfg_key)
+{
+	khiter_t pos = kh_get_cfg_set(&set->set, cfg_key);
+	return pos != kh_end(&set->set);
+}
+
+static void cfgset_clear(struct cfgset *set)
+{
+	kh_release_cfg_set(&set->set);
+	cfgset_init(set, 0);
+}
+
 static void get_system_info(struct strbuf *sys_info)
 {
 	struct strbuf version_info = STRBUF_INIT;
@@ -53,6 +91,36 @@  static void get_system_info(struct strbuf *sys_info)
 	strbuf_complete_line(sys_info);
 }
 
+static void gather_safelist()
+{
+	int index;
+	int safelist_len = sizeof(bugreport_config_safelist) / sizeof(const char *);
+	cfgset_init(&safelist, safelist_len);
+	for (index = 0; index < safelist_len; index++)
+		cfgset_insert(&safelist, bugreport_config_safelist[index]);
+
+}
+
+static int git_config_bugreport(const char *var, const char *value, void *cb)
+{
+	struct strbuf *config_info = (struct strbuf *)cb;
+
+	if (cfgset_contains(&safelist, var))
+		strbuf_addf(config_info,
+			    "%s (%s) : %s\n",
+			    var, config_scope_to_string(current_config_scope()),
+			    value);
+
+	return 0;
+}
+
+static void get_safelisted_config(struct strbuf *config_info)
+{
+	gather_safelist();
+	git_config(git_config_bugreport, config_info);
+	cfgset_clear(&safelist);
+}
+
 static const char * const bugreport_usage[] = {
 	N_("git bugreport [-o|--output <file>]"),
 	NULL
@@ -114,10 +182,12 @@  int cmd_main(int argc, const char **argv)
 
 	get_bug_template(&buffer);
 
-	// add other contents
 	get_header(&buffer, "System Info");
 	get_system_info(&buffer);
 
+	get_header(&buffer, "Safelisted Config Info");
+	get_safelisted_config(&buffer);
+
 	report = fopen_for_writing(report_path.buf);
 	strbuf_write(&buffer, report);
 	fclose(report);