Web lists-archives.com

[PATCH 10/10] ls-partial: created command to list missing blobs




From: Jeff Hostetler <git@xxxxxxxxxxxxxxxxx>

Added a command to list the missing blobs for a commit.
This can be used after a partial clone or fetch to list
the omitted blobs that the client would need to checkout
the given commit/branch.  Optionally respecting or ignoring
the current sparse-checkout definition.

This command prints a simple list of blob SHAs.  It is
expected that this would be piped into another command
with knowledge of the transport and/or blob store.

Signed-off-by: Jeff Hostetler <jeffhost@xxxxxxxxxxxxx>
---
 Makefile             |   2 +
 builtin.h            |   1 +
 builtin/ls-partial.c | 110 ++++++++++++++++++++
 git.c                |   1 +
 partial-utils.c      | 279 +++++++++++++++++++++++++++++++++++++++++++++++++++
 partial-utils.h      |  93 +++++++++++++++++
 6 files changed, 486 insertions(+)
 create mode 100644 builtin/ls-partial.c
 create mode 100644 partial-utils.c
 create mode 100644 partial-utils.h

diff --git a/Makefile b/Makefile
index 9ec6065..96e9e1e 100644
--- a/Makefile
+++ b/Makefile
@@ -791,6 +791,7 @@ LIB_OBJS += pack-write.o
 LIB_OBJS += pager.o
 LIB_OBJS += parse-options.o
 LIB_OBJS += parse-options-cb.o
+LIB_OBJS += partial-utils.o
 LIB_OBJS += patch-delta.o
 LIB_OBJS += patch-ids.o
 LIB_OBJS += path.o
@@ -908,6 +909,7 @@ BUILTIN_OBJS += builtin/init-db.o
 BUILTIN_OBJS += builtin/interpret-trailers.o
 BUILTIN_OBJS += builtin/log.o
 BUILTIN_OBJS += builtin/ls-files.o
+BUILTIN_OBJS += builtin/ls-partial.o
 BUILTIN_OBJS += builtin/ls-remote.o
 BUILTIN_OBJS += builtin/ls-tree.o
 BUILTIN_OBJS += builtin/mailinfo.o
diff --git a/builtin.h b/builtin.h
index 9e4a898..df00c4b 100644
--- a/builtin.h
+++ b/builtin.h
@@ -79,6 +79,7 @@ extern int cmd_interpret_trailers(int argc, const char **argv, const char *prefi
 extern int cmd_log(int argc, const char **argv, const char *prefix);
 extern int cmd_log_reflog(int argc, const char **argv, const char *prefix);
 extern int cmd_ls_files(int argc, const char **argv, const char *prefix);
+extern int cmd_ls_partial(int argc, const char **argv, const char *prefix);
 extern int cmd_ls_tree(int argc, const char **argv, const char *prefix);
 extern int cmd_ls_remote(int argc, const char **argv, const char *prefix);
 extern int cmd_mailinfo(int argc, const char **argv, const char *prefix);
diff --git a/builtin/ls-partial.c b/builtin/ls-partial.c
new file mode 100644
index 0000000..8ebf045
--- /dev/null
+++ b/builtin/ls-partial.c
@@ -0,0 +1,110 @@
+#include "cache.h"
+#include "blob.h"
+#include "tree.h"
+#include "commit.h"
+#include "quote.h"
+#include "builtin.h"
+#include "parse-options.h"
+#include "pathspec.h"
+#include "dir.h"
+#include "partial-utils.h"
+
+static struct trace_key trace_partial = TRACE_KEY_INIT(PARTIAL);
+
+static int verbose;
+static int ignore_sparse;
+struct exclude_list el;
+
+static const char * const ls_partial_usage[] = {
+	N_("git ls-partial [<options>] <tree-ish>"),
+	NULL
+};
+
+/*
+ * map <tree-ish> arg into SHA1 and get the root treenode.
+ */
+static struct tree *lookup_tree_from_treeish(const char *arg)
+{
+	unsigned char sha1[20];
+	struct tree *tree;
+
+	if (get_sha1(arg, sha1))
+		die("not a valid object name '%s'", arg);
+
+	trace_printf_key(
+		&trace_partial,
+		"ls-partial: treeish '%s' '%s'\n",
+		arg, sha1_to_hex(sha1));
+
+	if (verbose) {
+		printf("commit\t%s\n", sha1_to_hex(sha1));
+		printf("branch\t%s\n", arg);
+	}
+	
+	tree = parse_tree_indirect(sha1);
+	if (!tree)
+		die("not a tree object '%s'", arg);
+
+	return tree;
+}
+
+static void print_results(const struct pu_vec *vec)
+{
+	int k;
+
+	for (k = 0; k < vec->data_nr; k++)
+		printf("%s\n", oid_to_hex(&vec->data[k]->oid));
+}
+
+static void print_results_verbose(const struct pu_vec *vec)
+{
+	int k;
+
+	/* TODO Consider -z version */
+
+	for (k = 0; k < vec->data_nr; k++)
+		printf("%s\t%s\n", oid_to_hex(&vec->data[k]->oid), vec->data[k]->fullpath.buf);
+}
+
+int cmd_ls_partial(int argc, const char **argv, const char *prefix)
+{
+	struct exclude_list el;
+	struct tree *tree;
+	struct pu_vec *vec;
+	struct pu_vec *vec_all = NULL;
+	struct pu_vec *vec_sparse = NULL;
+	struct pu_vec *vec_missing = NULL;
+	
+	const struct option ls_partial_options[] = {
+		OPT__VERBOSE(&verbose, N_("show verbose blob details")),
+		OPT_BOOL(0, "ignore-sparse", &ignore_sparse,
+				 N_("ignore sparse-checkout settings (scan whole tree)")),
+		OPT_END()
+	};
+
+	git_config(git_default_config, NULL);
+	argc = parse_options(argc, argv, prefix,
+						 ls_partial_options, ls_partial_usage, 0);
+	if (argc < 1)
+		usage_with_options(ls_partial_usage, ls_partial_options);
+
+	tree = lookup_tree_from_treeish(argv[0]);
+
+	vec_all = pu_vec_ls_tree(tree, prefix, argv + 1);
+	if (ignore_sparse || pu_load_sparse_definitions("info/sparse-checkout", &el) < 0)
+		vec = vec_all;
+	else {
+		vec_sparse = pu_vec_filter_sparse(vec_all, &el);
+		vec = vec_sparse;
+	}
+
+	vec_missing = pu_vec_filter_missing(vec);
+	vec = vec_missing;
+
+	if (verbose)
+		print_results_verbose(vec);
+	else
+		print_results(vec);
+
+	return 0;
+}
diff --git a/git.c b/git.c
index 33f52ac..ef1e019 100644
--- a/git.c
+++ b/git.c
@@ -444,6 +444,7 @@ static struct cmd_struct commands[] = {
 	{ "interpret-trailers", cmd_interpret_trailers, RUN_SETUP_GENTLY },
 	{ "log", cmd_log, RUN_SETUP },
 	{ "ls-files", cmd_ls_files, RUN_SETUP | SUPPORT_SUPER_PREFIX },
+	{ "ls-partial", cmd_ls_partial, RUN_SETUP },
 	{ "ls-remote", cmd_ls_remote, RUN_SETUP_GENTLY },
 	{ "ls-tree", cmd_ls_tree, RUN_SETUP },
 	{ "mailinfo", cmd_mailinfo, RUN_SETUP_GENTLY },
diff --git a/partial-utils.c b/partial-utils.c
new file mode 100644
index 0000000..b75e91e
--- /dev/null
+++ b/partial-utils.c
@@ -0,0 +1,279 @@
+#include "cache.h"
+#include "blob.h"
+#include "tree.h"
+#include "commit.h"
+#include "quote.h"
+#include "builtin.h"
+#include "parse-options.h"
+#include "pathspec.h"
+#include "dir.h"
+#include "partial-utils.h"
+
+static struct trace_key trace_partial_utils = TRACE_KEY_INIT(PARTIAL_UTILS);
+
+void pu_row_trace(
+	const struct pu_row *row,
+	const char *label)
+{
+	trace_printf_key(
+		&trace_partial_utils,
+		"%s: %06o %s %.*s\n",
+		label,
+		row->mode,
+		oid_to_hex(&row->oid),
+		(int)row->fullpath.len,
+		row->fullpath.buf);
+}
+
+struct pu_row *pu_row_alloc(
+	const unsigned char *sha1,
+	const struct strbuf *base,
+	const char *entryname,
+	unsigned mode)
+{
+	struct pu_row *row = xcalloc(1, sizeof(struct pu_row));
+
+	hashcpy(row->oid.hash, sha1);
+	strbuf_init(&row->fullpath, base->len + strlen(entryname) + 1);
+	if (base->len)
+		strbuf_addbuf(&row->fullpath, base);
+	strbuf_addstr(&row->fullpath, entryname);
+	row->mode = mode;
+	row->entryname_offset = base->len;
+
+	pu_row_trace(row, "alloc");
+
+	return row;
+}
+
+struct pu_vec *pu_vec_alloc(
+	unsigned int nr_pre_alloc)
+{
+	struct pu_vec *vec = xcalloc(1, sizeof(struct pu_vec));
+
+	vec->data = xcalloc(nr_pre_alloc, sizeof(struct pu_row *));
+	vec->data_alloc = nr_pre_alloc;
+
+	return vec;
+}
+
+void pu_vec_append(
+	struct pu_vec *vec,
+	struct pu_row *row)
+{
+	ALLOC_GROW(vec->data, vec->data_nr + 1, vec->data_alloc);
+	vec->data[vec->data_nr++] = row;
+}
+
+static int ls_tree_cb(
+	const unsigned char *sha1,
+	struct strbuf *base,
+	const char *pathname,
+	unsigned mode,
+	int stage,
+	void *context)
+{
+	struct pu_vec *vec = (struct pu_vec *)context;
+
+	/* omit submodules */
+	if (S_ISGITLINK(mode))
+		return 0;
+
+	pu_vec_append(vec, pu_row_alloc(sha1, base, pathname, mode));
+
+	if (S_ISDIR(mode))
+		return READ_TREE_RECURSIVE;
+
+	return 0;
+}
+
+struct pu_vec *pu_vec_ls_tree(
+	struct tree *tree,
+	const char *prefix,
+	const char **argv)
+{
+	struct pu_vec *vec;
+	struct pathspec pathspec;
+	int k;
+
+	vec = pu_vec_alloc(PU_VEC_DEFAULT_SIZE);
+
+	parse_pathspec(
+		&pathspec, PATHSPEC_GLOB | PATHSPEC_ICASE | PATHSPEC_EXCLUDE,
+		PATHSPEC_PREFER_CWD, prefix, argv);
+	for (k = 0; k < pathspec.nr; k++)
+		pathspec.items[k].nowildcard_len = pathspec.items[k].len;
+	pathspec.has_wildcard = 0;
+
+	if (read_tree_recursive(tree, "", 0, 0, &pathspec, ls_tree_cb, vec) != 0)
+		die("Could not read tree");
+
+	return vec;
+}
+
+int pu_load_sparse_definitions(
+	const char *path,
+	struct exclude_list *pel)
+{
+	int result;
+	char *sparse = git_pathdup("info/sparse-checkout");
+	memset(pel, 0, sizeof(*pel));
+	result = add_excludes_from_file_to_list(sparse, "", 0, pel, 0);
+	free(sparse);
+	return result;
+}
+
+static int mode_to_dtype(unsigned mode)
+{
+	if (S_ISREG(mode))
+		return DT_REG;
+	if (S_ISDIR(mode) || S_ISGITLINK(mode))
+		return DT_DIR;
+	if (S_ISLNK(mode))
+		return DT_LNK;
+	return DT_UNKNOWN;
+}
+
+static int apply_excludes_1(
+	struct pu_row **subset,
+	unsigned int nr,
+	struct strbuf *prefix,
+	struct exclude_list *pel,
+	int defval,
+	struct pu_vec *vec_out);
+
+/* apply directory rules. based on clear_ce_flags_dir() */
+static int apply_excludes_dir(
+	struct pu_row **subset,
+	unsigned int nr,
+	struct strbuf *prefix,
+	char *basename,
+	struct exclude_list *pel,
+	int defval,
+	struct pu_vec *vec_out)
+{
+	struct pu_row **subset_end;
+	int dtype = DT_DIR;
+	int ret = is_excluded_from_list(
+		prefix->buf, prefix->len, basename, &dtype, pel);
+	int rc;
+
+	strbuf_addch(prefix, '/');
+
+	if (ret < 0)
+		ret = defval;
+
+	for (subset_end = subset; subset_end != subset + nr; subset_end++) {
+		struct pu_row *row = *subset_end;
+		if (strncmp(row->fullpath.buf, prefix->buf, prefix->len))
+			break;
+	}
+
+	rc = apply_excludes_1(
+		subset, subset_end - subset,
+		prefix, pel, ret,
+		vec_out);
+	strbuf_setlen(prefix, prefix->len - 1);
+	return rc;
+}
+
+/* apply sparse rules to subset[0..nr). based on clear_ce_flags_1() */
+static int apply_excludes_1(
+	struct pu_row **subset,
+	unsigned int nr,
+	struct strbuf *prefix,
+	struct exclude_list *pel,
+	int defval,
+	struct pu_vec *vec_out)
+{
+	struct pu_row **subset_end = subset + nr;
+
+	while (subset != subset_end) {
+		struct pu_row *row = *subset;
+		const char *name, *slash;
+		int len, dtype, val;
+
+		if (prefix->len && strncmp(row->fullpath.buf, prefix->buf, prefix->len))
+			break;
+
+		name = row->fullpath.buf + prefix->len;
+		slash = strchr(name, '/');
+
+		if (slash) {
+			int processed;
+
+			len = slash - name;
+			strbuf_add(prefix, name, len);
+
+			processed = apply_excludes_dir(
+				subset, subset_end - subset,
+				prefix, prefix->buf + prefix->len - len,
+				pel, defval,
+				vec_out);
+
+			if (processed) {
+				subset += processed;
+				strbuf_setlen(prefix, prefix->len - len);
+				continue;
+			}
+
+			strbuf_addch(prefix, '/');
+			subset += apply_excludes_1(
+				subset, subset_end - subset,
+				prefix, pel, defval,
+				vec_out);
+			strbuf_setlen(prefix, prefix->len - len - 1);
+			continue;
+		}
+
+		dtype = mode_to_dtype(row->mode);
+		val = is_excluded_from_list(
+			row->fullpath.buf, row->fullpath.len, name, &dtype, pel);
+		if (val < 0)
+			val = defval;
+		if (val > 0) {
+			pu_row_trace(row, "sparse");
+			pu_vec_append(vec_out, row);
+		}
+		subset++;
+	}
+
+	return nr - (subset_end - subset);
+}
+
+struct pu_vec *pu_vec_filter_sparse(
+	const struct pu_vec *vec_in,
+	struct exclude_list *pel)
+{
+	struct pu_vec *vec_out;
+	struct strbuf prefix = STRBUF_INIT;
+	int defval = 0;
+
+	vec_out = pu_vec_alloc(vec_in->data_nr);
+
+	apply_excludes_1(
+		vec_in->data, vec_in->data_nr,
+		&prefix, pel, defval,
+		vec_out);
+
+	return vec_out;
+}
+
+struct pu_vec *pu_vec_filter_missing(
+	const struct pu_vec *vec_in)
+{
+	struct pu_vec *vec_out;
+	int k;
+
+	vec_out = pu_vec_alloc(vec_in->data_nr);
+
+	for (k = 0; k < vec_in->data_nr; k++) {
+		struct pu_row *row = vec_in->data[k];
+		if (!has_sha1_file(row->oid.hash)) {
+			pu_row_trace(row, "missing");
+			pu_vec_append(vec_out, row);
+		}
+	}
+
+	return vec_out;
+}
diff --git a/partial-utils.h b/partial-utils.h
new file mode 100644
index 0000000..3bdf2e4
--- /dev/null
+++ b/partial-utils.h
@@ -0,0 +1,93 @@
+#ifndef PARTIAL_UTILS_H
+#define PARTIAL_UTILS_H
+
+/*
+ * A 'partial-utils row' represents a single item in the tree.
+ * This is conceptually equivalent to a cache_entry, but does
+ * not require an index_state and lets us operate on any commit
+ * and not be tied to the current worktree.
+ */
+struct pu_row
+{
+	struct strbuf fullpath;
+	struct object_id oid;
+	unsigned mode;
+	unsigned entryname_offset;
+};
+
+/*
+ * A 'partial-utils vec' represents a vector of 'pu row'
+ * values using the normal vector machinery.
+ */
+struct pu_vec
+{
+	struct pu_row **data;
+	unsigned int data_nr;
+	unsigned int data_alloc;
+};
+
+#define PU_VEC_DEFAULT_SIZE (1024*1024)
+
+
+void pu_row_trace(
+	const struct pu_row *row,
+	const char *label);
+
+struct pu_row *pu_row_alloc(
+	const unsigned char *sha1,
+	const struct strbuf *base,
+	const char *entryname,
+	unsigned mode);
+
+struct pu_vec *pu_vec_alloc(
+	unsigned int nr_pre_alloc);
+
+/*
+ * Append the given row onto the vector WITHOUT
+ * assuming ownership of the pointer.
+ */
+void pu_vec_append(
+	struct pu_vec *vec,
+	struct pu_row *row);
+
+/*
+ * Enumerate the contents of the tree (recursively) into
+ * a vector of rows.  This is essentially "ls-tree -r -t"
+ * into a vector.
+ */ 
+struct pu_vec *pu_vec_ls_tree(
+	struct tree *tree,
+	const char *prefix,
+	const char **argv);
+
+/*
+ * Load a sparse-checkout file into (*pel).
+ * Returns -1 if none or error.
+ */
+int pu_load_sparse_definitions(
+	const char *path,
+	struct exclude_list *pel);
+
+/*
+ * Filter the given vector using the sparse-checkout
+ * definitions and return new vector of just the paths
+ * that WOULD BE populated.
+ *
+ * The returned vector BORROWS rows from the input vector.
+ *
+ * This is loosely based upon clear_ce_flags() in unpack-trees.c
+ */
+struct pu_vec *pu_vec_filter_sparse(
+	const struct pu_vec *vec_in,
+	struct exclude_list *pel);
+
+/*
+ * Filter the given vector and return the list of blobs
+ * missing from the local ODB.
+ *
+ * The returned vector BORROWS rows from the input vector.
+ */
+struct pu_vec *pu_vec_filter_missing(
+	const struct pu_vec *vec_in);
+
+#endif /* PARTIAL_UTILS_H */
-- 
2.7.4