Web lists-archives.com

[PATCH/RFC] diff/read-cache: don't assume empty files will filter to empty




Hey guys, please see the below patch. (please read commit message
first and then the following:)
In particular I'd like to hear comments about the changes I made to
how smudging works.
I am not confident on whether it is a good way to permanently fix the problem.
I'm wondering if there's a better way, and I am also wondering if this
will have an effect
when a user upgrades git on their system, and they already had a
repository checked out.

I know there were other approaches to smudging cache entries, for
example JGIT had
this slightly funny method:
https://github.com/eclipse/jgit/commit/c98d97731b87417b196341fa63a50fffea4e123c#diff-2c773508260c1ab8eb01b5444c3395c8L320
but since then they have reverted to the c-git way of doing things.

Any comments?

Andrei

>From 296a83046e77c1d49db8d324f298280265dfb6dc Mon Sep 17 00:00:00 2001
From: Purdea Andrei <andrei.purdea@xxxxxxxxxxxx>
Date: Tue, 11 Jul 2017 23:00:15 +0300
Subject: [PATCH] diff/read-cache: don't assume empty files will filter to
 empty

There are some projects that are based on git smudge/clean filters that won't
necessarily produce empty files when clean-ing empty files. An example of such
a project is git-fat, which stores binary files elsewhere, but uses git to
store only a reference to that binary file. The reference to the file consists
of a hash and the file size. So an empty will will produce a non-empty file
after passing it through the git-fat clean filter.

Current GIT, when it sees an empty file on git diff, it interprets it specially,
and it won't pass it through the clean filter. But in other conditions it will.
For example you can create a commit containing an empty git-fat managed file,
and in that case the file will be passed through the clean filter, and the git
repository will contain the expanded hash of an empty file. If you try to run
git diff on such a repository, it will always show this file as changed, because
git diff will compare the expanded hash that is already in the repository, to
the empty file that has not been passed through the clean filter. This will
result in files that will always show up as modified. The "changes" cannot be
committed, or checked-out to the working tree, and git will not allow us to
perform more complex operations such as rebase / merge. The only action is to
delete the file.

The patch fixes the issue in two places:

diff.c: send the file through git-clean filter even if it is empty.

read-cache.c: read-cache uses a magic number of zero-sized files to smudge
cache entries that have the same timestamp as other file modifications.
(Note: the word smudge here is unrelated to the "smudge filter" concept)
Smudged entries will have a zero file size, but the stored hash will not equal
the unique, known hash of an empty file. In the absence of clean filters that
can produce non-zero files from zero-sized files, this is a good way to
detectably corrupt a cache entry. However in the presence of git clean filters
that do this, the file will be considered smudged, and it will keep showing up
as touched by the user. That is because the stored hash is actually the hash
of the file after it has passed to the clean filter, and it's not empty anymore.
This change fixes the condition by using a different magic number of smudged
entries. It uses -1 which is the biggest possible file size stored.

Signed-off-by: Purdea Andrei <andrei@xxxxxxxxx>
---
 diff.c       | 18 ++++++++++--------
 read-cache.c |  7 +++----
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/diff.c b/diff.c
index 00b4c8669..76bb536ce 100644
--- a/diff.c
+++ b/diff.c
@@ -2858,8 +2858,6 @@ int diff_populate_filespec(struct diff_filespec
*s, unsigned int flags)
  }
  }
  s->size = xsize_t(st.st_size);
- if (!s->size)
- goto empty;
  if (S_ISLNK(st.st_mode)) {
  struct strbuf sb = STRBUF_INIT;

@@ -2894,12 +2892,16 @@ int diff_populate_filespec(struct
diff_filespec *s, unsigned int flags)
  s->is_binary = 1;
  return 0;
  }
- fd = open(s->path, O_RDONLY);
- if (fd < 0)
- goto err_empty;
- s->data = xmmap(NULL, s->size, PROT_READ, MAP_PRIVATE, fd, 0);
- close(fd);
- s->should_munmap = 1;
+ if (!s->size) {
+ s->data = "";
+ } else {
+ fd = open(s->path, O_RDONLY);
+ if (fd < 0)
+ goto err_empty;
+ s->data = xmmap(NULL, s->size, PROT_READ, MAP_PRIVATE, fd, 0);
+ close(fd);
+ s->should_munmap = 1;
+ }

  /*
  * Convert from working tree format to canonical git format
diff --git a/read-cache.c b/read-cache.c
index 2121b6e7b..ca306993c 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -262,9 +262,8 @@ static int ce_match_stat_basic(const struct
cache_entry *ce, struct stat *st)
  changed |= match_stat_data(&ce->ce_stat_data, st);

  /* Racily smudged entry? */
- if (!ce->ce_stat_data.sd_size) {
- if (!is_empty_blob_sha1(ce->oid.hash))
- changed |= DATA_CHANGED;
+ if (ce->ce_stat_data.sd_size == (unsigned int)-1) {
+ changed |= DATA_CHANGED;
  }

  return changed;
@@ -2028,7 +2027,7 @@ static void ce_smudge_racily_clean_entry(struct
cache_entry *ce)
  * file, and never calls us, so the cached size information
  * for "frotz" stays 6 which does not match the filesystem.
  */
- ce->ce_stat_data.sd_size = 0;
+ ce->ce_stat_data.sd_size = (unsigned int)-1;
  }
 }

-- 
2.11.0
From 296a83046e77c1d49db8d324f298280265dfb6dc Mon Sep 17 00:00:00 2001
From: Purdea Andrei <andrei.purdea@xxxxxxxxxxxx>
Date: Tue, 11 Jul 2017 23:00:15 +0300
Subject: [PATCH] diff/read-cache: don't assume empty files will filter to
 empty

There are some projects that are based on git smudge/clean filters that won't
necessarily produce empty files when clean-ing empty files. An example of such
a project is git-fat, which stores binary files elsewhere, but uses git to
store only a reference to that binary file. The reference to the file consists
of a hash and the file size. So an empty will will produce a non-empty file
after passing it through the git-fat clean filter.

Current GIT, when it sees an empty file on git diff, it interprets it specially,
and it won't pass it through the clean filter. But in other conditions it will.
For example you can create a commit containing an empty git-fat managed file,
and in that case the file will be passed through the clean filter, and the git
repository will contain the expanded hash of an empty file. If you try to run
git diff on such a repository, it will always show this file as changed, because
git diff will compare the expanded hash that is already in the repository, to
the empty file that has not been passed through the clean filter. This will
result in files that will always show up as modified. The "changes" cannot be
committed, or checked-out to the working tree, and git will not allow us to
perform more complex operations such as rebase / merge. The only action is to
delete the file.

The patch fixes the issue in two places:

diff.c: send the file through git-clean filter even if it is empty.

read-cache.c: read-cache uses a magic number of zero-sized files to smudge
cache entries that have the same timestamp as other file modifications.
(Note: the word smudge here is unrelated to the "smudge filter" concept)
Smudged entries will have a zero file size, but the stored hash will not equal
the unique, known hash of an empty file. In the absence of clean filters that
can produce non-zero files from zero-sized files, this is a good way to
detectably corrupt a cache entry. However in the presence of git clean filters
that do this, the file will be considered smudged, and it will keep showing up
as touched by the user. That is because the stored hash is actually the hash
of the file after it has passed to the clean filter, and it's not empty anymore.
This change fixes the condition by using a different magic number of smudged
entries. It uses -1 which is the biggest possible file size stored.

Signed-off-by: Purdea Andrei <andrei@xxxxxxxxx>
---
 diff.c       | 18 ++++++++++--------
 read-cache.c |  7 +++----
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/diff.c b/diff.c
index 00b4c8669..76bb536ce 100644
--- a/diff.c
+++ b/diff.c
@@ -2858,8 +2858,6 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags)
 			}
 		}
 		s->size = xsize_t(st.st_size);
-		if (!s->size)
-			goto empty;
 		if (S_ISLNK(st.st_mode)) {
 			struct strbuf sb = STRBUF_INIT;
 
@@ -2894,12 +2892,16 @@ int diff_populate_filespec(struct diff_filespec *s, unsigned int flags)
 			s->is_binary = 1;
 			return 0;
 		}
-		fd = open(s->path, O_RDONLY);
-		if (fd < 0)
-			goto err_empty;
-		s->data = xmmap(NULL, s->size, PROT_READ, MAP_PRIVATE, fd, 0);
-		close(fd);
-		s->should_munmap = 1;
+		if (!s->size) {
+			s->data = "";
+		} else {
+			fd = open(s->path, O_RDONLY);
+			if (fd < 0)
+				goto err_empty;
+			s->data = xmmap(NULL, s->size, PROT_READ, MAP_PRIVATE, fd, 0);
+			close(fd);
+			s->should_munmap = 1;
+		}
 
 		/*
 		 * Convert from working tree format to canonical git format
diff --git a/read-cache.c b/read-cache.c
index 2121b6e7b..ca306993c 100644
--- a/read-cache.c
+++ b/read-cache.c
@@ -262,9 +262,8 @@ static int ce_match_stat_basic(const struct cache_entry *ce, struct stat *st)
 	changed |= match_stat_data(&ce->ce_stat_data, st);
 
 	/* Racily smudged entry? */
-	if (!ce->ce_stat_data.sd_size) {
-		if (!is_empty_blob_sha1(ce->oid.hash))
-			changed |= DATA_CHANGED;
+	if (ce->ce_stat_data.sd_size == (unsigned int)-1) {
+		changed |= DATA_CHANGED;
 	}
 
 	return changed;
@@ -2028,7 +2027,7 @@ static void ce_smudge_racily_clean_entry(struct cache_entry *ce)
 		 * file, and never calls us, so the cached size information
 		 * for "frotz" stays 6 which does not match the filesystem.
 		 */
-		ce->ce_stat_data.sd_size = 0;
+		ce->ce_stat_data.sd_size = (unsigned int)-1;
 	}
 }
 
-- 
2.11.0