9front - general discussion about 9front
 help / color / mirror / Atom feed
From: ori@eigenstate.org
To: 9front@9front.org
Subject: [9front] git: more speedups
Date: Sun, 22 May 2022 20:24:48 -0400	[thread overview]
Message-ID: <FDA711C4CA5D982FF4B233350AA5BC44@eigenstate.org> (raw)

Inspired by some changes made in game of trees, I've
implemented a number of speedups in git9.

First, hashing the chunks during deltification with
murmurhash instead of sha1 speeds up the delta search
significantly.

The stretch function was micro-optimized a bit as well,
since that was taking a large portion of the time when
chunking.

Finally, the full path is not stored. We only care about
grouping files with the same name and path. We don't care
about the ordering. Therefore, only the hash of the path
xored with the hash of the diretory is kept, which saves
a bunch of mallocs and string munging.

This reduces the time spent repacking some test repos
significantly.

9front:
	% time git/repack
	deltifying 97473 objects: 100%
	writing 97473 objects: 100%
	indexing 97473 objects: 100%
	58.85u 1.39s 61.82r 	 git/repack

	% time /sys/src/cmd/git/6.repack
	deltifying 97473 objects: 100%
	writing 97473 objects: 100%
	indexing 97473 objects: 100%
	43.86u 1.29s 47.51r 	 /sys/src/cmd/git/6.repack

openbsd:

	% time git/repack
	deltifying 2092325 objects: 100%
	writing 2092325 objects: 100%
	indexing 2092325 objects: 100%
	1589.48u 45.03s 1729.18r 	 git/repack

	% time /sys/src/cmd/git/6.repack
	deltifying 2092325 objects: 100%
	writing 2092325 objects: 100%
	indexing 2092325 objects: 100%
	1238.68u 41.49s 1373.15r 	 /sys/src/cmd/git/6.repack

go:	
	% time git/repack
	deltifying 529507 objects: 100%
	writing 529507 objects: 100%
	indexing 529507 objects: 100%
	345.32u 7.71s 369.25r     git/repack

	% time /sys/src/cmd/git/6.repack
	deltifying 529507 objects: 100%
	writing 529507 objects: 100%
	indexing 529507 objects: 100%
	248.07u 4.47s 257.59r 	 /sys/src/cmd/git/6.repack

diff 6fbb1acc8fa0b6655b14e8c46240a4a8d2d8c672 uncommitted
--- a/sys/src/cmd/git/delta.c
+++ b/sys/src/cmd/git/delta.c
@@ -7,7 +7,6 @@
 	Minchunk	= 128,
 	Maxchunk	= 8192,
 	Splitmask	= (1<<8)-1,
-	
 };
 
 static u32int geartab[] = {
@@ -48,9 +47,7 @@
 static u64int
 hash(void *p, int n)
 {
-	uchar buf[SHA1dlen];
-	sha1((uchar*)p, n, buf, nil);
-	return GETBE64(buf);
+	return murmurhash2(p, n);
 }
 
 static void
@@ -172,14 +169,18 @@
 static int
 stretch(Dtab *dt, Dblock *b, uchar *s, uchar *e, int n)
 {
-	uchar *p, *q, *eb;
+	uchar *p0, *p, *q, *eb;
 
 	if(b == nil)
 		return n;
 	p = s + n;
 	q = dt->base + b->off + n;
-	eb = dt->base + dt->nbase;
-	while(n < (1<<24)-1){
+	p0 = p;
+	if(dt->nbase < (1<<24)-1)
+		eb = dt->base + dt->nbase;
+	else
+		eb = dt->base + (1<<24)-1;
+	while(1){
 		if(p == e || q == eb)
 			break;
 		if(*p != *q)
@@ -186,9 +187,8 @@
 			break;
 		p++;
 		q++;
-		n++;
 	}
-	return n;
+	return n + (p - p0);
 }
 
 Delta*
--- /dev/null
+++ /dev/null
--- a/sys/src/cmd/git/git.h
+++ b/sys/src/cmd/git/git.h
@@ -303,6 +303,7 @@
 char	*strip(char *);
 int	findrepo(char *, int);
 int	showprogress(int, int);
+u64int	murmurhash2(void*, usize);
 
 /* packing */
 void	dtinit(Dtab *, Object*);
--- /dev/null
+++ /dev/null
--- a/sys/src/cmd/git/pack.c
+++ b/sys/src/cmd/git/pack.c
@@ -20,7 +20,7 @@
 
 struct Meta {
 	Object	*obj;
-	char	*path;
+	vlong	path;
 	vlong	mtime;
 
 	/* The best delta we picked */
@@ -1284,17 +1284,18 @@
 deltaordercmp(void *pa, void *pb)
 {
 	Meta *a, *b;
-	int cmp;
+	vlong cmp;
 
 	a = *(Meta**)pa;
 	b = *(Meta**)pb;
 	if(a->obj->type != b->obj->type)
 		return a->obj->type - b->obj->type;
-	cmp = strcmp(a->path, b->path);
+	cmp = (b->path - a->path);
 	if(cmp != 0)
-		return cmp;
-	if(a->mtime != b->mtime)
-		return a->mtime - b->mtime;
+		return (cmp < 0) ? -1 : 1;
+	cmp = a->mtime - b->mtime;
+	if(cmp != 0)
+		return (cmp < 0) ? -1 : 1;
 	return memcmp(a->obj->hash.h, b->obj->hash.h, sizeof(a->obj->hash.h));
 }
 
@@ -1317,7 +1318,7 @@
 }
 
 static void
-addmeta(Metavec *v, Objset *has, Object *o, char *path, vlong mtime)
+addmeta(Metavec *v, Objset *has, Object *o, vlong pathid, vlong mtime)
 {
 	Meta *m;
 
@@ -1328,7 +1329,7 @@
 		return;
 	m = emalloc(sizeof(Meta));
 	m->obj = o;
-	m->path = estrdup(path);
+	m->path = pathid;
 	m->mtime = mtime;
 
 	if(v->nmeta == v->metasz){
@@ -1342,7 +1343,6 @@
 freemeta(Meta *m)
 {
 	free(m->delta);
-	free(m->path);
 	free(m);
 }
 
@@ -1351,8 +1351,9 @@
 {
 	Object *t, *o;
 	Dirent *e;
+	vlong dh, eh;
+	int i, k, r;
 	char *p;
-	int i, k;
 
 	if(oshas(has, tree))
 		return 0;
@@ -1363,7 +1364,8 @@
 		unref(t);
 		return 0;
 	}
-	addmeta(v, has, t, dpath, mtime);
+	dh = murmurhash2(dpath, strlen(dpath));
+	addmeta(v, has, t, dh, mtime);
 	for(i = 0; i < t->tree->nent; i++){
 		e = &t->tree->ent[i];
 		if(oshas(has, e->h))
@@ -1372,14 +1374,16 @@
 			continue;
 		k = (e->mode & DMDIR) ? GTree : GBlob;
 		o = clearedobject(e->h, k);
-		p = smprint("%s/%s", dpath, e->name);
-		if(k == GBlob)
-			addmeta(v, has, o, p, mtime);
-		else if(loadtree(v, has, e->h, p, mtime) == -1){
+		if(k == GTree){
+			p = smprint("%s/%s", dpath, e->name);
+			r = loadtree(v, has, e->h, p, mtime);
 			free(p);
-			return -1;
+			if(r == -1)
+				return -1;
+		}else{
+			eh = murmurhash2(e->name, strlen(e->name));
+			addmeta(v, has, o, dh^eh, mtime);
 		}
-		free(p);
 	}
 	unref(t);
 	return 0;
@@ -1400,7 +1404,7 @@
 		unref(c);
 		return 0;
 	}
-	addmeta(v, has, c, "", c->commit->ctime);
+	addmeta(v, has, c, 0, c->commit->ctime);
 	r = loadtree(v, has, c->commit->tree, "", c->commit->ctime);
 	unref(c);
 	return r;
--- a/sys/src/cmd/git/util.c
+++ b/sys/src/cmd/git/util.c
@@ -10,6 +10,10 @@
 int chattygit;
 int interactive = 1;
 
+enum {
+	Seed		= 2928213749ULL
+};
+
 Object*
 emptydir(void)
 {
@@ -390,4 +394,51 @@
 		i = m;
 	}
 	return 1;
+}
+
+u64int
+murmurhash2(void *pp, usize n)
+{
+	u32int m = 0x5bd1e995;
+	u32int r = 24;
+	u32int h, k;
+	u32int *w, *e;
+	uchar *p;
+	
+	h = Seed ^ n;
+	e = pp;
+	e += (n / 4);
+	for (w = pp; w != e; w++) {
+		/*
+		 * NB: this is endian dependent.
+		 * This is fine for use in git, since the
+		 * hashes computed here are only ever used
+		 * for in memory data structures.
+		 *
+		 * Pack files will differ when packed on
+		 * machines with different endianness,
+		 * but the results will still be correct.
+		 */
+		k = *w;
+		k *= m;
+		k ^= k >> r;
+		k *= m;
+
+		h *= m;
+		h ^= k;
+	}
+
+	p = (uchar*)w;
+	switch (n & 0x3) {
+	case 3:	h ^= p[2] << 16;
+	case 2:	h ^= p[1] << 8;
+	case 1:	h ^= p[0] << 0;
+		h *= m;
+	}
+
+	h ^= h >> 13;
+	h *= m;
+	h ^= h >> 15;
+
+	return h;
 }


                 reply	other threads:[~2022-05-23  0:26 UTC|newest]

Thread overview: [no followups] expand[flat|nested]  mbox.gz  Atom feed

Reply instructions:

You may reply publicly to this message via plain-text email
using any one of the following methods:

* Save the following mbox file, import it into your mail client,
  and reply-to-all from there: mbox

  Avoid top-posting and favor interleaved quoting:
  https://en.wikipedia.org/wiki/Posting_style#Interleaved_style

* Reply using the --to, --cc, and --in-reply-to
  switches of git-send-email(1):

  git send-email \
    --in-reply-to=FDA711C4CA5D982FF4B233350AA5BC44@eigenstate.org \
    --to=ori@eigenstate.org \
    --cc=9front@9front.org \
    /path/to/YOUR_REPLY

  https://kernel.org/pub/software/scm/git/docs/git-send-email.html

* If your mail client supports setting the In-Reply-To header
  via mailto: links, try the mailto: link
Be sure your reply has a Subject: header at the top and a blank line before the message body.
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox;
as well as URLs for NNTP newsgroup(s).