diff options
| author | Jakob Unterwurzacher | 2017-06-09 21:52:26 +0200 | 
|---|---|---|
| committer | Jakob Unterwurzacher | 2017-06-09 22:05:14 +0200 | 
| commit | 80516ed3351477793eec882508969b6b29b69b0a (patch) | |
| tree | c461bd49e79fd6d8bf7f5dc8f28058faf2ba3078 /internal | |
| parent | da1bd742461e397abefc814bb0c0a21a6d8ec3d6 (diff) | |
cryptocore: prefetch nonces in 512-byte blocks
On my machine, reading 512-byte blocks from /dev/urandom
(same via getentropy syscall) is a lot faster in terms of
throughput:
Blocksize    Throughput
 16          28.18 MB/s
512          83.75 MB/s
For a single-threaded streaming write, this drops the CPU usage of
nonceGenerator.Get to almost 1/3:
        flat  flat%   sum%        cum   cum%
Before     0     0% 95.08%      0.35s  2.92%  github.com/rfjakob/gocryptfs/internal/cryptocore.(*nonceGenerator).Get
After  0.01s 0.092% 92.34%      0.13s  1.20%  github.com/rfjakob/gocryptfs/internal/cryptocore.(*nonceGenerator).Get
This change makes the nonce reading single-threaded, which may
hurt massively-parallel writes.
Diffstat (limited to 'internal')
| -rw-r--r-- | internal/cryptocore/nonce.go | 3 | ||||
| -rw-r--r-- | internal/cryptocore/randprefetch.go | 50 | ||||
| -rw-r--r-- | internal/cryptocore/randprefetch_test.go | 40 | 
3 files changed, 91 insertions, 2 deletions
| diff --git a/internal/cryptocore/nonce.go b/internal/cryptocore/nonce.go index 412cdbb..9df094c 100644 --- a/internal/cryptocore/nonce.go +++ b/internal/cryptocore/nonce.go @@ -28,6 +28,5 @@ type nonceGenerator struct {  // Get a random "nonceLen"-byte nonce  func (n *nonceGenerator) Get() []byte { -	nonce := RandBytes(n.nonceLen) -	return nonce +	return randPrefetcher.read(n.nonceLen)  } diff --git a/internal/cryptocore/randprefetch.go b/internal/cryptocore/randprefetch.go new file mode 100644 index 0000000..8825a05 --- /dev/null +++ b/internal/cryptocore/randprefetch.go @@ -0,0 +1,50 @@ +package cryptocore + +import ( +	"bytes" +	"log" +	"sync" +) + +/* +Number of bytes to prefetch. + +512 looks like a good compromise between throughput and latency: +Benchmark16-2      	 3000000	       567 ns/op	  28.18 MB/s +Benchmark64-2      	 5000000	       293 ns/op	  54.51 MB/s +Benchmark128-2     	10000000	       220 ns/op	  72.48 MB/s +Benchmark256-2     	10000000	       210 ns/op	  76.17 MB/s +Benchmark512-2     	10000000	       191 ns/op	  83.75 MB/s +Benchmark1024-2    	10000000	       171 ns/op	  93.48 MB/s +Benchmark2048-2    	10000000	       165 ns/op	  96.45 MB/s +Benchmark4096-2    	10000000	       165 ns/op	  96.58 MB/s +Benchmark40960-2   	10000000	       147 ns/op	 108.82 MB/s +*/ +const prefetchN = 512 + +type randPrefetcherT struct { +	sync.Mutex +	buf bytes.Buffer +} + +func (r *randPrefetcherT) read(want int) (out []byte) { +	out = make([]byte, want) +	r.Lock() +	// Note: don't use defer, it slows us down! +	have, err := r.buf.Read(out) +	if have == want && err == nil { +		r.Unlock() +		return out +	} +	// Buffer was empty -> re-fill +	r.buf.Reset() +	r.buf.Write(RandBytes(prefetchN)) +	have, err = r.buf.Read(out) +	if have != want || err != nil { +		log.Panicf("randPrefetcher could not satisfy read: have=%d want=%d err=%v", have, want, err) +	} +	r.Unlock() +	return out +} + +var randPrefetcher randPrefetcherT diff --git a/internal/cryptocore/randprefetch_test.go b/internal/cryptocore/randprefetch_test.go new file mode 100644 index 0000000..2a568f3 --- /dev/null +++ b/internal/cryptocore/randprefetch_test.go @@ -0,0 +1,40 @@ +package cryptocore + +import ( +	"bytes" +	"compress/flate" +	"runtime" +	"sync" +	"testing" +) + +// TestRandPrefetch hammers the randPrefetcher with 100 goroutines and verifies +// that the result is incompressible +func TestRandPrefetch(t *testing.T) { +	runtime.GOMAXPROCS(10) +	p := 100 +	l := 200 +	vec := make([][]byte, p) +	var wg sync.WaitGroup +	for i := 0; i < p; i++ { +		wg.Add(1) +		go func(i int) { +			var tmp []byte +			for x := 0; x < l; x++ { +				tmp = append(tmp, randPrefetcher.read(l)...) +			} +			vec[i] = tmp +			wg.Done() +		}(i) +	} +	wg.Wait() +	var b bytes.Buffer +	fw, _ := flate.NewWriter(&b, flate.BestCompression) +	for _, v := range vec { +		fw.Write(v) +	} +	fw.Close() +	if b.Len() < p*l*l { +		t.Errorf("random data should be incompressible, but: in=%d compressed=%d\n", p*l*l, b.Len()) +	} +} | 
