summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorJakob Unterwurzacher2017-06-09 21:52:26 +0200
committerJakob Unterwurzacher2017-06-09 22:05:14 +0200
commit80516ed3351477793eec882508969b6b29b69b0a (patch)
treec461bd49e79fd6d8bf7f5dc8f28058faf2ba3078
parentda1bd742461e397abefc814bb0c0a21a6d8ec3d6 (diff)
cryptocore: prefetch nonces in 512-byte blocks
On my machine, reading 512-byte blocks from /dev/urandom (same via getentropy syscall) is a lot faster in terms of throughput: Blocksize Throughput 16 28.18 MB/s 512 83.75 MB/s For a single-threaded streaming write, this drops the CPU usage of nonceGenerator.Get to almost 1/3: flat flat% sum% cum cum% Before 0 0% 95.08% 0.35s 2.92% github.com/rfjakob/gocryptfs/internal/cryptocore.(*nonceGenerator).Get After 0.01s 0.092% 92.34% 0.13s 1.20% github.com/rfjakob/gocryptfs/internal/cryptocore.(*nonceGenerator).Get This change makes the nonce reading single-threaded, which may hurt massively-parallel writes.
-rw-r--r--internal/cryptocore/nonce.go3
-rw-r--r--internal/cryptocore/randprefetch.go50
-rw-r--r--internal/cryptocore/randprefetch_test.go40
3 files changed, 91 insertions, 2 deletions
diff --git a/internal/cryptocore/nonce.go b/internal/cryptocore/nonce.go
index 412cdbb..9df094c 100644
--- a/internal/cryptocore/nonce.go
+++ b/internal/cryptocore/nonce.go
@@ -28,6 +28,5 @@ type nonceGenerator struct {
// Get a random "nonceLen"-byte nonce
func (n *nonceGenerator) Get() []byte {
- nonce := RandBytes(n.nonceLen)
- return nonce
+ return randPrefetcher.read(n.nonceLen)
}
diff --git a/internal/cryptocore/randprefetch.go b/internal/cryptocore/randprefetch.go
new file mode 100644
index 0000000..8825a05
--- /dev/null
+++ b/internal/cryptocore/randprefetch.go
@@ -0,0 +1,50 @@
+package cryptocore
+
+import (
+ "bytes"
+ "log"
+ "sync"
+)
+
+/*
+Number of bytes to prefetch.
+
+512 looks like a good compromise between throughput and latency:
+Benchmark16-2 3000000 567 ns/op 28.18 MB/s
+Benchmark64-2 5000000 293 ns/op 54.51 MB/s
+Benchmark128-2 10000000 220 ns/op 72.48 MB/s
+Benchmark256-2 10000000 210 ns/op 76.17 MB/s
+Benchmark512-2 10000000 191 ns/op 83.75 MB/s
+Benchmark1024-2 10000000 171 ns/op 93.48 MB/s
+Benchmark2048-2 10000000 165 ns/op 96.45 MB/s
+Benchmark4096-2 10000000 165 ns/op 96.58 MB/s
+Benchmark40960-2 10000000 147 ns/op 108.82 MB/s
+*/
+const prefetchN = 512
+
+type randPrefetcherT struct {
+ sync.Mutex
+ buf bytes.Buffer
+}
+
+func (r *randPrefetcherT) read(want int) (out []byte) {
+ out = make([]byte, want)
+ r.Lock()
+ // Note: don't use defer, it slows us down!
+ have, err := r.buf.Read(out)
+ if have == want && err == nil {
+ r.Unlock()
+ return out
+ }
+ // Buffer was empty -> re-fill
+ r.buf.Reset()
+ r.buf.Write(RandBytes(prefetchN))
+ have, err = r.buf.Read(out)
+ if have != want || err != nil {
+ log.Panicf("randPrefetcher could not satisfy read: have=%d want=%d err=%v", have, want, err)
+ }
+ r.Unlock()
+ return out
+}
+
+var randPrefetcher randPrefetcherT
diff --git a/internal/cryptocore/randprefetch_test.go b/internal/cryptocore/randprefetch_test.go
new file mode 100644
index 0000000..2a568f3
--- /dev/null
+++ b/internal/cryptocore/randprefetch_test.go
@@ -0,0 +1,40 @@
+package cryptocore
+
+import (
+ "bytes"
+ "compress/flate"
+ "runtime"
+ "sync"
+ "testing"
+)
+
+// TestRandPrefetch hammers the randPrefetcher with 100 goroutines and verifies
+// that the result is incompressible
+func TestRandPrefetch(t *testing.T) {
+ runtime.GOMAXPROCS(10)
+ p := 100
+ l := 200
+ vec := make([][]byte, p)
+ var wg sync.WaitGroup
+ for i := 0; i < p; i++ {
+ wg.Add(1)
+ go func(i int) {
+ var tmp []byte
+ for x := 0; x < l; x++ {
+ tmp = append(tmp, randPrefetcher.read(l)...)
+ }
+ vec[i] = tmp
+ wg.Done()
+ }(i)
+ }
+ wg.Wait()
+ var b bytes.Buffer
+ fw, _ := flate.NewWriter(&b, flate.BestCompression)
+ for _, v := range vec {
+ fw.Write(v)
+ }
+ fw.Close()
+ if b.Len() < p*l*l {
+ t.Errorf("random data should be incompressible, but: in=%d compressed=%d\n", p*l*l, b.Len())
+ }
+}