diff options
authorJakob Unterwurzacher2020-06-21 13:25:12 +0200
committerJakob Unterwurzacher2020-06-21 13:25:12 +0200
commitf6ded09e36a679695354f4b9bc74242ef399be09 (patch)
parent74a4accf0cc1fd3265abd8fa53b0721cd72c2158 (diff)
v2api: implement Create
6 files changed, 920 insertions, 0 deletions
diff --git a/internal/fusefrontend/file2.go b/internal/fusefrontend/file2.go
new file mode 100644
index 0000000..2882732
--- /dev/null
+++ b/internal/fusefrontend/file2.go
@@ -0,0 +1,476 @@
+package fusefrontend
+// FUSE operations on file handles
+import (
+ "bytes"
+ "encoding/hex"
+ "fmt"
+ "io"
+ "log"
+ "os"
+ "sync"
+ "syscall"
+ "time"
+ ""
+ ""
+ ""
+ ""
+ ""
+ ""
+ ""
+ ""
+ ""
+var _ nodefs.File = &File{} // Verify that interface is implemented.
+// File - based on loopbackFile in go-fuse/fuse/nodefs/files.go
+type File2 struct {
+ fd *os.File
+ // Has Release() already been called on this file? This also means that the
+ // wlock entry has been freed, so let's not crash trying to access it.
+ // Due to concurrency, Release can overtake other operations. These will
+ // return EBADF in that case.
+ released bool
+ // fdLock prevents the fd to be closed while we are in the middle of
+ // an operation.
+ // Every FUSE entrypoint should RLock(). The only user of Lock() is
+ // Release(), which closes the fd and sets "released" to true.
+ fdLock sync.RWMutex
+ // Content encryption helper
+ contentEnc *contentenc.ContentEnc
+ // Device and inode number uniquely identify the backing file
+ qIno inomap.QIno
+ // Entry in the open file table
+ fileTableEntry *openfiletable.Entry
+ // Store where the last byte was written
+ lastWrittenOffset int64
+ // The opCount is used to judge whether "lastWrittenOffset" is still
+ // guaranteed to be correct.
+ lastOpCount uint64
+ // Parent filesystem
+ rootNode *RootNode
+ // We embed a nodefs.NewDefaultFile() that returns ENOSYS for every operation we
+ // have not implemented. This prevents build breakage when the go-fuse library
+ // adds new methods to the nodefs.File interface.
+ nodefs.File
+// NewFile returns a new go-fuse File instance.
+func NewFile2(fd *os.File, rn *RootNode, st *syscall.Stat_t) *File2 {
+ qi := inomap.QInoFromStat(st)
+ e := openfiletable.Register(qi)
+ return &File2{
+ fd: fd,
+ contentEnc: rn.contentEnc,
+ qIno: qi,
+ fileTableEntry: e,
+ rootNode: rn,
+ File: nodefs.NewDefaultFile(),
+ }
+// intFd - return the backing file descriptor as an integer.
+func (f *File2) intFd() int {
+ return int(f.fd.Fd())
+// readFileID loads the file header from disk and extracts the file ID.
+// Returns io.EOF if the file is empty.
+func (f *File2) readFileID() ([]byte, error) {
+ // We read +1 byte to determine if the file has actual content
+ // and not only the header. A header-only file will be considered empty.
+ // This makes File ID poisoning more difficult.
+ readLen := contentenc.HeaderLen + 1
+ buf := make([]byte, readLen)
+ n, err := f.fd.ReadAt(buf, 0)
+ if err != nil {
+ if err == io.EOF && n != 0 {
+ tlog.Warn.Printf("readFileID %d: incomplete file, got %d instead of %d bytes",
+ f.qIno.Ino, n, readLen)
+ f.rootNode.reportMitigatedCorruption(fmt.Sprint(f.qIno.Ino))
+ }
+ return nil, err
+ }
+ buf = buf[:contentenc.HeaderLen]
+ h, err := contentenc.ParseHeader(buf)
+ if err != nil {
+ return nil, err
+ }
+ return h.ID, nil
+// createHeader creates a new random header and writes it to disk.
+// Returns the new file ID.
+// The caller must hold fileIDLock.Lock().
+func (f *File2) createHeader() (fileID []byte, err error) {
+ h := contentenc.RandomHeader()
+ buf := h.Pack()
+ // Prevent partially written (=corrupt) header by preallocating the space beforehand
+ if !f.rootNode.args.NoPrealloc {
+ err = syscallcompat.EnospcPrealloc(f.intFd(), 0, contentenc.HeaderLen)
+ if err != nil {
+ if !syscallcompat.IsENOSPC(err) {
+ tlog.Warn.Printf("ino%d: createHeader: prealloc failed: %s\n", f.qIno.Ino, err.Error())
+ }
+ return nil, err
+ }
+ }
+ // Actually write header
+ _, err = f.fd.WriteAt(buf, 0)
+ if err != nil {
+ return nil, err
+ }
+ return h.ID, err
+// doRead - read "length" plaintext bytes from plaintext offset "off" and append
+// to "dst".
+// Arguments "length" and "off" do not have to be block-aligned.
+// doRead reads the corresponding ciphertext blocks from disk, decrypts them and
+// returns the requested part of the plaintext.
+// Called by Read() for normal reading,
+// by Write() and Truncate() via doWrite() for Read-Modify-Write.
+func (f *File2) doRead(dst []byte, off uint64, length uint64) ([]byte, fuse.Status) {
+ // Get the file ID, either from the open file table, or from disk.
+ var fileID []byte
+ f.fileTableEntry.IDLock.Lock()
+ if f.fileTableEntry.ID != nil {
+ // Use the cached value in the file table
+ fileID = f.fileTableEntry.ID
+ } else {
+ // Not cached, we have to read it from disk.
+ var err error
+ fileID, err = f.readFileID()
+ if err != nil {
+ f.fileTableEntry.IDLock.Unlock()
+ if err == io.EOF {
+ // Empty file
+ return nil, fuse.OK
+ }
+ buf := make([]byte, 100)
+ n, _ := f.fd.ReadAt(buf, 0)
+ buf = buf[:n]
+ hexdump := hex.EncodeToString(buf)
+ tlog.Warn.Printf("doRead %d: corrupt header: %v\nFile hexdump (%d bytes): %s",
+ f.qIno.Ino, err, n, hexdump)
+ return nil, fuse.EIO
+ }
+ // Save into the file table
+ f.fileTableEntry.ID = fileID
+ }
+ f.fileTableEntry.IDLock.Unlock()
+ if fileID == nil {
+ log.Panicf("fileID=%v", fileID)
+ }
+ // Read the backing ciphertext in one go
+ blocks := f.contentEnc.ExplodePlainRange(off, length)
+ alignedOffset, alignedLength := blocks[0].JointCiphertextRange(blocks)
+ skip := blocks[0].Skip
+ tlog.Debug.Printf("doRead: off=%d len=%d -> off=%d len=%d skip=%d\n",
+ off, length, alignedOffset, alignedLength, skip)
+ ciphertext := f.rootNode.contentEnc.CReqPool.Get()
+ ciphertext = ciphertext[:int(alignedLength)]
+ n, err := f.fd.ReadAt(ciphertext, int64(alignedOffset))
+ if err != nil && err != io.EOF {
+ tlog.Warn.Printf("read: ReadAt: %s", err.Error())
+ return nil, fuse.ToStatus(err)
+ }
+ // The ReadAt came back empty. We can skip all the decryption and return early.
+ if n == 0 {
+ f.rootNode.contentEnc.CReqPool.Put(ciphertext)
+ return dst, fuse.OK
+ }
+ // Truncate ciphertext buffer down to actually read bytes
+ ciphertext = ciphertext[0:n]
+ firstBlockNo := blocks[0].BlockNo
+ tlog.Debug.Printf("ReadAt offset=%d bytes (%d blocks), want=%d, got=%d", alignedOffset, firstBlockNo, alignedLength, n)
+ // Decrypt it
+ plaintext, err := f.contentEnc.DecryptBlocks(ciphertext, firstBlockNo, fileID)
+ f.rootNode.contentEnc.CReqPool.Put(ciphertext)
+ if err != nil {
+ if f.rootNode.args.ForceDecode && err == stupidgcm.ErrAuth {
+ // We do not have the information which block was corrupt here anymore,
+ // but DecryptBlocks() has already logged it anyway.
+ tlog.Warn.Printf("doRead %d: off=%d len=%d: returning corrupt data due to forcedecode",
+ f.qIno.Ino, off, length)
+ } else {
+ curruptBlockNo := firstBlockNo + f.contentEnc.PlainOffToBlockNo(uint64(len(plaintext)))
+ tlog.Warn.Printf("doRead %d: corrupt block #%d: %v", f.qIno.Ino, curruptBlockNo, err)
+ return nil, fuse.EIO
+ }
+ }
+ // Crop down to the relevant part
+ var out []byte
+ lenHave := len(plaintext)
+ lenWant := int(skip + length)
+ if lenHave > lenWant {
+ out = plaintext[skip:lenWant]
+ } else if lenHave > int(skip) {
+ out = plaintext[skip:lenHave]
+ }
+ // else: out stays empty, file was smaller than the requested offset
+ out = append(dst, out...)
+ f.rootNode.contentEnc.PReqPool.Put(plaintext)
+ return out, fuse.OK
+// Read - FUSE call
+func (f *File2) Read(buf []byte, off int64) (resultData fuse.ReadResult, code fuse.Status) {
+ if len(buf) > fuse.MAX_KERNEL_WRITE {
+ // This would crash us due to our fixed-size buffer pool
+ tlog.Warn.Printf("Read: rejecting oversized request with EMSGSIZE, len=%d", len(buf))
+ return nil, fuse.Status(syscall.EMSGSIZE)
+ }
+ f.fdLock.RLock()
+ defer f.fdLock.RUnlock()
+ f.fileTableEntry.ContentLock.RLock()
+ defer f.fileTableEntry.ContentLock.RUnlock()
+ tlog.Debug.Printf("ino%d: FUSE Read: offset=%d length=%d", f.qIno.Ino, off, len(buf))
+ if f.rootNode.args.SerializeReads {
+ serialize_reads.Wait(off, len(buf))
+ }
+ out, status := f.doRead(buf[:0], uint64(off), uint64(len(buf)))
+ if f.rootNode.args.SerializeReads {
+ serialize_reads.Done()
+ }
+ if status != fuse.OK {
+ return nil, status
+ }
+ tlog.Debug.Printf("ino%d: Read: status %v, returning %d bytes", f.qIno.Ino, status, len(out))
+ return fuse.ReadResultData(out), status
+// doWrite - encrypt "data" and write it to plaintext offset "off"
+// Arguments do not have to be block-aligned, read-modify-write is
+// performed internally as necessary
+// Called by Write() for normal writing,
+// and by Truncate() to rewrite the last file block.
+// Empty writes do nothing and are allowed.
+func (f *File2) doWrite(data []byte, off int64) (uint32, fuse.Status) {
+ fileWasEmpty := false
+ // Get the file ID, create a new one if it does not exist yet.
+ var fileID []byte
+ // The caller has exclusively locked ContentLock, which blocks all other
+ // readers and writers. No need to take IDLock.
+ if f.fileTableEntry.ID != nil {
+ fileID = f.fileTableEntry.ID
+ } else {
+ // If the file ID is not cached, read it from disk
+ var err error
+ fileID, err = f.readFileID()
+ // Write a new file header if the file is empty
+ if err == io.EOF {
+ fileID, err = f.createHeader()
+ fileWasEmpty = true
+ }
+ if err != nil {
+ return 0, fuse.ToStatus(err)
+ }
+ f.fileTableEntry.ID = fileID
+ }
+ // Handle payload data
+ dataBuf := bytes.NewBuffer(data)
+ blocks := f.contentEnc.ExplodePlainRange(uint64(off), uint64(len(data)))
+ toEncrypt := make([][]byte, len(blocks))
+ for i, b := range blocks {
+ blockData := dataBuf.Next(int(b.Length))
+ // Incomplete block -> Read-Modify-Write
+ if b.IsPartial() {
+ // Read
+ oldData, status := f.doRead(nil, b.BlockPlainOff(), f.contentEnc.PlainBS())
+ if status != fuse.OK {
+ tlog.Warn.Printf("ino%d fh%d: RMW read failed: %s", f.qIno.Ino, f.intFd(), status.String())
+ return 0, status
+ }
+ // Modify
+ blockData = f.contentEnc.MergeBlocks(oldData, blockData, int(b.Skip))
+ tlog.Debug.Printf("len(oldData)=%d len(blockData)=%d", len(oldData), len(blockData))
+ }
+ tlog.Debug.Printf("ino%d: Writing %d bytes to block #%d",
+ f.qIno.Ino, len(blockData), b.BlockNo)
+ // Write into the to-encrypt list
+ toEncrypt[i] = blockData
+ }
+ // Encrypt all blocks
+ ciphertext := f.contentEnc.EncryptBlocks(toEncrypt, blocks[0].BlockNo, f.fileTableEntry.ID)
+ // Preallocate so we cannot run out of space in the middle of the write.
+ // This prevents partially written (=corrupt) blocks.
+ var err error
+ cOff := int64(blocks[0].BlockCipherOff())
+ if !f.rootNode.args.NoPrealloc {
+ err = syscallcompat.EnospcPrealloc(f.intFd(), cOff, int64(len(ciphertext)))
+ if err != nil {
+ if !syscallcompat.IsENOSPC(err) {
+ tlog.Warn.Printf("ino%d fh%d: doWrite: prealloc failed: %v", f.qIno.Ino, f.intFd(), err)
+ }
+ if fileWasEmpty {
+ // Kill the file header again
+ f.fileTableEntry.ID = nil
+ err2 := syscall.Ftruncate(f.intFd(), 0)
+ if err2 != nil {
+ tlog.Warn.Printf("ino%d fh%d: doWrite: rollback failed: %v", f.qIno.Ino, f.intFd(), err2)
+ }
+ }
+ return 0, fuse.ToStatus(err)
+ }
+ }
+ // Write
+ _, err = f.fd.WriteAt(ciphertext, cOff)
+ // Return memory to CReqPool
+ f.rootNode.contentEnc.CReqPool.Put(ciphertext)
+ if err != nil {
+ tlog.Warn.Printf("ino%d fh%d: doWrite: WriteAt off=%d len=%d failed: %v",
+ f.qIno.Ino, f.intFd(), cOff, len(ciphertext), err)
+ return 0, fuse.ToStatus(err)
+ }
+ return uint32(len(data)), fuse.OK
+// isConsecutiveWrite returns true if the current write
+// directly (in time and space) follows the last write.
+// This is an optimisation for streaming writes on NFS where a
+// Stat() call is very expensive.
+// The caller must "wlock.lock(f.devIno.ino)" otherwise this check would be racy.
+func (f *File2) isConsecutiveWrite(off int64) bool {
+ opCount := openfiletable.WriteOpCount()
+ return opCount == f.lastOpCount+1 && off == f.lastWrittenOffset+1
+// Write - FUSE call
+// If the write creates a hole, pads the file to the next block boundary.
+func (f *File2) Write(data []byte, off int64) (uint32, fuse.Status) {
+ if len(data) > fuse.MAX_KERNEL_WRITE {
+ // This would crash us due to our fixed-size buffer pool
+ tlog.Warn.Printf("Write: rejecting oversized request with EMSGSIZE, len=%d", len(data))
+ return 0, fuse.Status(syscall.EMSGSIZE)
+ }
+ f.fdLock.RLock()
+ defer f.fdLock.RUnlock()
+ if f.released {
+ // The file descriptor has been closed concurrently
+ tlog.Warn.Printf("ino%d fh%d: Write on released file", f.qIno.Ino, f.intFd())
+ return 0, fuse.EBADF
+ }
+ f.fileTableEntry.ContentLock.Lock()
+ defer f.fileTableEntry.ContentLock.Unlock()
+ tlog.Debug.Printf("ino%d: FUSE Write: offset=%d length=%d", f.qIno.Ino, off, len(data))
+ // If the write creates a file hole, we have to zero-pad the last block.
+ // But if the write directly follows an earlier write, it cannot create a
+ // hole, and we can save one Stat() call.
+ if !f.isConsecutiveWrite(off) {
+ status := f.writePadHole(off)
+ if !status.Ok() {
+ return 0, status
+ }
+ }
+ n, status := f.doWrite(data, off)
+ if status.Ok() {
+ f.lastOpCount = openfiletable.WriteOpCount()
+ f.lastWrittenOffset = off + int64(len(data)) - 1
+ }
+ return n, status
+// Release - FUSE call, close file
+func (f *File2) Release() {
+ f.fdLock.Lock()
+ if f.released {
+ log.Panicf("ino%d fh%d: double release", f.qIno.Ino, f.intFd())
+ }
+ f.released = true
+ openfiletable.Unregister(f.qIno)
+ f.fd.Close()
+ f.fdLock.Unlock()
+// Flush - FUSE call
+func (f *File2) Flush() fuse.Status {
+ f.fdLock.RLock()
+ defer f.fdLock.RUnlock()
+ // Since Flush() may be called for each dup'd fd, we don't
+ // want to really close the file, we just want to flush. This
+ // is achieved by closing a dup'd fd.
+ newFd, err := syscall.Dup(f.intFd())
+ if err != nil {
+ return fuse.ToStatus(err)
+ }
+ err = syscall.Close(newFd)
+ return fuse.ToStatus(err)
+// Fsync FUSE call
+func (f *File2) Fsync(flags int) (code fuse.Status) {
+ f.fdLock.RLock()
+ defer f.fdLock.RUnlock()
+ return fuse.ToStatus(syscall.Fsync(f.intFd()))
+// Chmod FUSE call
+func (f *File2) Chmod(mode uint32) fuse.Status {
+ f.fdLock.RLock()
+ defer f.fdLock.RUnlock()
+ // os.File.Chmod goes through the "syscallMode" translation function that messes
+ // up the suid and sgid bits. So use syscall.Fchmod directly.
+ err := syscall.Fchmod(f.intFd(), mode)
+ return fuse.ToStatus(err)
+// Chown FUSE call
+func (f *File2) Chown(uid uint32, gid uint32) fuse.Status {
+ f.fdLock.RLock()
+ defer f.fdLock.RUnlock()
+ return fuse.ToStatus(f.fd.Chown(int(uid), int(gid)))
+// GetAttr FUSE call (like stat)
+func (f *File2) GetAttr(a *fuse.Attr) fuse.Status {
+ f.fdLock.RLock()
+ defer f.fdLock.RUnlock()
+ tlog.Debug.Printf("file.GetAttr()")
+ st := syscall.Stat_t{}
+ err := syscall.Fstat(f.intFd(), &st)
+ if err != nil {
+ return fuse.ToStatus(err)
+ }
+ f.rootNode.inoMap.TranslateStat(&st)
+ a.FromStat(&st)
+ a.Size = f.contentEnc.CipherSizeToPlainSize(a.Size)
+ if f.rootNode.args.ForceOwner != nil {
+ a.Owner = *f.rootNode.args.ForceOwner
+ }
+ return fuse.OK
+// Utimens FUSE call
+func (f *File2) Utimens(a *time.Time, m *time.Time) fuse.Status {
+ f.fdLock.RLock()
+ defer f.fdLock.RUnlock()
+ err := syscallcompat.FutimesNano(f.intFd(), a, m)
+ return fuse.ToStatus(err)
diff --git a/internal/fusefrontend/file2_allocate_truncate.go b/internal/fusefrontend/file2_allocate_truncate.go
new file mode 100644
index 0000000..f799a3e
--- /dev/null
+++ b/internal/fusefrontend/file2_allocate_truncate.go
@@ -0,0 +1,217 @@
+package fusefrontend
+// FUSE operations Truncate and Allocate on file handles
+// i.e. ftruncate and fallocate
+import (
+ "log"
+ "syscall"
+ ""
+ ""
+ ""
+// Allocate - FUSE call for fallocate(2)
+// mode=FALLOC_FL_KEEP_SIZE is implemented directly.
+// mode=FALLOC_DEFAULT is implemented as a two-step process:
+// (1) Allocate the space using FALLOC_FL_KEEP_SIZE
+// (2) Set the file size using ftruncate (via truncateGrowFile)
+// This allows us to reuse the file grow mechanics from Truncate as they are
+// complicated and hard to get right.
+// Other modes (hole punching, zeroing) are not supported.
+func (f *File2) Allocate(off uint64, sz uint64, mode uint32) fuse.Status {
+ if mode != FALLOC_DEFAULT && mode != FALLOC_FL_KEEP_SIZE {
+ f := func() {
+ tlog.Info.Printf("fallocate: only mode 0 (default) and 1 (keep size) are supported")
+ }
+ allocateWarnOnce.Do(f)
+ return fuse.Status(syscall.EOPNOTSUPP)
+ }
+ f.fdLock.RLock()
+ defer f.fdLock.RUnlock()
+ if f.released {
+ return fuse.EBADF
+ }
+ f.fileTableEntry.ContentLock.Lock()
+ defer f.fileTableEntry.ContentLock.Unlock()
+ blocks := f.contentEnc.ExplodePlainRange(off, sz)
+ firstBlock := blocks[0]
+ lastBlock := blocks[len(blocks)-1]
+ // Step (1): Allocate the space the user wants using FALLOC_FL_KEEP_SIZE.
+ // This will fill file holes and/or allocate additional space past the end of
+ // the file.
+ cipherOff := firstBlock.BlockCipherOff()
+ cipherSz := lastBlock.BlockCipherOff() - cipherOff +
+ f.contentEnc.BlockOverhead() + lastBlock.Skip + lastBlock.Length
+ err := syscallcompat.Fallocate(f.intFd(), FALLOC_FL_KEEP_SIZE, int64(cipherOff), int64(cipherSz))
+ tlog.Debug.Printf("Allocate off=%d sz=%d mode=%x cipherOff=%d cipherSz=%d\n",
+ off, sz, mode, cipherOff, cipherSz)
+ if err != nil {
+ return fuse.ToStatus(err)
+ }
+ if mode == FALLOC_FL_KEEP_SIZE {
+ // The user did not want to change the apparent size. We are done.
+ return fuse.OK
+ }
+ // Step (2): Grow the apparent file size
+ // We need the old file size to determine if we are growing the file at all.
+ newPlainSz := off + sz
+ oldPlainSz, err := f.statPlainSize()
+ if err != nil {
+ return fuse.ToStatus(err)
+ }
+ if newPlainSz <= oldPlainSz {
+ // The new size is smaller (or equal). Fallocate with mode = 0 never
+ // truncates a file, so we are done.
+ return fuse.OK
+ }
+ // The file grows. The space has already been allocated in (1), so what is
+ // left to do is to pad the first and last block and call truncate.
+ // truncateGrowFile does just that.
+ return f.truncateGrowFile(oldPlainSz, newPlainSz)
+// Truncate - FUSE call
+func (f *File2) Truncate(newSize uint64) fuse.Status {
+ f.fdLock.RLock()
+ defer f.fdLock.RUnlock()
+ if f.released {
+ // The file descriptor has been closed concurrently.
+ tlog.Warn.Printf("ino%d fh%d: Truncate on released file", f.qIno.Ino, f.intFd())
+ return fuse.EBADF
+ }
+ f.fileTableEntry.ContentLock.Lock()
+ defer f.fileTableEntry.ContentLock.Unlock()
+ var err error
+ // Common case first: Truncate to zero
+ if newSize == 0 {
+ err = syscall.Ftruncate(int(f.fd.Fd()), 0)
+ if err != nil {
+ tlog.Warn.Printf("ino%d fh%d: Ftruncate(fd, 0) returned error: %v", f.qIno.Ino, f.intFd(), err)
+ return fuse.ToStatus(err)
+ }
+ // Truncate to zero kills the file header
+ f.fileTableEntry.ID = nil
+ return fuse.OK
+ }
+ // We need the old file size to determine if we are growing or shrinking
+ // the file
+ oldSize, err := f.statPlainSize()
+ if err != nil {
+ return fuse.ToStatus(err)
+ }
+ oldB := float32(oldSize) / float32(f.contentEnc.PlainBS())
+ newB := float32(newSize) / float32(f.contentEnc.PlainBS())
+ tlog.Debug.Printf("ino%d: FUSE Truncate from %.2f to %.2f blocks (%d to %d bytes)", f.qIno.Ino, oldB, newB, oldSize, newSize)
+ // File size stays the same - nothing to do
+ if newSize == oldSize {
+ return fuse.OK
+ }
+ // File grows
+ if newSize > oldSize {
+ return f.truncateGrowFile(oldSize, newSize)
+ }
+ // File shrinks
+ blockNo := f.contentEnc.PlainOffToBlockNo(newSize)
+ cipherOff := f.contentEnc.BlockNoToCipherOff(blockNo)
+ plainOff := f.contentEnc.BlockNoToPlainOff(blockNo)
+ lastBlockLen := newSize - plainOff
+ var data []byte
+ if lastBlockLen > 0 {
+ var status fuse.Status
+ data, status = f.doRead(nil, plainOff, lastBlockLen)
+ if status != fuse.OK {
+ tlog.Warn.Printf("Truncate: shrink doRead returned error: %v", err)
+ return status
+ }
+ }
+ // Truncate down to the last complete block
+ err = syscall.Ftruncate(int(f.fd.Fd()), int64(cipherOff))
+ if err != nil {
+ tlog.Warn.Printf("Truncate: shrink Ftruncate returned error: %v", err)
+ return fuse.ToStatus(err)
+ }
+ // Append partial block
+ if lastBlockLen > 0 {
+ _, status := f.doWrite(data, int64(plainOff))
+ return status
+ }
+ return fuse.OK
+// statPlainSize stats the file and returns the plaintext size
+func (f *File2) statPlainSize() (uint64, error) {
+ fi, err := f.fd.Stat()
+ if err != nil {
+ tlog.Warn.Printf("ino%d fh%d: statPlainSize: %v", f.qIno.Ino, f.intFd(), err)
+ return 0, err
+ }
+ cipherSz := uint64(fi.Size())
+ plainSz := uint64(f.contentEnc.CipherSizeToPlainSize(cipherSz))
+ return plainSz, nil
+// truncateGrowFile extends a file using seeking or ftruncate performing RMW on
+// the first and last block as necessary. New blocks in the middle become
+// file holes unless they have been fallocate()'d beforehand.
+func (f *File2) truncateGrowFile(oldPlainSz uint64, newPlainSz uint64) fuse.Status {
+ if newPlainSz <= oldPlainSz {
+ log.Panicf("BUG: newSize=%d <= oldSize=%d", newPlainSz, oldPlainSz)
+ }
+ newEOFOffset := newPlainSz - 1
+ if oldPlainSz > 0 {
+ n1 := f.contentEnc.PlainOffToBlockNo(oldPlainSz - 1)
+ n2 := f.contentEnc.PlainOffToBlockNo(newEOFOffset)
+ // The file is grown within one block, no need to pad anything.
+ // Write a single zero to the last byte and let doWrite figure out the RMW.
+ if n1 == n2 {
+ buf := make([]byte, 1)
+ _, status := f.doWrite(buf, int64(newEOFOffset))
+ return status
+ }
+ }
+ // The truncate creates at least one new block.
+ //
+ // Make sure the old last block is padded to the block boundary. This call
+ // is a no-op if it is already block-aligned.
+ status := f.zeroPad(oldPlainSz)
+ if !status.Ok() {
+ return status
+ }
+ // The new size is block-aligned. In this case we can do everything ourselves
+ // and avoid the call to doWrite.
+ if newPlainSz%f.contentEnc.PlainBS() == 0 {
+ // The file was empty, so it did not have a header. Create one.
+ if oldPlainSz == 0 {
+ id, err := f.createHeader()
+ if err != nil {
+ return fuse.ToStatus(err)
+ }
+ f.fileTableEntry.ID = id
+ }
+ cSz := int64(f.contentEnc.PlainSizeToCipherSize(newPlainSz))
+ err := syscall.Ftruncate(f.intFd(), cSz)
+ if err != nil {
+ tlog.Warn.Printf("Truncate: grow Ftruncate returned error: %v", err)
+ }
+ return fuse.ToStatus(err)
+ }
+ // The new size is NOT aligned, so we need to write a partial block.
+ // Write a single zero to the last byte and let doWrite figure it out.
+ buf := make([]byte, 1)
+ _, status = f.doWrite(buf, int64(newEOFOffset))
+ return status
diff --git a/internal/fusefrontend/file2_holes.go b/internal/fusefrontend/file2_holes.go
new file mode 100644
index 0000000..5e06981
--- /dev/null
+++ b/internal/fusefrontend/file2_holes.go
@@ -0,0 +1,92 @@
+package fusefrontend
+// Helper functions for sparse files (files with holes)
+import (
+ "runtime"
+ "syscall"
+ ""
+ ""
+// Will a write to plaintext offset "targetOff" create a file hole in the
+// ciphertext? If yes, zero-pad the last ciphertext block.
+func (f *File2) writePadHole(targetOff int64) fuse.Status {
+ // Get the current file size.
+ fi, err := f.fd.Stat()
+ if err != nil {
+ tlog.Warn.Printf("checkAndPadHole: Fstat failed: %v", err)
+ return fuse.ToStatus(err)
+ }
+ plainSize := f.contentEnc.CipherSizeToPlainSize(uint64(fi.Size()))
+ // Appending a single byte to the file (equivalent to writing to
+ // offset=plainSize) would write to "nextBlock".
+ nextBlock := f.contentEnc.PlainOffToBlockNo(plainSize)
+ // targetBlock is the block the user wants to write to.
+ targetBlock := f.contentEnc.PlainOffToBlockNo(uint64(targetOff))
+ // The write goes into an existing block or (if the last block was full)
+ // starts a new one directly after the last block. Nothing to do.
+ if targetBlock <= nextBlock {
+ return fuse.OK
+ }
+ // The write goes past the next block. nextBlock has
+ // to be zero-padded to the block boundary and (at least) nextBlock+1
+ // will contain a file hole in the ciphertext.
+ status := f.zeroPad(plainSize)
+ if status != fuse.OK {
+ return status
+ }
+ return fuse.OK
+// Zero-pad the file of size plainSize to the next block boundary. This is a no-op
+// if the file is already block-aligned.
+func (f *File2) zeroPad(plainSize uint64) fuse.Status {
+ lastBlockLen := plainSize % f.contentEnc.PlainBS()
+ if lastBlockLen == 0 {
+ // Already block-aligned
+ return fuse.OK
+ }
+ missing := f.contentEnc.PlainBS() - lastBlockLen
+ pad := make([]byte, missing)
+ tlog.Debug.Printf("zeroPad: Writing %d bytes\n", missing)
+ _, status := f.doWrite(pad, int64(plainSize))
+ return status
+// SeekData calls the lseek syscall with SEEK_DATA. It returns the offset of the
+// next data bytes, skipping over file holes.
+func (f *File2) SeekData(oldOffset int64) (int64, error) {
+ if runtime.GOOS != "linux" {
+ // Does MacOS support something like this?
+ return 0, syscall.EOPNOTSUPP
+ }
+ const SEEK_DATA = 3
+ // Convert plaintext offset to ciphertext offset and round down to the
+ // start of the current block. File holes smaller than a full block will
+ // be ignored.
+ blockNo := f.contentEnc.PlainOffToBlockNo(uint64(oldOffset))
+ oldCipherOff := int64(f.contentEnc.BlockNoToCipherOff(blockNo))
+ // Determine the next data offset. If the old offset points to (or beyond)
+ // the end of the file, the Seek syscall fails with syscall.ENXIO.
+ newCipherOff, err := syscall.Seek(f.intFd(), oldCipherOff, SEEK_DATA)
+ if err != nil {
+ return 0, err
+ }
+ // Convert ciphertext offset back to plaintext offset. At this point,
+ // newCipherOff should always be >= contentenc.HeaderLen. Round down,
+ // but ensure that the result is never smaller than the initial offset
+ // (to avoid endless loops).
+ blockNo = f.contentEnc.CipherOffToBlockNo(uint64(newCipherOff))
+ newOffset := int64(f.contentEnc.BlockNoToPlainOff(blockNo))
+ if newOffset < oldOffset {
+ newOffset = oldOffset
+ }
+ return newOffset, nil
diff --git a/internal/fusefrontend/node.go b/internal/fusefrontend/node.go
index 9074f72..28e606a 100644
--- a/internal/fusefrontend/node.go
+++ b/internal/fusefrontend/node.go
@@ -2,6 +2,7 @@ package fusefrontend
import (
+ "os"
@@ -10,7 +11,9 @@ import (
+ ""
+ ""
// Node is a file or directory in the filesystem tree
@@ -31,6 +34,9 @@ func (n *Node) rootNode() *RootNode {
func (n *Node) Lookup(ctx context.Context, name string, out *fuse.EntryOut) (*fs.Inode, syscall.Errno) {
rn := n.rootNode()
p := filepath.Join(n.path(), name)
+ if rn.isFiltered(p) {
+ return nil, syscall.EPERM
+ }
dirfd, cName, err := rn.openBackingDir(p)
if err != nil {
return nil, fs.ToErrno(err)
@@ -71,3 +77,68 @@ func (n *Node) Getattr(ctx context.Context, f fs.FileHandle, out *fuse.AttrOut)
return 0
+func (n *Node) Create(ctx context.Context, name string, flags uint32, mode uint32, out *fuse.EntryOut) (inode *fs.Inode, fh fs.FileHandle, fuseFlags uint32, errno syscall.Errno) {
+ rn := n.rootNode()
+ path := filepath.Join(n.path(), name)
+ if rn.isFiltered(path) {
+ return nil, nil, 0, syscall.EPERM
+ }
+ dirfd, cName, err := rn.openBackingDir(path)
+ if err != nil {
+ return nil, nil, 0, fs.ToErrno(err)
+ }
+ defer syscall.Close(dirfd)
+ fd := -1
+ // Make sure context is nil if we don't want to preserve the owner
+ if !rn.args.PreserveOwner {
+ ctx = nil
+ }
+ newFlags := rn.mangleOpenFlags(flags)
+ // Handle long file name
+ if !rn.args.PlaintextNames && nametransform.IsLongContent(cName) {
+ // Create ".name"
+ err = rn.nameTransform.WriteLongNameAt(dirfd, cName, path)
+ if err != nil {
+ return nil, nil, 0, fs.ToErrno(err)
+ }
+ // Create content
+ fd, err = syscallcompat.OpenatUserCtx(dirfd, cName, newFlags|syscall.O_CREAT|syscall.O_EXCL, mode, ctx)
+ if err != nil {
+ nametransform.DeleteLongNameAt(dirfd, cName)
+ }
+ } else {
+ // Create content, normal (short) file name
+ fd, err = syscallcompat.OpenatUserCtx(dirfd, cName, newFlags|syscall.O_CREAT|syscall.O_EXCL, mode, ctx)
+ }
+ if err != nil {
+ // xfstests generic/488 triggers this
+ if err == syscall.EMFILE {
+ var lim syscall.Rlimit
+ syscall.Getrlimit(syscall.RLIMIT_NOFILE, &lim)
+ tlog.Warn.Printf("Create %q: too many open files. Current \"ulimit -n\": %d", cName, lim.Cur)
+ }
+ return nil, nil, 0, fs.ToErrno(err)
+ }
+ // Get device number and inode number into `st`
+ st, err := syscallcompat.Fstatat2(dirfd, cName, unix.AT_SYMLINK_NOFOLLOW)
+ if err != nil {
+ return nil, nil, 0, fs.ToErrno(err)
+ }
+ // Get unique inode number
+ rn.inoMap.TranslateStat(st)
+ out.Attr.FromStat(st)
+ // Create child node
+ id := fs.StableAttr{
+ Mode: uint32(st.Mode),
+ Gen: 1,
+ Ino: st.Ino,
+ }
+ node := &Node{}
+ ch := n.NewInode(ctx, node, id)
+ f := os.NewFile(uintptr(fd), cName)
+ return ch, NewFile2(f, rn, st), 0, 0
diff --git a/internal/fusefrontend/root_node.go b/internal/fusefrontend/root_node.go
index 1116a41..c84ac93 100644
--- a/internal/fusefrontend/root_node.go
+++ b/internal/fusefrontend/root_node.go
@@ -1,11 +1,16 @@
package fusefrontend
import (
+ "os"
+ "sync/atomic"
+ "syscall"
+ ""
+ ""
@@ -47,6 +52,30 @@ func NewRootNode(args Args, c *contentenc.ContentEnc, n nametransform.NameTransf
+// mangleOpenFlags is used by Create() and Open() to convert the open flags the user
+// wants to the flags we internally use to open the backing file.
+// The returned flags always contain O_NOFOLLOW.
+func (rn *RootNode) mangleOpenFlags(flags uint32) (newFlags int) {
+ newFlags = int(flags)
+ // Convert WRONLY to RDWR. We always need read access to do read-modify-write cycles.
+ if (newFlags & syscall.O_ACCMODE) == syscall.O_WRONLY {
+ newFlags = newFlags ^ os.O_WRONLY | os.O_RDWR
+ }
+ // We also cannot open the file in append mode, we need to seek back for RMW
+ newFlags = newFlags &^ os.O_APPEND
+ // O_DIRECT accesses must be aligned in both offset and length. Due to our
+ // crypto header, alignment will be off, even if userspace makes aligned
+ // accesses. Running xfstests generic/013 on ext4 used to trigger lots of
+ // EINVAL errors due to missing alignment. Just fall back to buffered IO.
+ newFlags = newFlags &^ syscallcompat.O_DIRECT
+ // Create and Open are two separate FUSE operations, so O_CREAT should not
+ // be part of the open flags.
+ newFlags = newFlags &^ syscall.O_CREAT
+ // We always want O_NOFOLLOW to be safe against symlink races
+ newFlags |= syscall.O_NOFOLLOW
+ return newFlags
// reportMitigatedCorruption is used to report a corruption that was transparently
// mitigated and did not return an error to the user. Pass the name of the corrupt
// item (filename for OpenDir(), xattr name for ListXAttr() etc).
@@ -63,3 +92,23 @@ func (rn *RootNode) reportMitigatedCorruption(item string) {
+// isFiltered - check if plaintext "path" should be forbidden
+// Prevents name clashes with internal files when file names are not encrypted
+func (rn *RootNode) isFiltered(path string) bool {
+ atomic.StoreUint32(&rn.IsIdle, 0)
+ if !rn.args.PlaintextNames {
+ return false
+ }
+ // gocryptfs.conf in the root directory is forbidden
+ if path == configfile.ConfDefaultName {
+ tlog.Info.Printf("The name /%s is reserved when -plaintextnames is used\n",
+ configfile.ConfDefaultName)
+ return true
+ }
+ // Note: gocryptfs.diriv is NOT forbidden because diriv and plaintextnames
+ // are exclusive
+ return false
diff --git a/internal/syscallcompat/sys_linux.go b/internal/syscallcompat/sys_linux.go
index e2b19bb..02064ac 100644
--- a/internal/syscallcompat/sys_linux.go
+++ b/internal/syscallcompat/sys_linux.go
@@ -2,6 +2,7 @@
package syscallcompat
import (
+ "context"
@@ -88,6 +89,20 @@ func getSupplementaryGroups(pid uint32) (gids []int) {
return nil
+// OpenatUserCtx is a tries to extract a fuse.Context from the generic ctx and
+// calls OpenatUser.
+func OpenatUserCtx(dirfd int, path string, flags int, mode uint32, ctx context.Context) (fd int, err error) {
+ var ctx2 *fuse.Context
+ if ctx != nil {
+ if caller, ok := fuse.FromContext(ctx); ok {
+ ctx2 = &fuse.Context{
+ Caller: *caller,
+ }
+ }
+ }
+ return OpenatUser(dirfd, path, flags, mode, ctx2)
// OpenatUser runs the Openat syscall in the context of a different user.
func OpenatUser(dirfd int, path string, flags int, mode uint32, context *fuse.Context) (fd int, err error) {
if context != nil {