proof: improve tests and readme

jsign · jsign · commit dc3c3f7ef6ec · 2021-10-30T22:02:39.000-03:00
Signed-off-by: Ignacio Hagopian &lt;jsign.uy@gmail.com&gt;
diff --git a/Makefile b/Makefile
@@ -8,6 +8,9 @@ test:
 	go test ./... -race
 .PHONY: test
 
+install:
+	cd ./cmd/ufsproof && go install .
+
 GOLANGCI_LINT=go run github.com/golangci/golangci-lint/cmd/golangci-lint@v1.42.1
 lint:
 	$(GOLANGCI_LINT) run
diff --git a/README.md b/README.md
@@ -31,8 +31,8 @@ Consider the following UnixFS DAG file with a fanout factor of 3:
 
 
 Considering a verifer is asking a prover to provide a proof that it contains the corresponding block at the _file level offset_ X, the prover generates the subdag inside the green zone:
-- RoundIndigo nodes are internal DAG nodes that are somewhat small-ish and don't contain file data.
-- Square blocks are leaves that contain part of the original file data.
+- Roundo nodes are internal DAG nodes that are somewhat small-ish and don't contain file data.
+- Square nodes contain chunks of the original file data.
 - The indigo colored nodes are necessary nodes to make the proof verify that the target block (red) is at the specified offset.
 
 
@@ -67,7 +67,7 @@ Notice that if the prover has missing internal nodes of the UnixFS, then the imp
 
 
 ## Proof sizes and benchmark
-The size of the proof should be already close to the minimal level. Notice that these proofs are pretty big for the single reason that no assumptions are made of DAG layout nor chunking. Thus internal nodes at visited levels include many children. If the fan-out factor at each level is the default-ish ones, this involves a non-negligible number of blocks, which are unavoidable to allow having these minimal assumptions.
+The size of the proof should be already close to the minimal level. Notice that these proofs are pretty big for the single reason that no assumptions are made of DAG layout nor chunking. Thus internal nodes at visited levels include many children. If we're able to have some extra assumptions as fixed-size chunking, then we could potentially ignore untargeted raw leaves which are the biggest in size, and only include the targeted (red) leaf node.
 
 Generating and verifying proofs are mostly symmetrical operations. The current implementation is very naive and not optimized in any way. Being stricter with the spec CAR serialization block order can make the implementation faster. Probably, not a big deal unless you're generating proofs for thousands of _Cids_.
 
@@ -78,7 +78,7 @@ The following bullets will probably be implemented soon:
 - [ ] CLI command wirable to `go-ipfs`. The lib already supports any `DAGService` so anything can be pluggable.
 - [ ] Allow strict mode proof validation; maybe it makes sense to fail faster in some cases, nbd.
 - [ ] CLI for validation from DealID in Filecoin network; maybe fun, but `Labels` are unverified.
-- [ ] Many border-case tests.
+- [ ] godocs
 
 This is a side-project made for fun, so a priori is a hand-wavy roadmap.
 
diff --git a/proof.go b/proof.go
@@ -37,9 +37,9 @@ func ValidateProof(ctx context.Context, root cid.Cid, offset uint64, proof []byt
 		return false, fmt.Errorf("the root isn't the expected one")
 	}
 
-	// TODO(jsign): if we assume some ordering in the CAR file we could simply have a CAR-serial walker
-	//              which would make this much faster and probably simpler in a way avoiding blockstores, etc.
-	//              For now, not have those assumptions and do a naive-ish walk.
+	// TODO: if we assume some ordering in the CAR file we could simply have a CAR-serial walker
+	//       which would make this much faster and probably simpler in a way avoiding blockstores, etc.
+	//       For now, not have those assumptions and do a naive-ish walk.
 	for {
 		block, err := cr.Next()
 		if err == io.EOF {
@@ -68,6 +68,14 @@ func CreateProof(ctx context.Context, root cid.Cid, offset uint64, dserv ipld.DA
 	if err != nil {
 		return nil, fmt.Errorf("get %s from dag service: %s", root, err)
 	}
+	fsRoot, err := unixfs.ExtractFSNode(n)
+	if err != nil {
+		return nil, fmt.Errorf("extracting fsnode from merkle-dag: %s", err)
+	}
+	if fsRoot.FileSize() < offset {
+		return nil, fmt.Errorf("the offset is greater than the file size")
+	}
+
 	proofNodes := []ipld.Node{n}
 
 	var currOffset uint64
diff --git a/proof_test.go b/proof_test.go
@@ -8,7 +8,9 @@ import (
 	"math/rand"
 	"testing"
 
+	"github.com/ipfs/go-cid"
 	chunker "github.com/ipfs/go-ipfs-chunker"
+	ipld "github.com/ipfs/go-ipld-format"
 	"github.com/ipfs/go-unixfs/importer/balanced"
 	h "github.com/ipfs/go-unixfs/importer/helpers"
 	testu "github.com/ipfs/go-unixfs/test"
@@ -18,64 +20,79 @@ import (
 func TestProofVerify(t *testing.T) {
 	t.Parallel()
 
-	dserv := testu.GetDAGServ()
-	r := rand.New(rand.NewSource(22))
-	data := make([]byte, 100000)
-	_, err := io.ReadFull(r, data)
-	require.NoError(t, err)
-	in := bytes.NewReader(data)
-	opts := testu.UseCidV1
-	dbp := h.DagBuilderParams{
-		Dagserv:    dserv,
-		Maxlinks:   3,
-		CidBuilder: opts.Prefix,
-		RawLeaves:  opts.RawLeavesUsed,
-	}
+	ctx := context.Background()
+	dataSize := int64(100000)
 	chunkSize := int64(256)
-	db, err := dbp.New(chunker.NewSizeSplitter(in, chunkSize))
-	require.NoError(t, err)
-	node, err := balanced.Layout(db)
-	require.NoError(t, err)
+	rootCid, dserv := setupData(t, dataSize, chunkSize)
 
 	tests := []struct {
 		proofOffset uint64
 		verifOffset uint64
-		ok          bool
+		notOkVerif  bool
+		notOkProof  bool
 	}{
 		// Correct proofs.
-		{proofOffset: 40, verifOffset: 40, ok: true},
-		{proofOffset: 500, verifOffset: 500, ok: true},
-		{proofOffset: 6000, verifOffset: 6000, ok: true},
-		{proofOffset: 70000, verifOffset: 70000, ok: true},
+		{proofOffset: 40, verifOffset: 40},
+		{proofOffset: 500, verifOffset: 500},
+		{proofOffset: 6000, verifOffset: 6000},
+		{proofOffset: 70000, verifOffset: 70000},
+		{proofOffset: uint64(dataSize), verifOffset: uint64(dataSize)},
 
 		// Correct proof due to being in same block
-		{proofOffset: 40, verifOffset: 41, ok: true},
-		{proofOffset: 41, verifOffset: 40, ok: true},
+		{proofOffset: 40, verifOffset: 41},
+		{proofOffset: 41, verifOffset: 40},
 
 		// Indirectly correct proofs; this should work unless we change
 		// the verification to not allow unvisited blocks; not clear if that's
 		// entirely useful.
-		{proofOffset: 868, verifOffset: 1124, ok: true},
+		{proofOffset: 868, verifOffset: 1124},
 
 		// Definitely wrong proofs.
-		{proofOffset: 40, verifOffset: 50000, ok: false},
-		{proofOffset: 70000, verifOffset: 10, ok: false},
+		{proofOffset: 40, verifOffset: 50000, notOkVerif: true},
+		{proofOffset: 70000, verifOffset: 10, notOkVerif: true},
+
+		// Offset bigger than file size.
+		{proofOffset: uint64(dataSize) + 1, verifOffset: 0, notOkProof: true},
 	}
 
 	for _, test := range tests {
 		test := test
 		tname := fmt.Sprintf("%d %d", test.proofOffset, test.verifOffset)
-
 		t.Run(tname, func(t *testing.T) {
-			ctx, cls := context.WithCancel(context.Background())
-			defer cls()
-			proof, err := CreateProof(ctx, node.Cid(), test.proofOffset, dserv)
-			require.NoError(t, err)
+			t.Parallel()
 
-			ok, err := ValidateProof(ctx, node.Cid(), test.verifOffset, proof)
+			proof, err := CreateProof(ctx, rootCid, test.proofOffset, dserv)
+			if test.notOkProof {
+				require.Error(t, err)
+				return
+			}
 			require.NoError(t, err)
 
-			require.Equal(t, test.ok, ok)
+			ok, err := ValidateProof(ctx, rootCid, test.verifOffset, proof)
+			require.NoError(t, err)
+			require.Equal(t, !test.notOkVerif, ok)
 		})
 	}
 }
+
+func setupData(t *testing.T, dataSize, chunkSize int64) (cid.Cid, ipld.DAGService) {
+	r := rand.New(rand.NewSource(22))
+	data := make([]byte, dataSize)
+	_, err := io.ReadFull(r, data)
+	require.NoError(t, err)
+	in := bytes.NewReader(data)
+	opts := testu.UseCidV1
+	dserv := testu.GetDAGServ()
+	dbp := h.DagBuilderParams{
+		Dagserv:    dserv,
+		Maxlinks:   3,
+		CidBuilder: opts.Prefix,
+		RawLeaves:  opts.RawLeavesUsed,
+	}
+	db, err := dbp.New(chunker.NewSizeSplitter(in, chunkSize))
+	require.NoError(t, err)
+	n, err := balanced.Layout(db)
+	require.NoError(t, err)
+
+	return n.Cid(), dserv
+}