Added Position field to the Tokenizer (#128)

Steve van Loben Sels · web-flow · commit b2d0aeb3c785 · 2022-11-02T09:38:54.000-07:00
The Position gives a means to index into the Tokenizer's underlying byte slice.
This enables use cases where the caller is planning on making edits to the JSON
document but wants to leverage the copy func to optimize data movement and/or 
to copy remaining bytes if the caller wants to exit the tokenizing loop early.
diff --git a/json/token.go b/json/token.go
@@ -31,7 +31,6 @@ import (
 //			...
 //		}
 //	}
-//
 type Tokenizer struct {
 	// When the tokenizer is positioned on a json delimiter this field is not
 	// zero. In this case the possible values are '{', '}', '[', ']', ':', and
@@ -44,6 +43,17 @@ type Tokenizer struct {
 	// null, true, false, numbers, or quoted strings.
 	Value RawValue
 
+	// Position is the Tokenizer's current index into the underlying byte slice.
+	// Since the Tokenizer has already been advanced by calling Next, this
+	// position will be the first index of the next token.  The position of
+	// the current Value can be calculated by subtracting len(token.value).
+	// Accordingly, slicing the underlying bytes like:
+	//
+	//   b[token.Position-len(token.Value):token.Position]
+	//
+	// will yield the current Value.
+	Position int
+
 	// When the tokenizer has encountered invalid content this field is not nil.
 	Err error
 
@@ -92,6 +102,7 @@ func (t *Tokenizer) Reset(b []byte) {
 	// However, it does not compile down to an invocation of duff-copy.
 	t.Delim = 0
 	t.Value = nil
+	t.Position = 0
 	t.Err = nil
 	t.Depth = 0
 	t.Index = 0
@@ -128,13 +139,16 @@ skipLoop:
 
 	if i > 0 {
 		t.json = t.json[i:]
+		t.Position += i
 	}
 
 	if len(t.json) == 0 {
 		t.Reset(nil)
 		return false
 	}
 
+	lenBefore := len(t.json)
+
 	var kind Kind
 	switch t.json[0] {
 	case '"':
@@ -165,6 +179,8 @@ skipLoop:
 		t.Value, t.json, t.Err = t.json[:1], t.json[1:], syntaxError(t.json, "expected token but found '%c'", t.json[0])
 	}
 
+	t.Position += lenBefore - len(t.json)
+
 	t.Depth = t.depth()
 	t.Index = t.index()
 	t.flags = t.flags.withKind(kind)
diff --git a/json/token_test.go b/json/token_test.go
@@ -1,6 +1,7 @@
 package json
 
 import (
+	"bytes"
 	"reflect"
 	"testing"
 )
@@ -40,22 +41,30 @@ func value(v string, depth, index int) token {
 	}
 }
 
-func tokenize(b []byte) (tokens []token) {
-	t := NewTokenizer(b)
+func tokenize(t *testing.T, b []byte) (tokens []token) {
+	tok := NewTokenizer(b)
+
+	for tok.Next() {
+		start, end := tok.Position-len(tok.Value), tok.Position
+		if end > len(b) {
+			t.Fatalf("token position too far [%d:%d], len(b) is %d", start, end, len(b))
+		}
+		if !bytes.Equal(b[start:end], tok.Value) {
+			t.Fatalf("token position is wrong [%d:%d]", start, end)
+		}
 
-	for t.Next() {
 		tokens = append(tokens, token{
-			delim: t.Delim,
-			value: t.Value,
-			err:   t.Err,
-			depth: t.Depth,
-			index: t.Index,
-			isKey: t.IsKey,
+			delim: tok.Delim,
+			value: tok.Value,
+			err:   tok.Err,
+			depth: tok.Depth,
+			index: tok.Index,
+			isKey: tok.IsKey,
 		})
 	}
 
-	if t.Err != nil {
-		panic(t.Err)
+	if tok.Err != nil {
+		t.Fatal(tok.Err)
 	}
 
 	return
@@ -174,7 +183,7 @@ func TestTokenizer(t *testing.T) {
 
 	for _, test := range tests {
 		t.Run(string(test.input), func(t *testing.T) {
-			tokens := tokenize(test.input)
+			tokens := tokenize(t, test.input)
 
 			if !reflect.DeepEqual(tokens, test.tokens) {
 				t.Error("tokens mismatch")