Tool: dedupe_rank (#39)

heusalagroupbot · aibuddy · web-flow · commit f9f0aa28beb3 · 2025-08-20T23:06:48.000+03:00
* Tool: searxng_search

Add searxng_search tool sources and focused docs from develop, sliced as an independent feature. Based on main for minimal diff; marked draft pending scaffold merge.

* Tool: pdf_extract

Add pdf_extract tool sources and docs from develop as an independent feature.

* Tool: wiki_query

Add wiki_query tool sources from develop as an independent feature.

* Tool: openalex_search

Add openalex_search tool sources and docs from develop as an independent feature.

* Tool: crossref_search

Add crossref_search tool sources and docs from develop as an independent feature.

* Tool: github_search

Add github_search tool sources and docs from develop as an independent feature.

* Tool: dedupe_rank

Add dedupe_rank tool sources and docs from develop as an independent feature.

---------

Co-authored-by: aibuddy &lt;aibuddy@dev.hg.fi&gt;
diff --git a/tools/cmd/dedupe_rank/dedupe_rank.go b/tools/cmd/dedupe_rank/dedupe_rank.go
@@ -0,0 +1,267 @@
+package main
+
+import (
+	"encoding/json"
+	"errors"
+	"fmt"
+	"io"
+	"math"
+	"os"
+	"sort"
+	"strings"
+)
+
+type inputDocument struct {
+	ID          string `json:"id"`
+	URL         string `json:"url,omitempty"`
+	Title       string `json:"title,omitempty"`
+	Text        string `json:"text,omitempty"`
+	PublishedAt string `json:"published_at,omitempty"`
+}
+
+type toolInput struct {
+	Documents []inputDocument `json:"docs"`
+}
+
+type outputGroup struct {
+	RepresentativeID string   `json:"representative_id"`
+	Members          []string `json:"members"`
+	Score            float64  `json:"score"`
+}
+
+type toolOutput struct {
+	Groups []outputGroup `json:"groups"`
+}
+
+type stderrError struct {
+	Error string `json:"error"`
+	Hint  string `json:"hint,omitempty"`
+}
+
+func writeErrorAndExit(err error, hint string) {
+	encErr := json.NewEncoder(os.Stderr).Encode(stderrError{Error: err.Error(), Hint: hint})
+	if encErr != nil {
+		// Best-effort fallback when JSON encode fails
+		_, _ = fmt.Fprintf(os.Stderr, "error=%q hint=%q\n", err.Error(), hint)
+	}
+	os.Exit(1)
+}
+
+func main() {
+	data, err := io.ReadAll(os.Stdin)
+	if err != nil {
+		writeErrorAndExit(err, "failed to read stdin")
+		return
+	}
+	in, err := parseInput(data)
+	if err != nil {
+		writeErrorAndExit(err, "invalid JSON input for dedupe_rank")
+		return
+	}
+	if len(in.Documents) == 0 {
+		writeErrorAndExit(errors.New("missing docs"), "provide docs: [{id,title?,text?,url?,published_at?}]")
+		return
+	}
+
+	documents := buildDocuments(in)
+	groups := groupDocuments(documents, 0.25)
+	out := toolOutput{Groups: groups}
+	if err := json.NewEncoder(os.Stdout).Encode(out); err != nil {
+		_, _ = fmt.Fprintf(os.Stderr, "{\"error\":%q}\n", "failed to encode output")
+		os.Exit(1)
+	}
+}
+
+// parseInput unmarshals tool input from raw JSON bytes.
+func parseInput(data []byte) (toolInput, error) {
+	var in toolInput
+	err := json.Unmarshal(data, &in)
+	return in, err
+}
+
+type docData struct {
+	doc    inputDocument
+	tokens []string
+	set    map[string]struct{}
+}
+
+// buildDocuments tokenizes, filters, and constructs set representations.
+func buildDocuments(in toolInput) []docData {
+	documents := make([]docData, 0, len(in.Documents))
+	for _, d := range in.Documents {
+		tokens := tokenizeWords(strings.TrimSpace(d.Title + " " + d.Text))
+		tokens = filterStopwords(tokens)
+		set := make(map[string]struct{}, len(tokens))
+		for _, s := range tokens {
+			set[s] = struct{}{}
+		}
+		documents = append(documents, docData{doc: d, tokens: tokens, set: set})
+	}
+	return documents
+}
+
+// groupDocuments performs similarity grouping and representative selection.
+func groupDocuments(documents []docData, jaccardThreshold float64) []outputGroup {
+	// Union-Find structure
+	parent := make([]int, len(documents))
+	for i := range parent {
+		parent[i] = i
+	}
+	var find func(int) int
+	find = func(x int) int {
+		if parent[x] != x {
+			parent[x] = find(parent[x])
+		}
+		return parent[x]
+	}
+	union := func(a, b int) {
+		ra, rb := find(a), find(b)
+		if ra != rb {
+			parent[rb] = ra
+		}
+	}
+
+	// Pairwise similarities
+	for i := 0; i < len(documents); i++ {
+		for j := i + 1; j < len(documents); j++ {
+			sim := jaccard(documents[i].set, documents[j].set)
+			if sim >= jaccardThreshold {
+				union(i, j)
+			}
+		}
+	}
+
+	// Build groups by root parent
+	rootToIdx := make(map[int][]int)
+	for i := range documents {
+		r := find(i)
+		rootToIdx[r] = append(rootToIdx[r], i)
+	}
+
+	// Compute token doc frequency for TF-IDF scoring
+	tokenDocFreq := make(map[string]int)
+	for _, dd := range documents {
+		seen := map[string]struct{}{}
+		for _, t := range dd.tokens {
+			if _, ok := seen[t]; ok {
+				continue
+			}
+			seen[t] = struct{}{}
+			tokenDocFreq[t]++
+		}
+	}
+	scorer := func(idx int) float64 { return tfidfScore(documents[idx].tokens, tokenDocFreq, float64(len(documents))) }
+
+	groups := make([]outputGroup, 0, len(rootToIdx))
+	for _, idxs := range rootToIdx {
+		if len(idxs) == 1 {
+			i := idxs[0]
+			groups = append(groups, outputGroup{
+				RepresentativeID: documents[i].doc.ID,
+				Members:          []string{documents[i].doc.ID},
+				Score:            0,
+			})
+			continue
+		}
+		// Best representative by score; tie-break by id
+		bestIdx := idxs[0]
+		bestScore := scorer(bestIdx)
+		for k := 1; k < len(idxs); k++ {
+			s := scorer(idxs[k])
+			if s > bestScore || (s == bestScore && documents[idxs[k]].doc.ID < documents[bestIdx].doc.ID) {
+				bestScore = s
+				bestIdx = idxs[k]
+			}
+		}
+		members := make([]string, 0, len(idxs))
+		for _, i := range idxs {
+			members = append(members, documents[i].doc.ID)
+		}
+		sort.Strings(members)
+		groups = append(groups, outputGroup{
+			RepresentativeID: documents[bestIdx].doc.ID,
+			Members:          members,
+			Score:            bestScore,
+		})
+	}
+	sort.Slice(groups, func(i, j int) bool { return groups[i].RepresentativeID < groups[j].RepresentativeID })
+	return groups
+}
+
+// tfidfScore computes a crude TF-IDF score for a token sequence.
+func tfidfScore(tokens []string, tokenDocFreq map[string]int, numDocs float64) float64 {
+	tf := map[string]int{}
+	for _, t := range tokens {
+		tf[t]++
+	}
+	var score float64
+	for tok, c := range tf {
+		df := float64(tokenDocFreq[tok])
+		idf := 0.0
+		if df > 0 {
+			idf = math.Log(numDocs / df)
+		}
+		score += (1.0 + math.Log(float64(c))) * idf
+	}
+	return score
+}
+
+// tokenizeWords splits text into lowercase alphanumeric tokens.
+func tokenizeWords(s string) []string {
+	// Replace non-letters with spaces, split on spaces
+	b := strings.Builder{}
+	b.Grow(len(s))
+	for _, r := range s {
+		if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') {
+			b.WriteRune(r)
+		} else {
+			b.WriteByte(' ')
+		}
+	}
+	parts := strings.Fields(strings.ToLower(b.String()))
+	return parts
+}
+
+// filterStopwords removes a small set of common English stopwords.
+func filterStopwords(tokens []string) []string {
+	if len(tokens) == 0 {
+		return tokens
+	}
+	stop := map[string]struct{}{
+		"a": {}, "an": {}, "the": {}, "is": {}, "are": {}, "was": {}, "were": {},
+		"by": {}, "of": {}, "and": {}, "to": {}, "in": {}, "on": {}, "for": {},
+		"with": {}, "as": {}, "it": {}, "its": {}, "at": {}, "this": {}, "that": {},
+	}
+	out := make([]string, 0, len(tokens))
+	for _, t := range tokens {
+		if _, ok := stop[t]; ok {
+			continue
+		}
+		out = append(out, t)
+	}
+	return out
+}
+
+// jaccard computes Jaccard similarity between two sets.
+func jaccard(a, b map[string]struct{}) float64 {
+	if len(a) == 0 && len(b) == 0 {
+		return 1.0
+	}
+	inter := 0
+	var small, large map[string]struct{}
+	if len(a) < len(b) {
+		small, large = a, b
+	} else {
+		small, large = b, a
+	}
+	for k := range small {
+		if _, ok := large[k]; ok {
+			inter++
+		}
+	}
+	union := len(a) + len(b) - inter
+	if union == 0 {
+		return 0
+	}
+	return float64(inter) / float64(union)
+}
diff --git a/tools/cmd/dedupe_rank/dedupe_rank_test.go b/tools/cmd/dedupe_rank/dedupe_rank_test.go
@@ -0,0 +1,88 @@
+package main_test
+
+import (
+	"bytes"
+	"encoding/json"
+	"os/exec"
+	"sort"
+	"strings"
+	"testing"
+
+	testutil "github.com/hyperifyio/goagent/tools/testutil"
+)
+
+type group struct {
+	RepresentativeID string   `json:"representative_id"`
+	Members          []string `json:"members"`
+	Score            float64  `json:"score"`
+}
+
+type output struct {
+	Groups []group `json:"groups"`
+}
+
+func runTool(t *testing.T, bin string, input any) (output, string, error) {
+	t.Helper()
+	var out output
+	data, err := json.Marshal(input)
+	if err != nil {
+		t.Fatalf("marshal: %v", err)
+	}
+	cmd := exec.Command(bin)
+	cmd.Stdin = bytes.NewReader(data)
+	var stdout, stderr bytes.Buffer
+	cmd.Stdout = &stdout
+	cmd.Stderr = &stderr
+	err = cmd.Run()
+	if err == nil {
+		if decErr := json.Unmarshal([]byte(strings.TrimSpace(stdout.String())), &out); decErr != nil {
+			t.Fatalf("parse output: %v; raw=%s", decErr, stdout.String())
+		}
+	}
+	return out, strings.TrimSpace(stderr.String()), err
+}
+
+// TestDedupeRank_GroupsNearDuplicates encodes the expected behavior:
+// - Near-duplicate documents should be grouped together under one representative id
+// - The representative is the best-ranked member; tie-breaks use TF-IDF-like signal
+// This test is intentionally added before the implementation and should fail until implemented.
+func TestDedupeRank_GroupsNearDuplicates(t *testing.T) {
+	bin := testutil.BuildTool(t, "dedupe_rank")
+
+	docs := []map[string]any{
+		{"id": "a", "title": "Go Programming Language", "text": "Golang is a programming language created at Google."},
+		{"id": "b", "title": "The Go Language", "text": "Go is a programming language by Google."},
+		{"id": "c", "title": "Python Info", "text": "Python is a different programming language."},
+	}
+
+	in := map[string]any{"docs": docs}
+	out, errStr, err := runTool(t, bin, in)
+	if err != nil {
+		t.Fatalf("dedupe_rank errored: %v, stderr=%s", err, errStr)
+	}
+	if len(out.Groups) == 0 {
+		t.Fatalf("expected at least one group, got none")
+	}
+	// find group containing both a and b
+	var ab []string
+	for _, g := range out.Groups {
+		hasA := false
+		hasB := false
+		for _, id := range g.Members {
+			if id == "a" {
+				hasA = true
+			} else if id == "b" {
+				hasB = true
+			}
+		}
+		if hasA && hasB {
+			ab = append([]string{}, g.Members...)
+			// order members for deterministic comparison in golden-style tests
+			sort.Strings(ab)
+			break
+		}
+	}
+	if len(ab) == 0 {
+		t.Fatalf("expected docs 'a' and 'b' to be grouped together; groups=%v", out.Groups)
+	}
+}