Skip to content

Commit f9f0aa2

Browse files
heusalagroupbotaibuddy
andauthored
Tool: dedupe_rank (#39)
* Tool: searxng_search Add searxng_search tool sources and focused docs from develop, sliced as an independent feature. Based on main for minimal diff; marked draft pending scaffold merge. * Tool: pdf_extract Add pdf_extract tool sources and docs from develop as an independent feature. * Tool: wiki_query Add wiki_query tool sources from develop as an independent feature. * Tool: openalex_search Add openalex_search tool sources and docs from develop as an independent feature. * Tool: crossref_search Add crossref_search tool sources and docs from develop as an independent feature. * Tool: github_search Add github_search tool sources and docs from develop as an independent feature. * Tool: dedupe_rank Add dedupe_rank tool sources and docs from develop as an independent feature. --------- Co-authored-by: aibuddy <aibuddy@dev.hg.fi>
1 parent cc79aec commit f9f0aa2

File tree

2 files changed

+355
-0
lines changed

2 files changed

+355
-0
lines changed
Lines changed: 267 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,267 @@
1+
package main
2+
3+
import (
4+
"encoding/json"
5+
"errors"
6+
"fmt"
7+
"io"
8+
"math"
9+
"os"
10+
"sort"
11+
"strings"
12+
)
13+
14+
type inputDocument struct {
15+
ID string `json:"id"`
16+
URL string `json:"url,omitempty"`
17+
Title string `json:"title,omitempty"`
18+
Text string `json:"text,omitempty"`
19+
PublishedAt string `json:"published_at,omitempty"`
20+
}
21+
22+
type toolInput struct {
23+
Documents []inputDocument `json:"docs"`
24+
}
25+
26+
type outputGroup struct {
27+
RepresentativeID string `json:"representative_id"`
28+
Members []string `json:"members"`
29+
Score float64 `json:"score"`
30+
}
31+
32+
type toolOutput struct {
33+
Groups []outputGroup `json:"groups"`
34+
}
35+
36+
type stderrError struct {
37+
Error string `json:"error"`
38+
Hint string `json:"hint,omitempty"`
39+
}
40+
41+
func writeErrorAndExit(err error, hint string) {
42+
encErr := json.NewEncoder(os.Stderr).Encode(stderrError{Error: err.Error(), Hint: hint})
43+
if encErr != nil {
44+
// Best-effort fallback when JSON encode fails
45+
_, _ = fmt.Fprintf(os.Stderr, "error=%q hint=%q\n", err.Error(), hint)
46+
}
47+
os.Exit(1)
48+
}
49+
50+
func main() {
51+
data, err := io.ReadAll(os.Stdin)
52+
if err != nil {
53+
writeErrorAndExit(err, "failed to read stdin")
54+
return
55+
}
56+
in, err := parseInput(data)
57+
if err != nil {
58+
writeErrorAndExit(err, "invalid JSON input for dedupe_rank")
59+
return
60+
}
61+
if len(in.Documents) == 0 {
62+
writeErrorAndExit(errors.New("missing docs"), "provide docs: [{id,title?,text?,url?,published_at?}]")
63+
return
64+
}
65+
66+
documents := buildDocuments(in)
67+
groups := groupDocuments(documents, 0.25)
68+
out := toolOutput{Groups: groups}
69+
if err := json.NewEncoder(os.Stdout).Encode(out); err != nil {
70+
_, _ = fmt.Fprintf(os.Stderr, "{\"error\":%q}\n", "failed to encode output")
71+
os.Exit(1)
72+
}
73+
}
74+
75+
// parseInput unmarshals tool input from raw JSON bytes.
76+
func parseInput(data []byte) (toolInput, error) {
77+
var in toolInput
78+
err := json.Unmarshal(data, &in)
79+
return in, err
80+
}
81+
82+
type docData struct {
83+
doc inputDocument
84+
tokens []string
85+
set map[string]struct{}
86+
}
87+
88+
// buildDocuments tokenizes, filters, and constructs set representations.
89+
func buildDocuments(in toolInput) []docData {
90+
documents := make([]docData, 0, len(in.Documents))
91+
for _, d := range in.Documents {
92+
tokens := tokenizeWords(strings.TrimSpace(d.Title + " " + d.Text))
93+
tokens = filterStopwords(tokens)
94+
set := make(map[string]struct{}, len(tokens))
95+
for _, s := range tokens {
96+
set[s] = struct{}{}
97+
}
98+
documents = append(documents, docData{doc: d, tokens: tokens, set: set})
99+
}
100+
return documents
101+
}
102+
103+
// groupDocuments performs similarity grouping and representative selection.
104+
func groupDocuments(documents []docData, jaccardThreshold float64) []outputGroup {
105+
// Union-Find structure
106+
parent := make([]int, len(documents))
107+
for i := range parent {
108+
parent[i] = i
109+
}
110+
var find func(int) int
111+
find = func(x int) int {
112+
if parent[x] != x {
113+
parent[x] = find(parent[x])
114+
}
115+
return parent[x]
116+
}
117+
union := func(a, b int) {
118+
ra, rb := find(a), find(b)
119+
if ra != rb {
120+
parent[rb] = ra
121+
}
122+
}
123+
124+
// Pairwise similarities
125+
for i := 0; i < len(documents); i++ {
126+
for j := i + 1; j < len(documents); j++ {
127+
sim := jaccard(documents[i].set, documents[j].set)
128+
if sim >= jaccardThreshold {
129+
union(i, j)
130+
}
131+
}
132+
}
133+
134+
// Build groups by root parent
135+
rootToIdx := make(map[int][]int)
136+
for i := range documents {
137+
r := find(i)
138+
rootToIdx[r] = append(rootToIdx[r], i)
139+
}
140+
141+
// Compute token doc frequency for TF-IDF scoring
142+
tokenDocFreq := make(map[string]int)
143+
for _, dd := range documents {
144+
seen := map[string]struct{}{}
145+
for _, t := range dd.tokens {
146+
if _, ok := seen[t]; ok {
147+
continue
148+
}
149+
seen[t] = struct{}{}
150+
tokenDocFreq[t]++
151+
}
152+
}
153+
scorer := func(idx int) float64 { return tfidfScore(documents[idx].tokens, tokenDocFreq, float64(len(documents))) }
154+
155+
groups := make([]outputGroup, 0, len(rootToIdx))
156+
for _, idxs := range rootToIdx {
157+
if len(idxs) == 1 {
158+
i := idxs[0]
159+
groups = append(groups, outputGroup{
160+
RepresentativeID: documents[i].doc.ID,
161+
Members: []string{documents[i].doc.ID},
162+
Score: 0,
163+
})
164+
continue
165+
}
166+
// Best representative by score; tie-break by id
167+
bestIdx := idxs[0]
168+
bestScore := scorer(bestIdx)
169+
for k := 1; k < len(idxs); k++ {
170+
s := scorer(idxs[k])
171+
if s > bestScore || (s == bestScore && documents[idxs[k]].doc.ID < documents[bestIdx].doc.ID) {
172+
bestScore = s
173+
bestIdx = idxs[k]
174+
}
175+
}
176+
members := make([]string, 0, len(idxs))
177+
for _, i := range idxs {
178+
members = append(members, documents[i].doc.ID)
179+
}
180+
sort.Strings(members)
181+
groups = append(groups, outputGroup{
182+
RepresentativeID: documents[bestIdx].doc.ID,
183+
Members: members,
184+
Score: bestScore,
185+
})
186+
}
187+
sort.Slice(groups, func(i, j int) bool { return groups[i].RepresentativeID < groups[j].RepresentativeID })
188+
return groups
189+
}
190+
191+
// tfidfScore computes a crude TF-IDF score for a token sequence.
192+
func tfidfScore(tokens []string, tokenDocFreq map[string]int, numDocs float64) float64 {
193+
tf := map[string]int{}
194+
for _, t := range tokens {
195+
tf[t]++
196+
}
197+
var score float64
198+
for tok, c := range tf {
199+
df := float64(tokenDocFreq[tok])
200+
idf := 0.0
201+
if df > 0 {
202+
idf = math.Log(numDocs / df)
203+
}
204+
score += (1.0 + math.Log(float64(c))) * idf
205+
}
206+
return score
207+
}
208+
209+
// tokenizeWords splits text into lowercase alphanumeric tokens.
210+
func tokenizeWords(s string) []string {
211+
// Replace non-letters with spaces, split on spaces
212+
b := strings.Builder{}
213+
b.Grow(len(s))
214+
for _, r := range s {
215+
if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') {
216+
b.WriteRune(r)
217+
} else {
218+
b.WriteByte(' ')
219+
}
220+
}
221+
parts := strings.Fields(strings.ToLower(b.String()))
222+
return parts
223+
}
224+
225+
// filterStopwords removes a small set of common English stopwords.
226+
func filterStopwords(tokens []string) []string {
227+
if len(tokens) == 0 {
228+
return tokens
229+
}
230+
stop := map[string]struct{}{
231+
"a": {}, "an": {}, "the": {}, "is": {}, "are": {}, "was": {}, "were": {},
232+
"by": {}, "of": {}, "and": {}, "to": {}, "in": {}, "on": {}, "for": {},
233+
"with": {}, "as": {}, "it": {}, "its": {}, "at": {}, "this": {}, "that": {},
234+
}
235+
out := make([]string, 0, len(tokens))
236+
for _, t := range tokens {
237+
if _, ok := stop[t]; ok {
238+
continue
239+
}
240+
out = append(out, t)
241+
}
242+
return out
243+
}
244+
245+
// jaccard computes Jaccard similarity between two sets.
246+
func jaccard(a, b map[string]struct{}) float64 {
247+
if len(a) == 0 && len(b) == 0 {
248+
return 1.0
249+
}
250+
inter := 0
251+
var small, large map[string]struct{}
252+
if len(a) < len(b) {
253+
small, large = a, b
254+
} else {
255+
small, large = b, a
256+
}
257+
for k := range small {
258+
if _, ok := large[k]; ok {
259+
inter++
260+
}
261+
}
262+
union := len(a) + len(b) - inter
263+
if union == 0 {
264+
return 0
265+
}
266+
return float64(inter) / float64(union)
267+
}
Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
package main_test
2+
3+
import (
4+
"bytes"
5+
"encoding/json"
6+
"os/exec"
7+
"sort"
8+
"strings"
9+
"testing"
10+
11+
testutil "github.com/hyperifyio/goagent/tools/testutil"
12+
)
13+
14+
type group struct {
15+
RepresentativeID string `json:"representative_id"`
16+
Members []string `json:"members"`
17+
Score float64 `json:"score"`
18+
}
19+
20+
type output struct {
21+
Groups []group `json:"groups"`
22+
}
23+
24+
func runTool(t *testing.T, bin string, input any) (output, string, error) {
25+
t.Helper()
26+
var out output
27+
data, err := json.Marshal(input)
28+
if err != nil {
29+
t.Fatalf("marshal: %v", err)
30+
}
31+
cmd := exec.Command(bin)
32+
cmd.Stdin = bytes.NewReader(data)
33+
var stdout, stderr bytes.Buffer
34+
cmd.Stdout = &stdout
35+
cmd.Stderr = &stderr
36+
err = cmd.Run()
37+
if err == nil {
38+
if decErr := json.Unmarshal([]byte(strings.TrimSpace(stdout.String())), &out); decErr != nil {
39+
t.Fatalf("parse output: %v; raw=%s", decErr, stdout.String())
40+
}
41+
}
42+
return out, strings.TrimSpace(stderr.String()), err
43+
}
44+
45+
// TestDedupeRank_GroupsNearDuplicates encodes the expected behavior:
46+
// - Near-duplicate documents should be grouped together under one representative id
47+
// - The representative is the best-ranked member; tie-breaks use TF-IDF-like signal
48+
// This test is intentionally added before the implementation and should fail until implemented.
49+
func TestDedupeRank_GroupsNearDuplicates(t *testing.T) {
50+
bin := testutil.BuildTool(t, "dedupe_rank")
51+
52+
docs := []map[string]any{
53+
{"id": "a", "title": "Go Programming Language", "text": "Golang is a programming language created at Google."},
54+
{"id": "b", "title": "The Go Language", "text": "Go is a programming language by Google."},
55+
{"id": "c", "title": "Python Info", "text": "Python is a different programming language."},
56+
}
57+
58+
in := map[string]any{"docs": docs}
59+
out, errStr, err := runTool(t, bin, in)
60+
if err != nil {
61+
t.Fatalf("dedupe_rank errored: %v, stderr=%s", err, errStr)
62+
}
63+
if len(out.Groups) == 0 {
64+
t.Fatalf("expected at least one group, got none")
65+
}
66+
// find group containing both a and b
67+
var ab []string
68+
for _, g := range out.Groups {
69+
hasA := false
70+
hasB := false
71+
for _, id := range g.Members {
72+
if id == "a" {
73+
hasA = true
74+
} else if id == "b" {
75+
hasB = true
76+
}
77+
}
78+
if hasA && hasB {
79+
ab = append([]string{}, g.Members...)
80+
// order members for deterministic comparison in golden-style tests
81+
sort.Strings(ab)
82+
break
83+
}
84+
}
85+
if len(ab) == 0 {
86+
t.Fatalf("expected docs 'a' and 'b' to be grouped together; groups=%v", out.Groups)
87+
}
88+
}

0 commit comments

Comments
 (0)