|
| 1 | +package main |
| 2 | + |
| 3 | +import ( |
| 4 | + "encoding/json" |
| 5 | + "errors" |
| 6 | + "fmt" |
| 7 | + "io" |
| 8 | + "math" |
| 9 | + "os" |
| 10 | + "sort" |
| 11 | + "strings" |
| 12 | +) |
| 13 | + |
| 14 | +type inputDocument struct { |
| 15 | + ID string `json:"id"` |
| 16 | + URL string `json:"url,omitempty"` |
| 17 | + Title string `json:"title,omitempty"` |
| 18 | + Text string `json:"text,omitempty"` |
| 19 | + PublishedAt string `json:"published_at,omitempty"` |
| 20 | +} |
| 21 | + |
| 22 | +type toolInput struct { |
| 23 | + Documents []inputDocument `json:"docs"` |
| 24 | +} |
| 25 | + |
| 26 | +type outputGroup struct { |
| 27 | + RepresentativeID string `json:"representative_id"` |
| 28 | + Members []string `json:"members"` |
| 29 | + Score float64 `json:"score"` |
| 30 | +} |
| 31 | + |
| 32 | +type toolOutput struct { |
| 33 | + Groups []outputGroup `json:"groups"` |
| 34 | +} |
| 35 | + |
| 36 | +type stderrError struct { |
| 37 | + Error string `json:"error"` |
| 38 | + Hint string `json:"hint,omitempty"` |
| 39 | +} |
| 40 | + |
| 41 | +func writeErrorAndExit(err error, hint string) { |
| 42 | + encErr := json.NewEncoder(os.Stderr).Encode(stderrError{Error: err.Error(), Hint: hint}) |
| 43 | + if encErr != nil { |
| 44 | + // Best-effort fallback when JSON encode fails |
| 45 | + _, _ = fmt.Fprintf(os.Stderr, "error=%q hint=%q\n", err.Error(), hint) |
| 46 | + } |
| 47 | + os.Exit(1) |
| 48 | +} |
| 49 | + |
| 50 | +func main() { |
| 51 | + data, err := io.ReadAll(os.Stdin) |
| 52 | + if err != nil { |
| 53 | + writeErrorAndExit(err, "failed to read stdin") |
| 54 | + return |
| 55 | + } |
| 56 | + in, err := parseInput(data) |
| 57 | + if err != nil { |
| 58 | + writeErrorAndExit(err, "invalid JSON input for dedupe_rank") |
| 59 | + return |
| 60 | + } |
| 61 | + if len(in.Documents) == 0 { |
| 62 | + writeErrorAndExit(errors.New("missing docs"), "provide docs: [{id,title?,text?,url?,published_at?}]") |
| 63 | + return |
| 64 | + } |
| 65 | + |
| 66 | + documents := buildDocuments(in) |
| 67 | + groups := groupDocuments(documents, 0.25) |
| 68 | + out := toolOutput{Groups: groups} |
| 69 | + if err := json.NewEncoder(os.Stdout).Encode(out); err != nil { |
| 70 | + _, _ = fmt.Fprintf(os.Stderr, "{\"error\":%q}\n", "failed to encode output") |
| 71 | + os.Exit(1) |
| 72 | + } |
| 73 | +} |
| 74 | + |
| 75 | +// parseInput unmarshals tool input from raw JSON bytes. |
| 76 | +func parseInput(data []byte) (toolInput, error) { |
| 77 | + var in toolInput |
| 78 | + err := json.Unmarshal(data, &in) |
| 79 | + return in, err |
| 80 | +} |
| 81 | + |
| 82 | +type docData struct { |
| 83 | + doc inputDocument |
| 84 | + tokens []string |
| 85 | + set map[string]struct{} |
| 86 | +} |
| 87 | + |
| 88 | +// buildDocuments tokenizes, filters, and constructs set representations. |
| 89 | +func buildDocuments(in toolInput) []docData { |
| 90 | + documents := make([]docData, 0, len(in.Documents)) |
| 91 | + for _, d := range in.Documents { |
| 92 | + tokens := tokenizeWords(strings.TrimSpace(d.Title + " " + d.Text)) |
| 93 | + tokens = filterStopwords(tokens) |
| 94 | + set := make(map[string]struct{}, len(tokens)) |
| 95 | + for _, s := range tokens { |
| 96 | + set[s] = struct{}{} |
| 97 | + } |
| 98 | + documents = append(documents, docData{doc: d, tokens: tokens, set: set}) |
| 99 | + } |
| 100 | + return documents |
| 101 | +} |
| 102 | + |
| 103 | +// groupDocuments performs similarity grouping and representative selection. |
| 104 | +func groupDocuments(documents []docData, jaccardThreshold float64) []outputGroup { |
| 105 | + // Union-Find structure |
| 106 | + parent := make([]int, len(documents)) |
| 107 | + for i := range parent { |
| 108 | + parent[i] = i |
| 109 | + } |
| 110 | + var find func(int) int |
| 111 | + find = func(x int) int { |
| 112 | + if parent[x] != x { |
| 113 | + parent[x] = find(parent[x]) |
| 114 | + } |
| 115 | + return parent[x] |
| 116 | + } |
| 117 | + union := func(a, b int) { |
| 118 | + ra, rb := find(a), find(b) |
| 119 | + if ra != rb { |
| 120 | + parent[rb] = ra |
| 121 | + } |
| 122 | + } |
| 123 | + |
| 124 | + // Pairwise similarities |
| 125 | + for i := 0; i < len(documents); i++ { |
| 126 | + for j := i + 1; j < len(documents); j++ { |
| 127 | + sim := jaccard(documents[i].set, documents[j].set) |
| 128 | + if sim >= jaccardThreshold { |
| 129 | + union(i, j) |
| 130 | + } |
| 131 | + } |
| 132 | + } |
| 133 | + |
| 134 | + // Build groups by root parent |
| 135 | + rootToIdx := make(map[int][]int) |
| 136 | + for i := range documents { |
| 137 | + r := find(i) |
| 138 | + rootToIdx[r] = append(rootToIdx[r], i) |
| 139 | + } |
| 140 | + |
| 141 | + // Compute token doc frequency for TF-IDF scoring |
| 142 | + tokenDocFreq := make(map[string]int) |
| 143 | + for _, dd := range documents { |
| 144 | + seen := map[string]struct{}{} |
| 145 | + for _, t := range dd.tokens { |
| 146 | + if _, ok := seen[t]; ok { |
| 147 | + continue |
| 148 | + } |
| 149 | + seen[t] = struct{}{} |
| 150 | + tokenDocFreq[t]++ |
| 151 | + } |
| 152 | + } |
| 153 | + scorer := func(idx int) float64 { return tfidfScore(documents[idx].tokens, tokenDocFreq, float64(len(documents))) } |
| 154 | + |
| 155 | + groups := make([]outputGroup, 0, len(rootToIdx)) |
| 156 | + for _, idxs := range rootToIdx { |
| 157 | + if len(idxs) == 1 { |
| 158 | + i := idxs[0] |
| 159 | + groups = append(groups, outputGroup{ |
| 160 | + RepresentativeID: documents[i].doc.ID, |
| 161 | + Members: []string{documents[i].doc.ID}, |
| 162 | + Score: 0, |
| 163 | + }) |
| 164 | + continue |
| 165 | + } |
| 166 | + // Best representative by score; tie-break by id |
| 167 | + bestIdx := idxs[0] |
| 168 | + bestScore := scorer(bestIdx) |
| 169 | + for k := 1; k < len(idxs); k++ { |
| 170 | + s := scorer(idxs[k]) |
| 171 | + if s > bestScore || (s == bestScore && documents[idxs[k]].doc.ID < documents[bestIdx].doc.ID) { |
| 172 | + bestScore = s |
| 173 | + bestIdx = idxs[k] |
| 174 | + } |
| 175 | + } |
| 176 | + members := make([]string, 0, len(idxs)) |
| 177 | + for _, i := range idxs { |
| 178 | + members = append(members, documents[i].doc.ID) |
| 179 | + } |
| 180 | + sort.Strings(members) |
| 181 | + groups = append(groups, outputGroup{ |
| 182 | + RepresentativeID: documents[bestIdx].doc.ID, |
| 183 | + Members: members, |
| 184 | + Score: bestScore, |
| 185 | + }) |
| 186 | + } |
| 187 | + sort.Slice(groups, func(i, j int) bool { return groups[i].RepresentativeID < groups[j].RepresentativeID }) |
| 188 | + return groups |
| 189 | +} |
| 190 | + |
| 191 | +// tfidfScore computes a crude TF-IDF score for a token sequence. |
| 192 | +func tfidfScore(tokens []string, tokenDocFreq map[string]int, numDocs float64) float64 { |
| 193 | + tf := map[string]int{} |
| 194 | + for _, t := range tokens { |
| 195 | + tf[t]++ |
| 196 | + } |
| 197 | + var score float64 |
| 198 | + for tok, c := range tf { |
| 199 | + df := float64(tokenDocFreq[tok]) |
| 200 | + idf := 0.0 |
| 201 | + if df > 0 { |
| 202 | + idf = math.Log(numDocs / df) |
| 203 | + } |
| 204 | + score += (1.0 + math.Log(float64(c))) * idf |
| 205 | + } |
| 206 | + return score |
| 207 | +} |
| 208 | + |
| 209 | +// tokenizeWords splits text into lowercase alphanumeric tokens. |
| 210 | +func tokenizeWords(s string) []string { |
| 211 | + // Replace non-letters with spaces, split on spaces |
| 212 | + b := strings.Builder{} |
| 213 | + b.Grow(len(s)) |
| 214 | + for _, r := range s { |
| 215 | + if (r >= 'a' && r <= 'z') || (r >= 'A' && r <= 'Z') || (r >= '0' && r <= '9') { |
| 216 | + b.WriteRune(r) |
| 217 | + } else { |
| 218 | + b.WriteByte(' ') |
| 219 | + } |
| 220 | + } |
| 221 | + parts := strings.Fields(strings.ToLower(b.String())) |
| 222 | + return parts |
| 223 | +} |
| 224 | + |
| 225 | +// filterStopwords removes a small set of common English stopwords. |
| 226 | +func filterStopwords(tokens []string) []string { |
| 227 | + if len(tokens) == 0 { |
| 228 | + return tokens |
| 229 | + } |
| 230 | + stop := map[string]struct{}{ |
| 231 | + "a": {}, "an": {}, "the": {}, "is": {}, "are": {}, "was": {}, "were": {}, |
| 232 | + "by": {}, "of": {}, "and": {}, "to": {}, "in": {}, "on": {}, "for": {}, |
| 233 | + "with": {}, "as": {}, "it": {}, "its": {}, "at": {}, "this": {}, "that": {}, |
| 234 | + } |
| 235 | + out := make([]string, 0, len(tokens)) |
| 236 | + for _, t := range tokens { |
| 237 | + if _, ok := stop[t]; ok { |
| 238 | + continue |
| 239 | + } |
| 240 | + out = append(out, t) |
| 241 | + } |
| 242 | + return out |
| 243 | +} |
| 244 | + |
| 245 | +// jaccard computes Jaccard similarity between two sets. |
| 246 | +func jaccard(a, b map[string]struct{}) float64 { |
| 247 | + if len(a) == 0 && len(b) == 0 { |
| 248 | + return 1.0 |
| 249 | + } |
| 250 | + inter := 0 |
| 251 | + var small, large map[string]struct{} |
| 252 | + if len(a) < len(b) { |
| 253 | + small, large = a, b |
| 254 | + } else { |
| 255 | + small, large = b, a |
| 256 | + } |
| 257 | + for k := range small { |
| 258 | + if _, ok := large[k]; ok { |
| 259 | + inter++ |
| 260 | + } |
| 261 | + } |
| 262 | + union := len(a) + len(b) - inter |
| 263 | + if union == 0 { |
| 264 | + return 0 |
| 265 | + } |
| 266 | + return float64(inter) / float64(union) |
| 267 | +} |
0 commit comments