Skip to content

Commit 3da222b

Browse files
heusalagroupbotaibuddy
andauthored
Tool: citation_pack (#40)
* Tool: searxng_search Add searxng_search tool sources and focused docs from develop, sliced as an independent feature. Based on main for minimal diff; marked draft pending scaffold merge. * Tool: pdf_extract Add pdf_extract tool sources and docs from develop as an independent feature. * Tool: wiki_query Add wiki_query tool sources from develop as an independent feature. * Tool: openalex_search Add openalex_search tool sources and docs from develop as an independent feature. * Tool: crossref_search Add crossref_search tool sources and docs from develop as an independent feature. * Tool: github_search Add github_search tool sources and docs from develop as an independent feature. * Tool: dedupe_rank Add dedupe_rank tool sources and docs from develop as an independent feature. * Tool: citation_pack Add citation_pack tool sources and docs from develop as an independent feature. --------- Co-authored-by: aibuddy <aibuddy@dev.hg.fi>
1 parent f9f0aa2 commit 3da222b

File tree

3 files changed

+416
-0
lines changed

3 files changed

+416
-0
lines changed

docs/reference/citation_pack.md

Lines changed: 57 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,57 @@
1+
# citation_pack
2+
3+
Normalize citation metadata and optionally attach a Wayback archive URL.
4+
5+
## Stdin schema
6+
7+
```json
8+
{
9+
"doc": {
10+
"title": "string?",
11+
"url": "string",
12+
"published_at": "string?"
13+
},
14+
"archive": {
15+
"wayback": "boolean?"
16+
}
17+
}
18+
```
19+
20+
## Stdout schema
21+
22+
```json
23+
{
24+
"title": "string?",
25+
"url": "string",
26+
"host": "string",
27+
"accessed_at": "string",
28+
"archive_url": "string?"
29+
}
30+
```
31+
32+
- "accessed_at" is an RFC3339 UTC timestamp of when the pack was created.
33+
- When `archive.wayback` is true, the tool queries a Wayback-compatible endpoint for an existing snapshot and includes its URL if available.
34+
35+
## Environment
36+
37+
- `WAYBACK_BASE_URL` (optional): Base URL for Wayback API (defaults to `https://web.archive.org`).
38+
39+
## Exit codes
40+
41+
- 0: success
42+
- non-zero: error; stderr contains a single-line JSON `{ "error": "..." }`.
43+
44+
## Examples
45+
46+
- Minimal normalization:
47+
48+
```bash
49+
echo '{"doc":{"url":"https://example.com/post"}}' | ./tools/bin/citation_pack | jq .
50+
```
51+
52+
- Include Wayback lookup (using a local test server):
53+
54+
```bash
55+
export WAYBACK_BASE_URL="http://localhost:8080"
56+
echo '{"doc":{"url":"https://example.com/post"},"archive":{"wayback":true}}' | ./tools/bin/citation_pack | jq .
57+
```
Lines changed: 244 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,244 @@
1+
package main
2+
3+
import (
4+
"bufio"
5+
"encoding/json"
6+
"errors"
7+
"fmt"
8+
"net"
9+
"net/http"
10+
"net/url"
11+
"os"
12+
"path/filepath"
13+
"strings"
14+
"time"
15+
)
16+
17+
type input struct {
18+
Doc struct {
19+
Title string `json:"title"`
20+
URL string `json:"url"`
21+
PublishedAt string `json:"published_at"`
22+
} `json:"doc"`
23+
Archive struct {
24+
Wayback bool `json:"wayback"`
25+
} `json:"archive"`
26+
}
27+
28+
type output struct {
29+
Title string `json:"title,omitempty"`
30+
URL string `json:"url"`
31+
Host string `json:"host"`
32+
AccessedAt string `json:"accessed_at"`
33+
ArchiveURL string `json:"archive_url,omitempty"`
34+
}
35+
36+
func main() {
37+
if err := run(); err != nil {
38+
msg := strings.ReplaceAll(err.Error(), "\n", " ")
39+
fmt.Fprintf(os.Stderr, "{\"error\":%q}\n", msg)
40+
os.Exit(1)
41+
}
42+
}
43+
44+
func run() error {
45+
in, err := decodeInput()
46+
if err != nil {
47+
return err
48+
}
49+
if strings.TrimSpace(in.Doc.URL) == "" {
50+
return errors.New("doc.url is required")
51+
}
52+
u, err := url.Parse(in.Doc.URL)
53+
if err != nil || (u.Scheme != "http" && u.Scheme != "https") {
54+
return errors.New("doc.url must be a valid http/https URL")
55+
}
56+
out := output{
57+
Title: strings.TrimSpace(in.Doc.Title),
58+
URL: in.Doc.URL,
59+
Host: u.Hostname(),
60+
AccessedAt: time.Now().UTC().Format(time.RFC3339),
61+
}
62+
63+
archived := false
64+
start := time.Now()
65+
if in.Archive.Wayback {
66+
archiveURL, aerr := waybackLookup(in.Doc.URL)
67+
if aerr != nil {
68+
return aerr
69+
}
70+
if archiveURL != "" {
71+
out.ArchiveURL = archiveURL
72+
archived = true
73+
}
74+
}
75+
if err := json.NewEncoder(os.Stdout).Encode(out); err != nil {
76+
return fmt.Errorf("encode json: %w", err)
77+
}
78+
_ = appendAudit(map[string]any{ //nolint:errcheck
79+
"ts": time.Now().UTC().Format(time.RFC3339Nano),
80+
"tool": "citation_pack",
81+
"url_host": out.Host,
82+
"archived": archived,
83+
"ms": time.Since(start).Milliseconds(),
84+
})
85+
return nil
86+
}
87+
88+
func decodeInput() (input, error) {
89+
var in input
90+
dec := json.NewDecoder(bufio.NewReader(os.Stdin))
91+
if err := dec.Decode(&in); err != nil {
92+
return in, fmt.Errorf("parse json: %w", err)
93+
}
94+
return in, nil
95+
}
96+
97+
// waybackLookup performs a lookup against the Wayback Machine compatible endpoint.
98+
// It respects WAYBACK_BASE_URL if set, otherwise defaults to https://web.archive.org.
99+
// Enforces a 3s timeout and SSRF guard on the base URL.
100+
func waybackLookup(targetURL string) (string, error) {
101+
base := strings.TrimSpace(os.Getenv("WAYBACK_BASE_URL"))
102+
if base == "" {
103+
base = "https://web.archive.org"
104+
}
105+
baseURL, err := url.Parse(base)
106+
if err != nil || (baseURL.Scheme != "http" && baseURL.Scheme != "https") {
107+
return "", errors.New("WAYBACK_BASE_URL must be a valid http/https URL")
108+
}
109+
if err := ssrfGuard(baseURL); err != nil {
110+
return "", err
111+
}
112+
reqURL, err := url.Parse(baseURL.String())
113+
if err != nil {
114+
return "", err
115+
}
116+
reqURL.Path = strings.TrimRight(reqURL.Path, "/") + "/available"
117+
q := reqURL.Query()
118+
q.Set("url", targetURL)
119+
reqURL.RawQuery = q.Encode()
120+
client := &http.Client{Timeout: 3 * time.Second}
121+
resp, err := client.Get(reqURL.String())
122+
if err != nil {
123+
return "", fmt.Errorf("http: %w", err)
124+
}
125+
defer func() { _ = resp.Body.Close() }() //nolint:errcheck
126+
var raw struct {
127+
ArchivedSnapshots struct {
128+
Closest struct {
129+
Available bool `json:"available"`
130+
URL string `json:"url"`
131+
Timestamp string `json:"timestamp"`
132+
} `json:"closest"`
133+
} `json:"archived_snapshots"`
134+
}
135+
if err := json.NewDecoder(bufio.NewReader(resp.Body)).Decode(&raw); err != nil {
136+
return "", fmt.Errorf("decode json: %w", err)
137+
}
138+
if raw.ArchivedSnapshots.Closest.Available {
139+
return raw.ArchivedSnapshots.Closest.URL, nil
140+
}
141+
return "", nil
142+
}
143+
144+
// ssrfGuard similar to other networked tools; can be bypassed in tests via CITATION_PACK_ALLOW_LOCAL=1
145+
func ssrfGuard(u *url.URL) error {
146+
host := u.Hostname()
147+
if host == "" {
148+
return errors.New("invalid host")
149+
}
150+
if strings.HasSuffix(strings.ToLower(host), ".onion") {
151+
return errors.New("SSRF blocked: onion domains are not allowed")
152+
}
153+
if os.Getenv("CITATION_PACK_ALLOW_LOCAL") == "1" {
154+
return nil
155+
}
156+
ips, err := net.LookupIP(host)
157+
if err != nil || len(ips) == 0 {
158+
return errors.New("SSRF blocked: cannot resolve host")
159+
}
160+
for _, ip := range ips {
161+
if isPrivateIP(ip) {
162+
return errors.New("SSRF blocked: private or loopback address")
163+
}
164+
}
165+
return nil
166+
}
167+
168+
func isPrivateIP(ip net.IP) bool {
169+
if ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast() {
170+
return true
171+
}
172+
if v4 := ip.To4(); v4 != nil {
173+
ip = v4
174+
if v4[0] == 10 {
175+
return true
176+
}
177+
if v4[0] == 172 && v4[1]&0xf0 == 16 {
178+
return true
179+
}
180+
if v4[0] == 192 && v4[1] == 168 {
181+
return true
182+
}
183+
if v4[0] == 169 && v4[1] == 254 {
184+
return true
185+
}
186+
if v4[0] == 127 {
187+
return true
188+
}
189+
return false
190+
}
191+
if ip.Equal(net.ParseIP("::1")) {
192+
return true
193+
}
194+
if ip[0] == 0xfe && (ip[1]&0xc0) == 0x80 {
195+
return true
196+
}
197+
if ip[0]&0xfe == 0xfc {
198+
return true
199+
}
200+
return false
201+
}
202+
203+
// appendAudit writes an NDJSON line under .goagent/audit/YYYYMMDD.log at the repo root.
204+
func appendAudit(entry any) error {
205+
b, err := json.Marshal(entry)
206+
if err != nil {
207+
return err
208+
}
209+
root := moduleRoot()
210+
dir := filepath.Join(root, ".goagent", "audit")
211+
if err := os.MkdirAll(dir, 0o755); err != nil {
212+
return err
213+
}
214+
fname := time.Now().UTC().Format("20060102") + ".log"
215+
path := filepath.Join(dir, fname)
216+
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
217+
if err != nil {
218+
return err
219+
}
220+
defer func() { _ = f.Close() }() //nolint:errcheck
221+
if _, err := f.Write(append(b, '\n')); err != nil {
222+
return err
223+
}
224+
return nil
225+
}
226+
227+
// moduleRoot walks upward from CWD to the directory containing go.mod; falls back to CWD.
228+
func moduleRoot() string {
229+
cwd, err := os.Getwd()
230+
if err != nil || cwd == "" {
231+
return "."
232+
}
233+
dir := cwd
234+
for {
235+
if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil {
236+
return dir
237+
}
238+
parent := filepath.Dir(dir)
239+
if parent == dir {
240+
return cwd
241+
}
242+
dir = parent
243+
}
244+
}

0 commit comments

Comments
 (0)