Skip to content

Commit f44d13e

Browse files
heusalagroupbotaibuddy
andauthored
Tool: openalex_search (#36)
* Tool: searxng_search Add searxng_search tool sources and focused docs from develop, sliced as an independent feature. Based on main for minimal diff; marked draft pending scaffold merge. * Tool: pdf_extract Add pdf_extract tool sources and docs from develop as an independent feature. * Tool: wiki_query Add wiki_query tool sources from develop as an independent feature. * Tool: openalex_search Add openalex_search tool sources and docs from develop as an independent feature. --------- Co-authored-by: aibuddy <aibuddy@dev.hg.fi>
1 parent 2f3eb6b commit f44d13e

File tree

3 files changed

+443
-0
lines changed

3 files changed

+443
-0
lines changed

docs/reference/openalex_search.md

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
# OpenAlex search tool (openalex_search)
2+
3+
Search scholarly works via the OpenAlex API.
4+
5+
- Stdin JSON: {"q":string,"from?":string,"to?":string,"per_page?":int<=50}
6+
- Stdout JSON: {"results":[{"title":string,"doi?":string,"publication_year":int,"open_access_url?":string,"authorships":[...] ,"cited_by_count":int}],"next_cursor?":string}
7+
- Env: OPENALEX_BASE_URL (optional, default https://api.openalex.org), HTTP_TIMEOUT_MS (optional)
8+
- Retries: up to 1 on timeout or 5xx
9+
- SSRF guard: blocks loopback/RFC1918/link-local/ULA and .onion
10+
11+
Example:
12+
13+
```bash
14+
printf '{"q":"golang","per_page":5}' | ./tools/bin/openalex_search | jq
15+
```
Lines changed: 352 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,352 @@
1+
package main
2+
3+
import (
4+
"bufio"
5+
"encoding/json"
6+
"errors"
7+
"fmt"
8+
"net"
9+
"net/http"
10+
"net/url"
11+
"os"
12+
"path/filepath"
13+
"strconv"
14+
"strings"
15+
"time"
16+
)
17+
18+
// input defines the expected stdin JSON for the tool.
19+
type input struct {
20+
Q string `json:"q"`
21+
From string `json:"from"`
22+
To string `json:"to"`
23+
PerPage int `json:"per_page"`
24+
}
25+
26+
// outputResult is the normalized result row produced by this tool.
27+
type outputResult struct {
28+
Title string `json:"title"`
29+
DOI string `json:"doi,omitempty"`
30+
PublicationYear int `json:"publication_year"`
31+
OpenAccessURL string `json:"open_access_url,omitempty"`
32+
// Authorships carries through as an opaque list to avoid schema churn.
33+
Authorships []any `json:"authorships"`
34+
CitedByCount int `json:"cited_by_count"`
35+
}
36+
37+
// output is the stdout JSON envelope produced by the tool.
38+
type output struct {
39+
Results []outputResult `json:"results"`
40+
NextCursor string `json:"next_cursor,omitempty"`
41+
}
42+
43+
func main() {
44+
if err := run(); err != nil {
45+
msg := strings.ReplaceAll(err.Error(), "\n", " ")
46+
fmt.Fprintf(os.Stderr, "{\"error\":%q}\n", msg)
47+
os.Exit(1)
48+
}
49+
}
50+
51+
func run() error {
52+
in, err := decodeInput()
53+
if err != nil {
54+
return err
55+
}
56+
if strings.TrimSpace(in.Q) == "" {
57+
return errors.New("q is required")
58+
}
59+
baseURL, reqURL, err := prepareURLs(in)
60+
if err != nil {
61+
return err
62+
}
63+
client := newHTTPClient(resolveTimeout())
64+
start := time.Now()
65+
raw, status, retries, err := fetchWithRetry(client, baseURL, reqURL)
66+
if err != nil {
67+
return err
68+
}
69+
out := output{Results: mapResults(raw.Results)}
70+
if v := strings.TrimSpace(raw.NextCursor); v != "" {
71+
out.NextCursor = v
72+
}
73+
if err := json.NewEncoder(os.Stdout).Encode(out); err != nil {
74+
return fmt.Errorf("encode json: %w", err)
75+
}
76+
// Best-effort audit; ignore errors.
77+
_ = appendAudit(map[string]any{ //nolint:errcheck
78+
"ts": time.Now().UTC().Format(time.RFC3339Nano),
79+
"tool": "openalex_search",
80+
"url_host": baseURL.Hostname(),
81+
"status": status,
82+
"ms": time.Since(start).Milliseconds(),
83+
"retries": retries,
84+
})
85+
return nil
86+
}
87+
88+
func decodeInput() (input, error) {
89+
var in input
90+
dec := json.NewDecoder(bufio.NewReader(os.Stdin))
91+
if err := dec.Decode(&in); err != nil {
92+
return in, fmt.Errorf("parse json: %w", err)
93+
}
94+
return in, nil
95+
}
96+
97+
func prepareURLs(in input) (*url.URL, *url.URL, error) {
98+
base := strings.TrimSpace(os.Getenv("OPENALEX_BASE_URL"))
99+
if base == "" {
100+
base = "https://api.openalex.org"
101+
}
102+
baseURL, err := url.Parse(base)
103+
if err != nil || (baseURL.Scheme != "http" && baseURL.Scheme != "https") {
104+
return nil, nil, errors.New("OPENALEX_BASE_URL must be a valid http/https URL")
105+
}
106+
if err := ssrfGuard(baseURL); err != nil {
107+
return nil, nil, err
108+
}
109+
reqURL, err := url.Parse(baseURL.String())
110+
if err != nil {
111+
return nil, nil, err
112+
}
113+
// Build: /works?search=...&per-page=...&from_publication_date=...&to_publication_date=...
114+
reqURL.Path = strings.TrimRight(reqURL.Path, "/") + "/works"
115+
q := reqURL.Query()
116+
// The OpenAlex API supports "search" as a generic text search; stick to that.
117+
q.Set("search", in.Q)
118+
if in.PerPage > 0 {
119+
if in.PerPage > 50 {
120+
// OpenAlex allows up to 200, but keep a conservative cap here
121+
in.PerPage = 50
122+
}
123+
q.Set("per-page", strconv.Itoa(in.PerPage))
124+
} else {
125+
q.Set("per-page", "10")
126+
}
127+
if strings.TrimSpace(in.From) != "" {
128+
q.Set("from_publication_date", in.From)
129+
}
130+
if strings.TrimSpace(in.To) != "" {
131+
q.Set("to_publication_date", in.To)
132+
}
133+
reqURL.RawQuery = q.Encode()
134+
return baseURL, reqURL, nil
135+
}
136+
137+
type openalexResponse struct {
138+
Results []map[string]any `json:"results"`
139+
NextCursor string `json:"next_cursor"`
140+
}
141+
142+
func fetchWithRetry(client *http.Client, baseURL *url.URL, reqURL *url.URL) (openalexResponse, int, int, error) {
143+
var out openalexResponse
144+
var lastStatus int
145+
var retries int
146+
for attempt := 0; attempt < 2; attempt++ {
147+
if err := ssrfGuard(baseURL); err != nil {
148+
return openalexResponse{}, 0, retries, err
149+
}
150+
req, err := http.NewRequest(http.MethodGet, reqURL.String(), nil)
151+
if err != nil {
152+
return openalexResponse{}, 0, retries, fmt.Errorf("new request: %w", err)
153+
}
154+
req.Header.Set("User-Agent", "agentcli-openalex/0.1")
155+
resp, err := client.Do(req)
156+
if err != nil {
157+
if isTimeout(err) && attempt == 0 {
158+
retries++
159+
backoffSleep(0, attempt)
160+
continue
161+
}
162+
return openalexResponse{}, 0, retries, fmt.Errorf("http: %w", err)
163+
}
164+
lastStatus = resp.StatusCode
165+
dec := json.NewDecoder(bufio.NewReader(resp.Body))
166+
if resp.StatusCode >= 500 && attempt == 0 {
167+
_ = resp.Body.Close() //nolint:errcheck
168+
retries++
169+
backoffSleep(0, attempt)
170+
continue
171+
}
172+
if err := dec.Decode(&out); err != nil {
173+
_ = resp.Body.Close() //nolint:errcheck
174+
if resp.StatusCode >= 500 && attempt == 0 {
175+
retries++
176+
backoffSleep(0, attempt)
177+
continue
178+
}
179+
return openalexResponse{}, lastStatus, retries, fmt.Errorf("decode json: %w", err)
180+
}
181+
_ = resp.Body.Close() //nolint:errcheck
182+
break
183+
}
184+
return out, lastStatus, retries, nil
185+
}
186+
187+
func mapResults(rows []map[string]any) []outputResult {
188+
out := make([]outputResult, 0, len(rows))
189+
for _, r := range rows {
190+
var res outputResult
191+
if v, ok := r["display_name"].(string); ok {
192+
res.Title = v
193+
}
194+
if v, ok := r["title"].(string); ok && res.Title == "" {
195+
res.Title = v
196+
}
197+
if v, ok := r["doi"].(string); ok {
198+
res.DOI = v
199+
}
200+
if v, ok := r["publication_year"].(float64); ok {
201+
res.PublicationYear = int(v)
202+
} else if v, ok := r["publication_year"].(int); ok {
203+
res.PublicationYear = v
204+
}
205+
if oa, ok := r["open_access"].(map[string]any); ok {
206+
if v, ok := oa["oa_url"].(string); ok {
207+
res.OpenAccessURL = v
208+
}
209+
}
210+
if v, ok := r["authorships"].([]any); ok {
211+
res.Authorships = v
212+
}
213+
if v, ok := r["cited_by_count"].(float64); ok {
214+
res.CitedByCount = int(v)
215+
} else if v, ok := r["cited_by_count"].(int); ok {
216+
res.CitedByCount = v
217+
}
218+
out = append(out, res)
219+
}
220+
return out
221+
}
222+
223+
func resolveTimeout() time.Duration {
224+
// 8s default per spec, can be overridden via HTTP_TIMEOUT_MS
225+
if v := strings.TrimSpace(os.Getenv("HTTP_TIMEOUT_MS")); v != "" {
226+
if ms, err := time.ParseDuration(v + "ms"); err == nil && ms > 0 {
227+
return ms
228+
}
229+
}
230+
return 8 * time.Second
231+
}
232+
233+
func newHTTPClient(timeout time.Duration) *http.Client {
234+
tr := &http.Transport{}
235+
return &http.Client{Timeout: timeout, Transport: tr, CheckRedirect: func(req *http.Request, via []*http.Request) error {
236+
if len(via) >= 5 {
237+
return errors.New("too many redirects")
238+
}
239+
return ssrfGuard(req.URL)
240+
}}
241+
}
242+
243+
func isTimeout(err error) bool {
244+
var ne net.Error
245+
return errors.As(err, &ne) && ne.Timeout()
246+
}
247+
248+
func backoffSleep(_ int64, attempt int) {
249+
time.Sleep(time.Duration(100*(attempt+1)) * time.Millisecond)
250+
}
251+
252+
// ssrfGuard blocks loopback, RFC1918, link-local, ULA, and .onion unless OPENALEX_ALLOW_LOCAL=1
253+
func ssrfGuard(u *url.URL) error {
254+
host := u.Hostname()
255+
if host == "" {
256+
return errors.New("invalid host")
257+
}
258+
if strings.HasSuffix(strings.ToLower(host), ".onion") {
259+
return errors.New("SSRF blocked: onion domains are not allowed")
260+
}
261+
if os.Getenv("OPENALEX_ALLOW_LOCAL") == "1" {
262+
return nil
263+
}
264+
ips, err := net.LookupIP(host)
265+
if err != nil || len(ips) == 0 {
266+
return errors.New("SSRF blocked: cannot resolve host")
267+
}
268+
for _, ip := range ips {
269+
if isPrivateIP(ip) {
270+
return errors.New("SSRF blocked: private or loopback address")
271+
}
272+
}
273+
return nil
274+
}
275+
276+
func isPrivateIP(ip net.IP) bool {
277+
if ip.IsLoopback() || ip.IsLinkLocalUnicast() || ip.IsLinkLocalMulticast() {
278+
return true
279+
}
280+
if v4 := ip.To4(); v4 != nil {
281+
ip = v4
282+
if v4[0] == 10 {
283+
return true
284+
}
285+
if v4[0] == 172 && v4[1]&0xf0 == 16 {
286+
return true
287+
}
288+
if v4[0] == 192 && v4[1] == 168 {
289+
return true
290+
}
291+
if v4[0] == 169 && v4[1] == 254 {
292+
return true
293+
}
294+
if v4[0] == 127 {
295+
return true
296+
}
297+
return false
298+
}
299+
if ip.Equal(net.ParseIP("::1")) {
300+
return true
301+
}
302+
if ip[0] == 0xfe && (ip[1]&0xc0) == 0x80 {
303+
return true
304+
}
305+
if ip[0]&0xfe == 0xfc {
306+
return true
307+
}
308+
return false
309+
}
310+
311+
// appendAudit writes an NDJSON line under .goagent/audit/YYYYMMDD.log at the repo root.
312+
func appendAudit(entry any) error {
313+
b, err := json.Marshal(entry)
314+
if err != nil {
315+
return err
316+
}
317+
root := moduleRoot()
318+
dir := filepath.Join(root, ".goagent", "audit")
319+
if err := os.MkdirAll(dir, 0o755); err != nil {
320+
return err
321+
}
322+
fname := time.Now().UTC().Format("20060102") + ".log"
323+
path := filepath.Join(dir, fname)
324+
f, err := os.OpenFile(path, os.O_CREATE|os.O_APPEND|os.O_WRONLY, 0o644)
325+
if err != nil {
326+
return err
327+
}
328+
defer func() { _ = f.Close() }() //nolint:errcheck
329+
if _, err := f.Write(append(b, '\n')); err != nil {
330+
return err
331+
}
332+
return nil
333+
}
334+
335+
// moduleRoot walks upward from CWD to the directory containing go.mod; falls back to CWD.
336+
func moduleRoot() string {
337+
cwd, err := os.Getwd()
338+
if err != nil || cwd == "" {
339+
return "."
340+
}
341+
dir := cwd
342+
for {
343+
if _, err := os.Stat(filepath.Join(dir, "go.mod")); err == nil {
344+
return dir
345+
}
346+
parent := filepath.Dir(dir)
347+
if parent == dir {
348+
return cwd
349+
}
350+
dir = parent
351+
}
352+
}

0 commit comments

Comments
 (0)