Skip to content

Commit b8059a0

Browse files
committed
harden install process for BSDs
1 parent 1ab07cc commit b8059a0

File tree

2 files changed

+131
-86
lines changed

2 files changed

+131
-86
lines changed

cmd/agent/install.go

Lines changed: 62 additions & 52 deletions
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ import (
1010
"path/filepath"
1111
"runtime"
1212
"strings"
13-
"syscall"
1413
"text/template"
1514
"time"
1615
)
@@ -453,21 +452,31 @@ func uninstallLinux() error {
453452
return nil
454453
}
455454

455+
const crontabCmd = "crontab"
456+
456457
// installCron installs a cron job (fallback for systems without systemd/launchd).
457458
func installCron(agentPath, _, _ string) error {
458459
// Check if crontab is available
459-
if _, err := exec.LookPath("crontab"); err != nil {
460+
if _, err := exec.LookPath(crontabCmd); err != nil {
460461
return errors.New("neither systemd user services nor cron are available - manual startup required")
461462
}
462463

463464
// Get current crontab
464-
cmd := exec.Command("crontab", "-l") //nolint:noctx // local command
465-
output, _ := cmd.Output() //nolint:errcheck // Ignore error - no crontab is fine
465+
cmd := exec.Command(crontabCmd, "-l") //nolint:noctx // local command
466+
output, _ := cmd.Output() //nolint:errcheck // Ignore error - no crontab is fine
466467
currentCron := string(output)
467468

468-
// Check if already installed
469-
if strings.Contains(currentCron, agentPath) {
470-
return nil // Already installed
469+
// Check if already installed - look for the agent name in the crontab
470+
if strings.Contains(currentCron, agentName) {
471+
log.Printf("[INFO] Cron job for %s already installed, updating entries", agentName)
472+
// Remove old entries to replace with new ones
473+
var filteredLines []string
474+
for _, line := range strings.Split(currentCron, "\n") {
475+
if !strings.Contains(line, agentName) {
476+
filteredLines = append(filteredLines, line)
477+
}
478+
}
479+
currentCron = strings.Join(filteredLines, "\n")
471480
}
472481

473482
// Add new cron jobs: run at reboot and every 15 minutes
@@ -485,59 +494,60 @@ func installCron(agentPath, _, _ string) error {
485494
}
486495

487496
// Install new crontab
488-
cmd = exec.Command("crontab", "-") //nolint:noctx // local command
497+
log.Printf("[INFO] Installing cron entries for %s", agentPath)
498+
cmd = exec.Command(crontabCmd, "-") //nolint:noctx // local command
489499
cmd.Stdin = strings.NewReader(newCron)
490500
if err := cmd.Run(); err != nil {
491501
return fmt.Errorf("failed to install crontab: %w", err)
492502
}
503+
log.Print("[INFO] Cron entries installed successfully")
504+
505+
// Verify the crontab was actually installed
506+
cmd = exec.Command(crontabCmd, "-l") //nolint:noctx // local command
507+
if output, err := cmd.Output(); err == nil {
508+
if strings.Contains(string(output), agentPath) {
509+
log.Print("[INFO] Cron entries verified in crontab")
510+
} else {
511+
log.Print("[WARN] Cron entries not found in crontab after installation")
512+
}
513+
}
493514

494515
// Try to start the agent immediately in background
495-
// Use nohup to ensure the process survives after installer exits
496516
log.Printf("[INFO] Starting agent in background: %s", agentPath)
497517

498-
// Check if nohup is available
499-
nohupPath, nohupErr := exec.LookPath("nohup")
500-
if nohupErr == nil {
501-
// Use nohup with proper detachment
502-
cmd = exec.Command(nohupPath, agentPath) //nolint:noctx // agent spawns its own context
503-
cmd.Stdin = nil
504-
// Redirect stdout/stderr to /dev/null to fully detach
505-
devNull, err := os.Open("/dev/null")
506-
if err == nil {
507-
cmd.Stdout = devNull
508-
cmd.Stderr = devNull
509-
defer func() { _ = devNull.Close() }() //nolint:errcheck // defer close
510-
}
518+
// Check if nohup is available (it should be on all Unix-like systems including FreeBSD)
519+
nohupPath, err := exec.LookPath("nohup")
520+
if err != nil {
521+
log.Printf("[WARN] nohup not found, agent will start via cron in 15 minutes: %v", err)
522+
return nil
523+
}
511524

512-
// Set process group to detach from parent
513-
cmd.SysProcAttr = &syscall.SysProcAttr{
514-
Setpgid: true,
515-
}
525+
// Get the directory where the agent is installed
526+
agentDir := filepath.Dir(agentPath)
527+
agentBinary := filepath.Base(agentPath)
516528

517-
if err := cmd.Start(); err == nil {
518-
// Process started successfully
519-
log.Printf("[INFO] Agent started successfully with PID %d using nohup", cmd.Process.Pid)
520-
// Don't wait for it to finish
521-
go func() {
522-
_ = cmd.Wait() //nolint:errcheck // Reap the child when it exits
523-
}()
524-
} else {
525-
log.Printf("[WARN] Failed to start agent with nohup: %v", err)
526-
}
529+
// Use nohup with shell to properly background the process
530+
// Change to the agent directory first so PID file and logs are created in the right place
531+
// The & is crucial for detaching from the parent process
532+
shellCmd := fmt.Sprintf("cd %s && %s ./%s > /dev/null 2>&1 &", agentDir, nohupPath, agentBinary)
533+
cmd = exec.Command("sh", "-c", shellCmd) //nolint:noctx // agent spawns its own context
534+
535+
if err := cmd.Run(); err != nil {
536+
log.Printf("[WARN] Failed to start agent with nohup: %v (will start via cron in 15 minutes)", err)
537+
return nil
538+
}
539+
540+
log.Print("[INFO] Agent started successfully in background using nohup")
541+
542+
// Give it a moment to start
543+
time.Sleep(500 * time.Millisecond)
544+
// Verify the process is running
545+
checkCmd := exec.Command("pgrep", "-f", agentName) //nolint:noctx // local command
546+
if output, err := checkCmd.Output(); err == nil && len(output) > 0 {
547+
pids := strings.TrimSpace(string(output))
548+
log.Printf("[INFO] Agent process confirmed running with PID(s): %s", pids)
527549
} else {
528-
// Fallback to direct execution without nohup
529-
cmd = exec.Command(agentPath) //nolint:noctx // agent spawns its own context
530-
cmd.Stdin = nil
531-
cmd.Stdout = nil
532-
cmd.Stderr = nil
533-
534-
if err := cmd.Start(); err == nil {
535-
// Detach from the process
536-
_ = cmd.Process.Release() //nolint:errcheck // best effort cleanup
537-
log.Printf("[INFO] Agent started successfully with PID %d (without nohup)", cmd.Process.Pid)
538-
} else {
539-
log.Printf("[WARN] Failed to start agent immediately: %v (will start via cron in 15 minutes)", err)
540-
}
550+
log.Print("[WARN] Could not confirm agent is running, but it may have started successfully")
541551
}
542552

543553
return nil
@@ -546,7 +556,7 @@ func installCron(agentPath, _, _ string) error {
546556
// uninstallCron removes cron job.
547557
func uninstallCron() error {
548558
// Get current crontab
549-
cmd := exec.Command("crontab", "-l") //nolint:noctx // local command
559+
cmd := exec.Command(crontabCmd, "-l") //nolint:noctx // local command
550560
output, err := cmd.Output()
551561
if err != nil {
552562
return nil //nolint:nilerr // No crontab, nothing to remove
@@ -567,9 +577,9 @@ func uninstallCron() error {
567577
// Install updated crontab
568578
if strings.TrimSpace(newCron) == "" {
569579
// Remove crontab entirely if empty
570-
_ = exec.Command("crontab", "-r").Run() //nolint:errcheck,noctx // Best effort
580+
_ = exec.Command(crontabCmd, "-r").Run() //nolint:errcheck,noctx // Best effort
571581
} else {
572-
cmd = exec.Command("crontab", "-") //nolint:noctx // local command
582+
cmd = exec.Command(crontabCmd, "-") //nolint:noctx // local command
573583
cmd.Stdin = strings.NewReader(newCron)
574584
if err := cmd.Run(); err != nil {
575585
return fmt.Errorf("failed to update crontab: %w", err)

cmd/agent/main.go

Lines changed: 69 additions & 34 deletions
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,7 @@ import (
2727
"sort"
2828
"strconv"
2929
"strings"
30+
"sync"
3031
"syscall"
3132
"time"
3233

@@ -55,10 +56,10 @@ const (
5556
maxLogLength = 200
5657
// Minimum parts required for IOPlatformUUID parsing.
5758
minUUIDParts = 4
58-
// Retry configuration.
59-
maxRetries = 3
59+
// Retry configuration - using exponential backoff with jitter up to 2 minutes total.
60+
maxRetries = 7 // More attempts to handle transient failures
6061
initialBackoff = 1 * time.Second
61-
maxBackoff = 2 * time.Minute // Wait up to 2 minutes with exponential backoff
62+
maxBackoff = 30 * time.Second // Per-retry max delay to fit within 2 minute total
6263
// HTTP client timeout.
6364
httpTimeout = 30 * time.Second
6465
// Queue size for failed reports.
@@ -408,7 +409,10 @@ func main() {
408409

409410
// Configure server connection
410411
if err := agent.configureServerConnection(); err != nil {
411-
log.Fatal(err)
412+
log.Printf("[ERROR] Failed to configure server connection: %v", err)
413+
log.Print("[INFO] Agent will continue running in offline mode, collecting data locally")
414+
// Continue in offline mode - we can still collect data even if server is not configured
415+
agent.serverURL = ""
412416
}
413417

414418
// Check PID file to avoid duplicate processes
@@ -469,6 +473,15 @@ func (a *Agent) reportToServer(ctx context.Context) {
469473
log.Printf("[DEBUG] Generated report with %d checks in %v", len(report.Checks), time.Since(start))
470474
}
471475

476+
// If no server configured (offline mode), just log the collection
477+
if a.serverURL == "" {
478+
log.Print("[INFO] Running in offline mode - data collected but not sent to server")
479+
if *debugMode {
480+
log.Printf("[DEBUG] Collected %d checks in offline mode", len(report.Checks))
481+
}
482+
return
483+
}
484+
472485
retryCount := 0
473486
err := retry.Do(func() error {
474487
retryCount++
@@ -598,9 +611,15 @@ func (a *Agent) runAllChecks(ctx context.Context) map[string]gitmdm.Check {
598611
log.Printf("[DEBUG] Running %d checks for OS: %s", len(a.config.Checks), osName)
599612
}
600613

614+
// Use a mutex to protect the shared maps
615+
var mu sync.Mutex
616+
// Use a WaitGroup to wait for all checks to complete
617+
var wg sync.WaitGroup
618+
// Limit concurrency to avoid overwhelming the system
619+
semaphore := make(chan struct{}, runtime.NumCPU())
620+
601621
for checkName := range a.config.Checks {
602622
checkDef := a.config.Checks[checkName]
603-
checkStart := time.Now()
604623

605624
// Get the rules for this OS
606625
rules := checkDef.CommandsForOS(osName)
@@ -611,42 +630,58 @@ func (a *Agent) runAllChecks(ctx context.Context) map[string]gitmdm.Check {
611630
continue
612631
}
613632

614-
// Run all rules for this check
615-
var outputs []gitmdm.CommandOutput
616-
for _, rule := range rules {
617-
output := a.executeCheck(ctx, checkName, rule)
618-
outputs = append(outputs, output)
619-
}
633+
wg.Add(1)
634+
go func() {
635+
defer wg.Done()
636+
// Acquire semaphore
637+
semaphore <- struct{}{}
638+
defer func() { <-semaphore }()
620639

621-
// Analyze all outputs to determine status
622-
status, reason, remediation := analyzer.DetermineOverallStatus(outputs)
640+
checkStart := time.Now()
623641

624-
check := gitmdm.Check{
625-
Timestamp: time.Now(), // Set the timestamp when the check was performed
626-
Outputs: outputs,
627-
Status: status,
628-
Reason: reason,
629-
Remediation: remediation,
630-
}
631-
checks[checkName] = check
642+
// Run all rules for this check
643+
var outputs []gitmdm.CommandOutput
644+
for _, rule := range rules {
645+
output := a.executeCheck(ctx, checkName, rule)
646+
outputs = append(outputs, output)
647+
}
632648

633-
// Update counters based on status
634-
switch status {
635-
case statusPass:
636-
successCount++
637-
if *debugMode {
638-
log.Printf("[DEBUG] Check %s passed in %v: %s", checkName, time.Since(checkStart), reason)
649+
// Analyze all outputs to determine status
650+
status, reason, remediation := analyzer.DetermineOverallStatus(outputs)
651+
652+
check := gitmdm.Check{
653+
Timestamp: time.Now(), // Set the timestamp when the check was performed
654+
Outputs: outputs,
655+
Status: status,
656+
Reason: reason,
657+
Remediation: remediation,
639658
}
640-
case statusFail:
641-
failureCount++
642-
if *debugMode {
643-
log.Printf("[DEBUG] Check %s failed in %v: %s", checkName, time.Since(checkStart), reason)
659+
660+
// Update shared state with mutex
661+
mu.Lock()
662+
checks[checkName] = check
663+
// Update counters based on status
664+
switch status {
665+
case statusPass:
666+
successCount++
667+
if *debugMode {
668+
log.Printf("[DEBUG] Check %s passed in %v: %s", checkName, time.Since(checkStart), reason)
669+
}
670+
case statusFail:
671+
failureCount++
672+
if *debugMode {
673+
log.Printf("[DEBUG] Check %s failed in %v: %s", checkName, time.Since(checkStart), reason)
674+
}
675+
default:
676+
// "n/a" - no counter update
644677
}
645-
default:
646-
// "n/a" - no counter update
647-
}
678+
mu.Unlock()
679+
}()
648680
}
649681

682+
// Wait for all checks to complete
683+
wg.Wait()
684+
650685
log.Printf("[INFO] Completed %d checks (%d successful, %d failed) in %v",
651686
successCount+failureCount, successCount, failureCount, time.Since(start))
652687

0 commit comments

Comments
 (0)