refactor: streamline domain handling by consolidating normalization and registrability checks

jakejarvis · jakejarvis · commit eb7c47af725c · 2025-11-24T17:55:01.000-05:00
diff --git a/app/[domain]/page.tsx b/app/[domain]/page.tsx
@@ -3,7 +3,6 @@ import type { Metadata } from "next";
 import { notFound, redirect } from "next/navigation";
 import { DomainReportView } from "@/components/domain/domain-report-view";
 import { analytics } from "@/lib/analytics/server";
-import { normalizeDomainInput } from "@/lib/domain";
 import { toRegistrableDomain } from "@/lib/domain-server";
 import { getQueryClient, trpc } from "@/trpc/server";
 
@@ -17,10 +16,9 @@ export async function generateMetadata({
 }): Promise<Metadata> {
   const { domain: raw } = await params;
   const decoded = decodeURIComponent(raw);
-  const normalized = normalizeDomainInput(decoded);
 
-  const isRegistrable = toRegistrableDomain(normalized);
-  if (!isRegistrable) {
+  const registrable = toRegistrableDomain(decoded);
+  if (!registrable) {
     // workaround, should match metadata from not-found.tsx
     return {
       title: "Not Found",
@@ -30,11 +28,11 @@ export async function generateMetadata({
 
   return {
     title: {
-      absolute: `${normalized} — Domain Report`,
+      absolute: `${registrable} — Domain Report`,
     },
-    description: `Domainstack report for ${normalized}: WHOIS lookup, DNS & SSL scan, HTTP headers, hosting & email provider data, and SEO metadata.`,
+    description: `Domainstack report for ${registrable}: WHOIS lookup, DNS & SSL scan, HTTP headers, hosting & email provider data, and SEO metadata.`,
     alternates: {
-      canonical: `/${normalized}`,
+      canonical: `/${registrable}`,
     },
   };
 }
@@ -46,30 +44,29 @@ export default async function DomainPage({
 }) {
   const { domain: raw } = await params;
   const decoded = decodeURIComponent(raw);
-  const normalized = normalizeDomainInput(decoded);
 
-  const isRegistrable = toRegistrableDomain(normalized);
-  if (!isRegistrable) notFound();
+  const registrable = toRegistrableDomain(decoded);
+  if (!registrable) notFound();
 
-  // Canonicalize URL to the normalized domain (middleware should already handle most cases)
-  if (normalized !== decoded) {
-    redirect(`/${encodeURIComponent(normalized)}`);
+  // Canonicalize URL to the registrable domain (middleware should already handle most cases)
+  if (registrable !== decoded) {
+    redirect(`/${encodeURIComponent(registrable)}`);
   }
 
   // Track server-side page view
-  analytics.track("report_viewed", { domain: normalized });
+  analytics.track("report_viewed", { domain: registrable });
 
   // Minimal prefetch: registration only, let sections stream progressively
   // Use getQueryClient() to ensure consistent query client across the request
   const queryClient = getQueryClient();
   void queryClient.prefetchQuery(
-    trpc.domain.getRegistration.queryOptions({ domain: normalized }),
+    trpc.domain.getRegistration.queryOptions({ domain: registrable }),
   );
 
   return (
     <div className="container mx-auto max-w-4xl px-4 py-6">
       <HydrationBoundary state={dehydrate(queryClient)}>
-        <DomainReportView domain={normalized} />
+        <DomainReportView domain={registrable} />
       </HydrationBoundary>
     </div>
   );
diff --git a/lib/domain-server.ts b/lib/domain-server.ts
@@ -1,14 +1,21 @@
 import { toRegistrableDomain as toRegistrableDomainRdapper } from "rdapper";
 import { cache } from "react";
 import { BLACKLISTED_SUFFIXES } from "@/lib/constants/domain-validation";
+import { normalizeDomainInput } from "@/lib/domain";
 
-// A simple wrapper around rdapper's toRegistrableDomain that:
-// 1. is cached for per-request deduplication
-// 2. checks if the domain is blacklisted by BLACKLISTED_SUFFIXES in constants/domain-validation.ts
+// A wrapper around rdapper's toRegistrableDomain that:
+// 1. normalizes user input (strips schemes, paths, ports, auth, www., etc.)
+// 2. is cached for per-request deduplication
+// 3. checks if the domain is blacklisted by BLACKLISTED_SUFFIXES in constants/domain-validation.ts
 export const toRegistrableDomain = cache(function toRegistrableDomain(
   input: string,
 ): string | null {
-  const value = (input ?? "").trim().toLowerCase();
+  // First normalize the input to extract a clean hostname
+  // This handles user input with schemes, paths, ports, auth, trailing dots, www., etc.
+  const normalized = normalizeDomainInput(input);
+  if (!normalized) return null;
+
+  const value = normalized.trim().toLowerCase();
   if (value === "") return null;
 
   // Shortcut: exact suffixes such as ".css.map" that frequently appear
diff --git a/lib/domain.test.ts b/lib/domain.test.ts
@@ -24,6 +24,54 @@ describe("normalizeDomainInput", () => {
       "ex-ample.com",
     );
   });
+
+  it("handles malformed protocols (single slash)", () => {
+    expect(normalizeDomainInput("http:/example.com")).toBe("example.com");
+  });
+
+  it("handles malformed protocols (triple slash)", () => {
+    expect(normalizeDomainInput("http:///example.com")).toBe("example.com");
+  });
+
+  it("handles malformed protocols (multiple colons)", () => {
+    expect(normalizeDomainInput("https:::example.com/path")).toBe(
+      "example.com",
+    );
+  });
+
+  it("rejects IPv6 literals", () => {
+    expect(normalizeDomainInput("[::1]")).toBe("");
+    expect(normalizeDomainInput("[::1]:8080")).toBe("");
+    expect(normalizeDomainInput("http://[2001:db8::1]/path")).toBe("");
+  });
+
+  it("handles spaces and whitespace", () => {
+    expect(normalizeDomainInput("  example.com  ")).toBe("example.com");
+    expect(normalizeDomainInput("example.com /path")).toBe("example.com");
+  });
+
+  it("strips www from subdomains", () => {
+    expect(normalizeDomainInput("www.example.com")).toBe("example.com");
+    expect(normalizeDomainInput("WWW.EXAMPLE.COM")).toBe("example.com");
+  });
+
+  it("preserves non-www subdomains", () => {
+    expect(normalizeDomainInput("api.example.com")).toBe("api.example.com");
+    expect(normalizeDomainInput("sub.domain.example.com")).toBe(
+      "sub.domain.example.com",
+    );
+  });
+
+  it("handles query parameters and fragments", () => {
+    expect(normalizeDomainInput("example.com?query=value")).toBe("example.com");
+    expect(normalizeDomainInput("example.com#fragment")).toBe("example.com");
+    expect(normalizeDomainInput("example.com?q=1#frag")).toBe("example.com");
+  });
+
+  it("returns empty string for empty input", () => {
+    expect(normalizeDomainInput("")).toBe("");
+    expect(normalizeDomainInput("   ")).toBe("");
+  });
 });
 
 describe("isValidDomain", () => {
diff --git a/lib/domain.ts b/lib/domain.ts
@@ -1,60 +1,87 @@
 // Utilities for handling user-provided domain input
 
+// Matches beginning "http:" or "https:" followed by any number of slashes/colons
+// Captures the authority (host + userinfo + port)
+// This handles malformed protocols like "http:/example.com" or "http:///example.com"
+const SCHEME_PREFIX_REGEX = /^https?[:/]+([^/]+)/i;
+
 /**
- * Normalize arbitrary user input into a bare registrable domain string.
+ * Normalize arbitrary user input into a bare hostname string.
  * Accepts values like:
  *  - "example.com"
  *  - "www.example.com."
  *  - "https://example.com/path?x#y"
  *  - "http://user:pass@example.com:8080/"
+ *  - "http:/example.com" (malformed protocol)
  *  - "  EXAMPLE.COM  "
- * Returns a lowercased hostname without scheme, path, auth, port, or trailing dot.
+ * Returns a lowercased hostname without scheme, path, auth, port, trailing dot, or www. prefix.
+ * Returns empty string for invalid/unparseable input or IPv6 literals.
  */
 export function normalizeDomainInput(input: string): string {
   let value = (input ?? "").trim();
   if (value === "") return "";
 
-  // If it looks like a URL (has a scheme), use URL parsing
-  const hasScheme = /:\/\//.test(value);
-  if (hasScheme) {
+  // Reject IPv6 literals early (e.g., "[::1]", "[::1]:8080")
+  // These are not supported and would cause issues in URL parsing
+  if (value.includes("[") || value.includes("]")) {
+    return "";
+  }
+
+  // Try to extract authority (host) from scheme-prefixed input
+  // This handles both valid and malformed protocols
+  const schemeMatch = value.match(SCHEME_PREFIX_REGEX);
+  if (schemeMatch) {
+    // Extract authority from the scheme match
+    value = schemeMatch[1];
+  } else if (/:\/\//.test(value)) {
+    // Has scheme-like pattern but didn't match our regex (e.g., "fake+scheme://...")
+    // Try URL parsing first
     try {
       const url = new URL(value);
-      // URL applies IDNA (punycode) and strips auth/port/path for hostname
       value = url.hostname;
     } catch {
-      // If invalid URL with scheme, strip leading scheme-like prefix manually
+      // Fallback: strip scheme-like prefix manually
       value = value.replace(/^\w+:\/\//, "");
-      // Remove credentials if present
-      value = value.replace(/^[^@]+@/, "");
-      // Remove path/query/fragment
-      value = value.split("/")[0].split("?")[0].split("#")[0];
     }
   } else {
-    // No scheme: try URL parsing with implicit http:// to get punycoded hostname
+    // No scheme detected: try URL parsing with implicit http:// to get punycoded hostname
     try {
       const url = new URL(`http://${value}`);
       value = url.hostname;
     } catch {
-      // Fallback: remove any credentials, path, query, or fragment accidentally included
-      value = value.replace(/^[^@]+@/, "");
-      value = value.split("/")[0].split("?")[0].split("#")[0];
+      // Fallback: treat as raw authority and parse manually
     }
   }
 
-  // Strip port if present
-  value = value.replace(/:\d+$/, "");
+  // Strip query and fragment (in case they weren't already removed)
+  value = value.split(/[?#]/)[0];
+
+  // Strip User Info (credentials)
+  const atIndex = value.lastIndexOf("@");
+  if (atIndex !== -1) {
+    value = value.slice(atIndex + 1);
+  }
+
+  // Strip port
+  value = value.split(":")[0];
+
+  // Remove any path components that might remain
+  value = value.split("/")[0];
 
   // Strip trailing dot
   value = value.replace(/\.$/, "");
 
+  // Trim any remaining whitespace
+  value = value.trim();
+
   // Remove common leading www.
   value = value.replace(/^www\./i, "");
 
   return value.toLowerCase();
 }
 
 /**
- * Basic domain validity check (hostname-like), not performing DNS or RDAP.
+ * An even more basic domain validity check (hostname-like), not performing DNS or RDAP.
  */
 export function isValidDomain(value: string): boolean {
   const v = (value ?? "").trim();
diff --git a/lib/middleware.ts b/lib/middleware.ts
@@ -2,10 +2,6 @@ import type { NextRequest } from "next/server";
 import { NextResponse } from "next/server";
 import { toRegistrableDomain } from "@/lib/domain-server";
 
-// Matches beginning "http:" or "https:" followed by any number of slashes/colons
-// Captures the authority (host + userinfo + port)
-export const SCHEME_PREFIX_REGEX = /^https?[:/]+([^/]+)/i;
-
 export type ProxyAction =
   | { type: "match" }
   | { type: "redirect"; destination: string }
@@ -40,57 +36,15 @@ export function getProxyAction(path: string): ProxyAction {
     // ignore decoding failures
   }
 
-  let candidate = decodedInput;
-
-  // 3. Extract authority (host) candidate
-  // If scheme present, extract authority from it.
-  // Otherwise, treat the whole string as potential authority start.
-  const schemeMatch = candidate.match(SCHEME_PREFIX_REGEX);
-  let authority = schemeMatch ? schemeMatch[1] : candidate;
-
-  // 4. Cleanup: Strip query, fragment, path (if not already stripped by regex)
-  // Note: Regex above stops at first slash, so path is already gone if scheme matched.
-  // If scheme didn't match, we manually strip path.
-  if (!schemeMatch) {
-    authority = authority.split("/")[0];
-  }
-
-  // Strip query and fragment (order doesn't matter as we take the first occurrence of either)
-  authority = authority.split(/[?#]/)[0];
-
-  authority = authority.trim();
-
-  // 5. Strip User Info
-  const atIndex = authority.lastIndexOf("@");
-  if (atIndex !== -1) {
-    authority = authority.slice(atIndex + 1);
-  }
-
-  // 6. Strip Port
-  // IPv6 literals in brackets (e.g. [::1]) are not supported.
-  if (authority.includes("[") || authority.includes("]")) {
-    return null;
-  }
-
-  // Safe to split on colon as valid domains don't contain colons
-  authority = authority.split(":")[0];
-
-  candidate = authority.trim();
-
-  if (!candidate) {
-    return null;
-  }
-
-  // 7. Validate and Normalize
-  // This will return null for invalid domains, including IPs if rdapper handles them as such.
-  const registrable = toRegistrableDomain(candidate);
+  // 3. Validate and extract the registrable domain
+  const registrable = toRegistrableDomain(decodedInput);
   if (!registrable) {
     return null;
   }
 
-  // 8. Redirect if necessary
+  // 4. Redirect if necessary
   // We compare the originally decoded input against the final canonical domain.
-  // Any difference (path, query, scheme, case, whitespace, userinfo, port) triggers a redirect.
+  // Any difference (path, query, scheme, case, whitespace, userinfo, port, subdomain) triggers a redirect.
   if (decodedInput !== registrable) {
     return {
       type: "redirect",
diff --git a/server/routers/domain.ts b/server/routers/domain.ts
@@ -1,6 +1,5 @@
 import { TRPCError } from "@trpc/server";
 import z from "zod";
-import { normalizeDomainInput } from "@/lib/domain";
 import { toRegistrableDomain } from "@/lib/domain-server";
 import {
   BlobUrlResponseSchema,
@@ -30,8 +29,7 @@ import {
 const DomainInputSchema = z
   .object({ domain: z.string().min(1) })
   .transform(({ domain }) => {
-    const normalized = normalizeDomainInput(domain);
-    const registrable = toRegistrableDomain(normalized);
+    const registrable = toRegistrableDomain(domain);
     if (!registrable) {
       throw new TRPCError({
         code: "BAD_REQUEST",