From 8e517188525c187206bea34b2674ca459b92f645 Mon Sep 17 00:00:00 2001 From: nikstur Date: Thu, 13 Nov 2025 01:00:38 +0100 Subject: [PATCH] feat(treewide): lazy load resources from internet on first use Now, nothing is dynamically downloaded from the internet at install time. Instead, on first use, a cache direcory is created (`~/.cache/mdn-http-observatory`) and these files downloaded: - Mozilla CA Bundle - HSTS preload list - TLD list When one of these files is not present, it is downloaded on the next invocation. When it is present on the filesystem it is used right away. "Refreshing" these resources can then simply be done by removing the cache directory and restarting the application. It can also be done by calling `npm run refreshCache` or via `npm run maintenance`. This enables packaging mdn-http-observatory for Linux distros. Additionaly, it makes the package a little easier to understand because there is no loading of resources at various points at install time. --- Dockerfile | 2 - bin/wrapper.js | 7 -- package-lock.json | 80 ++--------------------- package.json | 12 ++-- src/analyzer/hsts.js | 14 +--- src/api/index.js | 3 + src/api/v2/utils.js | 14 +--- src/ca-bundle.js | 72 ++++++++++++++++++++ src/cache.js | 67 +++++++++++++++++++ src/{retrieve-hsts.js => hsts.js} | 17 ++--- src/maintenance/index.js | 11 ++-- src/scan.js | 3 + src/{retrieve-tld-list.js => tld-list.js} | 15 +---- 13 files changed, 172 insertions(+), 145 deletions(-) create mode 100644 src/ca-bundle.js create mode 100644 src/cache.js rename src/{retrieve-hsts.js => hsts.js} (80%) rename src/{retrieve-tld-list.js => tld-list.js} (65%) diff --git a/Dockerfile b/Dockerfile index 4f29535..d278679 100644 --- a/Dockerfile +++ b/Dockerfile @@ -8,7 +8,6 @@ RUN apt-get -y update && \ WORKDIR /home/node/app USER node COPY --chown=node:node . . -# This also installs hsts and tld data files in a postinstall script: RUN npm install ARG GIT_SHA=dev @@ -18,6 +17,5 @@ RUN env ENV RUN_ID=${RUN_ID} ENV GIT_SHA=${GIT_SHA} -ENV NODE_EXTRA_CA_CERTS=node_modules/extra_certs/ca_bundle/ca_intermediate_bundle.pem EXPOSE 8080 CMD [ "node", "src/api/index.js" ] diff --git a/bin/wrapper.js b/bin/wrapper.js index 37fd04c..5128fb7 100755 --- a/bin/wrapper.js +++ b/bin/wrapper.js @@ -9,13 +9,6 @@ import { fileURLToPath } from "node:url"; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); -// Set the environment variable for extra CA certificates -let caCertPath = import.meta.resolve("node_extra_ca_certs_mozilla_bundle"); -caCertPath = new URL(caCertPath).pathname; -caCertPath = path.dirname(caCertPath); -caCertPath = path.join(caCertPath, "ca_bundle", "ca_intermediate_bundle.pem"); -process.env.NODE_EXTRA_CA_CERTS = caCertPath; - // The target script you want to run (relative to this script's directory) const targetScript = path.join(__dirname, "..", "src", "scan.js"); diff --git a/package-lock.json b/package-lock.json index a8c822b..8dca9e0 100644 --- a/package-lock.json +++ b/package-lock.json @@ -7,7 +7,6 @@ "": { "name": "@mdn/mdn-http-observatory", "version": "1.5.0", - "hasInstallScript": true, "license": "MPL-2.0", "dependencies": { "@fastify/cors": "^11.0.1", @@ -26,7 +25,7 @@ "http-cookie-agent": "^7.0.1", "ip": "^2.0.1", "jsdom": "^27.0.0", - "node_extra_ca_certs_mozilla_bundle": "^1.0.7", + "papaparse": "^5.5.3", "pg": "^8.16.2", "pg-format": "^1.0.4", "pg-native": "^3.5.2", @@ -2075,12 +2074,6 @@ "file-uri-to-path": "1.0.0" } }, - "node_modules/bluebird": { - "version": "3.7.2", - "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.7.2.tgz", - "integrity": "sha512-XpNj6GDQzdfW+r2Wnn7xiSAd7TM3jzkxGXBGTtWKuSXv1xUV+azxAm8jdWZN06QTQk+2N2XB9jRDkvbmQmcRtg==", - "license": "MIT" - }, "node_modules/brace-expansion": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-2.0.2.tgz", @@ -2460,24 +2453,6 @@ "node": ">=18" } }, - "node_modules/cross-env": { - "version": "7.0.3", - "resolved": "https://registry.npmjs.org/cross-env/-/cross-env-7.0.3.tgz", - "integrity": "sha512-+/HKd6EgcQCJGh2PSjZuUitQBQynKor4wrFbRg4DtAgS1aWO+gU52xpH7M9ScGgXSYmAVS9bIJ8EzuaGw0oNAw==", - "license": "MIT", - "dependencies": { - "cross-spawn": "^7.0.1" - }, - "bin": { - "cross-env": "src/bin/cross-env.js", - "cross-env-shell": "src/bin/cross-env-shell.js" - }, - "engines": { - "node": ">=10.14", - "npm": ">=6", - "yarn": ">=1" - } - }, "node_modules/cross-spawn": { "version": "7.0.6", "resolved": "https://registry.npmjs.org/cross-spawn/-/cross-spawn-7.0.6.tgz", @@ -2519,23 +2494,6 @@ "node": ">=20" } }, - "node_modules/csvtojson": { - "version": "2.0.10", - "resolved": "https://registry.npmjs.org/csvtojson/-/csvtojson-2.0.10.tgz", - "integrity": "sha512-lUWFxGKyhraKCW8Qghz6Z0f2l/PqB1W3AO0HKJzGIQ5JRSlR651ekJDiGJbBT4sRNNv5ddnSGVEnsxP9XRCVpQ==", - "license": "MIT", - "dependencies": { - "bluebird": "^3.5.1", - "lodash": "^4.17.3", - "strip-bom": "^2.0.0" - }, - "bin": { - "csvtojson": "bin/csvtojson" - }, - "engines": { - "node": ">=4.0.0" - } - }, "node_modules/data-urls": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-6.0.0.tgz", @@ -3930,12 +3888,6 @@ "url": "https://github.com/sponsors/sindresorhus" } }, - "node_modules/is-utf8": { - "version": "0.2.1", - "resolved": "https://registry.npmjs.org/is-utf8/-/is-utf8-0.2.1.tgz", - "integrity": "sha512-rMYPYvCzsXywIsldgLaSoPlw5PfoB/ssr7hY4pLfcodrA5M/eArza1a9VmTiNIBNMjOGr1Ow9mTyU2o69U6U9Q==", - "license": "MIT" - }, "node_modules/isexe": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/isexe/-/isexe-2.0.0.tgz", @@ -4553,18 +4505,6 @@ "integrity": "sha512-OWND8ei3VtNC9h7V60qff3SVobHr996CTwgxubgyQYEpg290h9J0buyECNNJexkFm5sOajh5G116RYA1c8ZMSw==", "dev": true }, - "node_modules/node_extra_ca_certs_mozilla_bundle": { - "version": "1.0.7", - "resolved": "https://registry.npmjs.org/node_extra_ca_certs_mozilla_bundle/-/node_extra_ca_certs_mozilla_bundle-1.0.7.tgz", - "integrity": "sha512-wgnipQ71j14/5M//dp0kU8IzUYARoSaRpG0ILtLTa6QHB8EEHfN5OzSmViYxwSom8GTlC6KQC3GT2xs7DCUlRw==", - "hasInstallScript": true, - "license": "MIT", - "dependencies": { - "axios": "^1.6.5", - "cross-env": "^7.0.3", - "csvtojson": "^2.0.10" - } - }, "node_modules/nodemon": { "version": "3.1.10", "resolved": "https://registry.npmjs.org/nodemon/-/nodemon-3.1.10.tgz", @@ -4729,6 +4669,12 @@ "integrity": "sha512-UEZIS3/by4OC8vL3P2dTXRETpebLI2NiI5vIrjaD/5UtrkFX/tNbwjTSRAGC/+7CAo2pIcBaRgWmcBBHcsaCIw==", "license": "BlueOak-1.0.0" }, + "node_modules/papaparse": { + "version": "5.5.3", + "resolved": "https://registry.npmjs.org/papaparse/-/papaparse-5.5.3.tgz", + "integrity": "sha512-5QvjGxYVjxO59MGU2lHVYpRWBBtKHnlIAcSe1uNFCkkptUh63NFRj0FJQm7nR67puEruUci/ZkjmEFrjCAyP4A==", + "license": "MIT" + }, "node_modules/parent-module": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", @@ -5989,18 +5935,6 @@ "node": ">=8" } }, - "node_modules/strip-bom": { - "version": "2.0.0", - "resolved": "https://registry.npmjs.org/strip-bom/-/strip-bom-2.0.0.tgz", - "integrity": "sha512-kwrX1y7czp1E69n2ajbG65mIo9dqvJ+8aBQXOGVxqwvNbsXdFM6Lq37dLAY3mknUwru8CfcCbfOLL/gMo+fi3g==", - "license": "MIT", - "dependencies": { - "is-utf8": "^0.2.0" - }, - "engines": { - "node": ">=0.10.0" - } - }, "node_modules/strip-json-comments": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/strip-json-comments/-/strip-json-comments-3.1.1.tgz", diff --git a/package.json b/package.json index 77cf737..c4964e4 100644 --- a/package.json +++ b/package.json @@ -9,16 +9,14 @@ "npm": ">=9.0.0" }, "scripts": { - "start": "NODE_EXTRA_CA_CERTS=node_modules/node_extra_ca_certs_mozilla_bundle/ca_bundle/ca_intermediate_root_bundle.pem node src/api/index.js", - "dev": "NODE_EXTRA_CA_CERTS=node_modules/node_extra_ca_certs_mozilla_bundle/ca_bundle/ca_intermediate_root_bundle.pem nodemon src/api/index.js", + "start": "node src/api/index.js", + "dev": "nodemon src/api/index.js", "test": "CONFIG_FILE=conf/config-test.json mocha", "tsc": "tsc -p jsconfig.json", - "updateHsts": "node src/retrieve-hsts.js", - "updateTldList": "node src/retrieve-tld-list.js", "refreshMaterializedViews": "node src/maintenance/index.js", + "refreshCache": "node src/cache.js", "maintenance": "node src/maintenance/index.js", - "migrate": "node -e 'import(\"./src/database/migrate.js\").then( m => m.migrateDatabase() )'", - "postinstall": "npm run updateHsts && npm run updateTldList" + "migrate": "node -e 'import(\"./src/database/migrate.js\").then( m => m.migrateDatabase() )'" }, "bin": { "mdn-http-observatory-scan": "bin/wrapper.js" @@ -59,7 +57,7 @@ "http-cookie-agent": "^7.0.1", "ip": "^2.0.1", "jsdom": "^27.0.0", - "node_extra_ca_certs_mozilla_bundle": "^1.0.7", + "papaparse": "^5.5.3", "pg": "^8.16.2", "pg-format": "^1.0.4", "pg-native": "^3.5.2", diff --git a/src/analyzer/hsts.js b/src/analyzer/hsts.js index 464a317..87875a1 100644 --- a/src/analyzer/hsts.js +++ b/src/analyzer/hsts.js @@ -1,9 +1,6 @@ import fs from "fs"; -import path from "node:path"; -import { fileURLToPath } from "node:url"; import { Site } from "../site.js"; - -const dirname = path.dirname(fileURLToPath(import.meta.url)); +import { HSTS_PRELOAD_PATH } from "../cache.js"; /** * @type {import("../types.js").Hsts | null} @@ -15,15 +12,8 @@ let hstsMap = null; */ export function hsts() { if (!hstsMap) { - const filePath = path.join( - dirname, - "..", - "..", - "conf", - "hsts-preload.json" - ); hstsMap = new Map( - Object.entries(JSON.parse(fs.readFileSync(filePath, "utf8"))) + Object.entries(JSON.parse(fs.readFileSync(HSTS_PRELOAD_PATH, "utf8"))) ); } return hstsMap; diff --git a/src/api/index.js b/src/api/index.js index 79191e7..17eebda 100644 --- a/src/api/index.js +++ b/src/api/index.js @@ -1,7 +1,10 @@ import { CONFIG } from "../config.js"; import { createServer } from "./server.js"; +import { setupCache } from "../cache.js"; async function main() { + await setupCache(); + const server = await createServer(); try { await server.listen({ diff --git a/src/api/v2/utils.js b/src/api/v2/utils.js index 171ac0d..ef5d0cf 100644 --- a/src/api/v2/utils.js +++ b/src/api/v2/utils.js @@ -1,8 +1,6 @@ import ip from "ip"; import dns from "node:dns"; import fs from "fs"; -import { fileURLToPath } from "node:url"; -import path from "node:path"; import { InvalidHostNameError, InvalidHostNameIpError, @@ -28,6 +26,7 @@ import { PolicyResponse } from "./schemas.js"; import { Expectation } from "../../types.js"; import { TEST_TITLES } from "../../grader/charts.js"; import { scan } from "../../scanner/index.js"; +import { TLD_LIST_PATH } from "../../cache.js"; /** * @@ -50,7 +49,6 @@ export function isIp(hostname) { * @type {Set | null} */ let tldSet = null; -const dirname = path.dirname(fileURLToPath(import.meta.url)); /** * Get the cached set of top level domains. @@ -58,15 +56,7 @@ const dirname = path.dirname(fileURLToPath(import.meta.url)); */ function tlds() { if (!tldSet) { - const filePath = path.join( - dirname, - "..", - "..", - "..", - "conf", - "tld-list.json" - ); - tldSet = new Set(JSON.parse(fs.readFileSync(filePath, "utf8"))); + tldSet = new Set(JSON.parse(fs.readFileSync(TLD_LIST_PATH, "utf8"))); } return tldSet; } diff --git a/src/ca-bundle.js b/src/ca-bundle.js new file mode 100644 index 0000000..670429a --- /dev/null +++ b/src/ca-bundle.js @@ -0,0 +1,72 @@ +import axios from "axios"; +import { writeFile } from "fs/promises"; +import Papa from "papaparse"; + +const INTERMEDIATE_CA_URL = + "https://ccadb.my.salesforce-sites.com/mozilla/PublicAllIntermediateCertsWithPEMCSV"; + +const ROOT_CA_URL = + "https://ccadb.my.salesforce-sites.com/mozilla/IncludedCACertificateReportPEMCSV"; + +/** + * @param {string} url + * @returns {Promise} + */ +async function downloadCertificates(url) { + let r; + try { + r = await axios.get(url); + } catch (error) { + throw Error(`Failed to get data: ${error}`); + } + + const data = Papa.parse(r.data, { header: true }).data; + const output = []; + for (const entry of data) { + // Remove quotes from beginning and end of certificate + const certPem = entry["PEM Info"].slice(1, -1); + const commonName = entry["Common Name or Certificate Name"]; + output.push(`${commonName}\n${certPem}`); + } + return output; +} + +/** + * @returns {Promise} + */ +async function retrieveCABundle() { + // Download at the same time + const values = await Promise.all([ + downloadCertificates(INTERMEDIATE_CA_URL), + downloadCertificates(ROOT_CA_URL), + ]); + + const intermediateCACerts = values[0]; + const rootCACerts = values[1]; + + const combinedCACerts = intermediateCACerts.concat(rootCACerts); + return combinedCACerts.join("\n\n"); +} + +/** + * @param {string} filePath + */ +export async function retrieveAndStoreCABundle(filePath) { + const caBundle = await retrieveCABundle(); + + try { + await writeFile(filePath, caBundle); + console.log(`Downloaded Mozilla CA bundle and saved it to ${filePath}`); + } catch (error) { + console.error("Error writing file:", error); + return; + } +} + +/** + * + * @param {string} filePath + */ +export async function setupCABundle(filePath) { + process.env.NODE_EXTRA_CA_CERTS = filePath; +} diff --git a/src/cache.js b/src/cache.js new file mode 100644 index 0000000..04b1a31 --- /dev/null +++ b/src/cache.js @@ -0,0 +1,67 @@ +import path from "node:path"; +import os from "node:os"; +import fs from "node:fs"; + +import { retrieveAndStoreCABundle } from "./ca-bundle.js"; +import { retrieveAndStoreHsts } from "./hsts.js"; +import { retrieveAndStoreTldList } from "./tld-list.js"; + +const CACHE_DIR = path.join(os.homedir(), ".cache", "mdn-http-observatory"); +export const CA_BUNDLE_PATH = path.join(CACHE_DIR, "mozilla.ca-bundle"); +export const HSTS_PRELOAD_PATH = path.join(CACHE_DIR, "hsts-preload.json"); +export const TLD_LIST_PATH = path.join(CACHE_DIR, "tld-list.json"); + +/** + * Setup the cache. + * + * Create `~/.config/mdn-http-observatory` if it doesn't exist. + * Only download files if they don't exist in the cache directory. + */ +export async function setupCache() { + setupCacheDirectory(); + + const promises = []; + if (!fs.existsSync(CA_BUNDLE_PATH)) { + promises.push(retrieveAndStoreCABundle(CA_BUNDLE_PATH)); + } + if (!fs.existsSync(HSTS_PRELOAD_PATH)) { + promises.push(retrieveAndStoreHsts(HSTS_PRELOAD_PATH)); + } + if (!fs.existsSync(TLD_LIST_PATH)) { + promises.push(retrieveAndStoreTldList(TLD_LIST_PATH)); + } + + // Download at the same time + await Promise.all(promises); +} + +/** + * Forcibly refresh cache. + * + * Downloading all files even if they are already present in the cache + * directory. + */ +export async function refreshCache() { + setupCacheDirectory(); + + await Promise.all([ + retrieveAndStoreCABundle(CA_BUNDLE_PATH), + retrieveAndStoreHsts(HSTS_PRELOAD_PATH), + retrieveAndStoreTldList(TLD_LIST_PATH), + ]); +} + +function setupCacheDirectory() { + try { + if (!fs.existsSync(CACHE_DIR)) { + fs.mkdirSync(CACHE_DIR); + } + } catch (err) { + console.error(err); + } +} + +// Refresh cache when this file is run directly. +if (import.meta.url === `file://${process.argv[1]}`) { + refreshCache().catch(console.error); +} diff --git a/src/retrieve-hsts.js b/src/hsts.js similarity index 80% rename from src/retrieve-hsts.js rename to src/hsts.js index d181893..5cdb3d8 100644 --- a/src/retrieve-hsts.js +++ b/src/hsts.js @@ -1,7 +1,5 @@ import axios from "axios"; import { writeFile } from "fs/promises"; -import path from "node:path"; -import { fileURLToPath } from "node:url"; const HSTS_URL = new URL( "https://raw.githubusercontent.com/chromium/chromium/main/net/http/transport_security_state_static.json" @@ -16,8 +14,6 @@ const SCANNER_PINNED_DOMAINS = [ "services.mozilla.com", ]; -const dirname = path.dirname(fileURLToPath(import.meta.url)); - /** * * @typedef {Object} RawData @@ -38,9 +34,10 @@ const dirname = path.dirname(fileURLToPath(import.meta.url)); /** * Download the Google HSTS preload list + * @param {string} filePath * @returns */ -export async function retrieveAndStoreHsts() { +export async function retrieveAndStoreHsts(filePath) { let r; try { r = await axios.get(HSTS_URL.href); @@ -63,12 +60,11 @@ export async function retrieveAndStoreHsts() { pinned: SCANNER_PINNED_DOMAINS.includes(domain), }; return acc; - }, /** @type {HstsMap} */ ({})); + }, /** @type {HstsMap} */({})); - const filePath = path.join(dirname, "..", "conf", "hsts-preload.json"); try { await writeFile(filePath, JSON.stringify(hstsMap, null, 2)); - console.log(`File written to ${filePath}`); + console.log(`Downloaded HSTS data and saved it to ${filePath}`); } catch (error) { console.error("Error writing file:", error); return; @@ -83,8 +79,3 @@ export async function retrieveAndStoreHsts() { function removeJsonComments(jsonString) { return jsonString.replace(/\/\/.*$/gm, ""); } - -// Execute when run directly -if (import.meta.url === `file://${process.argv[1]}`) { - retrieveAndStoreHsts().catch(console.error); -} diff --git a/src/maintenance/index.js b/src/maintenance/index.js index 661ac7b..4522d05 100644 --- a/src/maintenance/index.js +++ b/src/maintenance/index.js @@ -2,16 +2,13 @@ import { createPool, refreshMaterializedViews, } from "../database/repository.js"; +import { refreshCache } from "../cache.js"; console.log("Starting MV refresh."); const pool = createPool(); await refreshMaterializedViews(pool); console.log("Successfully refreshed materialized views."); -import { retrieveAndStoreTldList } from "../retrieve-tld-list.js"; -await retrieveAndStoreTldList(); -console.log("Successfully updated TLD list."); - -import { retrieveAndStoreHsts } from "../retrieve-hsts.js"; -await retrieveAndStoreHsts(); -console.log("Successfully updated HSTS data."); +console.log("Starting cache refresh."); +await refreshCache(); +console.log("Successfully refreshed cache."); diff --git a/src/scan.js b/src/scan.js index e4b307d..7408850 100755 --- a/src/scan.js +++ b/src/scan.js @@ -3,6 +3,7 @@ import { Command } from "commander"; import { scan } from "./scanner/index.js"; import { Site } from "./site.js"; +import { setupCache } from "./cache.js"; const NAME = "mdn-http-observatory-scan"; const program = new Command(); @@ -14,6 +15,8 @@ program .argument("", "hostname to scan") .action(async (siteString, _options) => { try { + await setupCache(); + const site = Site.fromSiteString(siteString); const result = await scan(site); const tests = Object.fromEntries( diff --git a/src/retrieve-tld-list.js b/src/tld-list.js similarity index 65% rename from src/retrieve-tld-list.js rename to src/tld-list.js index ae76f96..2db56eb 100644 --- a/src/retrieve-tld-list.js +++ b/src/tld-list.js @@ -1,18 +1,15 @@ import axios from "axios"; import { writeFile } from "fs/promises"; -import path from "node:path"; -import { fileURLToPath } from "node:url"; const TLD_LIST_URL = new URL( "https://data.iana.org/TLD/tlds-alpha-by-domain.txt" ); -const dirname = path.dirname(fileURLToPath(import.meta.url)); - /** * Download the IANA-maintained public suffix list + * @param {string} filePath */ -export async function retrieveAndStoreTldList() { +export async function retrieveAndStoreTldList(filePath) { let r; try { r = await axios.get(TLD_LIST_URL.href); @@ -21,10 +18,9 @@ export async function retrieveAndStoreTldList() { return; } const data = cleanData(r.data); - const filePath = path.join(dirname, "..", "conf", "tld-list.json"); try { await writeFile(filePath, data); - console.log(`File written to ${filePath}`); + console.log(`Downloaded TLD list and saved it to ${filePath}`); } catch (error) { console.error("Error writing file:", error); return; @@ -45,8 +41,3 @@ function cleanData(data) { .map((line) => line.trim().toLowerCase()); return JSON.stringify(ret); } - -// Execute when run directly -if (import.meta.url === `file://${process.argv[1]}`) { - retrieveAndStoreTldList().catch(console.error); -}