diff --git a/package-lock.json b/package-lock.json index 5a98760d..c341d949 100644 --- a/package-lock.json +++ b/package-lock.json @@ -61,6 +61,7 @@ "react-dom": "^15.7.0", "redis": "^0.10.3", "request": "^2.88.2", + "request-filtering-agent": "^3.2.0", "requirejs": "2.1.14", "s-expression": "~2.2.0", "script-loader": "^0.7.2", @@ -20023,6 +20024,27 @@ "node": ">= 6" } }, + "node_modules/request-filtering-agent": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/request-filtering-agent/-/request-filtering-agent-3.2.0.tgz", + "integrity": "sha512-tKPrKdsmTFuGG1/pBEpzTB66mDZ2lZLW8kjW4N6jj4QjnxUTKrIfv5p2zuJRfztOos86jRPD41lRaGjh+1QqDw==", + "license": "MIT", + "dependencies": { + "ipaddr.js": "^2.1.0" + }, + "engines": { + "node": ">=20.0.0" + } + }, + "node_modules/request-filtering-agent/node_modules/ipaddr.js": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-2.3.0.tgz", + "integrity": "sha512-Zv/pA+ciVFbCSBBjGfaKUya/CcGmUHzTydLMaTwrUUEM2DIEO3iZvueGxmacvmN50fGpGVKeTXpb2LcYQxeVdg==", + "license": "MIT", + "engines": { + "node": ">= 10" + } + }, "node_modules/request/node_modules/form-data": { "version": "2.3.3", "resolved": "https://registry.npmjs.org/form-data/-/form-data-2.3.3.tgz", diff --git a/package.json b/package.json index d975ca4b..55646236 100644 --- a/package.json +++ b/package.json @@ -55,6 +55,7 @@ "react-dom": "^15.7.0", "redis": "^0.10.3", "request": "^2.88.2", + "request-filtering-agent": "^3.2.0", "requirejs": "2.1.14", "s-expression": "~2.2.0", "script-loader": "^0.7.2", diff --git a/src/server.js b/src/server.js index debbe7f2..9395dab6 100644 --- a/src/server.js +++ b/src/server.js @@ -7,6 +7,18 @@ const { drive } = require("googleapis/build/src/apis/drive/index.js"); var BACKREF_KEY = "originalProgram"; +// Limits for the streaming proxy. /downloadImg gets larger/looser caps because +// images can legitimately be tens of MB; also we've seen e.g. Drive ?export= +// take a while to get going. SHAREURL is intended to always be program +// plaintext. +// NOTE(joe + claude): really the timeout maybe should be on idleness at +// startup/between bytes, not overall per completed request, but that's work to +// plumb into `request` +var IMAGE_PROXY_MAX_BYTES = 20 * 1024 * 1024; // 20 MB +var IMAGE_PROXY_TIMEOUT_MS = 30 * 1000; // 30 s +var SHAREURL_PROXY_MAX_BYTES = 1 * 1024 * 1024; // 1 MB +var SHAREURL_PROXY_TIMEOUT_MS = 10 * 1000; // 10 s + function start(config, onServerReady) { var defaultOpts = { PYRET: process.env.PYRET, @@ -27,6 +39,7 @@ function start(config, onServerReady) { var csrf = require('csurf'); var googleAuth = require('./google-auth.js'); var request = require('request'); + var requestFilteringAgent = require('request-filtering-agent'); var mustache = require('mustache-express'); var url = require('url'); var fs = require('fs'); @@ -186,24 +199,74 @@ function start(config, onServerReady) { }); } - app.get("/downloadImg", function(req, response) { - var parsed = url.parse(req.url); - var googleLink = decodeURIComponent(parsed.query.slice(0)); - var googleParsed = url.parse(googleLink); - var gReq = request({url: googleLink, encoding: 'binary'}, function(error, imgResponse, body) { - if(error) { - response.status(400).send({type: "image-load-failure", error: "Unable to load image " + String(error)}); + function proxyStreamFetch(opts) { + var res = opts.res; + res.set('X-Content-Type-Options', 'nosniff'); + res.set('Content-Security-Policy', 'sandbox'); + + var parsed; + try { parsed = new URL(opts.url); } + catch (e) { return res.status(400).send({ error: 'invalid-url' }); } + if (opts.allowedHosts && !opts.allowedHosts(parsed.hostname)) { + return res.status(400).send({ error: 'host-not-allowed' }); + } + + var bytes = 0; + var upstream = request({ + url: opts.url, + timeout: opts.timeoutMs, + agent: requestFilteringAgent.useAgent(opts.url), + followRedirect: function(resp) { + if (!opts.allowedHosts) return true; + try { + var next = new URL(resp.headers.location, opts.url); + return opts.allowedHosts(next.hostname); + } catch (_) { return false; } + }, + }); + // If the client disconnects (e.g. the browser aborts /load-shareurl after + // direct succeeded), tear down the upstream connection too — otherwise + // we'd keep streaming bytes from raw.githubusercontent.com to nowhere. + res.on('close', function() { upstream.destroy(); }); + upstream.on('error', function(err) { + if (!res.headersSent) opts.onError(res, err); + }); + upstream.on('response', function(upRes) { + if (opts.contentTypeOk && !opts.contentTypeOk(upRes.headers['content-type'])) { + upstream.destroy(); + return res.status(400).send({ error: 'content-type-not-allowed', detail: upRes.headers['content-type'] }); } - else { - var h = imgResponse.headers; - var ct = h['content-type']; - if((!ct) || (ct.indexOf('image/') !== 0)) { - response.status(400).send({type: "non-image", error: "Invalid image type " + ct}); - return; - } - response.set('content-type', ct); - response.end(body, 'binary'); + res.status(upRes.statusCode); + if (upRes.headers['content-type']) { + res.set('content-type', upRes.headers['content-type']); } + upRes.on('data', function(chunk) { + bytes += chunk.length; + if (bytes > opts.maxBytes) { + upstream.destroy(); + if (!res.headersSent) res.status(502).send({ error: 'too-large' }); + else res.destroy(); + } + }); + // Pipe upRes (IncomingMessage), not upstream (request object). The + // request library's .pipe copies upstream headers verbatim, which + // would overwrite the security headers set above. + upRes.pipe(res); + }); + } + + app.get("/downloadImg", function(req, response) { + var googleLink = decodeURIComponent(url.parse(req.url).query.slice(0)); + proxyStreamFetch({ + res: response, + url: googleLink, + allowedHosts: null, + maxBytes: IMAGE_PROXY_MAX_BYTES, + timeoutMs: IMAGE_PROXY_TIMEOUT_MS, + contentTypeOk: function(ct) { return ct && ct.indexOf('image/') === 0; }, + onError: function(res, err) { + res.status(400).send({ type: 'image-load-failure', error: 'Unable to load image ' + String(err) }); + }, }); }); @@ -565,6 +628,26 @@ function start(config, onServerReady) { }); + // Server-side proxy for #shareurl loads from hosts that some school networks + // block or will likely block (notably raw.githubusercontent.com). + // Eager-proxied client-side for any URL whose host is in + // SHAREURL_ALLOWED_HOSTS. We can expand this list as needed. + var SHAREURL_ALLOWED_HOSTS = new Set(['raw.githubusercontent.com']); + + app.get("/load-shareurl", function(req, res) { + proxyStreamFetch({ + res: res, + url: req.query.url, + allowedHosts: function(h) { return SHAREURL_ALLOWED_HOSTS.has(h); }, + maxBytes: SHAREURL_PROXY_MAX_BYTES, + timeoutMs: SHAREURL_PROXY_TIMEOUT_MS, + contentTypeOk: null, + onError: function(res, err) { + res.status(502).send({ error: 'upstream-error' }); + }, + }); + }); + app.post("/share-image", function(req, res) { var driveFileId = req.body.fileId; diff --git a/src/web/js/beforePyret.js b/src/web/js/beforePyret.js index 005c2c56..b4941603 100644 --- a/src/web/js/beforePyret.js +++ b/src/web/js/beforePyret.js @@ -3,6 +3,130 @@ var originalPageLoad = Date.now(); console.log("originalPageLoad: ", originalPageLoad); +// Transparently route browser fetches to allowlisted hosts through the +// server-side proxy at /load-shareurl, but only when the direct path doesn't +// work. +// +// Strategy: the FIRST fetch to an allowlisted host fires direct + proxied in +// parallel. We decide shouldProxy for the rest of the page-load from direct's +// response *headers*: +// - direct returned 2xx with content-type text/plain -> shouldProxy=false: +// serve direct's response, abort the in-flight proxy fetch. +// - direct failed, hung past timeout, or returned anything else +// -> shouldProxy=true: +// serve proxy's response. +// A key idea is that network-blocky things sometimes return 200 with a +// message page about blocking (or an error, but that counts as a fail). We +// don't want to accidentally think that's a success. +// shouldProxy state is in-memory and per-host — never persisted, since +// reachability changes between networks and a stale value would silently +// break loads. +// +// Installed on the global fetch as early as possible so it catches every fetch +// caller; some of them are in the pyret-lang runtime and would be otherwise +// difficult to configure. +const SHAREURL_PROXY_HOSTS = new Set(['raw.githubusercontent.com']); +const SHAREURL_DIRECT_TIMEOUT_MS = 5000; +const _origFetch = window.fetch.bind(window); + +const _shareurlShouldProxy = new Map(); // host -> boolean +const _shareurlShouldProxyInflight = new Map(); // host -> Promise + +function _shareurlProxyUrl(fetchInput) { + return '/load-shareurl?url=' + encodeURIComponent(_shareurlInputToUrl(fetchInput)); +} + +function _shareurlInputToUrl(fetchInput) { + return (typeof fetchInput === 'string') ? fetchInput + : (typeof Request !== 'undefined' && fetchInput instanceof Request) ? fetchInput.url + : String(fetchInput); +} + +function _shareurlVerifyDirect(r) { + if (!r.ok) return false; + const ct = (r.headers.get('content-type') || '').toLowerCase(); + // Source files served from raw.githubusercontent.com come back as + // text/plain (.arr, .json, .csv, .md all do). Anything else — HTML block + // pages, captive portals, surprise content types — we don't trust as a + // real upstream response. + return ct.startsWith('text/plain'); +} + +function _shareurlFetch(shouldProxy, fetchInput, fetchInit) { + const maybeProxyInput = shouldProxy ? _shareurlProxyUrl(fetchInput) : fetchInput; + return _origFetch(maybeProxyInput, fetchInit); +} + +function _shareurlRace(fetchInput, fetchInit) { + const proxyCtrl = new AbortController(); + // NOTE(joe): The signal overwrite is technically not the right fetch() + // polyfill. If the caller elsewhere in the codebase provided a different + // signal (which in the fetch API is only for aborting as of April '26), that + // caller aborting through that signal won't cancel the proxy fetch. + // I'm OK letting that case slip through here in exchange for not having a + // bunch of extra event handler forwarding + const proxyP = _origFetch(_shareurlProxyUrl(fetchInput), + Object.assign({}, fetchInit, { signal: proxyCtrl.signal })); + const directP = _origFetch(fetchInput, fetchInit).then(r => { + if (!_shareurlVerifyDirect(r)) throw new Error('direct request failed'); + return r; + }); + + // shouldProxy: false iff direct verified before the timeout, else true. + // Whether to proxy is decided solely on whether direct succeeds or not + const shouldProxyPromise = Promise.race([ + directP.then(() => false, () => true), + new Promise(resolve => setTimeout(() => resolve(true), SHAREURL_DIRECT_TIMEOUT_MS)), + ]); + + // Settlement-order check: if direct verifies before proxy returns, abort + // the in-flight proxy to stop wasting server bandwidth. We must NOT + // abort once proxy has already returned, since by then the caller is + // reading proxy's body and aborting would error its stream mid-read. + const directFinishedSuccessfullyAndFirstP = Promise.race([ + directP.then(() => true, () => false), + proxyP.then(() => false, () => false), + ]); + directFinishedSuccessfullyAndFirstP.then(directFirst => { + if (directFirst) proxyCtrl.abort(); + }); + + // Caller's response: whichever of direct-verified or proxy fulfills + // first. If both fail, surface proxy's error (the more authoritative + // upstream — direct's may just be 'direct-not-verified'). + const responsePromise = Promise.any([directP, proxyP]).catch( + aggErr => Promise.reject(aggErr.errors[1] || aggErr.errors[0]) + ); + + return { responsePromise, shouldProxyPromise }; +} + +window.fetch = function(fetchInput, fetchInit) { + let host; + try { host = new URL(_shareurlInputToUrl(fetchInput), window.location.href).hostname; } + catch (_) { return _origFetch(fetchInput, fetchInit); } + if (!SHAREURL_PROXY_HOSTS.has(host)) return _origFetch(fetchInput, fetchInit); + + const shouldProxy = _shareurlShouldProxy.get(host); + const inflight = _shareurlShouldProxyInflight.get(host); + if (shouldProxy !== undefined) { + return _shareurlFetch(shouldProxy, fetchInput, fetchInit); + } else if (inflight) { + // shouldProxy pending: queue this fetch on it and issue a single fresh + // request once shouldProxy is decided. + return inflight.then(sp => _shareurlFetch(sp, fetchInput, fetchInit)); + } else { + // First fetch to this host this page-load: run the race. + const { responsePromise, shouldProxyPromise } = _shareurlRace(fetchInput, fetchInit); + _shareurlShouldProxyInflight.set(host, shouldProxyPromise); + shouldProxyPromise.then(sp => { + _shareurlShouldProxy.set(host, sp); + _shareurlShouldProxyInflight.delete(host); + }); + return responsePromise; + } +}; + const isEmbedded = window.parent !== window; var shareAPI = makeShareAPI(process.env.CURRENT_PYRET_RELEASE);