/ - Diff - Haketilo - Hydrilla issue tracker

« Previous | Next »

Revision 44958e6a

Added by koszko about 2 years ago

ID 44958e6ab4218429475f3c79ecf2116b78a07021
Parent 6b53d6c8
Child 03d041ce

implement rethinked tags sanitizing approach

This has not been tested yet. Additionally, functionality for blocking of `data:' urls needs to be re-enabled.

         return {name: header.name, value: new_csp.join('')};
+    }
     /* Regexes and objest to use as/in schemas for parse_json_with_schema(). */
     /* Regexes and objects to use as/in schemas for parse_json_with_schema(). */
     const nonempty_string_matcher = /.+/;
     const matchers = {

      * IMPORT is_chrome
      * IMPORT is_mozilla
      * IMPORT start_activity_info_server
      * IMPORT modify_on_the_fly
      * IMPORT csp_rule
      * IMPORT is_csp_header_name
      * IMPORT sanitize_csp_header
      * IMPORTS_END
      */
-...
         parent.hachette_corresponding.appendChild(clone);
+    }
     /*
      * 1. When injecting some payload we need to sanitize <meta> CSP tags before
      *    they reach the document.
      * 2. Only <meta> tags inside <head> are considered valid by the browser and
      *    need to be considered.
      * 3. We want to detach <html> from document, wait until its <head> completes
      *    loading, sanitize it and re-attach <html>.
      * 4. Browsers are eager to add <meta>'s that appear after `</head>' but before
      *    `<body>'. Due to this behavior the `DOMContentLoaded' event is considered
      *    unreliable (although it could still work properly, it is just problematic
      *    to verify).
      * 5. We shall wait for anything to appear in or after <body> and take that as
      *    a sign <head> has _really_ finished loading.
      */
     function make_body_start_observer(DOM_element, waiting)
+    {
         const observer = new MutationObserver(() => try_body_started(waiting));
         observer.observe(DOM_element, {childList: true});
         return observer;
+    }
     function try_body_started(waiting)
+    {
         const body = waiting.detached_html.querySelector("body");
         if ((body && (body.firstChild || body.nextSibling)) ||
     	waiting.doc.documentElement.nextSibling) {
     	finish_waiting(waiting);
     	return true;
+        }
         if (body && waiting.observers.length < 2)
     	waiting.observers.push(make_body_start_observer(body, waiting));
+    }
     function finish_waiting(waiting)
+    {
         waiting.observers.forEach(observer => observer.disconnect());
         waiting.doc.removeEventListener("DOMContentLoaded", waiting.loaded_cb);
         setTimeout(waiting.callback, 0);
+    }
     function _wait_for_head(doc, detached_html, callback)
+    {
         const waiting = {doc, detached_html, callback, observers: []};
         if (try_body_started(waiting))
     	return;
         waiting.observers = [make_body_start_observer(detached_html, waiting)];
         waiting.loaded_cb = () => finish_waiting(waiting);
         doc.addEventListener("DOMContentLoaded", waiting.loaded_cb);
+    }
     function wait_for_head(doc, detached_html)
+    {
         return new Promise(cb => _wait_for_head(doc, detached_html, cb));
+    }
     const blocked_str = "blocked";
     function block_attribute(node, attr)
+    {
         /*
          * Disabling attributes this way allows them to still be relatively
          * easily accessed in case they contain some useful data.
          */
         const construct_name = [attr];
         while (node.hasAttribute(construct_name.join("")))
     	construct_name.unshift(blocked_str);
         while (construct_name.length > 1) {
     	construct_name.shift();
     	const name = construct_name.join("");
     	node.setAttribute(`${blocked_str}-${name}`, node.getAttribute(name));
+        }
         node.removeAttribute(attr);
+    }
     function sanitize_meta(meta, policy)
+    {
         const http_equiv = meta.getAttribute("http-equiv");
         const value = meta.content;
         if (!value || !is_csp_header_name(http_equiv, true))
     	return;
         block_attribute(meta, "content");
         if (is_csp_header_name(http_equiv, false))
     	meta.content = sanitize_csp_header({value}, policy).value;
+    }
     function apply_hachette_csp_rules(doc, policy)
+    {
         const meta = doc.createElement("meta");
         meta.setAttribute("http-equiv", "Content-Security-Policy");
         meta.setAttribute("content", csp_rule(policy.nonce));
         doc.head.append(meta);
         /* CSP is already in effect, we can remove the <meta> now. */
         meta.remove();
+    }
     async function sanitize_document(doc, policy)
+    {
         /*
          * Ensure our CSP rules are employed from the beginning. This CSP injection
          * method is, when possible, going to be applied together with CSP rules
          * injected using webRequest.
          */
         const has_own_head = doc.head;
         if (!has_own_head)
     	doc.documentElement.prepend(doc.createElement("head"));
         apply_hachette_csp_rules(doc, policy);
         /* Probably not needed, but...: proceed with DOM in its initial state. */
         if (!has_own_head)
     	doc.head.remove();
         /*
          * <html> node gets hijacked now, to be re-attached after <head> is loaded
          * and sanitized.
          */
         const old_html = doc.documentElement;
         const new_html = doc.createElement("html");
         old_html.replaceWith(new_html);
         await wait_for_head(doc, old_html);
         for (const meta of old_html.querySelectorAll("head meta"))
     	sanitize_meta(meta, policy);
         new_html.replaceWith(old_html);
+    }
     if (!is_privileged_url(document.URL)) {
         const reductor =
     	  (ac, [_, sig, pol]) => ac[0] && ac || [extract_signed(sig, pol), sig];
-...
         if (signature)
     	document.cookie = `hachette-${signature}=; Max-Age=-1;`;
         handle_page_actions(policy.nonce);
         if (!policy.allow)
     	sanitize_document(document, policy);
         if (!policy.allow) {
     	const old_html = document.documentElement;
     	const new_html = document.createElement("html");
     	old_html.replaceWith(new_html);
     	old_html.hachette_corresponding = new_html;
     	const modify_end =
     	      modify_on_the_fly(old_html, policy, {node_eater: accept_node});
     	document.addEventListener("DOMContentLoaded", modify_end);
+        }
         handle_page_actions(policy.nonce);
         start_activity_info_server();
+    }

     /**
      * Hachette modify HTML document as it loads and reconstruct HTML code from it
+     *
      * Copyright (C) 2021 Wojtek Kosior
      * Redistribution terms are gathered in the `copyright' file.
      */
     /*
      * IMPORTS_START
      * IMPORT gen_nonce
      * IMPORT csp_rule
      * IMPORT is_csp_header_name
      * IMPORT sanitize_csp_header
      * IMPORT sanitize_attributes
      * IMPORTS_END
      */
     /*
      * Functions that sanitize elements. The script blocking measures are, when
      * possible, going to be applied together with CSP rules injected using
      * webRequest.
      */
     const blocked = "blocked";
     function block_attribute(node, attr)
+    {
         /*
          * Disabling attributed this way allows them to still be relatively
          * easily accessed in case they contain some useful data.
          */
         const construct_name = [attr];
         while (node.hasAttribute(construct_name.join("")))
     	construct_name.unshift(blocked);
         while (construct_name.length > 1) {
     	construct_name.shift();
     	const name = construct_name.join("");
     	node.setAttribute(`${blocked}-${name}`, node.getAttribute(name));
+        }
         node.removeAttribute(attr);
+    }
     function sanitize_script(script, data)
+    {
         if (script.getAttribute("data-hachette-deleteme") === data.policy.nonce) {
     	script.remove();
     	script.hachette_deleted = true;
     	script.hachette_ignore = true;
+        }
         if (data.policy.allow)
     	return;
         block_attribute(script, "type");
         script.setAttribute("type", "application/json");
+    }
     function inject_csp(head, data)
+    {
         if (data.policy.allow)
     	return;
         const meta = document.createElement("meta");
         meta.setAttribute("http-equiv", "Content-Security-Policy");
         meta.setAttribute("content", csp_rule(data.policy.nonce));
         meta.hachette_ignore = true;
         head.prepend(meta);
         data.new_added.unshift([meta, head]);
+    }
     function sanitize_http_equiv_csp_rule(meta, data)
+    {
         const http_equiv = meta.getAttribute("http-equiv");
         const value = meta.content;
         if (!value || !is_csp_header_name(http_equiv, !data.policy.allow))
     	return;
         block_attribute(meta, "content");
         if (data.policy.allow || is_csp_header_name(http_equiv, false))
     	meta.content = sanitize_csp_header({value}, data.policy).value;
+    }
     function sanitize_node(node, data)
+    {
         if (node.tagName === "SCRIPT")
     	sanitize_script(node, data);
         if (node.tagName === "HEAD")
     	inject_csp(node, data);
         if (node.tagName === "META")
     	sanitize_http_equiv_csp_rule(node, data);
         if (!data.policy.allow)
     	sanitize_attributes(node, data);
+    }
     /*
      * Instead of calling writer directly with multiple small chunks of reconstruced
      * HTML code, we utilize `setTimeout()' to only have it called once,
      * asynchronously.
      */
     function do_write_callback(data)
+    {
         data.writer(data.chunks.join(""));
         data.chunks = [];
         if (data.finished && data.finisher)
     	data.finisher();
+    }
     function do_write(chunk, data)
+    {
         data.chunks.push(chunk);
         clearTimeout(data.write_timeout);
         data.write_timeout = setTimeout(() => do_write_callback(data), 0);
+    }
     const serializer = new XMLSerializer();
     function start_serializing_node(node, data)
+    {
         node.hachette_started = true;
         if (!data.writer)
     	return;
         const clone = node.cloneNode(false);
         clone.textContent = data.uniq;
         do_write(data.uniq_reg.exec(clone.outerHTML)[1], data);
+    }
     function finish_serializing_node(node, data)
+    {
         const nodes_to_process = [node];
         while (true) {
     	node = nodes_to_process.pop();
     	if (!node)
     	    break;
     	nodes_to_process.push(node, node.hachette_last_added);
+        }
         while (nodes_to_process.length > 0) {
     	const node = nodes_to_process.pop();
     	node.remove();
     	node.hachette_ignore = true;
     	if (!data.writer)
     	    continue;
     	if (node.hachette_started) {
     	    node.textContent = data.uniq;
     	    do_write(data.uniq_reg.exec(node.outerHTML)[2], data);
     	    continue;
+    	}
     	do_write(node.outerHTML || serializer.serializeToString(node), data);
+        }
+    }
     function process_initial_nodes(node, data)
+    {
         if (data.processed_initial_nodes)
     	return;
         data.processed_initial_nodes = true;
         start_serializing_node(data.html_root, data);
         const new_added = [];
         const nodes_to_process = [data.html_root];
         let i = 0;
         while (nodes_to_process.length > 0) {
     	let current = nodes_to_process.shift();
     	if (current.firstChild) {
     	    if (current.firstChild === node)
     		break;
     	    nodes_to_process.unshift(current.firstChild, current);
     	    new_added.push([current.firstChild, current]);
     	    continue;
+    	}
     	while (current && !current.nextSibling)
     	    current = nodes_to_process.shift();
     	if (!current || current.nextSibling === node)
     	    break;
     	nodes_to_process.unshift(current.nextSibling);
     	new_added.push([current.nextSibling, nodes_to_process[1]]);
+        }
         data.new_added.unshift(...new_added);
+    }
     /*
      * Important! Due to some weirdness node.parentElement is not alway correct
      * in MutationRecords under Chromium. Track node relations manually.
      */
     function handle_added_node(node, true_parent, data)
+    {
         /*
          * Functions we call here might cause new nodes to be injected or found
          * that require processing before the one we got in function argument.
          * We rely on those functions putting the node(s) they create/find at the
          * very beginning of the `new_added' queue and (for created nodes) setting
          * their `hachette_ignore' property, based on which their MutationRecord
          * will not be processed. A function can also mark a node already in the
          * `new_added' queue as not eligible for processing by setting its
          * `hachette_deleted' property.
          */
         process_initial_nodes(node, data);
         data.new_added.push([node, true_parent]);
         while (data.new_added.length > 0) {
     	[node, true_parent] = data.new_added.shift();
     	if (true_parent.hachette_deleted)
     	    node.hachette_deleted = true;
     	if (node.hachette_deleted)
     	    continue;
     	if (!true_parent.hachette_started)
     	    start_serializing_node(true_parent, data)
     	if (!node.hachette_ignore)
     	    sanitize_node(node, data);
     	if (node.hachette_deleted)
     	    continue;
     	if (data.node_eater)
     	    data.node_eater(node, true_parent);
     	finish_serializing_node(true_parent.hachette_last_added, data);
     	true_parent.hachette_last_added = node;
+        }
+    }
     function handle_mutation(mutations, data)
+    {
         /*
          * Chromium: for an unknown reason mutation.target is not always the same as
          * node.parentElement. The former is the correct one.
          */
         for (const mutation of mutations) {
     	for (const node of mutation.addedNodes) {
     	    /* Check for nodes added by ourselves. */
     	    if (mutation.target.hachette_ignore)
     		node.hachette_ignore = true;
     	    if (node.hachette_ignore)
     		continue;
     	    handle_added_node(node, mutation.target, data);
+    	}
+        }
+    }
     function finish_processing(data)
+    {
         process_initial_nodes(undefined, data);
         /*
          * The `finisher' callback should be called, if provided. Normally our
          * function that performs the last write does it after seeing `finished'
          * set to `true'. If, however, there's no `writer' callback and hence no
          * writes to perform, we need to take care of calling `finisher' here.
          */
         data.finished = true;
         handle_mutation(data.observer.takeRecords(), data);
         data.observer.disconnect();
         /*
          * Additional whitespace that was after `</body>' gets appended to body.
          * Although it's a minor issue, it is not what we want. There's no way to
          * tell exactly what part of that whitespace was after `</body>' and what
          * was before, so we just replace it with a single newline which looks good
          * when printed.
          */
         const body = data.html_root.lastChild;
         const text = body && body.tagName === "BODY" && body.lastChild;
         if (text && text.nodeName === "#text") {
     	const new_content = /^([\S\s]*\S)?\s*$/.exec(text.textContent)[1] || "";
     	text.textContent = new_content + "\n";
+        }
         finish_serializing_node(data.html_root, data);
         if (!data.writer && data.finisher)
     	setTimeout(data.finisher, 0);
+    }
     /*
      * This function sanitizes `html_root' according to `policy'. It is capable of
      * working on an HTML document that is being written to, sanitizing new nodes
      * as they appear.
+     *
      * `consumers' object may contain 3 optional callback functions: `writer',
      * `node_eater' and `finisher'. The first one, if present, is called with chunks
      * of reconstructed HTML code. The second one, if present, gets called for every
      * added node with 2 arguments: that node and its parent. The third one is
      * called at the end, after all processing has been done.
+     *
      * `modify_on_the_fly()' returns a callback that should be called (with no
      * arguments) once the document of html_root has finished being written to.
      * Unfortunately, due to specifics behavior of document that has had its
      * documentElement replaced
      */
     function modify_on_the_fly(html_root, policy, consumers)
+    {
         const uniq = gen_nonce();
         const uniq_reg = new RegExp(`^([\\s\\S]*)${uniq}([\\s\\S]*)$`);
         const data = {policy, html_root, uniq, uniq_reg, chunks: [], new_added: []};
         Object.assign(data, consumers);
         var observer = new MutationObserver(m => handle_mutation(m, data));
         observer.observe(data.html_root, {
          	attributes: true,
     	childList: true,
     	subtree: true
         });
         data.observer = observer;
         return () => finish_processing(data);
+    }
     /*
      * EXPORTS_START
      * EXPORT modify_on_the_fly
      * EXPORTS_END
      */

Also available in: Unified diff

Project

General

Profile

Haketilo

Revision 44958e6a

Added by koszko about 2 years ago