/background/stream_filter.js - Haketilo - Hydrilla issue tracker

Download (6.01 KB) Statistics

haketilo / background / stream_filter.js @ 96068ada

       /**
        * This file is part of Haketilo.
+       *
        * Function: Modifying a web page using the StreamFilter API.
+       *
        * Copyright (C) 2018 Giorgio Maone <giorgio@maone.net>
        * Copyright (C) 2021 Wojtek Kosior
        * Redistribution terms are gathered in the `copyright' file.
+       *
        * Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js'
        * in LibreJS.
        */
       /*
        * IMPORTS_START
        * IMPORT browser
        * IMPORT csp_header_regex
        * IMPORTS_END
        */
       function validate_encoding(charset)
+      {
           try {
       	new TextDecoder();
       	return charset;
           } catch(e) {
       	return undefined;
+          }
+      }
       function is_content_type_header(header)
+      {
           header.name.toLowerCase().trim() === "content-type";
+      }
       const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i;
       function properties_from_headers(headers)
+      {
           const properties = {};
           for (const header of headers.filter(is_content_type_header)) {
       	const match = charset_reg.exec(header.value);
       	if (!properties.detected_charset && validate_encoding(match[1]))
       	    properties.detected_charset = match[1];
       	if (/html/i.test(header.value))
       	    properties.html = true;
+          }
           return properties;
+      }
       const UTF8_BOM = [0xef, 0xbb, 0xbf];
       const BOMs = [
           [UTF8_BOM, "utf-8"],
           [[0xfe, 0xff], "utf-16be"],
           [[0xff, 0xfe], "utf-16le"]
       ];
       function charset_from_BOM(data)
+      {
           for (const [BOM, charset] of BOMs) {
       	if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true))
       	    return charset;
+          }
           return "";
+      }
       const charset_attrs =
             ['charset', 'http-equiv="content-type"', 'content*="charset"'];
       const charset_meta_selector =
             charset_attrs.map(a => `head>meta[${a}]`).join(", ");
       function charset_from_meta_tags(doc)
+      {
           for (const meta of doc.querySelectorAll(charset_meta_selector)) {
       	const maybe_charset = meta.getAttribute("charset");
       	if (maybe_charset && validate_encoding(maybe_charset))
       	    return maybe_charset;
               const match = charset_reg.exec(meta.getAttribute("content"));
               if (match && validate_encoding(match[1]))
       	    return match[1];
+          }
           return undefined;
+      }
       function create_decoder(properties, data)
+      {
           let charset = charset_from_BOM(data) || properties.detected_charset;
           if (!charset && data.indexOf(0) !== -1) {
               console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.",
       		      properties);
       	return new TextDecoder("utf-16be");
+          }
           /* Missing HTTP charset, sniffing in content... */
           /*
            * TODO: I recall there is some standard saying how early in the doc the
            * charset has to be specified. We could process just this part of data.
            */
           const text = new TextDecoder("latin1").decode(data, {stream: true});
           properties.html = properties.html || /html/i.test(text);
           if (properties.html) {
       	const tmp_doc = new DOMParser().parseFromString(text, "text/html");
       	charset = charset_from_meta_tags(tmp_doc);
+          }
           return new TextDecoder(charset || "latin1");
+      }
       function may_define_csp_rules(html)
+      {
           const doc = new DOMParser().parseFromString(html, "text/html");
           for (const meta of doc.querySelectorAll("head>meta[http-equiv]")) {
       	if (csp_header_regex.test(meta.httpEquiv) && meta.content)
       	    return true;
+          }
           /*
            * Even if no naughty `<meta>' tags were found, subsequent chunk of HTML
            * data could add some. Before we return `false' we need to be sure we
            * reached the start of `<body>' where `<meta>' tags are no longer valid.
            */
           if (doc.documentElement.nextSibling || doc.body.nextSibling ||
       	doc.body.childNodes.length > 1)
       	return false;
           if (!doc.body.firstChild)
       	return true;
           if (doc.body.firstChild.nodeName !== "#text")
       	return false;
           return /^(<\/|&#|.)$/.test(doc.body.firstChild.wholeText);
+      }
       function filter_data(properties, event)
+      {
           const data = new Uint8Array(event.data);
           let first_chunk = false;
           if (!properties.decoder) {
       	first_chunk = true;
       	properties.decoder = create_decoder(properties, data);
       	properties.encoder = new TextEncoder();
+          }
           let decoded = properties.decoder.decode(data);
           /* Force UTF-8, this is the only encoding we can produce. */
           if (first_chunk)
       	properties.filter.write(new Uint8Array(UTF8_BOM));
           if (first_chunk && may_define_csp_rules(decoded)) {
       	/*
       	 * HAX! Our content scripts that execute at `document_start' will always
       	 * run before the first script in the document, but under Mozilla some
       	 * `<meta>' tags might already be loaded at that point. Here we inject a
       	 * dummy `<script>' at the beginning (before any `<meta>' tags) that
       	 * will force `document_start' to happen earlier. This way our content
       	 * scripts will be able to sanitize `http-equiv' tags with CSP rules
       	 * that would otherwise stop our injected scripts from executing.
+      	 *
       	 * As we want to only process HTML files that happen to have naughty
       	 * `<meta>' tags in `<head>', we use a DOMParser-based heuristic in
       	 * `may_define_rules()'. We don't do any additional MIME sniffing as it
       	 * is too unreliable (and our heuristic will likely mark non-HTML files
       	 * as harmless anyway).
       	 */
       	const dummy_script = `<script>null</script>`;
       	const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0];
       	decoded = doctype_decl + dummy_script +
       	    decoded.substring(doctype_decl.length);
+          }
           properties.filter.write(properties.encoder.encode(decoded));
           if (properties.decoder.encoding === "utf-8")
       	properties.filter.disconnect();
+      }
       function apply_stream_filter(details, headers, policy)
+      {
           if (!policy.payload)
       	return headers;
           const properties = properties_from_headers(headers);
           properties.filter =
       	browser.webRequest.filterResponseData(details.requestId);
           properties.filter.ondata = event => filter_data(properties, event);
           properties.filter.onstop = () => properties.filter.close();
           /*
            * In the future we might consider modifying the headers that specify
            * encoding. For now we are not yet doing it, though. However, we
            * prepend the data with UTF-8 BOM which should be enough.
            */
           return headers;
+      }
       /*
        * EXPORTS_START
        * EXPORT apply_stream_filter
        * EXPORTS_END
        */

« Previous
1
…
4
5
6
Next »

(6-6/6)