Project

General

Profile

« Previous | Next » 

Revision 44958e6a

Added by koszko about 2 years ago

implement rethinked tags sanitizing approach

This has not been tested yet. Additionally, functionality for blocking of `data:' urls needs to be re-enabled.

View differences:

common/misc.js
178 178
    return {name: header.name, value: new_csp.join('')};
179 179
}
180 180

  
181
/* Regexes and objest to use as/in schemas for parse_json_with_schema(). */
181
/* Regexes and objects to use as/in schemas for parse_json_with_schema(). */
182 182
const nonempty_string_matcher = /.+/;
183 183

  
184 184
const matchers = {
content/main.js
16 16
 * IMPORT is_chrome
17 17
 * IMPORT is_mozilla
18 18
 * IMPORT start_activity_info_server
19
 * IMPORT modify_on_the_fly
19
 * IMPORT csp_rule
20
 * IMPORT is_csp_header_name
21
 * IMPORT sanitize_csp_header
20 22
 * IMPORTS_END
21 23
 */
22 24

  
......
31 33
    parent.hachette_corresponding.appendChild(clone);
32 34
}
33 35

  
36
/*
37
 * 1. When injecting some payload we need to sanitize <meta> CSP tags before
38
 *    they reach the document.
39
 * 2. Only <meta> tags inside <head> are considered valid by the browser and
40
 *    need to be considered.
41
 * 3. We want to detach <html> from document, wait until its <head> completes
42
 *    loading, sanitize it and re-attach <html>.
43
 * 4. Browsers are eager to add <meta>'s that appear after `</head>' but before
44
 *    `<body>'. Due to this behavior the `DOMContentLoaded' event is considered
45
 *    unreliable (although it could still work properly, it is just problematic
46
 *    to verify).
47
 * 5. We shall wait for anything to appear in or after <body> and take that as
48
 *    a sign <head> has _really_ finished loading.
49
 */
50

  
51
function make_body_start_observer(DOM_element, waiting)
52
{
53
    const observer = new MutationObserver(() => try_body_started(waiting));
54
    observer.observe(DOM_element, {childList: true});
55
    return observer;
56
}
57

  
58
function try_body_started(waiting)
59
{
60
    const body = waiting.detached_html.querySelector("body");
61

  
62
    if ((body && (body.firstChild || body.nextSibling)) ||
63
	waiting.doc.documentElement.nextSibling) {
64
	finish_waiting(waiting);
65
	return true;
66
    }
67

  
68
    if (body && waiting.observers.length < 2)
69
	waiting.observers.push(make_body_start_observer(body, waiting));
70
}
71

  
72
function finish_waiting(waiting)
73
{
74
    waiting.observers.forEach(observer => observer.disconnect());
75
    waiting.doc.removeEventListener("DOMContentLoaded", waiting.loaded_cb);
76
    setTimeout(waiting.callback, 0);
77
}
78

  
79
function _wait_for_head(doc, detached_html, callback)
80
{
81
    const waiting = {doc, detached_html, callback, observers: []};
82
    if (try_body_started(waiting))
83
	return;
84

  
85
    waiting.observers = [make_body_start_observer(detached_html, waiting)];
86
    waiting.loaded_cb = () => finish_waiting(waiting);
87
    doc.addEventListener("DOMContentLoaded", waiting.loaded_cb);
88
}
89

  
90
function wait_for_head(doc, detached_html)
91
{
92
    return new Promise(cb => _wait_for_head(doc, detached_html, cb));
93
}
94

  
95
const blocked_str = "blocked";
96

  
97
function block_attribute(node, attr)
98
{
99
    /*
100
     * Disabling attributes this way allows them to still be relatively
101
     * easily accessed in case they contain some useful data.
102
     */
103
    const construct_name = [attr];
104
    while (node.hasAttribute(construct_name.join("")))
105
	construct_name.unshift(blocked_str);
106

  
107
    while (construct_name.length > 1) {
108
	construct_name.shift();
109
	const name = construct_name.join("");
110
	node.setAttribute(`${blocked_str}-${name}`, node.getAttribute(name));
111
    }
112

  
113
    node.removeAttribute(attr);
114
}
115

  
116
function sanitize_meta(meta, policy)
117
{
118
    const http_equiv = meta.getAttribute("http-equiv");
119
    const value = meta.content;
120

  
121
    if (!value || !is_csp_header_name(http_equiv, true))
122
	return;
123

  
124
    block_attribute(meta, "content");
125

  
126
    if (is_csp_header_name(http_equiv, false))
127
	meta.content = sanitize_csp_header({value}, policy).value;
128
}
129

  
130
function apply_hachette_csp_rules(doc, policy)
131
{
132
    const meta = doc.createElement("meta");
133
    meta.setAttribute("http-equiv", "Content-Security-Policy");
134
    meta.setAttribute("content", csp_rule(policy.nonce));
135
    doc.head.append(meta);
136
    /* CSP is already in effect, we can remove the <meta> now. */
137
    meta.remove();
138
}
139

  
140
async function sanitize_document(doc, policy)
141
{
142
    /*
143
     * Ensure our CSP rules are employed from the beginning. This CSP injection
144
     * method is, when possible, going to be applied together with CSP rules
145
     * injected using webRequest.
146
     */
147
    const has_own_head = doc.head;
148
    if (!has_own_head)
149
	doc.documentElement.prepend(doc.createElement("head"));
150

  
151
    apply_hachette_csp_rules(doc, policy);
152

  
153
    /* Probably not needed, but...: proceed with DOM in its initial state. */
154
    if (!has_own_head)
155
	doc.head.remove();
156

  
157
    /*
158
     * <html> node gets hijacked now, to be re-attached after <head> is loaded
159
     * and sanitized.
160
     */
161
    const old_html = doc.documentElement;
162
    const new_html = doc.createElement("html");
163
    old_html.replaceWith(new_html);
164

  
165
    await wait_for_head(doc, old_html);
166

  
167
    for (const meta of old_html.querySelectorAll("head meta"))
168
	sanitize_meta(meta, policy);
169

  
170
    new_html.replaceWith(old_html);
171
}
172

  
34 173
if (!is_privileged_url(document.URL)) {
35 174
    const reductor =
36 175
	  (ac, [_, sig, pol]) => ac[0] && ac || [extract_signed(sig, pol), sig];
......
45 184
    if (signature)
46 185
	document.cookie = `hachette-${signature}=; Max-Age=-1;`;
47 186

  
48
    handle_page_actions(policy.nonce);
187
    if (!policy.allow)
188
	sanitize_document(document, policy);
49 189

  
50
    if (!policy.allow) {
51
	const old_html = document.documentElement;
52
	const new_html = document.createElement("html");
53
	old_html.replaceWith(new_html);
54
	old_html.hachette_corresponding = new_html;
55

  
56
	const modify_end =
57
	      modify_on_the_fly(old_html, policy, {node_eater: accept_node});
58
	document.addEventListener("DOMContentLoaded", modify_end);
59
    }
190
    handle_page_actions(policy.nonce);
60 191

  
61 192
    start_activity_info_server();
62 193
}
content/sanitize_document.js
1
/**
2
 * Hachette modify HTML document as it loads and reconstruct HTML code from it
3
 *
4
 * Copyright (C) 2021 Wojtek Kosior
5
 * Redistribution terms are gathered in the `copyright' file.
6
 */
7

  
8
/*
9
 * IMPORTS_START
10
 * IMPORT gen_nonce
11
 * IMPORT csp_rule
12
 * IMPORT is_csp_header_name
13
 * IMPORT sanitize_csp_header
14
 * IMPORT sanitize_attributes
15
 * IMPORTS_END
16
 */
17

  
18
/*
19
 * Functions that sanitize elements. The script blocking measures are, when
20
 * possible, going to be applied together with CSP rules injected using
21
 * webRequest.
22
 */
23

  
24
const blocked = "blocked";
25

  
26
function block_attribute(node, attr)
27
{
28
    /*
29
     * Disabling attributed this way allows them to still be relatively
30
     * easily accessed in case they contain some useful data.
31
     */
32

  
33
    const construct_name = [attr];
34
    while (node.hasAttribute(construct_name.join("")))
35
	construct_name.unshift(blocked);
36

  
37
    while (construct_name.length > 1) {
38
	construct_name.shift();
39
	const name = construct_name.join("");
40
	node.setAttribute(`${blocked}-${name}`, node.getAttribute(name));
41
    }
42

  
43
    node.removeAttribute(attr);
44
}
45

  
46
function sanitize_script(script, data)
47
{
48
    if (script.getAttribute("data-hachette-deleteme") === data.policy.nonce) {
49
	script.remove();
50
	script.hachette_deleted = true;
51
	script.hachette_ignore = true;
52
    }
53

  
54
    if (data.policy.allow)
55
	return;
56

  
57
    block_attribute(script, "type");
58
    script.setAttribute("type", "application/json");
59
}
60

  
61
function inject_csp(head, data)
62
{
63
    if (data.policy.allow)
64
	return;
65

  
66
    const meta = document.createElement("meta");
67
    meta.setAttribute("http-equiv", "Content-Security-Policy");
68
    meta.setAttribute("content", csp_rule(data.policy.nonce));
69
    meta.hachette_ignore = true;
70
    head.prepend(meta);
71

  
72
    data.new_added.unshift([meta, head]);
73
}
74

  
75
function sanitize_http_equiv_csp_rule(meta, data)
76
{
77
    const http_equiv = meta.getAttribute("http-equiv");
78
    const value = meta.content;
79

  
80
    if (!value || !is_csp_header_name(http_equiv, !data.policy.allow))
81
	return;
82

  
83
    block_attribute(meta, "content");
84

  
85
    if (data.policy.allow || is_csp_header_name(http_equiv, false))
86
	meta.content = sanitize_csp_header({value}, data.policy).value;
87
}
88

  
89
function sanitize_node(node, data)
90
{
91
    if (node.tagName === "SCRIPT")
92
	sanitize_script(node, data);
93

  
94
    if (node.tagName === "HEAD")
95
	inject_csp(node, data);
96

  
97
    if (node.tagName === "META")
98
	sanitize_http_equiv_csp_rule(node, data);
99

  
100
    if (!data.policy.allow)
101
	sanitize_attributes(node, data);
102
}
103

  
104
/*
105
 * Instead of calling writer directly with multiple small chunks of reconstruced
106
 * HTML code, we utilize `setTimeout()' to only have it called once,
107
 * asynchronously.
108
 */
109
function do_write_callback(data)
110
{
111
    data.writer(data.chunks.join(""));
112
    data.chunks = [];
113

  
114
    if (data.finished && data.finisher)
115
	data.finisher();
116
}
117

  
118
function do_write(chunk, data)
119
{
120
    data.chunks.push(chunk);
121
    clearTimeout(data.write_timeout);
122
    data.write_timeout = setTimeout(() => do_write_callback(data), 0);
123
}
124

  
125
const serializer = new XMLSerializer();
126

  
127
function start_serializing_node(node, data)
128
{
129
    node.hachette_started = true;
130

  
131
    if (!data.writer)
132
	return;
133

  
134
    const clone = node.cloneNode(false);
135
    clone.textContent = data.uniq;
136
    do_write(data.uniq_reg.exec(clone.outerHTML)[1], data);
137
}
138

  
139
function finish_serializing_node(node, data)
140
{
141
    const nodes_to_process = [node];
142

  
143
    while (true) {
144
	node = nodes_to_process.pop();
145
	if (!node)
146
	    break;
147

  
148
	nodes_to_process.push(node, node.hachette_last_added);
149
    }
150

  
151
    while (nodes_to_process.length > 0) {
152
	const node = nodes_to_process.pop();
153
	node.remove();
154
	node.hachette_ignore = true;
155

  
156
	if (!data.writer)
157
	    continue;
158

  
159
	if (node.hachette_started) {
160
	    node.textContent = data.uniq;
161
	    do_write(data.uniq_reg.exec(node.outerHTML)[2], data);
162
	    continue;
163
	}
164

  
165
	do_write(node.outerHTML || serializer.serializeToString(node), data);
166
    }
167
}
168

  
169
function process_initial_nodes(node, data)
170
{
171
    if (data.processed_initial_nodes)
172
	return;
173

  
174
    data.processed_initial_nodes = true;
175

  
176
    start_serializing_node(data.html_root, data);
177

  
178
    const new_added = [];
179
    const nodes_to_process = [data.html_root];
180

  
181
    let i = 0;
182
    while (nodes_to_process.length > 0) {
183
	let current = nodes_to_process.shift();
184

  
185
	if (current.firstChild) {
186
	    if (current.firstChild === node)
187
		break;
188
	    nodes_to_process.unshift(current.firstChild, current);
189
	    new_added.push([current.firstChild, current]);
190
	    continue;
191
	}
192

  
193
	while (current && !current.nextSibling)
194
	    current = nodes_to_process.shift();
195

  
196
	if (!current || current.nextSibling === node)
197
	    break;
198

  
199
	nodes_to_process.unshift(current.nextSibling);
200
	new_added.push([current.nextSibling, nodes_to_process[1]]);
201
    }
202

  
203
    data.new_added.unshift(...new_added);
204
}
205

  
206
/*
207
 * Important! Due to some weirdness node.parentElement is not alway correct
208
 * in MutationRecords under Chromium. Track node relations manually.
209
 */
210
function handle_added_node(node, true_parent, data)
211
{
212
    /*
213
     * Functions we call here might cause new nodes to be injected or found
214
     * that require processing before the one we got in function argument.
215
     * We rely on those functions putting the node(s) they create/find at the
216
     * very beginning of the `new_added' queue and (for created nodes) setting
217
     * their `hachette_ignore' property, based on which their MutationRecord
218
     * will not be processed. A function can also mark a node already in the
219
     * `new_added' queue as not eligible for processing by setting its
220
     * `hachette_deleted' property.
221
     */
222

  
223
    process_initial_nodes(node, data);
224

  
225
    data.new_added.push([node, true_parent]);
226

  
227
    while (data.new_added.length > 0) {
228
	[node, true_parent] = data.new_added.shift();
229

  
230
	if (true_parent.hachette_deleted)
231
	    node.hachette_deleted = true;
232
	if (node.hachette_deleted)
233
	    continue;
234

  
235
	if (!true_parent.hachette_started)
236
	    start_serializing_node(true_parent, data)
237

  
238
	if (!node.hachette_ignore)
239
	    sanitize_node(node, data);
240

  
241
	if (node.hachette_deleted)
242
	    continue;
243

  
244
	if (data.node_eater)
245
	    data.node_eater(node, true_parent);
246

  
247
	finish_serializing_node(true_parent.hachette_last_added, data);
248

  
249
	true_parent.hachette_last_added = node;
250
    }
251
}
252

  
253
function handle_mutation(mutations, data)
254
{
255
    /*
256
     * Chromium: for an unknown reason mutation.target is not always the same as
257
     * node.parentElement. The former is the correct one.
258
     */
259
    for (const mutation of mutations) {
260
	for (const node of mutation.addedNodes) {
261
	    /* Check for nodes added by ourselves. */
262
	    if (mutation.target.hachette_ignore)
263
		node.hachette_ignore = true;
264
	    if (node.hachette_ignore)
265
		continue;
266

  
267
	    handle_added_node(node, mutation.target, data);
268
	}
269
    }
270
}
271

  
272
function finish_processing(data)
273
{
274
    process_initial_nodes(undefined, data);
275

  
276
    /*
277
     * The `finisher' callback should be called, if provided. Normally our
278
     * function that performs the last write does it after seeing `finished'
279
     * set to `true'. If, however, there's no `writer' callback and hence no
280
     * writes to perform, we need to take care of calling `finisher' here.
281
     */
282
    data.finished = true;
283
    handle_mutation(data.observer.takeRecords(), data);
284
    data.observer.disconnect();
285

  
286
    /*
287
     * Additional whitespace that was after `</body>' gets appended to body.
288
     * Although it's a minor issue, it is not what we want. There's no way to
289
     * tell exactly what part of that whitespace was after `</body>' and what
290
     * was before, so we just replace it with a single newline which looks good
291
     * when printed.
292
     */
293
    const body = data.html_root.lastChild;
294
    const text = body && body.tagName === "BODY" && body.lastChild;
295
    if (text && text.nodeName === "#text") {
296
	const new_content = /^([\S\s]*\S)?\s*$/.exec(text.textContent)[1] || "";
297
	text.textContent = new_content + "\n";
298
    }
299

  
300
    finish_serializing_node(data.html_root, data);
301
    if (!data.writer && data.finisher)
302
	setTimeout(data.finisher, 0);
303
}
304

  
305
/*
306
 * This function sanitizes `html_root' according to `policy'. It is capable of
307
 * working on an HTML document that is being written to, sanitizing new nodes
308
 * as they appear.
309
 *
310
 * `consumers' object may contain 3 optional callback functions: `writer',
311
 * `node_eater' and `finisher'. The first one, if present, is called with chunks
312
 * of reconstructed HTML code. The second one, if present, gets called for every
313
 * added node with 2 arguments: that node and its parent. The third one is
314
 * called at the end, after all processing has been done.
315
 *
316
 * `modify_on_the_fly()' returns a callback that should be called (with no
317
 * arguments) once the document of html_root has finished being written to.
318
 * Unfortunately, due to specifics behavior of document that has had its
319
 * documentElement replaced
320
 */
321
function modify_on_the_fly(html_root, policy, consumers)
322
{
323
    const uniq = gen_nonce();
324
    const uniq_reg = new RegExp(`^([\\s\\S]*)${uniq}([\\s\\S]*)$`);
325
    const data = {policy, html_root, uniq, uniq_reg, chunks: [], new_added: []};
326
    Object.assign(data, consumers);
327

  
328
    var observer = new MutationObserver(m => handle_mutation(m, data));
329
    observer.observe(data.html_root, {
330
     	attributes: true,
331
	childList: true,
332
	subtree: true
333
    });
334

  
335
    data.observer = observer;
336

  
337
    return () => finish_processing(data);
338
}
339

  
340
/*
341
 * EXPORTS_START
342
 * EXPORT modify_on_the_fly
343
 * EXPORTS_END
344
 */

Also available in: Unified diff