1
|
/**
|
2
|
* Hachette modify HTML document as it loads and reconstruct HTML code from it
|
3
|
*
|
4
|
* Copyright (C) 2021 Wojtek Kosior
|
5
|
* Redistribution terms are gathered in the `copyright' file.
|
6
|
*/
|
7
|
|
8
|
/*
|
9
|
* IMPORTS_START
|
10
|
* IMPORT gen_nonce
|
11
|
* IMPORT csp_rule
|
12
|
* IMPORT is_csp_header_name
|
13
|
* IMPORT sanitize_csp_header
|
14
|
* IMPORT sanitize_attributes
|
15
|
* IMPORTS_END
|
16
|
*/
|
17
|
|
18
|
/*
|
19
|
* Functions that sanitize elements. The script blocking measures are, when
|
20
|
* possible, going to be applied together with CSP rules injected using
|
21
|
* webRequest.
|
22
|
*/
|
23
|
|
24
|
const blocked = "blocked";
|
25
|
|
26
|
function block_attribute(node, attr)
|
27
|
{
|
28
|
/*
|
29
|
* Disabling attributed this way allows them to still be relatively
|
30
|
* easily accessed in case they contain some useful data.
|
31
|
*/
|
32
|
|
33
|
const construct_name = [attr];
|
34
|
while (node.hasAttribute(construct_name.join("")))
|
35
|
construct_name.unshift(blocked);
|
36
|
|
37
|
while (construct_name.length > 1) {
|
38
|
construct_name.shift();
|
39
|
const name = construct_name.join("");
|
40
|
node.setAttribute(`${blocked}-${name}`, node.getAttribute(name));
|
41
|
}
|
42
|
|
43
|
node.removeAttribute(attr);
|
44
|
}
|
45
|
|
46
|
function sanitize_script(script, data)
|
47
|
{
|
48
|
if (script.getAttribute("data-hachette-deleteme") === data.policy.nonce) {
|
49
|
script.remove();
|
50
|
script.hachette_deleted = true;
|
51
|
script.hachette_ignore = true;
|
52
|
}
|
53
|
|
54
|
if (data.policy.allow)
|
55
|
return;
|
56
|
|
57
|
block_attribute(script, "type");
|
58
|
script.setAttribute("type", "application/json");
|
59
|
}
|
60
|
|
61
|
function inject_csp(head, data)
|
62
|
{
|
63
|
if (data.policy.allow)
|
64
|
return;
|
65
|
|
66
|
const meta = document.createElement("meta");
|
67
|
meta.setAttribute("http-equiv", "Content-Security-Policy");
|
68
|
meta.setAttribute("content", csp_rule(data.policy.nonce));
|
69
|
meta.hachette_ignore = true;
|
70
|
head.prepend(meta);
|
71
|
|
72
|
data.new_added.unshift([meta, head]);
|
73
|
}
|
74
|
|
75
|
function sanitize_http_equiv_csp_rule(meta, data)
|
76
|
{
|
77
|
const http_equiv = meta.getAttribute("http-equiv");
|
78
|
const value = meta.content;
|
79
|
|
80
|
if (!value || !is_csp_header_name(http_equiv, !data.policy.allow))
|
81
|
return;
|
82
|
|
83
|
block_attribute(meta, "content");
|
84
|
|
85
|
if (data.policy.allow || is_csp_header_name(http_equiv, false))
|
86
|
meta.content = sanitize_csp_header({value}, data.policy).value;
|
87
|
}
|
88
|
|
89
|
function sanitize_node(node, data)
|
90
|
{
|
91
|
if (node.tagName === "SCRIPT")
|
92
|
sanitize_script(node, data);
|
93
|
|
94
|
if (node.tagName === "HEAD")
|
95
|
inject_csp(node, data);
|
96
|
|
97
|
if (node.tagName === "META")
|
98
|
sanitize_http_equiv_csp_rule(node, data);
|
99
|
|
100
|
if (!data.policy.allow)
|
101
|
sanitize_attributes(node, data);
|
102
|
}
|
103
|
|
104
|
/*
|
105
|
* Instead of calling writer directly with multiple small chunks of reconstruced
|
106
|
* HTML code, we utilize `setTimeout()' to only have it called once,
|
107
|
* asynchronously.
|
108
|
*/
|
109
|
function do_write_callback(data)
|
110
|
{
|
111
|
data.writer(data.chunks.join(""));
|
112
|
data.chunks = [];
|
113
|
|
114
|
if (data.finished && data.finisher)
|
115
|
data.finisher();
|
116
|
}
|
117
|
|
118
|
function do_write(chunk, data)
|
119
|
{
|
120
|
data.chunks.push(chunk);
|
121
|
clearTimeout(data.write_timeout);
|
122
|
data.write_timeout = setTimeout(() => do_write_callback(data), 0);
|
123
|
}
|
124
|
|
125
|
const serializer = new XMLSerializer();
|
126
|
|
127
|
function start_serializing_node(node, data)
|
128
|
{
|
129
|
node.hachette_started = true;
|
130
|
|
131
|
if (!data.writer)
|
132
|
return;
|
133
|
|
134
|
const clone = node.cloneNode(false);
|
135
|
clone.textContent = data.uniq;
|
136
|
do_write(data.uniq_reg.exec(clone.outerHTML)[1], data);
|
137
|
}
|
138
|
|
139
|
function finish_serializing_node(node, data)
|
140
|
{
|
141
|
const nodes_to_process = [node];
|
142
|
|
143
|
while (true) {
|
144
|
node = nodes_to_process.pop();
|
145
|
if (!node)
|
146
|
break;
|
147
|
|
148
|
nodes_to_process.push(node, node.hachette_last_added);
|
149
|
}
|
150
|
|
151
|
while (nodes_to_process.length > 0) {
|
152
|
const node = nodes_to_process.pop();
|
153
|
node.remove();
|
154
|
node.hachette_ignore = true;
|
155
|
|
156
|
if (!data.writer)
|
157
|
continue;
|
158
|
|
159
|
if (node.hachette_started) {
|
160
|
node.textContent = data.uniq;
|
161
|
do_write(data.uniq_reg.exec(node.outerHTML)[2], data);
|
162
|
continue;
|
163
|
}
|
164
|
|
165
|
do_write(node.outerHTML || serializer.serializeToString(node), data);
|
166
|
}
|
167
|
}
|
168
|
|
169
|
function process_initial_nodes(node, data)
|
170
|
{
|
171
|
if (data.processed_initial_nodes)
|
172
|
return;
|
173
|
|
174
|
data.processed_initial_nodes = true;
|
175
|
|
176
|
start_serializing_node(data.html_root, data);
|
177
|
|
178
|
const new_added = [];
|
179
|
const nodes_to_process = [data.html_root];
|
180
|
|
181
|
let i = 0;
|
182
|
while (nodes_to_process.length > 0) {
|
183
|
let current = nodes_to_process.shift();
|
184
|
|
185
|
if (current.firstChild) {
|
186
|
if (current.firstChild === node)
|
187
|
break;
|
188
|
nodes_to_process.unshift(current.firstChild, current);
|
189
|
new_added.push([current.firstChild, current]);
|
190
|
continue;
|
191
|
}
|
192
|
|
193
|
while (current && !current.nextSibling)
|
194
|
current = nodes_to_process.shift();
|
195
|
|
196
|
if (!current || current.nextSibling === node)
|
197
|
break;
|
198
|
|
199
|
nodes_to_process.unshift(current.nextSibling);
|
200
|
new_added.push([current.nextSibling, nodes_to_process[1]]);
|
201
|
}
|
202
|
|
203
|
data.new_added.unshift(...new_added);
|
204
|
}
|
205
|
|
206
|
/*
|
207
|
* Important! Due to some weirdness node.parentElement is not alway correct
|
208
|
* in MutationRecords under Chromium. Track node relations manually.
|
209
|
*/
|
210
|
function handle_added_node(node, true_parent, data)
|
211
|
{
|
212
|
/*
|
213
|
* Functions we call here might cause new nodes to be injected or found
|
214
|
* that require processing before the one we got in function argument.
|
215
|
* We rely on those functions putting the node(s) they create/find at the
|
216
|
* very beginning of the `new_added' queue and (for created nodes) setting
|
217
|
* their `hachette_ignore' property, based on which their MutationRecord
|
218
|
* will not be processed. A function can also mark a node already in the
|
219
|
* `new_added' queue as not eligible for processing by setting its
|
220
|
* `hachette_deleted' property.
|
221
|
*/
|
222
|
|
223
|
process_initial_nodes(node, data);
|
224
|
|
225
|
data.new_added.push([node, true_parent]);
|
226
|
|
227
|
while (data.new_added.length > 0) {
|
228
|
[node, true_parent] = data.new_added.shift();
|
229
|
|
230
|
if (true_parent.hachette_deleted)
|
231
|
node.hachette_deleted = true;
|
232
|
if (node.hachette_deleted)
|
233
|
continue;
|
234
|
|
235
|
if (!true_parent.hachette_started)
|
236
|
start_serializing_node(true_parent, data)
|
237
|
|
238
|
if (!node.hachette_ignore)
|
239
|
sanitize_node(node, data);
|
240
|
|
241
|
if (node.hachette_deleted)
|
242
|
continue;
|
243
|
|
244
|
if (data.node_eater)
|
245
|
data.node_eater(node, true_parent);
|
246
|
|
247
|
finish_serializing_node(true_parent.hachette_last_added, data);
|
248
|
|
249
|
true_parent.hachette_last_added = node;
|
250
|
}
|
251
|
}
|
252
|
|
253
|
function handle_mutation(mutations, data)
|
254
|
{
|
255
|
/*
|
256
|
* Chromium: for an unknown reason mutation.target is not always the same as
|
257
|
* node.parentElement. The former is the correct one.
|
258
|
*/
|
259
|
for (const mutation of mutations) {
|
260
|
for (const node of mutation.addedNodes) {
|
261
|
/* Check for nodes added by ourselves. */
|
262
|
if (mutation.target.hachette_ignore)
|
263
|
node.hachette_ignore = true;
|
264
|
if (node.hachette_ignore)
|
265
|
continue;
|
266
|
|
267
|
handle_added_node(node, mutation.target, data);
|
268
|
}
|
269
|
}
|
270
|
}
|
271
|
|
272
|
function finish_processing(data)
|
273
|
{
|
274
|
process_initial_nodes(undefined, data);
|
275
|
|
276
|
/*
|
277
|
* The `finisher' callback should be called, if provided. Normally our
|
278
|
* function that performs the last write does it after seeing `finished'
|
279
|
* set to `true'. If, however, there's no `writer' callback and hence no
|
280
|
* writes to perform, we need to take care of calling `finisher' here.
|
281
|
*/
|
282
|
data.finished = true;
|
283
|
handle_mutation(data.observer.takeRecords(), data);
|
284
|
data.observer.disconnect();
|
285
|
|
286
|
/*
|
287
|
* Additional whitespace that was after `</body>' gets appended to body.
|
288
|
* Although it's a minor issue, it is not what we want. There's no way to
|
289
|
* tell exactly what part of that whitespace was after `</body>' and what
|
290
|
* was before, so we just replace it with a single newline which looks good
|
291
|
* when printed.
|
292
|
*/
|
293
|
const body = data.html_root.lastChild;
|
294
|
const text = body && body.tagName === "BODY" && body.lastChild;
|
295
|
if (text && text.nodeName === "#text") {
|
296
|
const new_content = /^([\S\s]*\S)?\s*$/.exec(text.textContent)[1] || "";
|
297
|
text.textContent = new_content + "\n";
|
298
|
}
|
299
|
|
300
|
finish_serializing_node(data.html_root, data);
|
301
|
if (!data.writer && data.finisher)
|
302
|
setTimeout(data.finisher, 0);
|
303
|
}
|
304
|
|
305
|
/*
|
306
|
* This function sanitizes `html_root' according to `policy'. It is capable of
|
307
|
* working on an HTML document that is being written to, sanitizing new nodes
|
308
|
* as they appear.
|
309
|
*
|
310
|
* `consumers' object may contain 3 optional callback functions: `writer',
|
311
|
* `node_eater' and `finisher'. The first one, if present, is called with chunks
|
312
|
* of reconstructed HTML code. The second one, if present, gets called for every
|
313
|
* added node with 2 arguments: that node and its parent. The third one is
|
314
|
* called at the end, after all processing has been done.
|
315
|
*
|
316
|
* `modify_on_the_fly()' returns a callback that should be called (with no
|
317
|
* arguments) once the document of html_root has finished being written to.
|
318
|
* Unfortunately, due to specifics behavior of document that has had its
|
319
|
* documentElement replaced
|
320
|
*/
|
321
|
function modify_on_the_fly(html_root, policy, consumers)
|
322
|
{
|
323
|
const uniq = gen_nonce();
|
324
|
const uniq_reg = new RegExp(`^([\\s\\S]*)${uniq}([\\s\\S]*)$`);
|
325
|
const data = {policy, html_root, uniq, uniq_reg, chunks: [], new_added: []};
|
326
|
Object.assign(data, consumers);
|
327
|
|
328
|
var observer = new MutationObserver(m => handle_mutation(m, data));
|
329
|
observer.observe(data.html_root, {
|
330
|
attributes: true,
|
331
|
childList: true,
|
332
|
subtree: true
|
333
|
});
|
334
|
|
335
|
data.observer = observer;
|
336
|
|
337
|
return () => finish_processing(data);
|
338
|
}
|
339
|
|
340
|
/*
|
341
|
* EXPORTS_START
|
342
|
* EXPORT modify_on_the_fly
|
343
|
* EXPORTS_END
|
344
|
*/
|