Revision 44958e6a
Added by koszko about 2 years ago
common/misc.js | ||
---|---|---|
178 | 178 |
return {name: header.name, value: new_csp.join('')}; |
179 | 179 |
} |
180 | 180 |
|
181 |
/* Regexes and objest to use as/in schemas for parse_json_with_schema(). */
|
|
181 |
/* Regexes and objects to use as/in schemas for parse_json_with_schema(). */
|
|
182 | 182 |
const nonempty_string_matcher = /.+/; |
183 | 183 |
|
184 | 184 |
const matchers = { |
content/main.js | ||
---|---|---|
16 | 16 |
* IMPORT is_chrome |
17 | 17 |
* IMPORT is_mozilla |
18 | 18 |
* IMPORT start_activity_info_server |
19 |
* IMPORT modify_on_the_fly |
|
19 |
* IMPORT csp_rule |
|
20 |
* IMPORT is_csp_header_name |
|
21 |
* IMPORT sanitize_csp_header |
|
20 | 22 |
* IMPORTS_END |
21 | 23 |
*/ |
22 | 24 |
|
... | ... | |
31 | 33 |
parent.hachette_corresponding.appendChild(clone); |
32 | 34 |
} |
33 | 35 |
|
36 |
/* |
|
37 |
* 1. When injecting some payload we need to sanitize <meta> CSP tags before |
|
38 |
* they reach the document. |
|
39 |
* 2. Only <meta> tags inside <head> are considered valid by the browser and |
|
40 |
* need to be considered. |
|
41 |
* 3. We want to detach <html> from document, wait until its <head> completes |
|
42 |
* loading, sanitize it and re-attach <html>. |
|
43 |
* 4. Browsers are eager to add <meta>'s that appear after `</head>' but before |
|
44 |
* `<body>'. Due to this behavior the `DOMContentLoaded' event is considered |
|
45 |
* unreliable (although it could still work properly, it is just problematic |
|
46 |
* to verify). |
|
47 |
* 5. We shall wait for anything to appear in or after <body> and take that as |
|
48 |
* a sign <head> has _really_ finished loading. |
|
49 |
*/ |
|
50 |
|
|
51 |
function make_body_start_observer(DOM_element, waiting) |
|
52 |
{ |
|
53 |
const observer = new MutationObserver(() => try_body_started(waiting)); |
|
54 |
observer.observe(DOM_element, {childList: true}); |
|
55 |
return observer; |
|
56 |
} |
|
57 |
|
|
58 |
function try_body_started(waiting) |
|
59 |
{ |
|
60 |
const body = waiting.detached_html.querySelector("body"); |
|
61 |
|
|
62 |
if ((body && (body.firstChild || body.nextSibling)) || |
|
63 |
waiting.doc.documentElement.nextSibling) { |
|
64 |
finish_waiting(waiting); |
|
65 |
return true; |
|
66 |
} |
|
67 |
|
|
68 |
if (body && waiting.observers.length < 2) |
|
69 |
waiting.observers.push(make_body_start_observer(body, waiting)); |
|
70 |
} |
|
71 |
|
|
72 |
function finish_waiting(waiting) |
|
73 |
{ |
|
74 |
waiting.observers.forEach(observer => observer.disconnect()); |
|
75 |
waiting.doc.removeEventListener("DOMContentLoaded", waiting.loaded_cb); |
|
76 |
setTimeout(waiting.callback, 0); |
|
77 |
} |
|
78 |
|
|
79 |
function _wait_for_head(doc, detached_html, callback) |
|
80 |
{ |
|
81 |
const waiting = {doc, detached_html, callback, observers: []}; |
|
82 |
if (try_body_started(waiting)) |
|
83 |
return; |
|
84 |
|
|
85 |
waiting.observers = [make_body_start_observer(detached_html, waiting)]; |
|
86 |
waiting.loaded_cb = () => finish_waiting(waiting); |
|
87 |
doc.addEventListener("DOMContentLoaded", waiting.loaded_cb); |
|
88 |
} |
|
89 |
|
|
90 |
function wait_for_head(doc, detached_html) |
|
91 |
{ |
|
92 |
return new Promise(cb => _wait_for_head(doc, detached_html, cb)); |
|
93 |
} |
|
94 |
|
|
95 |
const blocked_str = "blocked"; |
|
96 |
|
|
97 |
function block_attribute(node, attr) |
|
98 |
{ |
|
99 |
/* |
|
100 |
* Disabling attributes this way allows them to still be relatively |
|
101 |
* easily accessed in case they contain some useful data. |
|
102 |
*/ |
|
103 |
const construct_name = [attr]; |
|
104 |
while (node.hasAttribute(construct_name.join(""))) |
|
105 |
construct_name.unshift(blocked_str); |
|
106 |
|
|
107 |
while (construct_name.length > 1) { |
|
108 |
construct_name.shift(); |
|
109 |
const name = construct_name.join(""); |
|
110 |
node.setAttribute(`${blocked_str}-${name}`, node.getAttribute(name)); |
|
111 |
} |
|
112 |
|
|
113 |
node.removeAttribute(attr); |
|
114 |
} |
|
115 |
|
|
116 |
function sanitize_meta(meta, policy) |
|
117 |
{ |
|
118 |
const http_equiv = meta.getAttribute("http-equiv"); |
|
119 |
const value = meta.content; |
|
120 |
|
|
121 |
if (!value || !is_csp_header_name(http_equiv, true)) |
|
122 |
return; |
|
123 |
|
|
124 |
block_attribute(meta, "content"); |
|
125 |
|
|
126 |
if (is_csp_header_name(http_equiv, false)) |
|
127 |
meta.content = sanitize_csp_header({value}, policy).value; |
|
128 |
} |
|
129 |
|
|
130 |
function apply_hachette_csp_rules(doc, policy) |
|
131 |
{ |
|
132 |
const meta = doc.createElement("meta"); |
|
133 |
meta.setAttribute("http-equiv", "Content-Security-Policy"); |
|
134 |
meta.setAttribute("content", csp_rule(policy.nonce)); |
|
135 |
doc.head.append(meta); |
|
136 |
/* CSP is already in effect, we can remove the <meta> now. */ |
|
137 |
meta.remove(); |
|
138 |
} |
|
139 |
|
|
140 |
async function sanitize_document(doc, policy) |
|
141 |
{ |
|
142 |
/* |
|
143 |
* Ensure our CSP rules are employed from the beginning. This CSP injection |
|
144 |
* method is, when possible, going to be applied together with CSP rules |
|
145 |
* injected using webRequest. |
|
146 |
*/ |
|
147 |
const has_own_head = doc.head; |
|
148 |
if (!has_own_head) |
|
149 |
doc.documentElement.prepend(doc.createElement("head")); |
|
150 |
|
|
151 |
apply_hachette_csp_rules(doc, policy); |
|
152 |
|
|
153 |
/* Probably not needed, but...: proceed with DOM in its initial state. */ |
|
154 |
if (!has_own_head) |
|
155 |
doc.head.remove(); |
|
156 |
|
|
157 |
/* |
|
158 |
* <html> node gets hijacked now, to be re-attached after <head> is loaded |
|
159 |
* and sanitized. |
|
160 |
*/ |
|
161 |
const old_html = doc.documentElement; |
|
162 |
const new_html = doc.createElement("html"); |
|
163 |
old_html.replaceWith(new_html); |
|
164 |
|
|
165 |
await wait_for_head(doc, old_html); |
|
166 |
|
|
167 |
for (const meta of old_html.querySelectorAll("head meta")) |
|
168 |
sanitize_meta(meta, policy); |
|
169 |
|
|
170 |
new_html.replaceWith(old_html); |
|
171 |
} |
|
172 |
|
|
34 | 173 |
if (!is_privileged_url(document.URL)) { |
35 | 174 |
const reductor = |
36 | 175 |
(ac, [_, sig, pol]) => ac[0] && ac || [extract_signed(sig, pol), sig]; |
... | ... | |
45 | 184 |
if (signature) |
46 | 185 |
document.cookie = `hachette-${signature}=; Max-Age=-1;`; |
47 | 186 |
|
48 |
handle_page_actions(policy.nonce); |
|
187 |
if (!policy.allow) |
|
188 |
sanitize_document(document, policy); |
|
49 | 189 |
|
50 |
if (!policy.allow) { |
|
51 |
const old_html = document.documentElement; |
|
52 |
const new_html = document.createElement("html"); |
|
53 |
old_html.replaceWith(new_html); |
|
54 |
old_html.hachette_corresponding = new_html; |
|
55 |
|
|
56 |
const modify_end = |
|
57 |
modify_on_the_fly(old_html, policy, {node_eater: accept_node}); |
|
58 |
document.addEventListener("DOMContentLoaded", modify_end); |
|
59 |
} |
|
190 |
handle_page_actions(policy.nonce); |
|
60 | 191 |
|
61 | 192 |
start_activity_info_server(); |
62 | 193 |
} |
content/sanitize_document.js | ||
---|---|---|
1 |
/** |
|
2 |
* Hachette modify HTML document as it loads and reconstruct HTML code from it |
|
3 |
* |
|
4 |
* Copyright (C) 2021 Wojtek Kosior |
|
5 |
* Redistribution terms are gathered in the `copyright' file. |
|
6 |
*/ |
|
7 |
|
|
8 |
/* |
|
9 |
* IMPORTS_START |
|
10 |
* IMPORT gen_nonce |
|
11 |
* IMPORT csp_rule |
|
12 |
* IMPORT is_csp_header_name |
|
13 |
* IMPORT sanitize_csp_header |
|
14 |
* IMPORT sanitize_attributes |
|
15 |
* IMPORTS_END |
|
16 |
*/ |
|
17 |
|
|
18 |
/* |
|
19 |
* Functions that sanitize elements. The script blocking measures are, when |
|
20 |
* possible, going to be applied together with CSP rules injected using |
|
21 |
* webRequest. |
|
22 |
*/ |
|
23 |
|
|
24 |
const blocked = "blocked"; |
|
25 |
|
|
26 |
function block_attribute(node, attr) |
|
27 |
{ |
|
28 |
/* |
|
29 |
* Disabling attributed this way allows them to still be relatively |
|
30 |
* easily accessed in case they contain some useful data. |
|
31 |
*/ |
|
32 |
|
|
33 |
const construct_name = [attr]; |
|
34 |
while (node.hasAttribute(construct_name.join(""))) |
|
35 |
construct_name.unshift(blocked); |
|
36 |
|
|
37 |
while (construct_name.length > 1) { |
|
38 |
construct_name.shift(); |
|
39 |
const name = construct_name.join(""); |
|
40 |
node.setAttribute(`${blocked}-${name}`, node.getAttribute(name)); |
|
41 |
} |
|
42 |
|
|
43 |
node.removeAttribute(attr); |
|
44 |
} |
|
45 |
|
|
46 |
function sanitize_script(script, data) |
|
47 |
{ |
|
48 |
if (script.getAttribute("data-hachette-deleteme") === data.policy.nonce) { |
|
49 |
script.remove(); |
|
50 |
script.hachette_deleted = true; |
|
51 |
script.hachette_ignore = true; |
|
52 |
} |
|
53 |
|
|
54 |
if (data.policy.allow) |
|
55 |
return; |
|
56 |
|
|
57 |
block_attribute(script, "type"); |
|
58 |
script.setAttribute("type", "application/json"); |
|
59 |
} |
|
60 |
|
|
61 |
function inject_csp(head, data) |
|
62 |
{ |
|
63 |
if (data.policy.allow) |
|
64 |
return; |
|
65 |
|
|
66 |
const meta = document.createElement("meta"); |
|
67 |
meta.setAttribute("http-equiv", "Content-Security-Policy"); |
|
68 |
meta.setAttribute("content", csp_rule(data.policy.nonce)); |
|
69 |
meta.hachette_ignore = true; |
|
70 |
head.prepend(meta); |
|
71 |
|
|
72 |
data.new_added.unshift([meta, head]); |
|
73 |
} |
|
74 |
|
|
75 |
function sanitize_http_equiv_csp_rule(meta, data) |
|
76 |
{ |
|
77 |
const http_equiv = meta.getAttribute("http-equiv"); |
|
78 |
const value = meta.content; |
|
79 |
|
|
80 |
if (!value || !is_csp_header_name(http_equiv, !data.policy.allow)) |
|
81 |
return; |
|
82 |
|
|
83 |
block_attribute(meta, "content"); |
|
84 |
|
|
85 |
if (data.policy.allow || is_csp_header_name(http_equiv, false)) |
|
86 |
meta.content = sanitize_csp_header({value}, data.policy).value; |
|
87 |
} |
|
88 |
|
|
89 |
function sanitize_node(node, data) |
|
90 |
{ |
|
91 |
if (node.tagName === "SCRIPT") |
|
92 |
sanitize_script(node, data); |
|
93 |
|
|
94 |
if (node.tagName === "HEAD") |
|
95 |
inject_csp(node, data); |
|
96 |
|
|
97 |
if (node.tagName === "META") |
|
98 |
sanitize_http_equiv_csp_rule(node, data); |
|
99 |
|
|
100 |
if (!data.policy.allow) |
|
101 |
sanitize_attributes(node, data); |
|
102 |
} |
|
103 |
|
|
104 |
/* |
|
105 |
* Instead of calling writer directly with multiple small chunks of reconstruced |
|
106 |
* HTML code, we utilize `setTimeout()' to only have it called once, |
|
107 |
* asynchronously. |
|
108 |
*/ |
|
109 |
function do_write_callback(data) |
|
110 |
{ |
|
111 |
data.writer(data.chunks.join("")); |
|
112 |
data.chunks = []; |
|
113 |
|
|
114 |
if (data.finished && data.finisher) |
|
115 |
data.finisher(); |
|
116 |
} |
|
117 |
|
|
118 |
function do_write(chunk, data) |
|
119 |
{ |
|
120 |
data.chunks.push(chunk); |
|
121 |
clearTimeout(data.write_timeout); |
|
122 |
data.write_timeout = setTimeout(() => do_write_callback(data), 0); |
|
123 |
} |
|
124 |
|
|
125 |
const serializer = new XMLSerializer(); |
|
126 |
|
|
127 |
function start_serializing_node(node, data) |
|
128 |
{ |
|
129 |
node.hachette_started = true; |
|
130 |
|
|
131 |
if (!data.writer) |
|
132 |
return; |
|
133 |
|
|
134 |
const clone = node.cloneNode(false); |
|
135 |
clone.textContent = data.uniq; |
|
136 |
do_write(data.uniq_reg.exec(clone.outerHTML)[1], data); |
|
137 |
} |
|
138 |
|
|
139 |
function finish_serializing_node(node, data) |
|
140 |
{ |
|
141 |
const nodes_to_process = [node]; |
|
142 |
|
|
143 |
while (true) { |
|
144 |
node = nodes_to_process.pop(); |
|
145 |
if (!node) |
|
146 |
break; |
|
147 |
|
|
148 |
nodes_to_process.push(node, node.hachette_last_added); |
|
149 |
} |
|
150 |
|
|
151 |
while (nodes_to_process.length > 0) { |
|
152 |
const node = nodes_to_process.pop(); |
|
153 |
node.remove(); |
|
154 |
node.hachette_ignore = true; |
|
155 |
|
|
156 |
if (!data.writer) |
|
157 |
continue; |
|
158 |
|
|
159 |
if (node.hachette_started) { |
|
160 |
node.textContent = data.uniq; |
|
161 |
do_write(data.uniq_reg.exec(node.outerHTML)[2], data); |
|
162 |
continue; |
|
163 |
} |
|
164 |
|
|
165 |
do_write(node.outerHTML || serializer.serializeToString(node), data); |
|
166 |
} |
|
167 |
} |
|
168 |
|
|
169 |
function process_initial_nodes(node, data) |
|
170 |
{ |
|
171 |
if (data.processed_initial_nodes) |
|
172 |
return; |
|
173 |
|
|
174 |
data.processed_initial_nodes = true; |
|
175 |
|
|
176 |
start_serializing_node(data.html_root, data); |
|
177 |
|
|
178 |
const new_added = []; |
|
179 |
const nodes_to_process = [data.html_root]; |
|
180 |
|
|
181 |
let i = 0; |
|
182 |
while (nodes_to_process.length > 0) { |
|
183 |
let current = nodes_to_process.shift(); |
|
184 |
|
|
185 |
if (current.firstChild) { |
|
186 |
if (current.firstChild === node) |
|
187 |
break; |
|
188 |
nodes_to_process.unshift(current.firstChild, current); |
|
189 |
new_added.push([current.firstChild, current]); |
|
190 |
continue; |
|
191 |
} |
|
192 |
|
|
193 |
while (current && !current.nextSibling) |
|
194 |
current = nodes_to_process.shift(); |
|
195 |
|
|
196 |
if (!current || current.nextSibling === node) |
|
197 |
break; |
|
198 |
|
|
199 |
nodes_to_process.unshift(current.nextSibling); |
|
200 |
new_added.push([current.nextSibling, nodes_to_process[1]]); |
|
201 |
} |
|
202 |
|
|
203 |
data.new_added.unshift(...new_added); |
|
204 |
} |
|
205 |
|
|
206 |
/* |
|
207 |
* Important! Due to some weirdness node.parentElement is not alway correct |
|
208 |
* in MutationRecords under Chromium. Track node relations manually. |
|
209 |
*/ |
|
210 |
function handle_added_node(node, true_parent, data) |
|
211 |
{ |
|
212 |
/* |
|
213 |
* Functions we call here might cause new nodes to be injected or found |
|
214 |
* that require processing before the one we got in function argument. |
|
215 |
* We rely on those functions putting the node(s) they create/find at the |
|
216 |
* very beginning of the `new_added' queue and (for created nodes) setting |
|
217 |
* their `hachette_ignore' property, based on which their MutationRecord |
|
218 |
* will not be processed. A function can also mark a node already in the |
|
219 |
* `new_added' queue as not eligible for processing by setting its |
|
220 |
* `hachette_deleted' property. |
|
221 |
*/ |
|
222 |
|
|
223 |
process_initial_nodes(node, data); |
|
224 |
|
|
225 |
data.new_added.push([node, true_parent]); |
|
226 |
|
|
227 |
while (data.new_added.length > 0) { |
|
228 |
[node, true_parent] = data.new_added.shift(); |
|
229 |
|
|
230 |
if (true_parent.hachette_deleted) |
|
231 |
node.hachette_deleted = true; |
|
232 |
if (node.hachette_deleted) |
|
233 |
continue; |
|
234 |
|
|
235 |
if (!true_parent.hachette_started) |
|
236 |
start_serializing_node(true_parent, data) |
|
237 |
|
|
238 |
if (!node.hachette_ignore) |
|
239 |
sanitize_node(node, data); |
|
240 |
|
|
241 |
if (node.hachette_deleted) |
|
242 |
continue; |
|
243 |
|
|
244 |
if (data.node_eater) |
|
245 |
data.node_eater(node, true_parent); |
|
246 |
|
|
247 |
finish_serializing_node(true_parent.hachette_last_added, data); |
|
248 |
|
|
249 |
true_parent.hachette_last_added = node; |
|
250 |
} |
|
251 |
} |
|
252 |
|
|
253 |
function handle_mutation(mutations, data) |
|
254 |
{ |
|
255 |
/* |
|
256 |
* Chromium: for an unknown reason mutation.target is not always the same as |
|
257 |
* node.parentElement. The former is the correct one. |
|
258 |
*/ |
|
259 |
for (const mutation of mutations) { |
|
260 |
for (const node of mutation.addedNodes) { |
|
261 |
/* Check for nodes added by ourselves. */ |
|
262 |
if (mutation.target.hachette_ignore) |
|
263 |
node.hachette_ignore = true; |
|
264 |
if (node.hachette_ignore) |
|
265 |
continue; |
|
266 |
|
|
267 |
handle_added_node(node, mutation.target, data); |
|
268 |
} |
|
269 |
} |
|
270 |
} |
|
271 |
|
|
272 |
function finish_processing(data) |
|
273 |
{ |
|
274 |
process_initial_nodes(undefined, data); |
|
275 |
|
|
276 |
/* |
|
277 |
* The `finisher' callback should be called, if provided. Normally our |
|
278 |
* function that performs the last write does it after seeing `finished' |
|
279 |
* set to `true'. If, however, there's no `writer' callback and hence no |
|
280 |
* writes to perform, we need to take care of calling `finisher' here. |
|
281 |
*/ |
|
282 |
data.finished = true; |
|
283 |
handle_mutation(data.observer.takeRecords(), data); |
|
284 |
data.observer.disconnect(); |
|
285 |
|
|
286 |
/* |
|
287 |
* Additional whitespace that was after `</body>' gets appended to body. |
|
288 |
* Although it's a minor issue, it is not what we want. There's no way to |
|
289 |
* tell exactly what part of that whitespace was after `</body>' and what |
|
290 |
* was before, so we just replace it with a single newline which looks good |
|
291 |
* when printed. |
|
292 |
*/ |
|
293 |
const body = data.html_root.lastChild; |
|
294 |
const text = body && body.tagName === "BODY" && body.lastChild; |
|
295 |
if (text && text.nodeName === "#text") { |
|
296 |
const new_content = /^([\S\s]*\S)?\s*$/.exec(text.textContent)[1] || ""; |
|
297 |
text.textContent = new_content + "\n"; |
|
298 |
} |
|
299 |
|
|
300 |
finish_serializing_node(data.html_root, data); |
|
301 |
if (!data.writer && data.finisher) |
|
302 |
setTimeout(data.finisher, 0); |
|
303 |
} |
|
304 |
|
|
305 |
/* |
|
306 |
* This function sanitizes `html_root' according to `policy'. It is capable of |
|
307 |
* working on an HTML document that is being written to, sanitizing new nodes |
|
308 |
* as they appear. |
|
309 |
* |
|
310 |
* `consumers' object may contain 3 optional callback functions: `writer', |
|
311 |
* `node_eater' and `finisher'. The first one, if present, is called with chunks |
|
312 |
* of reconstructed HTML code. The second one, if present, gets called for every |
|
313 |
* added node with 2 arguments: that node and its parent. The third one is |
|
314 |
* called at the end, after all processing has been done. |
|
315 |
* |
|
316 |
* `modify_on_the_fly()' returns a callback that should be called (with no |
|
317 |
* arguments) once the document of html_root has finished being written to. |
|
318 |
* Unfortunately, due to specifics behavior of document that has had its |
|
319 |
* documentElement replaced |
|
320 |
*/ |
|
321 |
function modify_on_the_fly(html_root, policy, consumers) |
|
322 |
{ |
|
323 |
const uniq = gen_nonce(); |
|
324 |
const uniq_reg = new RegExp(`^([\\s\\S]*)${uniq}([\\s\\S]*)$`); |
|
325 |
const data = {policy, html_root, uniq, uniq_reg, chunks: [], new_added: []}; |
|
326 |
Object.assign(data, consumers); |
|
327 |
|
|
328 |
var observer = new MutationObserver(m => handle_mutation(m, data)); |
|
329 |
observer.observe(data.html_root, { |
|
330 |
attributes: true, |
|
331 |
childList: true, |
|
332 |
subtree: true |
|
333 |
}); |
|
334 |
|
|
335 |
data.observer = observer; |
|
336 |
|
|
337 |
return () => finish_processing(data); |
|
338 |
} |
|
339 |
|
|
340 |
/* |
|
341 |
* EXPORTS_START |
|
342 |
* EXPORT modify_on_the_fly |
|
343 |
* EXPORTS_END |
|
344 |
*/ |
Also available in: Unified diff
implement rethinked tags sanitizing approach
This has not been tested yet. Additionally, functionality for blocking of `data:' urls needs to be re-enabled.