1
|
/**
|
2
|
* This file is part of Haketilo.
|
3
|
*
|
4
|
* Function: Modifying a web page using the StreamFilter API.
|
5
|
*
|
6
|
* Copyright (C) 2021, Wojtek Kosior
|
7
|
* Copyright (C) 2018, Giorgio Maone <giorgio@maone.net>
|
8
|
*
|
9
|
* This program is free software: you can redistribute it and/or modify
|
10
|
* it under the terms of the GNU General Public License as published by
|
11
|
* the Free Software Foundation, either version 3 of the License, or
|
12
|
* (at your option) any later version.
|
13
|
*
|
14
|
* This program is distributed in the hope that it will be useful,
|
15
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
16
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
17
|
* GNU General Public License for more details.
|
18
|
*
|
19
|
* You should have received a copy of the GNU General Public License
|
20
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
21
|
*
|
22
|
*
|
23
|
* I, Wojtek Kosior, thereby promise not to sue for violation of this file's
|
24
|
* license. Although I request that you do not make use this code in a
|
25
|
* proprietary program, I am not going to enforce this in court.
|
26
|
*
|
27
|
*
|
28
|
* Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js'
|
29
|
* in LibreJS.
|
30
|
*/
|
31
|
|
32
|
/*
|
33
|
* IMPORTS_START
|
34
|
* IMPORT browser
|
35
|
* IMPORT csp_header_regex
|
36
|
* IMPORTS_END
|
37
|
*/
|
38
|
|
39
|
function validate_encoding(charset)
|
40
|
{
|
41
|
try {
|
42
|
new TextDecoder();
|
43
|
return charset;
|
44
|
} catch(e) {
|
45
|
return undefined;
|
46
|
}
|
47
|
}
|
48
|
|
49
|
function is_content_type_header(header)
|
50
|
{
|
51
|
header.name.toLowerCase().trim() === "content-type";
|
52
|
}
|
53
|
|
54
|
const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i;
|
55
|
|
56
|
function properties_from_headers(headers)
|
57
|
{
|
58
|
const properties = {};
|
59
|
|
60
|
for (const header of headers.filter(is_content_type_header)) {
|
61
|
const match = charset_reg.exec(header.value);
|
62
|
if (!properties.detected_charset && validate_encoding(match[1]))
|
63
|
properties.detected_charset = match[1];
|
64
|
|
65
|
if (/html/i.test(header.value))
|
66
|
properties.html = true;
|
67
|
}
|
68
|
|
69
|
return properties;
|
70
|
}
|
71
|
|
72
|
const UTF8_BOM = [0xef, 0xbb, 0xbf];
|
73
|
const BOMs = [
|
74
|
[UTF8_BOM, "utf-8"],
|
75
|
[[0xfe, 0xff], "utf-16be"],
|
76
|
[[0xff, 0xfe], "utf-16le"]
|
77
|
];
|
78
|
|
79
|
function charset_from_BOM(data)
|
80
|
{
|
81
|
for (const [BOM, charset] of BOMs) {
|
82
|
if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true))
|
83
|
return charset;
|
84
|
}
|
85
|
|
86
|
return "";
|
87
|
}
|
88
|
|
89
|
const charset_attrs =
|
90
|
['charset', 'http-equiv="content-type"', 'content*="charset"'];
|
91
|
const charset_meta_selector =
|
92
|
charset_attrs.map(a => `head>meta[${a}]`).join(", ");
|
93
|
|
94
|
function charset_from_meta_tags(doc)
|
95
|
{
|
96
|
for (const meta of doc.querySelectorAll(charset_meta_selector)) {
|
97
|
const maybe_charset = meta.getAttribute("charset");
|
98
|
if (maybe_charset && validate_encoding(maybe_charset))
|
99
|
return maybe_charset;
|
100
|
|
101
|
const match = charset_reg.exec(meta.getAttribute("content"));
|
102
|
if (match && validate_encoding(match[1]))
|
103
|
return match[1];
|
104
|
}
|
105
|
|
106
|
return undefined;
|
107
|
}
|
108
|
|
109
|
function create_decoder(properties, data)
|
110
|
{
|
111
|
let charset = charset_from_BOM(data) || properties.detected_charset;
|
112
|
if (!charset && data.indexOf(0) !== -1) {
|
113
|
console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.",
|
114
|
properties);
|
115
|
return new TextDecoder("utf-16be");
|
116
|
}
|
117
|
|
118
|
/* Missing HTTP charset, sniffing in content... */
|
119
|
/*
|
120
|
* TODO: I recall there is some standard saying how early in the doc the
|
121
|
* charset has to be specified. We could process just this part of data.
|
122
|
*/
|
123
|
const text = new TextDecoder("latin1").decode(data, {stream: true});
|
124
|
properties.html = properties.html || /html/i.test(text);
|
125
|
|
126
|
if (properties.html) {
|
127
|
const tmp_doc = new DOMParser().parseFromString(text, "text/html");
|
128
|
charset = charset_from_meta_tags(tmp_doc);
|
129
|
}
|
130
|
|
131
|
return new TextDecoder(charset || "latin1");
|
132
|
}
|
133
|
|
134
|
function may_define_csp_rules(html)
|
135
|
{
|
136
|
const doc = new DOMParser().parseFromString(html, "text/html");
|
137
|
|
138
|
for (const meta of doc.querySelectorAll("head>meta[http-equiv]")) {
|
139
|
if (csp_header_regex.test(meta.httpEquiv) && meta.content)
|
140
|
return true;
|
141
|
}
|
142
|
|
143
|
/*
|
144
|
* Even if no naughty `<meta>' tags were found, subsequent chunk of HTML
|
145
|
* data could add some. Before we return `false' we need to be sure we
|
146
|
* reached the start of `<body>' where `<meta>' tags are no longer valid.
|
147
|
*/
|
148
|
|
149
|
if (doc.documentElement.nextSibling || doc.body.nextSibling ||
|
150
|
doc.body.childNodes.length > 1)
|
151
|
return false;
|
152
|
|
153
|
if (!doc.body.firstChild)
|
154
|
return true;
|
155
|
|
156
|
if (doc.body.firstChild.nodeName !== "#text")
|
157
|
return false;
|
158
|
|
159
|
return /^(<\/|&#|.)$/.test(doc.body.firstChild.wholeText);
|
160
|
}
|
161
|
|
162
|
function filter_data(properties, event)
|
163
|
{
|
164
|
const data = new Uint8Array(event.data);
|
165
|
let first_chunk = false;
|
166
|
if (!properties.decoder) {
|
167
|
first_chunk = true;
|
168
|
properties.decoder = create_decoder(properties, data);
|
169
|
properties.encoder = new TextEncoder();
|
170
|
}
|
171
|
|
172
|
let decoded = properties.decoder.decode(data);
|
173
|
|
174
|
/* Force UTF-8, this is the only encoding we can produce. */
|
175
|
if (first_chunk)
|
176
|
properties.filter.write(new Uint8Array(UTF8_BOM));
|
177
|
|
178
|
if (first_chunk && may_define_csp_rules(decoded)) {
|
179
|
/*
|
180
|
* HAX! Our content scripts that execute at `document_start' will always
|
181
|
* run before the first script in the document, but under Mozilla some
|
182
|
* `<meta>' tags might already be loaded at that point. Here we inject a
|
183
|
* dummy `<script>' at the beginning (before any `<meta>' tags) that
|
184
|
* will force `document_start' to happen earlier. This way our content
|
185
|
* scripts will be able to sanitize `http-equiv' tags with CSP rules
|
186
|
* that would otherwise stop our injected scripts from executing.
|
187
|
*
|
188
|
* As we want to only process HTML files that happen to have naughty
|
189
|
* `<meta>' tags in `<head>', we use a DOMParser-based heuristic in
|
190
|
* `may_define_rules()'. We don't do any additional MIME sniffing as it
|
191
|
* is too unreliable (and our heuristic will likely mark non-HTML files
|
192
|
* as harmless anyway).
|
193
|
*/
|
194
|
|
195
|
const dummy_script =
|
196
|
`<script data-haketilo-deleteme="${properties.policy.nonce}" nonce="${properties.policy.nonce}">null</script>`;
|
197
|
const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0];
|
198
|
decoded = doctype_decl + dummy_script +
|
199
|
decoded.substring(doctype_decl.length);
|
200
|
}
|
201
|
|
202
|
properties.filter.write(properties.encoder.encode(decoded));
|
203
|
|
204
|
if (properties.decoder.encoding === "utf-8")
|
205
|
properties.filter.disconnect();
|
206
|
}
|
207
|
|
208
|
function apply_stream_filter(details, headers, policy)
|
209
|
{
|
210
|
if (!policy.has_payload)
|
211
|
return headers;
|
212
|
|
213
|
const properties = properties_from_headers(headers);
|
214
|
properties.policy = policy;
|
215
|
|
216
|
properties.filter =
|
217
|
browser.webRequest.filterResponseData(details.requestId);
|
218
|
|
219
|
properties.filter.ondata = event => filter_data(properties, event);
|
220
|
properties.filter.onstop = () => properties.filter.close();
|
221
|
|
222
|
/*
|
223
|
* In the future we might consider modifying the headers that specify
|
224
|
* encoding. For now we are not yet doing it, though. However, we
|
225
|
* prepend the data with UTF-8 BOM which should be enough.
|
226
|
*/
|
227
|
return headers;
|
228
|
}
|
229
|
|
230
|
/*
|
231
|
* EXPORTS_START
|
232
|
* EXPORT apply_stream_filter
|
233
|
* EXPORTS_END
|
234
|
*/
|