1
|
/**
|
2
|
* This file is part of Haketilo.
|
3
|
*
|
4
|
* Function: Modifying a web page using the StreamFilter API.
|
5
|
*
|
6
|
* Copyright (C) 2018 Giorgio Maone <giorgio@maone.net>
|
7
|
* Copyright (C) 2021 Wojtek Kosior
|
8
|
* Redistribution terms are gathered in the `copyright' file.
|
9
|
*
|
10
|
* Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js'
|
11
|
* in LibreJS.
|
12
|
*/
|
13
|
|
14
|
/*
|
15
|
* IMPORTS_START
|
16
|
* IMPORT browser
|
17
|
* IMPORT csp_header_regex
|
18
|
* IMPORTS_END
|
19
|
*/
|
20
|
|
21
|
function validate_encoding(charset)
|
22
|
{
|
23
|
try {
|
24
|
new TextDecoder();
|
25
|
return charset;
|
26
|
} catch(e) {
|
27
|
return undefined;
|
28
|
}
|
29
|
}
|
30
|
|
31
|
function is_content_type_header(header)
|
32
|
{
|
33
|
header.name.toLowerCase().trim() === "content-type";
|
34
|
}
|
35
|
|
36
|
const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i;
|
37
|
|
38
|
function properties_from_headers(headers)
|
39
|
{
|
40
|
const properties = {};
|
41
|
|
42
|
for (const header of headers.filter(is_content_type_header)) {
|
43
|
const match = charset_reg.exec(header.value);
|
44
|
if (!properties.detected_charset && validate_encoding(match[1]))
|
45
|
properties.detected_charset = match[1];
|
46
|
|
47
|
if (/html/i.test(header.value))
|
48
|
properties.html = true;
|
49
|
}
|
50
|
|
51
|
return properties;
|
52
|
}
|
53
|
|
54
|
const UTF8_BOM = [0xef, 0xbb, 0xbf];
|
55
|
const BOMs = [
|
56
|
[UTF8_BOM, "utf-8"],
|
57
|
[[0xfe, 0xff], "utf-16be"],
|
58
|
[[0xff, 0xfe], "utf-16le"]
|
59
|
];
|
60
|
|
61
|
function charset_from_BOM(data)
|
62
|
{
|
63
|
for (const [BOM, charset] of BOMs) {
|
64
|
if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true))
|
65
|
return charset;
|
66
|
}
|
67
|
|
68
|
return "";
|
69
|
}
|
70
|
|
71
|
const charset_attrs =
|
72
|
['charset', 'http-equiv="content-type"', 'content*="charset"'];
|
73
|
const charset_meta_selector =
|
74
|
charset_attrs.map(a => `head>meta[${a}]`).join(", ");
|
75
|
|
76
|
function charset_from_meta_tags(doc)
|
77
|
{
|
78
|
for (const meta of doc.querySelectorAll(charset_meta_selector)) {
|
79
|
const maybe_charset = meta.getAttribute("charset");
|
80
|
if (maybe_charset && validate_encoding(maybe_charset))
|
81
|
return maybe_charset;
|
82
|
|
83
|
const match = charset_reg.exec(meta.getAttribute("content"));
|
84
|
if (match && validate_encoding(match[1]))
|
85
|
return match[1];
|
86
|
}
|
87
|
|
88
|
return undefined;
|
89
|
}
|
90
|
|
91
|
function create_decoder(properties, data)
|
92
|
{
|
93
|
let charset = charset_from_BOM(data) || properties.detected_charset;
|
94
|
if (!charset && data.indexOf(0) !== -1) {
|
95
|
console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.",
|
96
|
properties);
|
97
|
return new TextDecoder("utf-16be");
|
98
|
}
|
99
|
|
100
|
/* Missing HTTP charset, sniffing in content... */
|
101
|
/*
|
102
|
* TODO: I recall there is some standard saying how early in the doc the
|
103
|
* charset has to be specified. We could process just this part of data.
|
104
|
*/
|
105
|
const text = new TextDecoder("latin1").decode(data, {stream: true});
|
106
|
properties.html = properties.html || /html/i.test(text);
|
107
|
|
108
|
if (properties.html) {
|
109
|
const tmp_doc = new DOMParser().parseFromString(text, "text/html");
|
110
|
charset = charset_from_meta_tags(tmp_doc);
|
111
|
}
|
112
|
|
113
|
return new TextDecoder(charset || "latin1");
|
114
|
}
|
115
|
|
116
|
function may_define_csp_rules(html)
|
117
|
{
|
118
|
const doc = new DOMParser().parseFromString(html, "text/html");
|
119
|
|
120
|
for (const meta of doc.querySelectorAll("head>meta[http-equiv]")) {
|
121
|
if (csp_header_regex.test(meta.httpEquiv) && meta.content)
|
122
|
return true;
|
123
|
}
|
124
|
|
125
|
/*
|
126
|
* Even if no naughty `<meta>' tags were found, subsequent chunk of HTML
|
127
|
* data could add some. Before we return `false' we need to be sure we
|
128
|
* reached the start of `<body>' where `<meta>' tags are no longer valid.
|
129
|
*/
|
130
|
|
131
|
if (doc.documentElement.nextSibling || doc.body.nextSibling ||
|
132
|
doc.body.childNodes.length > 1)
|
133
|
return false;
|
134
|
|
135
|
if (!doc.body.firstChild)
|
136
|
return true;
|
137
|
|
138
|
if (doc.body.firstChild.nodeName !== "#text")
|
139
|
return false;
|
140
|
|
141
|
return /^(<\/|&#|.)$/.test(doc.body.firstChild.wholeText);
|
142
|
}
|
143
|
|
144
|
function filter_data(properties, event)
|
145
|
{
|
146
|
const data = new Uint8Array(event.data);
|
147
|
let first_chunk = false;
|
148
|
if (!properties.decoder) {
|
149
|
first_chunk = true;
|
150
|
properties.decoder = create_decoder(properties, data);
|
151
|
properties.encoder = new TextEncoder();
|
152
|
}
|
153
|
|
154
|
let decoded = properties.decoder.decode(data);
|
155
|
|
156
|
/* Force UTF-8, this is the only encoding we can produce. */
|
157
|
if (first_chunk)
|
158
|
properties.filter.write(new Uint8Array(UTF8_BOM));
|
159
|
|
160
|
if (first_chunk && may_define_csp_rules(decoded)) {
|
161
|
/*
|
162
|
* HAX! Our content scripts that execute at `document_start' will always
|
163
|
* run before the first script in the document, but under Mozilla some
|
164
|
* `<meta>' tags might already be loaded at that point. Here we inject a
|
165
|
* dummy `<script>' at the beginning (before any `<meta>' tags) that
|
166
|
* will force `document_start' to happen earlier. This way our content
|
167
|
* scripts will be able to sanitize `http-equiv' tags with CSP rules
|
168
|
* that would otherwise stop our injected scripts from executing.
|
169
|
*
|
170
|
* As we want to only process HTML files that happen to have naughty
|
171
|
* `<meta>' tags in `<head>', we use a DOMParser-based heuristic in
|
172
|
* `may_define_rules()'. We don't do any additional MIME sniffing as it
|
173
|
* is too unreliable (and our heuristic will likely mark non-HTML files
|
174
|
* as harmless anyway).
|
175
|
*/
|
176
|
|
177
|
const dummy_script = `<script>null</script>`;
|
178
|
const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0];
|
179
|
decoded = doctype_decl + dummy_script +
|
180
|
decoded.substring(doctype_decl.length);
|
181
|
}
|
182
|
|
183
|
properties.filter.write(properties.encoder.encode(decoded));
|
184
|
|
185
|
if (properties.decoder.encoding === "utf-8")
|
186
|
properties.filter.disconnect();
|
187
|
}
|
188
|
|
189
|
function apply_stream_filter(details, headers, policy)
|
190
|
{
|
191
|
if (!policy.payload)
|
192
|
return headers;
|
193
|
|
194
|
const properties = properties_from_headers(headers);
|
195
|
|
196
|
properties.filter =
|
197
|
browser.webRequest.filterResponseData(details.requestId);
|
198
|
|
199
|
properties.filter.ondata = event => filter_data(properties, event);
|
200
|
properties.filter.onstop = () => properties.filter.close();
|
201
|
|
202
|
/*
|
203
|
* In the future we might consider modifying the headers that specify
|
204
|
* encoding. For now we are not yet doing it, though. However, we
|
205
|
* prepend the data with UTF-8 BOM which should be enough.
|
206
|
*/
|
207
|
return headers;
|
208
|
}
|
209
|
|
210
|
/*
|
211
|
* EXPORTS_START
|
212
|
* EXPORT apply_stream_filter
|
213
|
* EXPORTS_END
|
214
|
*/
|