Project

General

Profile

Download (6.93 KB) Statistics
| Branch: | Tag: | Revision:

haketilo / background / stream_filter.js @ 263d03d5

1
/**
2
 * This file is part of Haketilo.
3
 *
4
 * Function: Modifying a web page using the StreamFilter API.
5
 *
6
 * Copyright (C) 2021, Wojtek Kosior
7
 * Copyright (C) 2018, Giorgio Maone <giorgio@maone.net>
8
 *
9
 * This program is free software: you can redistribute it and/or modify
10
 * it under the terms of the GNU General Public License as published by
11
 * the Free Software Foundation, either version 3 of the License, or
12
 * (at your option) any later version.
13
 *
14
 * This program is distributed in the hope that it will be useful,
15
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
 * GNU General Public License for more details.
18
 *
19
 * You should have received a copy of the GNU General Public License
20
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
21
 *
22
 *
23
 * I, Wojtek Kosior, thereby promise not to sue for violation of this file's
24
 * license. Although I request that you do not make use this code in a
25
 * proprietary program, I am not going to enforce this in court.
26
 *
27
 *
28
 * Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js'
29
 * in LibreJS.
30
 */
31

    
32
/*
33
 * IMPORTS_START
34
 * IMPORT browser
35
 * IMPORT csp_header_regex
36
 * IMPORTS_END
37
 */
38

    
39
function validate_encoding(charset)
40
{
41
    try {
42
	new TextDecoder();
43
	return charset;
44
    } catch(e) {
45
	return undefined;
46
    }
47
}
48

    
49
function is_content_type_header(header)
50
{
51
    header.name.toLowerCase().trim() === "content-type";
52
}
53

    
54
const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i;
55

    
56
function properties_from_headers(headers)
57
{
58
    const properties = {};
59

    
60
    for (const header of headers.filter(is_content_type_header)) {
61
	const match = charset_reg.exec(header.value);
62
	if (!properties.detected_charset && validate_encoding(match[1]))
63
	    properties.detected_charset = match[1];
64

    
65
	if (/html/i.test(header.value))
66
	    properties.html = true;
67
    }
68

    
69
    return properties;
70
}
71

    
72
const UTF8_BOM = [0xef, 0xbb, 0xbf];
73
const BOMs = [
74
    [UTF8_BOM, "utf-8"],
75
    [[0xfe, 0xff], "utf-16be"],
76
    [[0xff, 0xfe], "utf-16le"]
77
];
78

    
79
function charset_from_BOM(data)
80
{
81
    for (const [BOM, charset] of BOMs) {
82
	if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true))
83
	    return charset;
84
    }
85

    
86
    return "";
87
}
88

    
89
const charset_attrs =
90
      ['charset', 'http-equiv="content-type"', 'content*="charset"'];
91
const charset_meta_selector =
92
      charset_attrs.map(a => `head>meta[${a}]`).join(", ");
93

    
94
function charset_from_meta_tags(doc)
95
{
96
    for (const meta of doc.querySelectorAll(charset_meta_selector)) {
97
	const maybe_charset = meta.getAttribute("charset");
98
	if (maybe_charset && validate_encoding(maybe_charset))
99
	    return maybe_charset;
100

    
101
        const match = charset_reg.exec(meta.getAttribute("content"));
102
        if (match && validate_encoding(match[1]))
103
	    return match[1];
104
    }
105

    
106
    return undefined;
107
}
108

    
109
function create_decoder(properties, data)
110
{
111
    let charset = charset_from_BOM(data) || properties.detected_charset;
112
    if (!charset && data.indexOf(0) !== -1) {
113
        console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.",
114
		      properties);
115
	return new TextDecoder("utf-16be");
116
    }
117

    
118
    /* Missing HTTP charset, sniffing in content... */
119
    /*
120
     * TODO: I recall there is some standard saying how early in the doc the
121
     * charset has to be specified. We could process just this part of data.
122
     */
123
    const text = new TextDecoder("latin1").decode(data, {stream: true});
124
    properties.html = properties.html || /html/i.test(text);
125

    
126
    if (properties.html) {
127
	const tmp_doc = new DOMParser().parseFromString(text, "text/html");
128
	charset = charset_from_meta_tags(tmp_doc);
129
    }
130

    
131
    return new TextDecoder(charset || "latin1");
132
}
133

    
134
function may_define_csp_rules(html)
135
{
136
    const doc = new DOMParser().parseFromString(html, "text/html");
137

    
138
    for (const meta of doc.querySelectorAll("head>meta[http-equiv]")) {
139
	if (csp_header_regex.test(meta.httpEquiv) && meta.content)
140
	    return true;
141
    }
142

    
143
    /*
144
     * Even if no naughty `<meta>' tags were found, subsequent chunk of HTML
145
     * data could add some. Before we return `false' we need to be sure we
146
     * reached the start of `<body>' where `<meta>' tags are no longer valid.
147
     */
148

    
149
    if (doc.documentElement.nextSibling || doc.body.nextSibling ||
150
	doc.body.childNodes.length > 1)
151
	return false;
152

    
153
    if (!doc.body.firstChild)
154
	return true;
155

    
156
    if (doc.body.firstChild.nodeName !== "#text")
157
	return false;
158

    
159
    return /^(<\/|&#|.)$/.test(doc.body.firstChild.wholeText);
160
}
161

    
162
function filter_data(properties, event)
163
{
164
    const data = new Uint8Array(event.data);
165
    let first_chunk = false;
166
    if (!properties.decoder) {
167
	first_chunk = true;
168
	properties.decoder = create_decoder(properties, data);
169
	properties.encoder = new TextEncoder();
170
    }
171

    
172
    let decoded = properties.decoder.decode(data);
173

    
174
    /* Force UTF-8, this is the only encoding we can produce. */
175
    if (first_chunk)
176
	properties.filter.write(new Uint8Array(UTF8_BOM));
177

    
178
    if (first_chunk && may_define_csp_rules(decoded)) {
179
	/*
180
	 * HAX! Our content scripts that execute at `document_start' will always
181
	 * run before the first script in the document, but under Mozilla some
182
	 * `<meta>' tags might already be loaded at that point. Here we inject a
183
	 * dummy `<script>' at the beginning (before any `<meta>' tags) that
184
	 * will force `document_start' to happen earlier. This way our content
185
	 * scripts will be able to sanitize `http-equiv' tags with CSP rules
186
	 * that would otherwise stop our injected scripts from executing.
187
	 *
188
	 * As we want to only process HTML files that happen to have naughty
189
	 * `<meta>' tags in `<head>', we use a DOMParser-based heuristic in
190
	 * `may_define_rules()'. We don't do any additional MIME sniffing as it
191
	 * is too unreliable (and our heuristic will likely mark non-HTML files
192
	 * as harmless anyway).
193
	 */
194

    
195
	const dummy_script =
196
	      `<script data-haketilo-deleteme="${properties.policy.nonce}" nonce="${properties.policy.nonce}">null</script>`;
197
	const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0];
198
	decoded = doctype_decl + dummy_script +
199
	    decoded.substring(doctype_decl.length);
200
    }
201

    
202
    properties.filter.write(properties.encoder.encode(decoded));
203

    
204
    if (properties.decoder.encoding === "utf-8")
205
	properties.filter.disconnect();
206
}
207

    
208
function apply_stream_filter(details, headers, policy)
209
{
210
    if (!policy.has_payload)
211
	return headers;
212

    
213
    const properties = properties_from_headers(headers);
214
    properties.policy = policy;
215

    
216
    properties.filter =
217
	browser.webRequest.filterResponseData(details.requestId);
218

    
219
    properties.filter.ondata = event => filter_data(properties, event);
220
    properties.filter.onstop = () => properties.filter.close();
221

    
222
    /*
223
     * In the future we might consider modifying the headers that specify
224
     * encoding. For now we are not yet doing it, though. However, we
225
     * prepend the data with UTF-8 BOM which should be enough.
226
     */
227
    return headers;
228
}
229

    
230
/*
231
 * EXPORTS_START
232
 * EXPORT apply_stream_filter
233
 * EXPORTS_END
234
 */
(7-7/7)