Project

General

Profile

Download (6.1 KB) Statistics
| Branch: | Tag: | Revision:

haketilo / background / stream_filter.js @ 44e89d8e

1
/**
2
 * Hachette modifying a web page using the StreamFilter API
3
 *
4
 * Copyright (C) 2018 Giorgio Maone <giorgio@maone.net>
5
 * Copyright (C) 2021 Wojtek Kosior
6
 * Redistribution terms are gathered in the `copyright' file.
7
 *
8
 * Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js'
9
 * in LibreJS.
10
 */
11

    
12
/*
13
 * IMPORTS_START
14
 * IMPORT browser
15
 * IMPORT csp_header_regex
16
 * IMPORTS_END
17
 */
18

    
19
function validate_encoding(charset)
20
{
21
    try {
22
	new TextDecoder();
23
	return charset;
24
    } catch(e) {
25
	return undefined;
26
    }
27
}
28

    
29
function is_content_type_header(header)
30
{
31
    header.name.toLowerCase().trim() === "content-type";
32
}
33

    
34
const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i;
35

    
36
function properties_from_headers(headers)
37
{
38
    const properties = {};
39

    
40
    for (const header of headers.filter(is_content_type_header)) {
41
	const match = charset_reg.exec(header.value);
42
	if (!properties.detected_charset && validate_encoding(match[1]))
43
	    properties.detected_charset = match[1];
44

    
45
	if (/html/i.test(header.value))
46
	    properties.html = true;
47
    }
48

    
49
    return properties;
50
}
51

    
52
const UTF8_BOM = [0xef, 0xbb, 0xbf];
53
const BOMs = [
54
    [UTF8_BOM, "utf-8"],
55
    [[0xfe, 0xff], "utf-16be"],
56
    [[0xff, 0xfe], "utf-16le"]
57
];
58

    
59
function charset_from_BOM(data)
60
{
61
    for (const [BOM, charset] of BOMs) {
62
	if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true))
63
	    return charset;
64
    }
65

    
66
    return "";
67
}
68

    
69
const charset_attrs =
70
      ['charset', 'http-equiv="content-type"', 'content*="charset"'];
71
const charset_meta_selector =
72
      charset_attrs.map(a => `head>meta[${a}]`).join(", ");
73

    
74
function charset_from_meta_tags(doc)
75
{
76
    for (const meta of doc.querySelectorAll(charset_meta_selector)) {
77
	const maybe_charset = meta.getAttribute("charset");
78
	if (maybe_charset && validate_encoding(maybe_charset))
79
	    return maybe_charset;
80

    
81
        const match = charset_reg.exec(meta.getAttribute("content"));
82
        if (match && validate_encoding(match[1]))
83
	    return match[1];
84
    }
85

    
86
    return undefined;
87
}
88

    
89
function create_decoder(properties, data)
90
{
91
    let charset = charset_from_BOM(data) || properties.detected_charset;
92
    if (!charset && data.indexOf(0) !== -1) {
93
        console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.",
94
		      properties);
95
	return new TextDecoder("utf-16be");
96
    }
97

    
98
    /* Missing HTTP charset, sniffing in content... */
99
    /*
100
     * TODO: I recall there is some standard saying how early in the doc the
101
     * charset has to be specified. We could process just this part of data.
102
     */
103
    const text = new TextDecoder("latin1").decode(data, {stream: true});
104
    properties.html = properties.html || /html/i.test(text);
105

    
106
    if (properties.html) {
107
	const tmp_doc = new DOMParser().parseFromString(text, "text/html");
108
	charset = charset_from_meta_tags(tmp_doc);
109
    }
110

    
111
    return new TextDecoder(charset || "latin1");
112
}
113

    
114
function may_define_csp_rules(html)
115
{
116
    const doc = new DOMParser().parseFromString(html, "text/html");
117

    
118
    for (const meta of doc.querySelectorAll("head>meta[http-equiv]")) {
119
	if (csp_header_regex.test(meta.httpEquiv) && meta.content)
120
	    return true;
121
    }
122

    
123
    /*
124
     * Even if no naughty `<meta>' tags were found, subsequent chunk of HTML
125
     * data could add some. Before we return `false' we need to be sure we
126
     * reached the start of `<body>' where `<meta>' tags are no longer valid.
127
     */
128

    
129
    if (doc.documentElement.nextSibling || doc.body.nextSibling ||
130
	doc.body.childNodes.length > 1)
131
	return false;
132

    
133
    if (!doc.body.firstChild)
134
	return true;
135

    
136
    if (doc.body.firstChild.nodeName !== "#text")
137
	return false;
138

    
139
    return /^(<\/|&#|.)$/.test(doc.body.firstChild.wholeText);
140
}
141

    
142
function filter_data(properties, event)
143
{
144
    const data = new Uint8Array(event.data);
145
    let first_chunk = false;
146
    if (!properties.decoder) {
147
	first_chunk = true;
148
	properties.decoder = create_decoder(properties, data);
149
	properties.encoder = new TextEncoder();
150
    }
151

    
152
    let decoded = properties.decoder.decode(data);
153

    
154
    /* Force UTF-8, this is the only encoding we can produce. */
155
    if (first_chunk)
156
	properties.filter.write(new Uint8Array(UTF8_BOM));
157

    
158
    if (first_chunk && may_define_csp_rules(decoded)) {
159
	/*
160
	 * HAX! Our content scripts that execute at `document_start' will always
161
	 * run before the first script in the document, but under Mozilla some
162
	 * `<meta>' tags might already be loaded at that point. Here we inject a
163
	 * dummy `<script>' at the beginning (before any `<meta>' tags) that
164
	 * will force `document_start' to happen earlier. This way our content
165
	 * scripts will be able to sanitize `http-equiv' tags with CSP rules
166
	 * that would otherwise stop our injected scripts from executing.
167
	 *
168
	 * As we want to only process HTML files that happen to have naughty
169
	 * `<meta>' tags in `<head>', we use a DOMParser-based heuristic in
170
	 * `may_define_rules()'. We don't do any additional MIME sniffing as it
171
	 * is too unreliable (and our heuristic will likely mark non-HTML files
172
	 * as harmless anyway).
173
	 */
174

    
175
	const dummy_script =
176
	      `<script data-hachette-deleteme="${properties.policy.nonce}" nonce="${properties.policy.nonce}">null</script>`;
177
	const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0];
178
	decoded = doctype_decl + dummy_script +
179
	    decoded.substring(doctype_decl.length);
180
    }
181

    
182
    properties.filter.write(properties.encoder.encode(decoded));
183

    
184
    if (properties.decoder.encoding === "utf-8")
185
	properties.filter.disconnect();
186
}
187

    
188
function apply_stream_filter(details, headers, policy)
189
{
190
    if (!policy.has_payload)
191
	return headers;
192

    
193
    const properties = properties_from_headers(headers);
194
    properties.policy = policy;
195

    
196
    properties.filter =
197
	browser.webRequest.filterResponseData(details.requestId);
198

    
199
    properties.filter.ondata = event => filter_data(properties, event);
200
    properties.filter.onstop = () => properties.filter.close();
201

    
202
    /*
203
     * In the future we might consider modifying the headers that specify
204
     * encoding. For now we are not yet doing it, though. However, we
205
     * prepend the data with UTF-8 BOM which should be enough.
206
     */
207
    return headers;
208
}
209

    
210
/*
211
 * EXPORTS_START
212
 * EXPORT apply_stream_filter
213
 * EXPORTS_END
214
 */
(7-7/7)