Project

General

Profile

Download (4.92 KB) Statistics
| Branch: | Tag: | Revision:

haketilo / background / stream_filter.js @ 6b53d6c8

1
/**
2
 * Hachette modifying a web page using the StreamFilter API
3
 *
4
 * Copyright (C) 2018 Giorgio Maone <giorgio@maone.net>
5
 * Copyright (C) 2021 Wojtek Kosior
6
 * Redistribution terms are gathered in the `copyright' file.
7
 *
8
 * Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js'
9
 * in LibreJS.
10
 */
11

    
12
/*
13
 * IMPORTS_START
14
 * IMPORT browser
15
 * IMPORTS_END
16
 */
17

    
18
function validate_encoding(charset)
19
{
20
    try {
21
	new TextDecoder();
22
	return charset;
23
    } catch(e) {
24
	return undefined;
25
    }
26
}
27

    
28
function is_content_type_header(header)
29
{
30
    header.name.toLowerCase().trim() === "content-type";
31
}
32

    
33
const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i;
34

    
35
function properties_from_headers(headers)
36
{
37
    const properties = {};
38

    
39
    for (const header of headers.filter(is_content_type_header)) {
40
	const match = charset_reg.exec(header.value);
41
	if (!properties.detected_charset && validate_encoding(match[1]))
42
	    properties.detected_charset = match[1];
43

    
44
	if (/html/i.test(header.value))
45
	    properties.html = true;
46
    }
47

    
48
    return properties;
49
}
50

    
51
const UTF8_BOM = [0xef, 0xbb, 0xbf];
52
const BOMs = [
53
    [UTF8_BOM, "utf-8"],
54
    [[0xfe, 0xff], "utf-16be"],
55
    [[0xff, 0xfe], "utf-16le"]
56
];
57

    
58
function charset_from_BOM(data)
59
{
60
    for (const [BOM, charset] of BOMs) {
61
	if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true))
62
	    return charset;
63
    }
64

    
65
    return "";
66
}
67

    
68
const charset_attrs =
69
      ['charset', 'http-equiv="content-type"', 'content*="charset"'];
70
const charset_meta_selector =
71
      charset_attrs.map(a => `head>meta[${a}]`).join(", ");
72

    
73
function charset_from_meta_tags(doc)
74
{
75
    for (const meta of doc.querySelectorAll(charset_meta_selector)) {
76
	const maybe_charset = meta.getAttribute("charset");
77
	if (maybe_charset && validate_encoding(maybe_charset))
78
	    return maybe_charset;
79

    
80
        const match = charset_reg.exec(meta.getAttribute("content"));
81
        if (match && validate_encoding(match[1]))
82
	    return match[1];
83
    }
84

    
85
    return undefined;
86
}
87

    
88
function create_decoder(properties, data)
89
{
90
    let charset = charset_from_BOM(data) || properties.detected_charset;
91
    if (!charset && data.indexOf(0) !== -1) {
92
        console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.",
93
		      properties);
94
	return new TextDecoder("utf-16be");
95
    }
96

    
97
    /* Missing HTTP charset, sniffing in content... */
98
    /*
99
     * TODO: I recall there is some standard saying how early in the doc the
100
     * charset has to be specified. We could process just this part of data.
101
     */
102
    const text = new TextDecoder("latin1").decode(data, {stream: true});
103
    properties.html = properties.html || /html/i.test(text);
104

    
105
    if (properties.html) {
106
	const tmp_doc = new DOMParser().parseFromString(text, "text/html");
107
	charset = charset_from_meta_tags(tmp_doc);
108
    }
109

    
110
    return new TextDecoder(charset || "latin1");
111
}
112

    
113
function filter_data(properties, event)
114
{
115
    const data = new Uint8Array(event.data);
116
    let first_chunk = false;
117
    if (!properties.decoder) {
118
	first_chunk = true;
119
	properties.decoder = create_decoder(properties, data);
120
	properties.encoder = new TextEncoder();
121
	/* Force UTF-8, this is the only encoding we can produce. */
122
	properties.filter.write(new Uint8Array(UTF8_BOM));
123
    }
124

    
125
    let decoded = properties.decoder.decode(data);
126

    
127
    if (first_chunk) {
128
	/*
129
	 * HAX! Our content scripts that execute at `document_start' will always
130
	 * run before the first script in the document, but under Mozilla some
131
	 * `<meta>' tags might already be loaded at that point. Here we inject a
132
	 * dummy `<script>' at the beginning (before any `<meta>' tags) that
133
	 * will force `document_start' to happen earlier. This way our content
134
	 * scripts will be able to sanitize `http-equiv' tags with CSP rules
135
	 * that would otherwise stop our injected scripts from executing.
136
	 */
137
	const dummy_script =
138
	      `<script data-hachette-deleteme="${properties.policy.nonce}" nonce="${properties.policy.nonce}">null</script>`;
139
	const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0];
140
	decoded = doctype_decl + dummy_script +
141
	    decoded.substring(doctype_decl.length);
142
    }
143

    
144
    properties.filter.write(properties.encoder.encode(decoded));
145

    
146
    if (properties.decoder.encoding === "utf-8")
147
	properties.filter.disconnect();
148
}
149

    
150
function apply_stream_filter(details, headers, policy)
151
{
152
    if (policy.allow)
153
	return headers;
154

    
155
    const properties = properties_from_headers(headers);
156
    properties.policy = policy;
157

    
158
    properties.filter =
159
	browser.webRequest.filterResponseData(details.requestId);
160

    
161
    properties.filter.ondata = event => filter_data(properties, event);
162
    properties.filter.onstop = () => properties.filter.close();
163

    
164
    /*
165
     * In the future we might consider modifying the headers that specify
166
     * encoding. For now we are not yet doing it, though. However, we
167
     * prepend the data with UTF-8 BOM which should be enough.
168
     */
169
    return headers;
170
}
171

    
172
/*
173
 * EXPORTS_START
174
 * EXPORT apply_stream_filter
175
 * EXPORTS_END
176
 */
(6-6/6)