1
|
/**
|
2
|
* Hachette modifying a web page using the StreamFilter API
|
3
|
*
|
4
|
* Copyright (C) 2018 Giorgio Maone <giorgio@maone.net>
|
5
|
* Copyright (C) 2021 Wojtek Kosior
|
6
|
* Redistribution terms are gathered in the `copyright' file.
|
7
|
*
|
8
|
* Derived from `bg/ResponseProcessor.js' and `bg/ResponseMetaData.js'
|
9
|
* in LibreJS.
|
10
|
*/
|
11
|
|
12
|
/*
|
13
|
* IMPORTS_START
|
14
|
* IMPORT browser
|
15
|
* IMPORTS_END
|
16
|
*/
|
17
|
|
18
|
function validate_encoding(charset)
|
19
|
{
|
20
|
try {
|
21
|
new TextDecoder();
|
22
|
return charset;
|
23
|
} catch(e) {
|
24
|
return undefined;
|
25
|
}
|
26
|
}
|
27
|
|
28
|
function is_content_type_header(header)
|
29
|
{
|
30
|
header.name.toLowerCase().trim() === "content-type";
|
31
|
}
|
32
|
|
33
|
const charset_reg = /;\s*charset\s*=\s*([\w-]+)/i;
|
34
|
|
35
|
function properties_from_headers(headers)
|
36
|
{
|
37
|
const properties = {};
|
38
|
|
39
|
for (const header of headers.filter(is_content_type_header)) {
|
40
|
const match = charset_reg.exec(header.value);
|
41
|
if (!properties.detected_charset && validate_encoding(match[1]))
|
42
|
properties.detected_charset = match[1];
|
43
|
|
44
|
if (/html/i.test(header.value))
|
45
|
properties.html = true;
|
46
|
}
|
47
|
|
48
|
return properties;
|
49
|
}
|
50
|
|
51
|
const UTF8_BOM = [0xef, 0xbb, 0xbf];
|
52
|
const BOMs = [
|
53
|
[UTF8_BOM, "utf-8"],
|
54
|
[[0xfe, 0xff], "utf-16be"],
|
55
|
[[0xff, 0xfe], "utf-16le"]
|
56
|
];
|
57
|
|
58
|
function charset_from_BOM(data)
|
59
|
{
|
60
|
for (const [BOM, charset] of BOMs) {
|
61
|
if (BOM.reduce((ac, byte, i) => ac && byte === data[i], true))
|
62
|
return charset;
|
63
|
}
|
64
|
|
65
|
return "";
|
66
|
}
|
67
|
|
68
|
const charset_attrs =
|
69
|
['charset', 'http-equiv="content-type"', 'content*="charset"'];
|
70
|
const charset_meta_selector =
|
71
|
charset_attrs.map(a => `head>meta[${a}]`).join(", ");
|
72
|
|
73
|
function charset_from_meta_tags(doc)
|
74
|
{
|
75
|
for (const meta of doc.querySelectorAll(charset_meta_selector)) {
|
76
|
const maybe_charset = meta.getAttribute("charset");
|
77
|
if (maybe_charset && validate_encoding(maybe_charset))
|
78
|
return maybe_charset;
|
79
|
|
80
|
const match = charset_reg.exec(meta.getAttribute("content"));
|
81
|
if (match && validate_encoding(match[1]))
|
82
|
return match[1];
|
83
|
}
|
84
|
|
85
|
return undefined;
|
86
|
}
|
87
|
|
88
|
function create_decoder(properties, data)
|
89
|
{
|
90
|
let charset = charset_from_BOM(data) || properties.detected_charset;
|
91
|
if (!charset && data.indexOf(0) !== -1) {
|
92
|
console.debug("Warning: zeroes in bytestream, probable cached encoding mismatch. Trying to decode it as UTF-16.",
|
93
|
properties);
|
94
|
return new TextDecoder("utf-16be");
|
95
|
}
|
96
|
|
97
|
/* Missing HTTP charset, sniffing in content... */
|
98
|
/*
|
99
|
* TODO: I recall there is some standard saying how early in the doc the
|
100
|
* charset has to be specified. We could process just this part of data.
|
101
|
*/
|
102
|
const text = new TextDecoder("latin1").decode(data, {stream: true});
|
103
|
properties.html = properties.html || /html/i.test(text);
|
104
|
|
105
|
if (properties.html) {
|
106
|
const tmp_doc = new DOMParser().parseFromString(text, "text/html");
|
107
|
charset = charset_from_meta_tags(tmp_doc);
|
108
|
}
|
109
|
|
110
|
return new TextDecoder(charset || "latin1");
|
111
|
}
|
112
|
|
113
|
function filter_data(properties, event)
|
114
|
{
|
115
|
const data = new Uint8Array(event.data);
|
116
|
let first_chunk = false;
|
117
|
if (!properties.decoder) {
|
118
|
first_chunk = true;
|
119
|
properties.decoder = create_decoder(properties, data);
|
120
|
properties.encoder = new TextEncoder();
|
121
|
/* Force UTF-8, this is the only encoding we can produce. */
|
122
|
properties.filter.write(new Uint8Array(UTF8_BOM));
|
123
|
}
|
124
|
|
125
|
let decoded = properties.decoder.decode(data);
|
126
|
|
127
|
if (first_chunk) {
|
128
|
/*
|
129
|
* HAX! Our content scripts that execute at `document_start' will always
|
130
|
* run before the first script in the document, but under Mozilla some
|
131
|
* `<meta>' tags might already be loaded at that point. Here we inject a
|
132
|
* dummy `<script>' at the beginning (before any `<meta>' tags) that
|
133
|
* will force `document_start' to happen earlier. This way our content
|
134
|
* scripts will be able to sanitize `http-equiv' tags with CSP rules
|
135
|
* that would otherwise stop our injected scripts from executing.
|
136
|
*/
|
137
|
const dummy_script =
|
138
|
`<script data-hachette-deleteme="${properties.policy.nonce}" nonce="${properties.policy.nonce}">null</script>`;
|
139
|
const doctype_decl = /^(\s*<!doctype[^<>"']*>)?/i.exec(decoded)[0];
|
140
|
decoded = doctype_decl + dummy_script +
|
141
|
decoded.substring(doctype_decl.length);
|
142
|
}
|
143
|
|
144
|
properties.filter.write(properties.encoder.encode(decoded));
|
145
|
|
146
|
if (properties.decoder.encoding === "utf-8")
|
147
|
properties.filter.disconnect();
|
148
|
}
|
149
|
|
150
|
function apply_stream_filter(details, headers, policy)
|
151
|
{
|
152
|
if (policy.allow)
|
153
|
return headers;
|
154
|
|
155
|
const properties = properties_from_headers(headers);
|
156
|
properties.policy = policy;
|
157
|
|
158
|
properties.filter =
|
159
|
browser.webRequest.filterResponseData(details.requestId);
|
160
|
|
161
|
properties.filter.ondata = event => filter_data(properties, event);
|
162
|
properties.filter.onstop = () => properties.filter.close();
|
163
|
|
164
|
/*
|
165
|
* In the future we might consider modifying the headers that specify
|
166
|
* encoding. For now we are not yet doing it, though. However, we
|
167
|
* prepend the data with UTF-8 BOM which should be enough.
|
168
|
*/
|
169
|
return headers;
|
170
|
}
|
171
|
|
172
|
/*
|
173
|
* EXPORTS_START
|
174
|
* EXPORT apply_stream_filter
|
175
|
* EXPORTS_END
|
176
|
*/
|