1
|
/**
|
2
|
* This file is part of Haketilo.
|
3
|
*
|
4
|
* Function: Operations on page URL patterns.
|
5
|
*
|
6
|
* Copyright (C) 2021 Wojtek Kosior
|
7
|
*
|
8
|
* This program is free software: you can redistribute it and/or modify
|
9
|
* it under the terms of the GNU General Public License as published by
|
10
|
* the Free Software Foundation, either version 3 of the License, or
|
11
|
* (at your option) any later version.
|
12
|
*
|
13
|
* This program is distributed in the hope that it will be useful,
|
14
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
* GNU General Public License for more details.
|
17
|
*
|
18
|
* As additional permission under GNU GPL version 3 section 7, you
|
19
|
* may distribute forms of that code without the copy of the GNU
|
20
|
* GPL normally required by section 4, provided you include this
|
21
|
* license notice and, in case of non-source distribution, a URL
|
22
|
* through which recipients can access the Corresponding Source.
|
23
|
* If you modify file(s) with this exception, you may extend this
|
24
|
* exception to your version of the file(s), but you are not
|
25
|
* obligated to do so. If you do not wish to do so, delete this
|
26
|
* exception statement from your version.
|
27
|
*
|
28
|
* As a special exception to the GPL, any HTML file which merely
|
29
|
* makes function calls to this code, and for that purpose
|
30
|
* includes it by reference shall be deemed a separate work for
|
31
|
* copyright law purposes. If you modify this code, you may extend
|
32
|
* this exception to your version of the code, but you are not
|
33
|
* obligated to do so. If you do not wish to do so, delete this
|
34
|
* exception statement from your version.
|
35
|
*
|
36
|
* You should have received a copy of the GNU General Public License
|
37
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
38
|
*
|
39
|
* I, Wojtek Kosior, thereby promise not to sue for violation of this file's
|
40
|
* license. Although I request that you do not make use of this code in a
|
41
|
* proprietary program, I am not going to enforce this in court.
|
42
|
*/
|
43
|
|
44
|
const MAX = {
|
45
|
URL_PATH_LEN: 12,
|
46
|
URL_PATH_CHARS: 255,
|
47
|
DOMAIN_LEN: 7,
|
48
|
DOMAIN_CHARS: 100
|
49
|
};
|
50
|
|
51
|
const proto_regex = /^(\w+):\/\/(.*)$/;
|
52
|
|
53
|
const user_re = "[^/?#@]+@"
|
54
|
const domain_re = "[.*a-zA-Z0-9-]+";
|
55
|
const path_re = "[^?#]*";
|
56
|
const query_re = "\\??[^#]*";
|
57
|
|
58
|
const http_regex = new RegExp(`^(${domain_re})(${path_re})(${query_re}).*`);
|
59
|
|
60
|
const file_regex = new RegExp(`^(/${path_re}).*`);
|
61
|
|
62
|
const ftp_regex = new RegExp(`^(${user_re})?(${domain_re})(${path_re}).*`);
|
63
|
|
64
|
function match_or_throw(regex, string, error_msg)
|
65
|
{
|
66
|
const match = regex.exec(string);
|
67
|
if (match === null)
|
68
|
throw error_msg;
|
69
|
|
70
|
return match;
|
71
|
}
|
72
|
|
73
|
function deconstruct_url(url, use_limits=true)
|
74
|
{
|
75
|
const max = Object.assign({}, MAX);
|
76
|
if (!use_limits) {
|
77
|
for (const key in MAX)
|
78
|
max[key] = Infinity;
|
79
|
}
|
80
|
|
81
|
const matcher = (re, str) => match_or_throw(re, str, `bad url '${url}'`)
|
82
|
|
83
|
const proto_match = matcher(proto_regex, url);
|
84
|
const deco = {proto: proto_match[1]};
|
85
|
|
86
|
if (deco.proto === "file") {
|
87
|
deco.path = matcher(file_regex, proto_match[2])[1];
|
88
|
} else if (deco.proto === "ftp") {
|
89
|
[deco.domain, deco.path] =
|
90
|
matcher(ftp_regex, proto_match[2]).slice(2, 4);
|
91
|
} else if (deco.proto === "http" || deco.proto === "https") {
|
92
|
[deco.domain, deco.path, deco.query] =
|
93
|
matcher(http_regex, proto_match[2]).slice(1, 4);
|
94
|
deco.domain = deco.domain.toLowerCase();
|
95
|
} else {
|
96
|
throw `unsupported protocol in url '${url}'`;
|
97
|
}
|
98
|
|
99
|
deco.trailing_slash = deco.path[deco.path.length - 1] === "/";
|
100
|
|
101
|
if (deco.domain) {
|
102
|
if (deco.domain.length > max.DOMAIN_CHARS) {
|
103
|
const idx = deco.domain.indexOf(".", deco.domain.length -
|
104
|
max.DOMAIN_CHARS);
|
105
|
if (idx === -1)
|
106
|
deco.domain = [];
|
107
|
else
|
108
|
deco.domain = deco.domain.substring(idx + 1);
|
109
|
|
110
|
deco.domain_truncated = true;
|
111
|
}
|
112
|
|
113
|
if (deco.path.length > max.URL_PATH_CHARS) {
|
114
|
deco.path = deco.path.substring(0, deco.path.lastIndexOf("/"));
|
115
|
deco.path_truncated = true;
|
116
|
}
|
117
|
}
|
118
|
|
119
|
if (typeof deco.domain === "string") {
|
120
|
deco.domain = deco.domain.split(".");
|
121
|
if (deco.domain.splice(0, deco.domain.length - max.DOMAIN_LEN).length
|
122
|
> 0)
|
123
|
deco.domain_truncated = true;
|
124
|
}
|
125
|
|
126
|
deco.path = deco.path.split("/").filter(s => s !== "");
|
127
|
if (deco.domain && deco.path.splice(max.URL_PATH_LEN).length > 0)
|
128
|
deco.path_truncated = true;
|
129
|
|
130
|
return deco;
|
131
|
}
|
132
|
#EXPORT deconstruct_url
|
133
|
|
134
|
function* each_domain_pattern(deco)
|
135
|
{
|
136
|
for (let slice = 0; slice < deco.domain.length - 1; slice++) {
|
137
|
const domain_part = deco.domain.slice(slice).join(".");
|
138
|
const domain_wildcards = [];
|
139
|
if (slice === 0 && !deco.domain_truncated)
|
140
|
yield domain_part;
|
141
|
if (slice === 1 && !deco.domain_truncated)
|
142
|
yield "*." + domain_part;
|
143
|
if (slice > 1)
|
144
|
yield "**." + domain_part;
|
145
|
yield "***." + domain_part;
|
146
|
}
|
147
|
}
|
148
|
|
149
|
function* each_path_pattern(deco)
|
150
|
{
|
151
|
for (let slice = deco.path.length; slice >= 0; slice--) {
|
152
|
const path_part = ["", ...deco.path.slice(0, slice)].join("/");
|
153
|
const path_wildcards = [];
|
154
|
if (slice === deco.path.length && !deco.path_truncated) {
|
155
|
if (deco.trailing_slash)
|
156
|
yield path_part + "/";
|
157
|
if (slice > 0 || deco.proto !== "file")
|
158
|
yield path_part;
|
159
|
}
|
160
|
if (slice === deco.path.length - 1 && !deco.path_truncated &&
|
161
|
deco.path[slice] !== "*")
|
162
|
yield path_part + "/*";
|
163
|
if (slice < deco.path.length - 1)
|
164
|
yield path_part + "/**";
|
165
|
if (slice !== deco.path.length - 1 || deco.path_truncated ||
|
166
|
deco.path[slice] !== "***")
|
167
|
yield path_part + "/***";
|
168
|
}
|
169
|
}
|
170
|
|
171
|
/* Generate every possible pattern that matches url. */
|
172
|
function* each_url_pattern(url)
|
173
|
{
|
174
|
const deco = deconstruct_url(url);
|
175
|
|
176
|
if (deco === undefined) {
|
177
|
console.error("Haketilo: bad url format", url);
|
178
|
return false;
|
179
|
}
|
180
|
|
181
|
const all_domains = deco.domain ? each_domain_pattern(deco) : [""];
|
182
|
for (const domain of all_domains) {
|
183
|
for (const path of each_path_pattern(deco))
|
184
|
yield `${deco.proto}://${domain}${path}`;
|
185
|
}
|
186
|
}
|
187
|
#EXPORT each_url_pattern
|
188
|
|
189
|
const patterns_doc_url =
|
190
|
"https://hydrillabugs.koszko.org/projects/haketilo/wiki/URL_patterns";
|
191
|
#EXPORT patterns_doc_url
|
192
|
|
193
|
function reconstruct_url(deco)
|
194
|
{
|
195
|
const domain = deco.domain.join(".");
|
196
|
const path = ["", ...deco.path].join("/");
|
197
|
const trail = deco.trailing_slash ? "/" : "";
|
198
|
return `${deco.proto}://${domain}${path}${trail}`;
|
199
|
}
|
200
|
#EXPORT reconstruct_url
|
201
|
|
202
|
function validate_normalize_url_pattern(url_pattern)
|
203
|
{
|
204
|
try {
|
205
|
return reconstruct_url(deconstruct_url(url_pattern));
|
206
|
} catch(e) {
|
207
|
const patterns_doc_link = document.createElement("a");
|
208
|
patterns_doc_link.href = patterns_doc_url;
|
209
|
patterns_doc_link.innerText = "here";
|
210
|
const msg = document.createElement("span");
|
211
|
msg.prepend(`'${url_pattern}' is not a valid URL pattern. See `,
|
212
|
patterns_doc_link, " for more details.");
|
213
|
throw msg;
|
214
|
}
|
215
|
}
|
216
|
#EXPORT validate_normalize_url_pattern
|