1
|
/**
|
2
|
* This file is part of Haketilo.
|
3
|
*
|
4
|
* Function: Data structure to query items by URL patterns.
|
5
|
*
|
6
|
* Copyright (C) 2021 Wojtek Kosior
|
7
|
*
|
8
|
* This program is free software: you can redistribute it and/or modify
|
9
|
* it under the terms of the GNU General Public License as published by
|
10
|
* the Free Software Foundation, either version 3 of the License, or
|
11
|
* (at your option) any later version.
|
12
|
*
|
13
|
* This program is distributed in the hope that it will be useful,
|
14
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
* GNU General Public License for more details.
|
17
|
*
|
18
|
* As additional permission under GNU GPL version 3 section 7, you
|
19
|
* may distribute forms of that code without the copy of the GNU
|
20
|
* GPL normally required by section 4, provided you include this
|
21
|
* license notice and, in case of non-source distribution, a URL
|
22
|
* through which recipients can access the Corresponding Source.
|
23
|
* If you modify file(s) with this exception, you may extend this
|
24
|
* exception to your version of the file(s), but you are not
|
25
|
* obligated to do so. If you do not wish to do so, delete this
|
26
|
* exception statement from your version.
|
27
|
*
|
28
|
* As a special exception to the GPL, any HTML file which merely
|
29
|
* makes function calls to this code, and for that purpose
|
30
|
* includes it by reference shall be deemed a separate work for
|
31
|
* copyright law purposes. If you modify this code, you may extend
|
32
|
* this exception to your version of the code, but you are not
|
33
|
* obligated to do so. If you do not wish to do so, delete this
|
34
|
* exception statement from your version.
|
35
|
*
|
36
|
* You should have received a copy of the GNU General Public License
|
37
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
38
|
*
|
39
|
* I, Wojtek Kosior, thereby promise not to sue for violation of this file's
|
40
|
* license. Although I request that you do not make use this code in a
|
41
|
* proprietary program, I am not going to enforce this in court.
|
42
|
*/
|
43
|
|
44
|
/*
|
45
|
* IMPORTS_START
|
46
|
* IMPORT deconstruct_url
|
47
|
* IMPORTS_END
|
48
|
*/
|
49
|
|
50
|
/*
|
51
|
* This is a javascript rewrite of Python implementation of the algorithm here:
|
52
|
* https://git.koszko.org/pydrilla/tree/src/pydrilla/pydrilla.py?id=f4edcbe7f4739d6f82a2e1bb180960b003b30862#n205
|
53
|
*/
|
54
|
|
55
|
/* "Pattern Tree" is how we refer to the data structure used for querying
|
56
|
* Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal
|
57
|
* is to make it possible for given URL to quickly retrieve all known patterns
|
58
|
* that match it.
|
59
|
*/
|
60
|
function make_tree_node() {
|
61
|
return {
|
62
|
wildcard_matches: [null, null, null],
|
63
|
literal_match: null,
|
64
|
children: null
|
65
|
};
|
66
|
}
|
67
|
|
68
|
const is_wildcard = segment => ["*", "**", "***"].lastIndexOf(segment) >= 0;
|
69
|
|
70
|
/*
|
71
|
* Yields all matches of this segments sequence against the tree that starts at
|
72
|
* this node. Results are produces in order from greatest to lowest pattern
|
73
|
* specificity.
|
74
|
*/
|
75
|
function* search_sequence(tree_node, segments)
|
76
|
{
|
77
|
const nodes = [tree_node];
|
78
|
|
79
|
for (const segment of segments) {
|
80
|
const next_node = nodes[nodes.length - 1].children[segment];
|
81
|
if (next_node === undefined)
|
82
|
break;
|
83
|
|
84
|
nodes.push(next_node);
|
85
|
}
|
86
|
|
87
|
const nsegments = segments.length;
|
88
|
|
89
|
const conds = [
|
90
|
/* literal pattern match */
|
91
|
() => nodes.length == nsegments,
|
92
|
/* wildcard pattern matches */
|
93
|
() => nodes.length + 1 == nsegments && segments[nsegments - 1] != "*",
|
94
|
() => nodes.length + 1 < nsegments,
|
95
|
() => nodes.length + 1 != nsegments || segments[nsegments - 1] != "***"
|
96
|
];
|
97
|
|
98
|
while (nodes.length) {
|
99
|
const node = nodes.pop();
|
100
|
const items = [node.literal_match, ...node.wildcard_matches];
|
101
|
|
102
|
for (let i = 0; i < 4; i++) {
|
103
|
if (items[i] !== null && conds[i]())
|
104
|
yield items[i];
|
105
|
}
|
106
|
}
|
107
|
}
|
108
|
|
109
|
/*
|
110
|
* Make item queryable through (this branch of) the Pattern Tree. If there was
|
111
|
* not yet any item associated with the tree path designated by segments, create
|
112
|
* a new one using item_instantiator() function. Return all items matching this
|
113
|
* path (both the ones that existed and the ones just created).
|
114
|
*/
|
115
|
function add_sequence(tree_node, segments, item_instantiator)
|
116
|
{
|
117
|
let wildcards;
|
118
|
|
119
|
for (var segment of segments) {
|
120
|
wildcards = tree_node.wildcard_matches;
|
121
|
|
122
|
const child = tree_node.children[segment] || make_tree_node();
|
123
|
tree_node.children[segment] = child;
|
124
|
tree_node = child;
|
125
|
}
|
126
|
|
127
|
if (tree_node.literal_match === null)
|
128
|
tree_node.literal_match = item_instantiator();
|
129
|
|
130
|
if (!is_wildcard(segment))
|
131
|
return [tree_node.literal_match];
|
132
|
|
133
|
if (wildcards[segment.length - 1] === null)
|
134
|
wildcards[segment.length - 1] = item_instantiator();
|
135
|
|
136
|
return [tree_node.literal_match, wildcards[segment.length - 1]];
|
137
|
}
|
138
|
|
139
|
/*
|
140
|
* Can be used to remove item from (this branch of) the Pattern Tree.
|
141
|
* item_modifier should be a function that accepts 1 argument, the item stored
|
142
|
* in the tree, and returns an item that should be used in place of the first
|
143
|
* one. It is also legal for it to return the same item modifying it first. If
|
144
|
* it returns `undefined`, it means the item should be deleted from the Tree.
|
145
|
*
|
146
|
* item_modifier() will be called with all items in (this branch of) the Pattern
|
147
|
* Tree that correspond to the path designated by segments. The Tree will be
|
148
|
* updated accordingly.
|
149
|
*/
|
150
|
function modify_sequence(tree_node, segments, item_modifier)
|
151
|
{
|
152
|
const nodes = [tree_node];
|
153
|
|
154
|
for (const segment of segments) {
|
155
|
const next_node = nodes[nodes.length - 1].children[segment];
|
156
|
nodes.push(next_node);
|
157
|
}
|
158
|
|
159
|
const nsegments = segments.length;
|
160
|
let i = segments.length - 1;
|
161
|
|
162
|
if (nsegments > 0 && is_wildcard(segments[i])) {
|
163
|
const node = nodes[nodes.length - 2];
|
164
|
const asterisks = segments[i].length;
|
165
|
}
|
166
|
|
167
|
............................
|
168
|
|
169
|
for (let i = segments.length - 1; i >= 0; i--) {
|
170
|
|
171
|
if (i
|
172
|
}
|
173
|
|
174
|
segments
|
175
|
|
176
|
const conds = [
|
177
|
/* literal pattern match */
|
178
|
() => nodes.length == nsegments,
|
179
|
/* wildcard pattern matches */
|
180
|
() => nodes.length + 1 == nsegments && segments[nsegments - 1] != "*",
|
181
|
() => nodes.length + 1 < nsegments,
|
182
|
() => nodes.length + 1 != nsegments || segments[nsegments - 1] != "***"
|
183
|
];
|
184
|
|
185
|
while (nodes.length) {
|
186
|
const node = nodes.pop();
|
187
|
const items = [node.literal_match, ...node.wildcard_matches];
|
188
|
|
189
|
for (let i = 0; i < 4; i++) {
|
190
|
if (items[i] !== null && conds[i]())
|
191
|
yield items[i];
|
192
|
}
|
193
|
}
|
194
|
.................
|
195
|
}
|
196
|
|
197
|
/*
|
198
|
* Make item queryable through the Pattern Tree that starts with the protocols
|
199
|
* dictionary object passed in the first argument.
|
200
|
*/
|
201
|
function pattern_tree_register(patterns_by_proto, pattern, item_name, item)
|
202
|
{
|
203
|
/*
|
204
|
* We pass 'false' to disable length limits on URL parts. Length limits are
|
205
|
* mostly useful in case of iteration over all patterns matching given URL.
|
206
|
* Here we don't do that.
|
207
|
*/
|
208
|
const deco = deconstruct_url(pattern, false);
|
209
|
|
210
|
const tree = patterns_by_proto[deco.proto] || make_tree_node();
|
211
|
patterns_by_proto[deco.proto] = tree;
|
212
|
|
213
|
let path_trees;
|
214
|
|
215
|
if (deco.proto === "file") {
|
216
|
path_trees = [tree];
|
217
|
} else {
|
218
|
/* We need an aray of domain labels ordered most-significant-first. */
|
219
|
const domain = [...deco.domain].reverse();
|
220
|
path_trees = add_sequence(tree, domain, make_tree_node);
|
221
|
}
|
222
|
|
223
|
for (const path_tree of path_trees) {
|
224
|
for (const match_obj of add_sequence(path_tree, deco.path, () => {}))
|
225
|
match_obj[item_name] = item;
|
226
|
}
|
227
|
}
|
228
|
|
229
|
/*
|
230
|
* Remove registered item from the Pattern Tree that starts with the protocols
|
231
|
* dictionary object passed in the first argument. The remaining 2 arguments
|
232
|
* should be pattern and name that have been earlier passed to
|
233
|
* pattern_tree_register().
|
234
|
*/
|
235
|
function pattern_tree_deregister(patterns_by_proto, pattern, item_name)
|
236
|
{
|
237
|
const deco = deconstruct_url(pattern, false);
|
238
|
|
239
|
const tree = patterns_by_proto[deco.proto] || make_tree_node();
|
240
|
patterns_by_proto[deco.proto] = tree;
|
241
|
|
242
|
let path_trees;
|
243
|
|
244
|
if (deco.proto === "file") {
|
245
|
path_trees = [tree];
|
246
|
} else {
|
247
|
/* We need an aray of domain labels ordered most-significant-first. */
|
248
|
const domain = [...deco.domain].reverse();
|
249
|
..........................
|
250
|
path_trees = add_sequence(tree, domain, make_tree_node);
|
251
|
}
|
252
|
}
|
253
|
|
254
|
/*
|
255
|
* Yield registered items (as [item_name, item] arrays) that match url. For
|
256
|
* cosistent behavior, please don't modify the pattern tree while iterating over
|
257
|
* the results.
|
258
|
*/
|
259
|
function* pattern_tree_search(patterns_by_proto, url)
|
260
|
{
|
261
|
const deco = deconstruct_url(url, false);
|
262
|
|
263
|
tree = patterns_by_proto[deco.proto] || make_tree_node();
|
264
|
|
265
|
let path_trees;
|
266
|
|
267
|
if (deco.proto === "file") {
|
268
|
path_trees = [tree];
|
269
|
} else {
|
270
|
/* We need an aray of domain labels ordered most-significant-first. */
|
271
|
const domain = [...deco.domain].reverse();
|
272
|
path_trees = search_sequence(tree, domain);
|
273
|
}
|
274
|
|
275
|
for (const path_tree of path_trees) {
|
276
|
for (const match_obj of search_sequence(path_tree, deco.path)) {
|
277
|
for (const entry of Object.entries(match_obj))
|
278
|
yield entry;
|
279
|
}
|
280
|
}
|
281
|
}
|
282
|
|
283
|
const patterns_tree = {
|
284
|
make: () => {},
|
285
|
register: patterns_tree_register,
|
286
|
search: patterns_tree_search
|
287
|
}
|
288
|
|
289
|
/*
|
290
|
* EXPORTS_START
|
291
|
* EXPORT patterns_tree
|
292
|
* EXPORTS_END
|
293
|
*/
|