1
|
/**
|
2
|
* This file is part of Haketilo.
|
3
|
*
|
4
|
* Function: Data structure to query items by URL patterns.
|
5
|
*
|
6
|
* Copyright (C) 2021 Wojtek Kosior
|
7
|
*
|
8
|
* This program is free software: you can redistribute it and/or modify
|
9
|
* it under the terms of the GNU General Public License as published by
|
10
|
* the Free Software Foundation, either version 3 of the License, or
|
11
|
* (at your option) any later version.
|
12
|
*
|
13
|
* This program is distributed in the hope that it will be useful,
|
14
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
15
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
16
|
* GNU General Public License for more details.
|
17
|
*
|
18
|
* As additional permission under GNU GPL version 3 section 7, you
|
19
|
* may distribute forms of that code without the copy of the GNU
|
20
|
* GPL normally required by section 4, provided you include this
|
21
|
* license notice and, in case of non-source distribution, a URL
|
22
|
* through which recipients can access the Corresponding Source.
|
23
|
* If you modify file(s) with this exception, you may extend this
|
24
|
* exception to your version of the file(s), but you are not
|
25
|
* obligated to do so. If you do not wish to do so, delete this
|
26
|
* exception statement from your version.
|
27
|
*
|
28
|
* As a special exception to the GPL, any HTML file which merely
|
29
|
* makes function calls to this code, and for that purpose
|
30
|
* includes it by reference shall be deemed a separate work for
|
31
|
* copyright law purposes. If you modify this code, you may extend
|
32
|
* this exception to your version of the code, but you are not
|
33
|
* obligated to do so. If you do not wish to do so, delete this
|
34
|
* exception statement from your version.
|
35
|
*
|
36
|
* You should have received a copy of the GNU General Public License
|
37
|
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
38
|
*
|
39
|
* I, Wojtek Kosior, thereby promise not to sue for violation of this file's
|
40
|
* license. Although I request that you do not make use this code in a
|
41
|
* proprietary program, I am not going to enforce this in court.
|
42
|
*/
|
43
|
|
44
|
/*
|
45
|
* IMPORTS_START
|
46
|
* IMPORT deconstruct_url
|
47
|
* IMPORTS_END
|
48
|
*/
|
49
|
|
50
|
/*
|
51
|
* This is a javascript rewrite of Python implementation of the algorithm here:
|
52
|
* https://git.koszko.org/pydrilla/tree/src/pydrilla/pydrilla.py?id=f4edcbe7f4739d6f82a2e1bb180960b003b30862#n205
|
53
|
*/
|
54
|
|
55
|
/* "Pattern Tree" is how we refer to the data structure used for querying
|
56
|
* Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal
|
57
|
* is to make it possible for given URL to quickly retrieve all known patterns
|
58
|
* that match it.
|
59
|
*/
|
60
|
function make_tree_node() {
|
61
|
return {
|
62
|
wildcard_matches: [null, null, null],
|
63
|
literal_match: null,
|
64
|
children: {}
|
65
|
};
|
66
|
}
|
67
|
|
68
|
function is_empty_node(tree_node) {
|
69
|
const children = tree_node.children;
|
70
|
for (const key in children) {
|
71
|
if (children.hasOwnProperty(key))
|
72
|
return false;
|
73
|
}
|
74
|
|
75
|
if (Array.reduce(tree_node.wildcard_matches, (a, b) => b && a !== null, 1))
|
76
|
return false;
|
77
|
|
78
|
return tree_node.literal_match === null;
|
79
|
}
|
80
|
|
81
|
const is_wildcard = segment => ["*", "**", "***"].lastIndexOf(segment) >= 0;
|
82
|
|
83
|
/*
|
84
|
* Yields all matches of this segments sequence against the tree that starts at
|
85
|
* this node. Results are produces in order from greatest to lowest pattern
|
86
|
* specificity.
|
87
|
*/
|
88
|
function* search_sequence(tree_node, segments)
|
89
|
{
|
90
|
const nodes = [tree_node];
|
91
|
|
92
|
for (const segment of segments) {
|
93
|
const next_node = nodes[nodes.length - 1].children[segment];
|
94
|
if (next_node === undefined)
|
95
|
break;
|
96
|
|
97
|
nodes.push(next_node);
|
98
|
}
|
99
|
|
100
|
const nsegments = segments.length;
|
101
|
|
102
|
const conds = [
|
103
|
/* literal pattern match */
|
104
|
() => nodes.length == nsegments,
|
105
|
/* wildcard pattern matches */
|
106
|
() => nodes.length + 1 == nsegments && segments[nsegments - 1] != "*",
|
107
|
() => nodes.length + 1 < nsegments,
|
108
|
() => nodes.length + 1 != nsegments || segments[nsegments - 1] != "***"
|
109
|
];
|
110
|
|
111
|
while (nodes.length) {
|
112
|
const node = nodes.pop();
|
113
|
const items = [node.literal_match, ...node.wildcard_matches];
|
114
|
|
115
|
for (let i = 0; i < 4; i++) {
|
116
|
if (items[i] !== null && conds[i]())
|
117
|
yield items[i];
|
118
|
}
|
119
|
}
|
120
|
}
|
121
|
|
122
|
/*
|
123
|
* Make item queryable through (this branch of) the Pattern Tree or remove its
|
124
|
* path from there.
|
125
|
*
|
126
|
* item_modifier should be a function that accepts 1 argument, the item stored
|
127
|
* in the tree (or `null` if there wasn't any item there), and returns an item
|
128
|
* that should be used in place of the first one. It is also legal for it to
|
129
|
* return the same item modifying it first. If it returns `null`, it means the
|
130
|
* item should be deleted from the Tree.
|
131
|
*
|
132
|
* If there was not yet any item associated with the tree path designated by
|
133
|
* segments and value returned by item_modifier is not `null`, make the value
|
134
|
* queryable by this path.
|
135
|
*/
|
136
|
function modify_sequence(tree_node, segments, item_modifier)
|
137
|
{
|
138
|
const nodes = [tree_node];
|
139
|
let removed = true;
|
140
|
|
141
|
for (var current_segment of segments) {
|
142
|
wildcards = tree_node.wildcard_matches;
|
143
|
|
144
|
const child = tree_node.children[current_segment] || make_tree_node();
|
145
|
tree_node.children[current_segment] = child;
|
146
|
tree_node = child;
|
147
|
nodes.push(tree_node);
|
148
|
}
|
149
|
|
150
|
tree_node.literal_match = item_modifier(tree_node.literal_match);
|
151
|
if (tree_node.literal_match !== null)
|
152
|
removed = false;
|
153
|
|
154
|
let i = segments.length;
|
155
|
|
156
|
if (is_wildcard(current_segment)) {
|
157
|
const asterisks = current_segment.length - 1;
|
158
|
const wildcards = nodes[i - 1].wildcard_matches;
|
159
|
wildcards[asterisks] = item_modifier(wildcards[asterisks]);
|
160
|
if (wildcards[asterisks] !== null)
|
161
|
removed = false;
|
162
|
}
|
163
|
|
164
|
while (!removed)
|
165
|
return;
|
166
|
|
167
|
while (i > 0) {
|
168
|
tree_node = nodes[i--];
|
169
|
if (is_empty_node(tree_node))
|
170
|
delete nodes[i].children[segments[i]];
|
171
|
}
|
172
|
}
|
173
|
|
174
|
/*
|
175
|
* Make item queryable through the Pattern Tree that starts with the protocols
|
176
|
* dictionary object passed in the first argument.
|
177
|
*/
|
178
|
function pattern_tree_register(patterns_by_proto, pattern, item_name, item)
|
179
|
{
|
180
|
/*
|
181
|
* We pass 'false' to disable length limits on URL parts. Length limits are
|
182
|
* mostly useful in case of iteration over all patterns matching given URL.
|
183
|
* Here we don't do that.
|
184
|
*/
|
185
|
const deco = deconstruct_url(pattern, false);
|
186
|
|
187
|
const tree = patterns_by_proto[deco.proto] || make_tree_node();
|
188
|
patterns_by_proto[deco.proto] = tree;
|
189
|
|
190
|
let path_trees;
|
191
|
|
192
|
if (deco.proto === "file") {
|
193
|
path_trees = [tree];
|
194
|
} else {
|
195
|
/* We need an aray of domain labels ordered most-significant-first. */
|
196
|
const domain = [...deco.domain].reverse();
|
197
|
path_trees = add_sequence(tree, domain, make_tree_node);
|
198
|
}
|
199
|
|
200
|
for (const path_tree of path_trees) {
|
201
|
for (const match_obj of add_sequence(path_tree, deco.path, () => {}))
|
202
|
match_obj[item_name] = item;
|
203
|
}
|
204
|
}
|
205
|
|
206
|
// /*
|
207
|
// * Remove registered item from the Pattern Tree that starts with the protocols
|
208
|
// * dictionary object passed in the first argument. The remaining 2 arguments
|
209
|
// * should be pattern and name that have been earlier passed to
|
210
|
// * pattern_tree_register().
|
211
|
// */
|
212
|
// function pattern_tree_deregister(patterns_by_proto, pattern, item_name)
|
213
|
// {
|
214
|
// const deco = deconstruct_url(pattern, false);
|
215
|
|
216
|
// const tree = patterns_by_proto[deco.proto] || make_tree_node();
|
217
|
// patterns_by_proto[deco.proto] = tree;
|
218
|
|
219
|
// let path_trees;
|
220
|
|
221
|
// if (deco.proto === "file") {
|
222
|
// path_trees = [tree];
|
223
|
// } else {
|
224
|
// /* We need an aray of domain labels ordered most-significant-first. */
|
225
|
// const domain = [...deco.domain].reverse();
|
226
|
// ..........................
|
227
|
// path_trees = add_sequence(tree, domain, make_tree_node);
|
228
|
// }
|
229
|
// }
|
230
|
|
231
|
/*
|
232
|
* Yield registered items (as [item_name, item] arrays) that match url. For
|
233
|
* cosistent behavior, please don't modify the pattern tree while iterating over
|
234
|
* the results.
|
235
|
*/
|
236
|
function* pattern_tree_search(patterns_by_proto, url)
|
237
|
{
|
238
|
const deco = deconstruct_url(url, false);
|
239
|
|
240
|
tree = patterns_by_proto[deco.proto] || make_tree_node();
|
241
|
|
242
|
let path_trees;
|
243
|
|
244
|
if (deco.proto === "file") {
|
245
|
path_trees = [tree];
|
246
|
} else {
|
247
|
/* We need an aray of domain labels ordered most-significant-first. */
|
248
|
const domain = [...deco.domain].reverse();
|
249
|
path_trees = search_sequence(tree, domain);
|
250
|
}
|
251
|
|
252
|
for (const path_tree of path_trees) {
|
253
|
for (const match_obj of search_sequence(path_tree, deco.path)) {
|
254
|
for (const entry of Object.entries(match_obj))
|
255
|
yield entry;
|
256
|
}
|
257
|
}
|
258
|
}
|
259
|
|
260
|
const pattern_tree = {
|
261
|
make: () => {},
|
262
|
register: pattern_tree_register,
|
263
|
// deregister: pattern_tree_deregister,
|
264
|
search: pattern_tree_search
|
265
|
}
|
266
|
|
267
|
/*
|
268
|
* EXPORTS_START
|
269
|
* EXPORT pattern_tree
|
270
|
* EXPORTS_END
|
271
|
*/
|