|
1 |
/**
|
|
2 |
* This file is part of Haketilo.
|
|
3 |
*
|
|
4 |
* Function: Data structure to query items by URL patterns.
|
|
5 |
*
|
|
6 |
* Copyright (C) 2021 Wojtek Kosior
|
|
7 |
*
|
|
8 |
* This program is free software: you can redistribute it and/or modify
|
|
9 |
* it under the terms of the GNU General Public License as published by
|
|
10 |
* the Free Software Foundation, either version 3 of the License, or
|
|
11 |
* (at your option) any later version.
|
|
12 |
*
|
|
13 |
* This program is distributed in the hope that it will be useful,
|
|
14 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
15 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
16 |
* GNU General Public License for more details.
|
|
17 |
*
|
|
18 |
* As additional permission under GNU GPL version 3 section 7, you
|
|
19 |
* may distribute forms of that code without the copy of the GNU
|
|
20 |
* GPL normally required by section 4, provided you include this
|
|
21 |
* license notice and, in case of non-source distribution, a URL
|
|
22 |
* through which recipients can access the Corresponding Source.
|
|
23 |
* If you modify file(s) with this exception, you may extend this
|
|
24 |
* exception to your version of the file(s), but you are not
|
|
25 |
* obligated to do so. If you do not wish to do so, delete this
|
|
26 |
* exception statement from your version.
|
|
27 |
*
|
|
28 |
* As a special exception to the GPL, any HTML file which merely
|
|
29 |
* makes function calls to this code, and for that purpose
|
|
30 |
* includes it by reference shall be deemed a separate work for
|
|
31 |
* copyright law purposes. If you modify this code, you may extend
|
|
32 |
* this exception to your version of the code, but you are not
|
|
33 |
* obligated to do so. If you do not wish to do so, delete this
|
|
34 |
* exception statement from your version.
|
|
35 |
*
|
|
36 |
* You should have received a copy of the GNU General Public License
|
|
37 |
* along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|
38 |
*
|
|
39 |
* I, Wojtek Kosior, thereby promise not to sue for violation of this file's
|
|
40 |
* license. Although I request that you do not make use this code in a
|
|
41 |
* proprietary program, I am not going to enforce this in court.
|
|
42 |
*/
|
|
43 |
|
|
44 |
/*
|
|
45 |
* IMPORTS_START
|
|
46 |
* IMPORT deconstruct_url
|
|
47 |
* IMPORTS_END
|
|
48 |
*/
|
|
49 |
|
|
50 |
/*
|
|
51 |
* This is a javascript rewrite of Python implementation of the algorithm here:
|
|
52 |
* https://git.koszko.org/pydrilla/tree/src/pydrilla/pydrilla.py?id=f4edcbe7f4739d6f82a2e1bb180960b003b30862#n205
|
|
53 |
*/
|
|
54 |
|
|
55 |
/* "Pattern Tree" is how we refer to the data structure used for querying
|
|
56 |
* Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal
|
|
57 |
* is to make it possible for given URL to quickly retrieve all known patterns
|
|
58 |
* that match it.
|
|
59 |
*/
|
|
60 |
function make_tree_node() {
|
|
61 |
return {
|
|
62 |
wildcard_matches: [null, null, null],
|
|
63 |
literal_match: null,
|
|
64 |
children: null
|
|
65 |
};
|
|
66 |
}
|
|
67 |
|
|
68 |
const is_wildcard = segment => ["*", "**", "***"].lastIndexOf(segment) >= 0;
|
|
69 |
|
|
70 |
/*
|
|
71 |
* Yields all matches of this segments sequence against the tree that starts at
|
|
72 |
* this node. Results are produces in order from greatest to lowest pattern
|
|
73 |
* specificity.
|
|
74 |
*/
|
|
75 |
function* search_sequence(tree_node, segments)
|
|
76 |
{
|
|
77 |
const nodes = [tree_node];
|
|
78 |
|
|
79 |
for (const segment of segments) {
|
|
80 |
const next_node = nodes[nodes.length - 1].children[segment];
|
|
81 |
if (next_node === undefined)
|
|
82 |
break;
|
|
83 |
|
|
84 |
nodes.push(next_node);
|
|
85 |
}
|
|
86 |
|
|
87 |
const nsegments = segments.length;
|
|
88 |
|
|
89 |
const conds = [
|
|
90 |
/* literal pattern match */
|
|
91 |
() => nodes.length == nsegments,
|
|
92 |
/* wildcard pattern matches */
|
|
93 |
() => nodes.length + 1 == nsegments && segments[nsegments - 1] != "*",
|
|
94 |
() => nodes.length + 1 < nsegments,
|
|
95 |
() => nodes.length + 1 != nsegments || segments[nsegments - 1] != "***"
|
|
96 |
];
|
|
97 |
|
|
98 |
while (nodes.length) {
|
|
99 |
const node = nodes.pop();
|
|
100 |
const items = [node.literal_match, ...node.wildcard_matches];
|
|
101 |
|
|
102 |
for (let i = 0; i < 4; i++) {
|
|
103 |
if (items[i] !== null && conds[i]())
|
|
104 |
yield items[i];
|
|
105 |
}
|
|
106 |
}
|
|
107 |
}
|
|
108 |
|
|
109 |
/*
|
|
110 |
* Make item queryable through (this branch of) the Pattern Tree. If there was
|
|
111 |
* not yet any item associated with the tree path designated by segments, create
|
|
112 |
* a new one using item_instantiator() function. Return all items matching this
|
|
113 |
* path (both the ones that existed and the ones just created).
|
|
114 |
*/
|
|
115 |
function add_sequence(tree_node, segments, item_instantiator)
|
|
116 |
{
|
|
117 |
let wildcards;
|
|
118 |
|
|
119 |
for (var segment of segments) {
|
|
120 |
wildcards = tree_node.wildcard_matches;
|
|
121 |
|
|
122 |
const child = tree_node.children[segment] || make_tree_node();
|
|
123 |
tree_node.children[segment] = child;
|
|
124 |
tree_node = child;
|
|
125 |
}
|
|
126 |
|
|
127 |
if (tree_node.literal_match === null)
|
|
128 |
tree_node.literal_match = item_instantiator();
|
|
129 |
|
|
130 |
if (!is_wildcard(segment))
|
|
131 |
return [tree_node.literal_match];
|
|
132 |
|
|
133 |
if (wildcards[segment.length - 1] === null)
|
|
134 |
wildcards[segment.length - 1] = item_instantiator();
|
|
135 |
|
|
136 |
return [tree_node.literal_match, wildcards[segment.length - 1]];
|
|
137 |
}
|
|
138 |
|
|
139 |
/*
|
|
140 |
* Can be used to remove item from (this branch of) the Pattern Tree.
|
|
141 |
* item_modifier should be a function that accepts 1 argument, the item stored
|
|
142 |
* in the tree, and returns an item that should be used in place of the first
|
|
143 |
* one. It is also legal for it to return the same item modifying it first. If
|
|
144 |
* it returns `undefined`, it means the item should be deleted from the Tree.
|
|
145 |
*
|
|
146 |
* item_modifier() will be called with all items in (this branch of) the Pattern
|
|
147 |
* Tree that correspond to the path designated by segments. The Tree will be
|
|
148 |
* updated accordingly.
|
|
149 |
*/
|
|
150 |
function modify_sequence(tree_node, segments, item_modifier)
|
|
151 |
{
|
|
152 |
const nodes = [tree_node];
|
|
153 |
|
|
154 |
for (const segment of segments) {
|
|
155 |
const next_node = nodes[nodes.length - 1].children[segment];
|
|
156 |
nodes.push(next_node);
|
|
157 |
}
|
|
158 |
|
|
159 |
const nsegments = segments.length;
|
|
160 |
let i = segments.length - 1;
|
|
161 |
|
|
162 |
if (nsegments > 0 && is_wildcard(segments[i])) {
|
|
163 |
const node = nodes[nodes.length - 2];
|
|
164 |
const asterisks = segments[i].length;
|
|
165 |
}
|
|
166 |
|
|
167 |
............................
|
|
168 |
|
|
169 |
for (let i = segments.length - 1; i >= 0; i--) {
|
|
170 |
|
|
171 |
if (i
|
|
172 |
}
|
|
173 |
|
|
174 |
segments
|
|
175 |
|
|
176 |
const conds = [
|
|
177 |
/* literal pattern match */
|
|
178 |
() => nodes.length == nsegments,
|
|
179 |
/* wildcard pattern matches */
|
|
180 |
() => nodes.length + 1 == nsegments && segments[nsegments - 1] != "*",
|
|
181 |
() => nodes.length + 1 < nsegments,
|
|
182 |
() => nodes.length + 1 != nsegments || segments[nsegments - 1] != "***"
|
|
183 |
];
|
|
184 |
|
|
185 |
while (nodes.length) {
|
|
186 |
const node = nodes.pop();
|
|
187 |
const items = [node.literal_match, ...node.wildcard_matches];
|
|
188 |
|
|
189 |
for (let i = 0; i < 4; i++) {
|
|
190 |
if (items[i] !== null && conds[i]())
|
|
191 |
yield items[i];
|
|
192 |
}
|
|
193 |
}
|
|
194 |
.................
|
|
195 |
}
|
|
196 |
|
|
197 |
/*
|
|
198 |
* Make item queryable through the Pattern Tree that starts with the protocols
|
|
199 |
* dictionary object passed in the first argument.
|
|
200 |
*/
|
|
201 |
function pattern_tree_register(patterns_by_proto, pattern, item_name, item)
|
|
202 |
{
|
|
203 |
/*
|
|
204 |
* We pass 'false' to disable length limits on URL parts. Length limits are
|
|
205 |
* mostly useful in case of iteration over all patterns matching given URL.
|
|
206 |
* Here we don't do that.
|
|
207 |
*/
|
|
208 |
const deco = deconstruct_url(pattern, false);
|
|
209 |
|
|
210 |
const tree = patterns_by_proto[deco.proto] || make_tree_node();
|
|
211 |
patterns_by_proto[deco.proto] = tree;
|
|
212 |
|
|
213 |
let path_trees;
|
|
214 |
|
|
215 |
if (deco.proto === "file") {
|
|
216 |
path_trees = [tree];
|
|
217 |
} else {
|
|
218 |
/* We need an aray of domain labels ordered most-significant-first. */
|
|
219 |
const domain = [...deco.domain].reverse();
|
|
220 |
path_trees = add_sequence(tree, domain, make_tree_node);
|
|
221 |
}
|
|
222 |
|
|
223 |
for (const path_tree of path_trees) {
|
|
224 |
for (const match_obj of add_sequence(path_tree, deco.path, () => {}))
|
|
225 |
match_obj[item_name] = item;
|
|
226 |
}
|
|
227 |
}
|
|
228 |
|
|
229 |
/*
|
|
230 |
* Remove registered item from the Pattern Tree that starts with the protocols
|
|
231 |
* dictionary object passed in the first argument. The remaining 2 arguments
|
|
232 |
* should be pattern and name that have been earlier passed to
|
|
233 |
* pattern_tree_register().
|
|
234 |
*/
|
|
235 |
function pattern_tree_deregister(patterns_by_proto, pattern, item_name)
|
|
236 |
{
|
|
237 |
const deco = deconstruct_url(pattern, false);
|
|
238 |
|
|
239 |
const tree = patterns_by_proto[deco.proto] || make_tree_node();
|
|
240 |
patterns_by_proto[deco.proto] = tree;
|
|
241 |
|
|
242 |
let path_trees;
|
|
243 |
|
|
244 |
if (deco.proto === "file") {
|
|
245 |
path_trees = [tree];
|
|
246 |
} else {
|
|
247 |
/* We need an aray of domain labels ordered most-significant-first. */
|
|
248 |
const domain = [...deco.domain].reverse();
|
|
249 |
..........................
|
|
250 |
path_trees = add_sequence(tree, domain, make_tree_node);
|
|
251 |
}
|
|
252 |
}
|
|
253 |
|
|
254 |
/*
|
|
255 |
* Yield registered items (as [item_name, item] arrays) that match url. For
|
|
256 |
* cosistent behavior, please don't modify the pattern tree while iterating over
|
|
257 |
* the results.
|
|
258 |
*/
|
|
259 |
function* pattern_tree_search(patterns_by_proto, url)
|
|
260 |
{
|
|
261 |
const deco = deconstruct_url(url, false);
|
|
262 |
|
|
263 |
tree = patterns_by_proto[deco.proto] || make_tree_node();
|
|
264 |
|
|
265 |
let path_trees;
|
|
266 |
|
|
267 |
if (deco.proto === "file") {
|
|
268 |
path_trees = [tree];
|
|
269 |
} else {
|
|
270 |
/* We need an aray of domain labels ordered most-significant-first. */
|
|
271 |
const domain = [...deco.domain].reverse();
|
|
272 |
path_trees = search_sequence(tree, domain);
|
|
273 |
}
|
|
274 |
|
|
275 |
for (const path_tree of path_trees) {
|
|
276 |
for (const match_obj of search_sequence(path_tree, deco.path)) {
|
|
277 |
for (const entry of Object.entries(match_obj))
|
|
278 |
yield entry;
|
|
279 |
}
|
|
280 |
}
|
|
281 |
}
|
|
282 |
|
|
283 |
const patterns_tree = {
|
|
284 |
make: () => {},
|
|
285 |
register: patterns_tree_register,
|
|
286 |
search: patterns_tree_search
|
|
287 |
}
|
|
288 |
|
|
289 |
/*
|
|
290 |
* EXPORTS_START
|
|
291 |
* EXPORT patterns_tree
|
|
292 |
* EXPORTS_END
|
|
293 |
*/
|
implement more efficient querying of URL patterns