Project

General

Profile

« Previous | Next » 

Revision 69e53743

Added by koszko almost 2 years ago

implement more efficient querying of URL patterns

View differences:

common/patterns.js
7 7
 * Redistribution terms are gathered in the `copyright' file.
8 8
 */
9 9

  
10
const MAX_URL_PATH_LEN = 12;
11
const MAX_URL_PATH_CHARS = 255;
12
const MAX_DOMAIN_LEN = 7;
13
const MAX_DOMAIN_CHARS = 100;
10
const MAX = {
11
    URL_PATH_LEN:   12,
12
    URL_PATH_CHARS: 255,
13
    DOMAIN_LEN:     7,
14
    DOMAIN_CHARS:   100
15
};
14 16

  
15 17
const proto_regex = /^(\w+):\/\/(.*)$/;
16 18

  
......
25 27

  
26 28
const ftp_regex = new RegExp(`^(${user_re})?(${domain_re})(${path_re}).*`);
27 29

  
28
function deconstruct_url(url)
30
function deconstruct_url(url, use_limits=true)
29 31
{
32
    const max = MAX;
33
    if (!use_limits) {
34
	for (key in MAX)
35
	    max[key] = Infinity;
36
    }
37

  
30 38
    const proto_match = proto_regex.exec(url);
31 39
    if (proto_match === null)
32
	return undefined;
40
	throw `bad url '${url}'`;
33 41

  
34 42
    const deco = {proto: proto_match[1]};
35 43

  
......
37 45
	deco.path = file_regex.exec(proto_match[2])[1];
38 46
    } else if (deco.proto === "ftp") {
39 47
	[deco.domain, deco.path] = ftp_regex.exec(proto_match[2]).slice(2, 4);
40
    } else {
48
    } else if (deco.proto === "http" || deco.proto === "https") {
41 49
	const http_match = http_regex.exec(proto_match[2]);
42 50
	if (!http_match)
43 51
	    return undefined;
44 52
	[deco.domain, deco.path, deco.query] = http_match.slice(1, 4);
53
    } else {
54
	throw `unsupported protocol in url '${url}'`;
45 55
    }
46 56

  
47
    const leading_dash = deco.path[0] === "/";
48 57
    deco.trailing_dash = deco.path[deco.path.length - 1] === "/";
49 58

  
50 59
    if (deco.domain) {
51
	if (deco.domain.length > MAX_DOMAIN_CHARS) {
60
	if (deco.domain.length > max.DOMAIN_CHARS) {
52 61
	    const idx = deco.domain.indexOf(".", deco.domain.length -
53
					    MAX_DOMAIN_CHARS);
62
					    max.DOMAIN_CHARS);
54 63
	    if (idx === -1)
55 64
		deco.domain = [];
56 65
	    else
......
59 68
	    deco.domain_truncated = true;
60 69
	}
61 70

  
62
	if (deco.path.length > MAX_URL_PATH_CHARS) {
71
	if (deco.path.length > max.URL_PATH_CHARS) {
63 72
	    deco.path = deco.path.substring(0, deco.path.lastIndexOf("/"));
64 73
	    deco.path_truncated = true;
65 74
	}
......
67 76

  
68 77
    if (typeof deco.domain === "string") {
69 78
	deco.domain = deco.domain.split(".");
70
	if (deco.domain.splice(0, deco.domain.length - MAX_DOMAIN_LEN).length
79
	if (deco.domain.splice(0, deco.domain.length - max.DOMAIN_LEN).length
71 80
	    > 0)
72 81
	    deco.domain_truncated = true;
73 82
    }
74 83

  
75 84
    deco.path = deco.path.split("/").filter(s => s !== "");
76
    if (deco.domain && deco.path.splice(MAX_URL_PATH_LEN).length > 0)
85
    if (deco.domain && deco.path.splice(max.URL_PATH_LEN).length > 0)
77 86
	deco.path_truncated = true;
78
    if (leading_dash || deco.path.length === 0)
79
	deco.path.unshift("");
80 87

  
81 88
    return deco;
82 89
}
......
98 105

  
99 106
function* each_path_pattern(deco)
100 107
{
101
    for (let slice = deco.path.length; slice > 0; slice--) {
102
	const path_part = deco.path.slice(0, slice).join("/");
108
    for (let slice = deco.path.length; slice >= 0; slice--) {
109
	const path_part = ["", ...deco.path.slice(0, slice)].join("/");
103 110
	const path_wildcards = [];
104 111
	if (slice === deco.path.length && !deco.path_truncated) {
105
	    if (deco.trailing_dash)
112
	    if (deco.trailing_dash && path_part !== )
106 113
		yield path_part + "/";
107
	    yield path_part;
114
	    if (part_part !== "" || deco.proto !== "file")
115
		yield path_part;
108 116
	}
109 117
	if (slice === deco.path.length - 1 && !deco.path_truncated &&
110 118
	    deco.path[slice] !== "*")
......
137 145
/*
138 146
 * EXPORTS_START
139 147
 * EXPORT each_url_pattern
148
 * EXPORT deconstruct_url
140 149
 * EXPORTS_END
141 150
 */
common/patterns_query_tree.js
1
/**
2
 * This file is part of Haketilo.
3
 *
4
 * Function: Data structure to query items by URL patterns.
5
 *
6
 * Copyright (C) 2021 Wojtek Kosior
7
 *
8
 * This program is free software: you can redistribute it and/or modify
9
 * it under the terms of the GNU General Public License as published by
10
 * the Free Software Foundation, either version 3 of the License, or
11
 * (at your option) any later version.
12
 *
13
 * This program is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16
 * GNU General Public License for more details.
17
 *
18
 * As additional permission under GNU GPL version 3 section 7, you
19
 * may distribute forms of that code without the copy of the GNU
20
 * GPL normally required by section 4, provided you include this
21
 * license notice and, in case of non-source distribution, a URL
22
 * through which recipients can access the Corresponding Source.
23
 * If you modify file(s) with this exception, you may extend this
24
 * exception to your version of the file(s), but you are not
25
 * obligated to do so. If you do not wish to do so, delete this
26
 * exception statement from your version.
27
 *
28
 * As a special exception to the GPL, any HTML file which merely
29
 * makes function calls to this code, and for that purpose
30
 * includes it by reference shall be deemed a separate work for
31
 * copyright law purposes. If you modify this code, you may extend
32
 * this exception to your version of the code, but you are not
33
 * obligated to do so. If you do not wish to do so, delete this
34
 * exception statement from your version.
35
 *
36
 * You should have received a copy of the GNU General Public License
37
 * along with this program.  If not, see <https://www.gnu.org/licenses/>.
38
 *
39
 * I, Wojtek Kosior, thereby promise not to sue for violation of this file's
40
 * license. Although I request that you do not make use this code in a
41
 * proprietary program, I am not going to enforce this in court.
42
 */
43

  
44
/*
45
 * IMPORTS_START
46
 * IMPORT deconstruct_url
47
 * IMPORTS_END
48
 */
49

  
50
/*
51
 * This is a javascript rewrite of Python implementation of the algorithm here:
52
 * https://git.koszko.org/pydrilla/tree/src/pydrilla/pydrilla.py?id=f4edcbe7f4739d6f82a2e1bb180960b003b30862#n205
53
 */
54

  
55
/* "Pattern Tree" is how we refer to the data structure used for querying
56
 * Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal
57
 * is to make it possible for given URL to quickly retrieve all known patterns
58
 * that match it.
59
 */
60
function make_tree_node() {
61
    return {
62
	wildcard_matches: [null, null, null],
63
	literal_match:    null,
64
	children:         null
65
    };
66
}
67

  
68
const is_wildcard = segment => ["*", "**", "***"].lastIndexOf(segment) >= 0;
69

  
70
/*
71
 * Yields all matches of this segments sequence against the tree that starts at
72
 * this node. Results are produces in order from greatest to lowest pattern
73
 * specificity.
74
 */
75
function* search_sequence(tree_node, segments)
76
{
77
    const nodes = [tree_node];
78

  
79
    for (const segment of segments) {
80
	const next_node = nodes[nodes.length - 1].children[segment];
81
	if (next_node === undefined)
82
	    break;
83

  
84
	nodes.push(next_node);
85
    }
86

  
87
    const nsegments = segments.length;
88

  
89
    const conds = [
90
	/* literal pattern match */
91
	() => nodes.length     == nsegments,
92
	/* wildcard pattern matches */
93
	() => nodes.length + 1 == nsegments && segments[nsegments - 1] != "*",
94
	() => nodes.length + 1 <  nsegments,
95
	() => nodes.length + 1 != nsegments || segments[nsegments - 1] != "***"
96
    ];
97

  
98
    while (nodes.length) {
99
	const node = nodes.pop();
100
	const items = [node.literal_match, ...node.wildcard_matches];
101

  
102
	for (let i = 0; i < 4; i++) {
103
	    if (items[i] !== null && conds[i]())
104
		yield items[i];
105
	}
106
    }
107
}
108

  
109
/*
110
 * Make item queryable through (this branch of) the Pattern Tree. If there was
111
 * not yet any item associated with the tree path designated by segments, create
112
 * a new one using item_instantiator() function. Return all items matching this
113
 * path (both the ones that existed and the ones just created).
114
 */
115
function add_sequence(tree_node, segments, item_instantiator)
116
{
117
    let wildcards;
118

  
119
    for (var segment of segments) {
120
	wildcards = tree_node.wildcard_matches;
121

  
122
	const child = tree_node.children[segment] || make_tree_node();
123
	tree_node.children[segment] = child;
124
	tree_node = child;
125
    }
126

  
127
    if (tree_node.literal_match === null)
128
	tree_node.literal_match = item_instantiator();
129

  
130
    if (!is_wildcard(segment))
131
	return [tree_node.literal_match];
132

  
133
    if (wildcards[segment.length - 1] === null)
134
	wildcards[segment.length - 1] = item_instantiator();
135

  
136
    return [tree_node.literal_match, wildcards[segment.length - 1]];
137
}
138

  
139
/*
140
 * Can be used to remove item from (this branch of) the Pattern Tree.
141
 * item_modifier should be a function that accepts 1 argument, the item stored
142
 * in the tree, and returns an item that should be used in place of the first
143
 * one. It is also legal for it to return the same item modifying it first. If
144
 * it returns `undefined`, it means the item should be deleted from the Tree.
145
 *
146
 * item_modifier() will be called with all items in (this branch of) the Pattern
147
 * Tree that correspond to the path designated by segments. The Tree will be
148
 * updated accordingly.
149
 */
150
function modify_sequence(tree_node, segments, item_modifier)
151
{
152
    const nodes = [tree_node];
153

  
154
    for (const segment of segments) {
155
	const next_node = nodes[nodes.length - 1].children[segment];
156
	nodes.push(next_node);
157
    }
158

  
159
    const nsegments = segments.length;
160
    let i = segments.length - 1;
161

  
162
    if (nsegments > 0 && is_wildcard(segments[i])) {
163
	const node = nodes[nodes.length - 2];
164
	const asterisks = segments[i].length;
165
    }
166

  
167
    ............................
168

  
169
    for (let i = segments.length - 1; i >= 0; i--) {
170

  
171
	if (i
172
    }
173

  
174
    segments
175

  
176
    const conds = [
177
	/* literal pattern match */
178
	() => nodes.length     == nsegments,
179
	/* wildcard pattern matches */
180
	() => nodes.length + 1 == nsegments && segments[nsegments - 1] != "*",
181
	() => nodes.length + 1 <  nsegments,
182
	() => nodes.length + 1 != nsegments || segments[nsegments - 1] != "***"
183
    ];
184

  
185
    while (nodes.length) {
186
	const node = nodes.pop();
187
	const items = [node.literal_match, ...node.wildcard_matches];
188

  
189
	for (let i = 0; i < 4; i++) {
190
	    if (items[i] !== null && conds[i]())
191
		yield items[i];
192
	}
193
    }
194
    .................
195
}
196

  
197
/*
198
 * Make item queryable through the Pattern Tree that starts with the protocols
199
 * dictionary object passed in the first argument.
200
 */
201
function pattern_tree_register(patterns_by_proto, pattern, item_name, item)
202
{
203
    /*
204
     * We pass 'false' to disable length limits on URL parts. Length limits are
205
     * mostly useful in case of iteration over all patterns matching given URL.
206
     * Here we don't do that.
207
     */
208
    const deco = deconstruct_url(pattern, false);
209

  
210
    const tree = patterns_by_proto[deco.proto] || make_tree_node();
211
    patterns_by_proto[deco.proto] = tree;
212

  
213
    let path_trees;
214

  
215
    if (deco.proto === "file") {
216
	path_trees = [tree];
217
    } else {
218
	/* We need an aray of domain labels ordered most-significant-first. */
219
	const domain = [...deco.domain].reverse();
220
	path_trees = add_sequence(tree, domain, make_tree_node);
221
    }
222

  
223
    for (const path_tree of path_trees) {
224
        for (const match_obj of add_sequence(path_tree, deco.path, () => {}))
225
            match_obj[item_name] = item;
226
    }
227
}
228

  
229
/*
230
 * Remove registered item from the Pattern Tree that starts with the protocols
231
 * dictionary object passed in the first argument. The remaining 2 arguments
232
 * should be pattern and name that have been earlier passed to
233
 * pattern_tree_register().
234
 */
235
function pattern_tree_deregister(patterns_by_proto, pattern, item_name)
236
{
237
    const deco = deconstruct_url(pattern, false);
238

  
239
    const tree = patterns_by_proto[deco.proto] || make_tree_node();
240
    patterns_by_proto[deco.proto] = tree;
241

  
242
    let path_trees;
243

  
244
    if (deco.proto === "file") {
245
	path_trees = [tree];
246
    } else {
247
	/* We need an aray of domain labels ordered most-significant-first. */
248
	const domain = [...deco.domain].reverse();
249
	..........................
250
	path_trees = add_sequence(tree, domain, make_tree_node);
251
    }
252
}
253

  
254
/*
255
 * Yield registered items (as [item_name, item] arrays) that match url. For
256
 * cosistent behavior, please don't modify the pattern tree while iterating over
257
 * the results.
258
 */
259
function* pattern_tree_search(patterns_by_proto, url)
260
{
261
    const deco = deconstruct_url(url, false);
262

  
263
    tree = patterns_by_proto[deco.proto] || make_tree_node();
264

  
265
    let path_trees;
266

  
267
    if (deco.proto === "file") {
268
	path_trees = [tree];
269
    } else {
270
	/* We need an aray of domain labels ordered most-significant-first. */
271
	const domain = [...deco.domain].reverse();
272
	path_trees = search_sequence(tree, domain);
273
    }
274

  
275
    for (const path_tree of path_trees) {
276
	for (const match_obj of search_sequence(path_tree, deco.path)) {
277
	    for (const entry of Object.entries(match_obj))
278
		yield entry;
279
	}
280
    }
281
}
282

  
283
const patterns_tree = {
284
    make: () => {},
285
    register: patterns_tree_register,
286
    search: patterns_tree_search
287
}
288

  
289
/*
290
 * EXPORTS_START
291
 * EXPORT patterns_tree
292
 * EXPORTS_END
293
 */

Also available in: Unified diff