Revision 5c583de8
Added by koszko almost 2 years ago
| common/patterns.js | ||
|---|---|---|
| 17 | 17 |
const proto_regex = /^(\w+):\/\/(.*)$/; |
| 18 | 18 |
|
| 19 | 19 |
const user_re = "[^/?#@]+@" |
| 20 |
const domain_re = "[.a-zA-Z0-9-]+"; |
|
| 20 |
const domain_re = "[.*a-zA-Z0-9-]+";
|
|
| 21 | 21 |
const path_re = "[^?#]*"; |
| 22 | 22 |
const query_re = "\\??[^#]*"; |
| 23 | 23 |
|
| 24 | 24 |
const http_regex = new RegExp(`^(${domain_re})(${path_re})(${query_re}).*`);
|
| 25 | 25 |
|
| 26 |
const file_regex = new RegExp(`^(${path_re}).*`);
|
|
| 26 |
const file_regex = new RegExp(`^(/${path_re}).*`);
|
|
| 27 | 27 |
|
| 28 | 28 |
const ftp_regex = new RegExp(`^(${user_re})?(${domain_re})(${path_re}).*`);
|
| 29 | 29 |
|
| 30 |
function match_or_throw(regex, string, error_msg) |
|
| 31 |
{
|
|
| 32 |
const match = regex.exec(string); |
|
| 33 |
if (match === null) |
|
| 34 |
throw error_msg; |
|
| 35 |
|
|
| 36 |
return match; |
|
| 37 |
} |
|
| 38 |
|
|
| 30 | 39 |
function deconstruct_url(url, use_limits=true) |
| 31 | 40 |
{
|
| 32 | 41 |
const max = MAX; |
| ... | ... | |
| 35 | 44 |
max[key] = Infinity; |
| 36 | 45 |
} |
| 37 | 46 |
|
| 38 |
const proto_match = proto_regex.exec(url); |
|
| 39 |
if (proto_match === null) |
|
| 40 |
throw `bad url '${url}'`;
|
|
| 47 |
const matcher = (re, str) => match_or_throw(re, str, `bad url '${url}'`)
|
|
| 41 | 48 |
|
| 49 |
const proto_match = matcher(proto_regex, url); |
|
| 42 | 50 |
const deco = {proto: proto_match[1]};
|
| 43 | 51 |
|
| 44 | 52 |
if (deco.proto === "file") {
|
| 45 |
deco.path = file_regex.exec(proto_match[2])[1];
|
|
| 53 |
deco.path = matcher(file_regex, proto_match[2])[1];
|
|
| 46 | 54 |
} else if (deco.proto === "ftp") {
|
| 47 |
[deco.domain, deco.path] = ftp_regex.exec(proto_match[2]).slice(2, 4); |
|
| 55 |
[deco.domain, deco.path] = |
|
| 56 |
matcher(ftp_regex, proto_match[2]).slice(2, 4); |
|
| 48 | 57 |
} else if (deco.proto === "http" || deco.proto === "https") {
|
| 49 |
const http_match = http_regex.exec(proto_match[2]); |
|
| 50 |
if (!http_match) |
|
| 51 |
return undefined; |
|
| 52 |
[deco.domain, deco.path, deco.query] = http_match.slice(1, 4); |
|
| 58 |
[deco.domain, deco.path, deco.query] = |
|
| 59 |
matcher(http_regex, proto_match[2]).slice(1, 4); |
|
| 53 | 60 |
deco.domain = deco.domain.toLowerCase(); |
| 54 | 61 |
} else {
|
| 55 | 62 |
throw `unsupported protocol in url '${url}'`;
|
| common/patterns_query_tree.js | ||
|---|---|---|
| 1 |
/** |
|
| 2 |
* This file is part of Haketilo. |
|
| 3 |
* |
|
| 4 |
* Function: Data structure to query items by URL patterns. |
|
| 5 |
* |
|
| 6 |
* Copyright (C) 2021 Wojtek Kosior |
|
| 7 |
* |
|
| 8 |
* This program is free software: you can redistribute it and/or modify |
|
| 9 |
* it under the terms of the GNU General Public License as published by |
|
| 10 |
* the Free Software Foundation, either version 3 of the License, or |
|
| 11 |
* (at your option) any later version. |
|
| 12 |
* |
|
| 13 |
* This program is distributed in the hope that it will be useful, |
|
| 14 |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 15 |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
| 16 |
* GNU General Public License for more details. |
|
| 17 |
* |
|
| 18 |
* As additional permission under GNU GPL version 3 section 7, you |
|
| 19 |
* may distribute forms of that code without the copy of the GNU |
|
| 20 |
* GPL normally required by section 4, provided you include this |
|
| 21 |
* license notice and, in case of non-source distribution, a URL |
|
| 22 |
* through which recipients can access the Corresponding Source. |
|
| 23 |
* If you modify file(s) with this exception, you may extend this |
|
| 24 |
* exception to your version of the file(s), but you are not |
|
| 25 |
* obligated to do so. If you do not wish to do so, delete this |
|
| 26 |
* exception statement from your version. |
|
| 27 |
* |
|
| 28 |
* As a special exception to the GPL, any HTML file which merely |
|
| 29 |
* makes function calls to this code, and for that purpose |
|
| 30 |
* includes it by reference shall be deemed a separate work for |
|
| 31 |
* copyright law purposes. If you modify this code, you may extend |
|
| 32 |
* this exception to your version of the code, but you are not |
|
| 33 |
* obligated to do so. If you do not wish to do so, delete this |
|
| 34 |
* exception statement from your version. |
|
| 35 |
* |
|
| 36 |
* You should have received a copy of the GNU General Public License |
|
| 37 |
* along with this program. If not, see <https://www.gnu.org/licenses/>. |
|
| 38 |
* |
|
| 39 |
* I, Wojtek Kosior, thereby promise not to sue for violation of this file's |
|
| 40 |
* license. Although I request that you do not make use this code in a |
|
| 41 |
* proprietary program, I am not going to enforce this in court. |
|
| 42 |
*/ |
|
| 43 |
|
|
| 44 |
/* |
|
| 45 |
* IMPORTS_START |
|
| 46 |
* IMPORT deconstruct_url |
|
| 47 |
* IMPORTS_END |
|
| 48 |
*/ |
|
| 49 |
|
|
| 50 |
/* |
|
| 51 |
* This is a javascript rewrite of Python implementation of the algorithm here: |
|
| 52 |
* https://git.koszko.org/pydrilla/tree/src/pydrilla/pydrilla.py?id=f4edcbe7f4739d6f82a2e1bb180960b003b30862#n205 |
|
| 53 |
*/ |
|
| 54 |
|
|
| 55 |
/* "Pattern Tree" is how we refer to the data structure used for querying |
|
| 56 |
* Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal |
|
| 57 |
* is to make it possible for given URL to quickly retrieve all known patterns |
|
| 58 |
* that match it. |
|
| 59 |
*/ |
|
| 60 |
function make_tree_node() {
|
|
| 61 |
return {
|
|
| 62 |
wildcard_matches: [null, null, null], |
|
| 63 |
literal_match: null, |
|
| 64 |
children: {}
|
|
| 65 |
}; |
|
| 66 |
} |
|
| 67 |
|
|
| 68 |
function is_empty_node(tree_node) {
|
|
| 69 |
const children = tree_node.children; |
|
| 70 |
for (const key in children) {
|
|
| 71 |
if (children.hasOwnProperty(key)) |
|
| 72 |
return false; |
|
| 73 |
} |
|
| 74 |
|
|
| 75 |
if (Array.reduce(tree_node.wildcard_matches, (a, b) => b && a !== null, 1)) |
|
| 76 |
return false; |
|
| 77 |
|
|
| 78 |
return tree_node.literal_match === null; |
|
| 79 |
} |
|
| 80 |
|
|
| 81 |
const is_wildcard = segment => ["*", "**", "***"].lastIndexOf(segment) >= 0; |
|
| 82 |
|
|
| 83 |
/* |
|
| 84 |
* Yields all matches of this segments sequence against the tree that starts at |
|
| 85 |
* this node. Results are produces in order from greatest to lowest pattern |
|
| 86 |
* specificity. |
|
| 87 |
*/ |
|
| 88 |
function* search_sequence(tree_node, segments) |
|
| 89 |
{
|
|
| 90 |
const nodes = [tree_node]; |
|
| 91 |
|
|
| 92 |
for (const segment of segments) {
|
|
| 93 |
const next_node = nodes[nodes.length - 1].children[segment]; |
|
| 94 |
if (next_node === undefined) |
|
| 95 |
break; |
|
| 96 |
|
|
| 97 |
nodes.push(next_node); |
|
| 98 |
} |
|
| 99 |
|
|
| 100 |
const nsegments = segments.length; |
|
| 101 |
|
|
| 102 |
const conds = [ |
|
| 103 |
/* literal pattern match */ |
|
| 104 |
() => nodes.length == nsegments, |
|
| 105 |
/* wildcard pattern matches */ |
|
| 106 |
() => nodes.length + 1 == nsegments && segments[nsegments - 1] != "*", |
|
| 107 |
() => nodes.length + 1 < nsegments, |
|
| 108 |
() => nodes.length + 1 != nsegments || segments[nsegments - 1] != "***" |
|
| 109 |
]; |
|
| 110 |
|
|
| 111 |
while (nodes.length) {
|
|
| 112 |
const node = nodes.pop(); |
|
| 113 |
const items = [node.literal_match, ...node.wildcard_matches]; |
|
| 114 |
|
|
| 115 |
for (let i = 0; i < 4; i++) {
|
|
| 116 |
if (items[i] !== null && conds[i]()) |
|
| 117 |
yield items[i]; |
|
| 118 |
} |
|
| 119 |
} |
|
| 120 |
} |
|
| 121 |
|
|
| 122 |
/* |
|
| 123 |
* Make item queryable through (this branch of) the Pattern Tree or remove its |
|
| 124 |
* path from there. |
|
| 125 |
* |
|
| 126 |
* item_modifier should be a function that accepts 1 argument, the item stored |
|
| 127 |
* in the tree (or `null` if there wasn't any item there), and returns an item |
|
| 128 |
* that should be used in place of the first one. It is also legal for it to |
|
| 129 |
* return the same item modifying it first. If it returns `null`, it means the |
|
| 130 |
* item should be deleted from the Tree. |
|
| 131 |
* |
|
| 132 |
* If there was not yet any item associated with the tree path designated by |
|
| 133 |
* segments and value returned by item_modifier is not `null`, make the value |
|
| 134 |
* queryable by this path. |
|
| 135 |
*/ |
|
| 136 |
function modify_sequence(tree_node, segments, item_modifier) |
|
| 137 |
{
|
|
| 138 |
const nodes = [tree_node]; |
|
| 139 |
let removed = true; |
|
| 140 |
|
|
| 141 |
for (var current_segment of segments) {
|
|
| 142 |
wildcards = tree_node.wildcard_matches; |
|
| 143 |
|
|
| 144 |
const child = tree_node.children[current_segment] || make_tree_node(); |
|
| 145 |
tree_node.children[current_segment] = child; |
|
| 146 |
tree_node = child; |
|
| 147 |
nodes.push(tree_node); |
|
| 148 |
} |
|
| 149 |
|
|
| 150 |
tree_node.literal_match = item_modifier(tree_node.literal_match); |
|
| 151 |
if (tree_node.literal_match !== null) |
|
| 152 |
removed = false; |
|
| 153 |
|
|
| 154 |
let i = segments.length; |
|
| 155 |
|
|
| 156 |
if (is_wildcard(current_segment)) {
|
|
| 157 |
const asterisks = current_segment.length - 1; |
|
| 158 |
const wildcards = nodes[i - 1].wildcard_matches; |
|
| 159 |
wildcards[asterisks] = item_modifier(wildcards[asterisks]); |
|
| 160 |
if (wildcards[asterisks] !== null) |
|
| 161 |
removed = false; |
|
| 162 |
} |
|
| 163 |
|
|
| 164 |
while (!removed) |
|
| 165 |
return; |
|
| 166 |
|
|
| 167 |
while (i > 0) {
|
|
| 168 |
tree_node = nodes[i--]; |
|
| 169 |
if (is_empty_node(tree_node)) |
|
| 170 |
delete nodes[i].children[segments[i]]; |
|
| 171 |
} |
|
| 172 |
} |
|
| 173 |
|
|
| 174 |
/* |
|
| 175 |
* Make item queryable through the Pattern Tree that starts with the protocols |
|
| 176 |
* dictionary object passed in the first argument. |
|
| 177 |
*/ |
|
| 178 |
function pattern_tree_register(patterns_by_proto, pattern, item_name, item) |
|
| 179 |
{
|
|
| 180 |
/* |
|
| 181 |
* We pass 'false' to disable length limits on URL parts. Length limits are |
|
| 182 |
* mostly useful in case of iteration over all patterns matching given URL. |
|
| 183 |
* Here we don't do that. |
|
| 184 |
*/ |
|
| 185 |
const deco = deconstruct_url(pattern, false); |
|
| 186 |
|
|
| 187 |
const tree = patterns_by_proto[deco.proto] || make_tree_node(); |
|
| 188 |
patterns_by_proto[deco.proto] = tree; |
|
| 189 |
|
|
| 190 |
let path_trees; |
|
| 191 |
|
|
| 192 |
if (deco.proto === "file") {
|
|
| 193 |
path_trees = [tree]; |
|
| 194 |
} else {
|
|
| 195 |
/* We need an aray of domain labels ordered most-significant-first. */ |
|
| 196 |
const domain = [...deco.domain].reverse(); |
|
| 197 |
path_trees = add_sequence(tree, domain, make_tree_node); |
|
| 198 |
} |
|
| 199 |
|
|
| 200 |
for (const path_tree of path_trees) {
|
|
| 201 |
for (const match_obj of add_sequence(path_tree, deco.path, () => {}))
|
|
| 202 |
match_obj[item_name] = item; |
|
| 203 |
} |
|
| 204 |
} |
|
| 205 |
|
|
| 206 |
// /* |
|
| 207 |
// * Remove registered item from the Pattern Tree that starts with the protocols |
|
| 208 |
// * dictionary object passed in the first argument. The remaining 2 arguments |
|
| 209 |
// * should be pattern and name that have been earlier passed to |
|
| 210 |
// * pattern_tree_register(). |
|
| 211 |
// */ |
|
| 212 |
// function pattern_tree_deregister(patterns_by_proto, pattern, item_name) |
|
| 213 |
// {
|
|
| 214 |
// const deco = deconstruct_url(pattern, false); |
|
| 215 |
|
|
| 216 |
// const tree = patterns_by_proto[deco.proto] || make_tree_node(); |
|
| 217 |
// patterns_by_proto[deco.proto] = tree; |
|
| 218 |
|
|
| 219 |
// let path_trees; |
|
| 220 |
|
|
| 221 |
// if (deco.proto === "file") {
|
|
| 222 |
// path_trees = [tree]; |
|
| 223 |
// } else {
|
|
| 224 |
// /* We need an aray of domain labels ordered most-significant-first. */ |
|
| 225 |
// const domain = [...deco.domain].reverse(); |
|
| 226 |
// .......................... |
|
| 227 |
// path_trees = add_sequence(tree, domain, make_tree_node); |
|
| 228 |
// } |
|
| 229 |
// } |
|
| 230 |
|
|
| 231 |
/* |
|
| 232 |
* Yield registered items (as [item_name, item] arrays) that match url. For |
|
| 233 |
* cosistent behavior, please don't modify the pattern tree while iterating over |
|
| 234 |
* the results. |
|
| 235 |
*/ |
|
| 236 |
function* pattern_tree_search(patterns_by_proto, url) |
|
| 237 |
{
|
|
| 238 |
const deco = deconstruct_url(url, false); |
|
| 239 |
|
|
| 240 |
tree = patterns_by_proto[deco.proto] || make_tree_node(); |
|
| 241 |
|
|
| 242 |
let path_trees; |
|
| 243 |
|
|
| 244 |
if (deco.proto === "file") {
|
|
| 245 |
path_trees = [tree]; |
|
| 246 |
} else {
|
|
| 247 |
/* We need an aray of domain labels ordered most-significant-first. */ |
|
| 248 |
const domain = [...deco.domain].reverse(); |
|
| 249 |
path_trees = search_sequence(tree, domain); |
|
| 250 |
} |
|
| 251 |
|
|
| 252 |
for (const path_tree of path_trees) {
|
|
| 253 |
for (const match_obj of search_sequence(path_tree, deco.path)) {
|
|
| 254 |
for (const entry of Object.entries(match_obj)) |
|
| 255 |
yield entry; |
|
| 256 |
} |
|
| 257 |
} |
|
| 258 |
} |
|
| 259 |
|
|
| 260 |
const pattern_tree = {
|
|
| 261 |
make: () => {},
|
|
| 262 |
register: pattern_tree_register, |
|
| 263 |
// deregister: pattern_tree_deregister, |
|
| 264 |
search: pattern_tree_search |
|
| 265 |
} |
|
| 266 |
|
|
| 267 |
/* |
|
| 268 |
* EXPORTS_START |
|
| 269 |
* EXPORT pattern_tree |
|
| 270 |
* EXPORTS_END |
|
| 271 |
*/ |
|
| test/unit/conftest.py | ||
|---|---|---|
| 76 | 76 |
return window.haketilo_selenium_return_value; |
| 77 | 77 |
''' |
| 78 | 78 |
|
| 79 |
def _execute_in_page_context(driver, script, *args):
|
|
| 79 |
def _execute_in_page_context(driver, script, args): |
|
| 80 | 80 |
script = script + '\n;\nwindow.haketilo_selenium_exception = false;' |
| 81 | 81 |
try: |
| 82 | 82 |
return driver.execute_script(script_injecting_script, script, args) |
| test/unit/test_patterns.py | ||
|---|---|---|
| 89 | 89 |
match = execute_in_page('returnval(ftp_regex.exec(arguments[0]));',
|
| 90 | 90 |
'@bad.url/') |
| 91 | 91 |
assert match is None |
| 92 |
|
|
| 93 |
def test_deconstruct_url(execute_in_page, patterns_code): |
|
| 94 |
""" |
|
| 95 |
patterns.js contains deconstruct_url() function that handles URL parsing. |
|
| 96 |
Verify it works properly. |
|
| 97 |
""" |
|
| 98 |
execute_in_page(patterns_code, page='https://gotmyowndoma.in') |
|
| 99 |
|
|
| 100 |
deco = execute_in_page('returnval(deconstruct_url(arguments[0]));',
|
|
| 101 |
'https://eXaMpLe.com/a/b?ver=1.2.3#heading2') |
|
| 102 |
assert deco |
|
| 103 |
assert deco['trailing_dash'] == False |
|
| 104 |
assert deco['proto'] == 'https' |
|
| 105 |
assert deco['domain'] == ['example', 'com'] |
|
| 106 |
assert deco['path'] == ['a', 'b'] |
|
| 107 |
|
|
| 108 |
deco = execute_in_page('returnval(deconstruct_url(arguments[0]));',
|
|
| 109 |
'http://**.example.com/') |
|
| 110 |
assert deco |
|
| 111 |
assert deco['trailing_dash'] == True |
|
| 112 |
assert deco['proto'] == 'http' |
|
| 113 |
assert deco['domain'] == ['**', 'example', 'com'] |
|
| 114 |
assert deco['path'] == [] |
|
| 115 |
|
|
| 116 |
deco = execute_in_page('returnval(deconstruct_url(arguments[0]));',
|
|
| 117 |
'ftp://user@ftp.example.com/all///passwords.txt/') |
|
| 118 |
assert deco |
|
| 119 |
assert deco['trailing_dash'] == True |
|
| 120 |
assert deco['proto'] == 'ftp' |
|
| 121 |
assert deco['domain'] == ['ftp', 'example', 'com'] |
|
| 122 |
assert deco['path'] == ['all', 'passwords.txt'] |
|
| 123 |
|
|
| 124 |
deco = execute_in_page('returnval(deconstruct_url(arguments[0]));',
|
|
| 125 |
'ftp://mirror.edu.pl.eu.org') |
|
| 126 |
assert deco |
|
| 127 |
assert deco['trailing_dash'] == False |
|
| 128 |
assert deco['proto'] == 'ftp' |
|
| 129 |
assert deco['domain'] == ['mirror', 'edu', 'pl', 'eu', 'org'] |
|
| 130 |
assert deco['path'] == [] |
|
| 131 |
|
|
| 132 |
deco = execute_in_page('returnval(deconstruct_url(arguments[0]));',
|
|
| 133 |
'file:///mnt/parabola_chroot///etc/passwd') |
|
| 134 |
assert deco |
|
| 135 |
assert deco['trailing_dash'] == False |
|
| 136 |
assert deco['proto'] == 'file' |
|
| 137 |
assert deco['path'] == ['mnt', 'parabola_chroot', 'etc', 'passwd'] |
|
| 138 |
|
|
| 139 |
for bad_url in [ |
|
| 140 |
'://bad-url.missing/protocol', |
|
| 141 |
'http:/example.com/a/b', |
|
| 142 |
'unknown://example.com/a/b', |
|
| 143 |
'idontfancypineapple', |
|
| 144 |
'ftp://@example.org/', |
|
| 145 |
'https:///some/path/', |
|
| 146 |
'file://non-absolute/path' |
|
| 147 |
]: |
|
| 148 |
with pytest.raises(Exception, match=r'Error in injected script'): |
|
| 149 |
deco = execute_in_page('returnval(deconstruct_url(arguments[0]));',
|
|
| 150 |
bad_url) |
|
| 151 |
|
|
| 152 |
# at some point we might also consider testing url deconstruction with |
|
| 153 |
# length limits... |
|
| test/unit/test_patterns_query_tree.py | ||
|---|---|---|
| 1 |
# SPDX-License-Identifier: CC0-1.0 |
|
| 2 |
|
|
| 3 |
""" |
|
| 4 |
Haketilo unit tests - URL patterns |
|
| 5 |
""" |
|
| 6 |
|
|
| 7 |
# This file is part of Haketilo |
|
| 8 |
# |
|
| 9 |
# Copyright (C) 2021, Wojtek Kosior |
|
| 10 |
# |
|
| 11 |
# This program is free software: you can redistribute it and/or modify |
|
| 12 |
# it under the terms of the CC0 1.0 Universal License as published by |
|
| 13 |
# the Creative Commons Corporation. |
|
| 14 |
# |
|
| 15 |
# This program is distributed in the hope that it will be useful, |
|
| 16 |
# but WITHOUT ANY WARRANTY; without even the implied warranty of |
|
| 17 |
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
|
| 18 |
# CC0 1.0 Universal License for more details. |
|
| 19 |
|
|
| 20 |
import pytest |
|
| 21 |
|
|
| 22 |
from ..script_loader import load_script |
|
| 23 |
|
|
| 24 |
@pytest.fixture(scope="session") |
|
| 25 |
def patterns_tree_code(): |
|
| 26 |
yield load_script('common/patterns_query_tree.js', ['common'])
|
|
| 27 |
|
|
| 28 |
def test_modify_branch(execute_in_page, patterns_tree_code): |
|
| 29 |
""" |
|
| 30 |
patterns_query_tree.js contains Patterns Tree data structure that allows |
|
| 31 |
arrays of string labels to be mapped to items. |
|
| 32 |
Verify operations modifying a single branch of such tree work properly. |
|
| 33 |
""" |
|
| 34 |
execute_in_page(patterns_tree_code, page='https://gotmyowndoma.in') |
|
| 35 |
execute_in_page( |
|
| 36 |
''' |
|
| 37 |
let items_added; |
|
| 38 |
let items_removed; |
|
| 39 |
|
|
| 40 |
function _item_adder(item, array) |
|
| 41 |
{
|
|
| 42 |
items_added++; |
|
| 43 |
return [...(array || []), item]; |
|
| 44 |
} |
|
| 45 |
|
|
| 46 |
function item_adder(item) |
|
| 47 |
{
|
|
| 48 |
items_added = 0; |
|
| 49 |
return array => _item_adder(item, array); |
|
| 50 |
} |
|
| 51 |
|
|
| 52 |
function _item_remover(array) |
|
| 53 |
{
|
|
| 54 |
if (array !== null) {
|
|
| 55 |
items_removed++; |
|
| 56 |
array.pop(); |
|
| 57 |
} |
|
| 58 |
return (array && array.length > 0) ? array : null; |
|
| 59 |
} |
|
| 60 |
|
|
| 61 |
function item_remover() |
|
| 62 |
{
|
|
| 63 |
items_removed = 0; |
|
| 64 |
return _item_remover; |
|
| 65 |
}''') |
|
| 66 |
|
|
| 67 |
# Let's construct some tree branch while checking that each addition gives |
|
| 68 |
# the right result. |
|
| 69 |
branch = execute_in_page( |
|
| 70 |
'''{
|
|
| 71 |
const branch = make_tree_node(); |
|
| 72 |
modify_sequence(branch, ['com', 'example'], item_adder('some_item'));
|
|
| 73 |
returnval(branch); |
|
| 74 |
}''') |
|
| 75 |
assert branch == {
|
|
| 76 |
'literal_match': None, |
|
| 77 |
'wildcard_matches': [None, None, None], |
|
| 78 |
'children': {
|
|
| 79 |
'com': {
|
|
| 80 |
'literal_match': None, |
|
| 81 |
'wildcard_matches': [None, None, None], |
|
| 82 |
'children': {
|
|
| 83 |
'example': {
|
|
| 84 |
'literal_match': ['some_item'], |
|
| 85 |
'wildcard_matches': [None, None, None], |
|
| 86 |
'children': {
|
|
| 87 |
} |
|
| 88 |
} |
|
| 89 |
} |
|
| 90 |
} |
|
| 91 |
} |
|
| 92 |
} |
|
| 93 |
|
|
| 94 |
branch, items_added = execute_in_page( |
|
| 95 |
'''{
|
|
| 96 |
const branch = arguments[0]; |
|
| 97 |
modify_sequence(branch, ['com', 'example'], item_adder('other_item'));
|
|
| 98 |
returnval([branch, items_added]); |
|
| 99 |
}''', branch) |
|
| 100 |
assert items_added == 1 |
|
| 101 |
assert branch['children']['com']['children']['example']['literal_match'] \ |
|
| 102 |
== ['some_item', 'other_item'] |
|
| 103 |
|
|
| 104 |
for i in range(3): |
|
| 105 |
for expected_array in [['third_item'], ['third_item', '4th_item']]: |
|
| 106 |
wildcard = '*' * (i + 1) |
|
| 107 |
branch, items_added = execute_in_page( |
|
| 108 |
'''{
|
|
| 109 |
const branch = arguments[0]; |
|
| 110 |
modify_sequence(branch, ['com', 'sample', arguments[1]], |
|
| 111 |
item_adder(arguments[2])); |
|
| 112 |
returnval([branch, items_added]); |
|
| 113 |
}''', |
|
| 114 |
branch, wildcard, expected_array[-1]) |
|
| 115 |
assert items_added == 2 |
|
| 116 |
sample = branch['children']['com']['children']['sample'] |
|
| 117 |
assert sample['wildcard_matches'][i] == expected_array |
|
| 118 |
assert sample['children'][wildcard]['literal_match'] \ |
|
| 119 |
== expected_array |
|
| 120 |
|
|
| 121 |
branch, items_added = execute_in_page( |
|
| 122 |
'''{
|
|
| 123 |
const branch = arguments[0]; |
|
| 124 |
modify_sequence(branch, ['org', 'koszko', '***', '123'], |
|
| 125 |
item_adder('5th_item'));
|
|
| 126 |
returnval([branch, items_added]); |
|
| 127 |
}''', |
|
| 128 |
branch) |
|
| 129 |
assert items_added == 1 |
|
| 130 |
assert branch['children']['org']['children']['koszko']['children']['***']\ |
|
| 131 |
['children']['123']['literal_match'] == ['5th_item'] |
|
| 132 |
|
|
| 133 |
# Let's verify that removing a nonexistent element doesn't modify the tree. |
|
| 134 |
branch2, items_removed = execute_in_page( |
|
| 135 |
'''{
|
|
| 136 |
const branch = arguments[0]; |
|
| 137 |
modify_sequence(branch, ['com', 'not', 'registered', '*'], |
|
| 138 |
item_remover()); |
|
| 139 |
returnval([branch, items_removed]); |
|
| 140 |
}''', |
|
| 141 |
branch) |
|
| 142 |
assert branch == branch2 |
|
| 143 |
assert items_removed == 0 |
|
| 144 |
|
|
| 145 |
# Let's remove all elements in the tree branch while checking that each |
|
| 146 |
# removal gives the right result. |
|
| 147 |
branch, items_removed = execute_in_page( |
|
| 148 |
'''{
|
|
| 149 |
const branch = arguments[0]; |
|
| 150 |
modify_sequence(branch, ['org', 'koszko', '***', '123'], |
|
| 151 |
item_remover()); |
|
| 152 |
returnval([branch, items_removed]); |
|
| 153 |
}''', |
|
| 154 |
branch) |
|
| 155 |
assert items_removed == 1 |
|
| 156 |
assert 'org' not in branch['children'] |
|
| 157 |
|
|
| 158 |
for i in range(3): |
|
| 159 |
for expected_array in [['third_item'], None]: |
|
| 160 |
wildcard = '*' * (i + 1) |
|
| 161 |
branch, items_removed = execute_in_page( |
|
| 162 |
'''{
|
|
| 163 |
const branch = arguments[0]; |
|
| 164 |
modify_sequence(branch, ['com', 'sample', arguments[1]], |
|
| 165 |
item_remover()); |
|
| 166 |
returnval([branch, items_removed]); |
|
| 167 |
}''', |
|
| 168 |
branch, wildcard) |
|
| 169 |
assert items_removed == 2 |
|
| 170 |
if i == 2 and expected_array == []: |
|
| 171 |
break |
|
| 172 |
sample = branch['children']['com']['children'].get('sample', {})
|
|
| 173 |
assert sample.get('wildcard_matches', [None, None, None])[i] \
|
|
| 174 |
== expected_array |
|
| 175 |
assert sample.get('children', {}).get(wildcard, {})\
|
|
| 176 |
.get('literal_match') == expected_array
|
|
| 177 |
|
|
| 178 |
for i in range(2): |
|
| 179 |
branch, items_removed = execute_in_page( |
|
| 180 |
'''{
|
|
| 181 |
const branch = arguments[0]; |
|
| 182 |
modify_sequence(branch, ['com', 'example'], item_remover()); |
|
| 183 |
returnval([branch, items_removed]); |
|
| 184 |
}''', |
|
| 185 |
branch) |
|
| 186 |
assert items_removed == 1 |
|
| 187 |
if i == 0: |
|
| 188 |
assert branch['children']['com']['children']['example']\ |
|
| 189 |
['literal_match'] == ['some_item'] |
|
| 190 |
else: |
|
| 191 |
assert branch == {
|
|
| 192 |
'literal_match': None, |
|
| 193 |
'wildcard_matches': [None, None, None], |
|
| 194 |
'children': {
|
|
| 195 |
} |
|
| 196 |
} |
|
| 197 |
|
|
| 198 |
def test_search_branch(execute_in_page, patterns_tree_code): |
|
| 199 |
""" |
|
| 200 |
patterns_query_tree.js contains Patterns Tree data structure that allows |
|
| 201 |
arrays of string labels to be mapped to items. |
|
| 202 |
Verify searching a single branch of such tree work properly. |
|
| 203 |
""" |
|
| 204 |
execute_in_page(patterns_tree_code, page='https://gotmyowndoma.in') |
|
| 205 |
execute_in_page( |
|
| 206 |
''' |
|
| 207 |
const item_adder = item => (array => [...(array || []), item]); |
|
| 208 |
''') |
|
| 209 |
|
|
| 210 |
# Let's construct some tree branch to test on. |
|
| 211 |
execute_in_page( |
|
| 212 |
''' |
|
| 213 |
var branch = make_tree_node(); |
|
| 214 |
|
|
| 215 |
for (const [item, sequence] of [ |
|
| 216 |
['(root)', []], |
|
| 217 |
['***', ['***']], |
|
| 218 |
['**', ['**']], |
|
| 219 |
['*', ['*']], |
|
| 220 |
|
|
| 221 |
['a', ['a']], |
|
| 222 |
['A', ['a']], |
|
| 223 |
['b', ['b']], |
|
| 224 |
|
|
| 225 |
['a/***', ['a', '***']], |
|
| 226 |
['A/***', ['a', '***']], |
|
| 227 |
['a/**', ['a', '**']], |
|
| 228 |
['A/**', ['a', '**']], |
|
| 229 |
['a/*', ['a', '*']], |
|
| 230 |
['A/*', ['a', '*']], |
|
| 231 |
['a/sth', ['a', 'sth']], |
|
| 232 |
['A/sth', ['a', 'sth']], |
|
| 233 |
|
|
| 234 |
['b/***', ['b', '***']], |
|
| 235 |
['b/**', ['b', '**']], |
|
| 236 |
['b/*', ['b', '*']], |
|
| 237 |
['b/sth', ['b', 'sth']], |
|
| 238 |
]) |
|
| 239 |
modify_sequence(branch, sequence, item_adder(item)); |
|
| 240 |
''') |
|
| 241 |
|
|
| 242 |
# Let's make the actual searches on our testing branch. |
|
| 243 |
for sequence, expected in [ |
|
| 244 |
([], [{'(root)'}, {'***'}]),
|
|
| 245 |
(['a'], [{'a', 'A'}, {'a/***', 'A/***'}, {'*'}, {'***'}]),
|
|
| 246 |
(['b'], [{'b'}, {'b/***'}, {'*'}, {'***'}]),
|
|
| 247 |
(['c'], [ {'*'}, {'***'}]),
|
|
| 248 |
(['***'], [{'***'}, {'*'} ]),
|
|
| 249 |
(['**'], [{'**'}, {'*'}, {'***'}]),
|
|
| 250 |
(['**'], [{'**'}, {'*'}, {'***'}]),
|
|
| 251 |
(['*'], [{'*'}, {'***'}]),
|
|
| 252 |
|
|
| 253 |
(['a', 'sth'], [{'a/sth', 'A/sth'}, {'a/*', 'A/*'}, {'a/***', 'A/***'}, {'**'}, {'***'}]),
|
|
| 254 |
(['b', 'sth'], [{'b/sth'}, {'b/*'}, {'b/***'}, {'**'}, {'***'}]),
|
|
| 255 |
(['a', 'hts'], [ {'a/*', 'A/*'}, {'a/***', 'A/***'}, {'**'}, {'***'}]),
|
|
| 256 |
(['b', 'hts'], [ {'b/*'}, {'b/***'}, {'**'}, {'***'}]),
|
|
| 257 |
(['a', '***'], [{'a/***', 'A/***'}, {'a/*', 'A/*'}, {'**'}, {'***'}]),
|
|
| 258 |
(['b', '***'], [{'b/***'}, {'b/*'}, {'**'}, {'***'}]),
|
|
| 259 |
(['a', '**'], [{'a/**', 'A/**'}, {'a/*', 'A/*'}, {'a/***', 'A/***'}, {'**'}, {'***'}]),
|
|
| 260 |
(['b', '**'], [{'b/**'}, {'b/*'}, {'b/***'}, {'**'}, {'***'}]),
|
|
| 261 |
(['a', '*'], [{'a/*', 'A/*'}, {'a/***', 'A/***'}, {'**'}, {'***'}]),
|
|
| 262 |
(['b', '*'], [{'b/*'}, {'b/***'}, {'**'}, {'***'}]),
|
|
| 263 |
|
|
| 264 |
(['a', 'c', 'd'], [{'a/**', 'A/**'}, {'a/***', 'A/***'}, {'**'}, {'***'}]),
|
|
| 265 |
(['b', 'c', 'd'], [{'b/**'}, {'b/***'}, {'**'}, {'***'}])
|
|
| 266 |
]: |
|
| 267 |
result = execute_in_page( |
|
| 268 |
''' |
|
| 269 |
returnval([...search_sequence(branch, arguments[0])]); |
|
| 270 |
''', |
|
| 271 |
sequence) |
|
| 272 |
|
|
| 273 |
try: |
|
| 274 |
assert len(result) == len(expected) |
|
| 275 |
|
|
| 276 |
for expected_set, result_array in zip(expected, result): |
|
| 277 |
assert len(expected_set) == len(result_array) |
|
| 278 |
assert expected_set == set(result_array) |
|
| 279 |
except Exception as e: |
|
| 280 |
import sys |
|
| 281 |
print('sequence:', sequence, '\nexpected:', expected,
|
|
| 282 |
'\nresult:', result, file=sys.stderr) |
|
| 283 |
raise e from None |
|
Also available in: Unified diff
start implementing more efficient querying of URL patterns