Revision e1282a63
Added by koszko almost 2 years ago
| common/patterns_query_tree.js | ||
|---|---|---|
| 47 | 47 |
* IMPORTS_END |
| 48 | 48 |
*/ |
| 49 | 49 |
|
| 50 |
/* |
|
| 51 |
* This is a javascript rewrite of Python implementation of the algorithm here: |
|
| 52 |
* https://git.koszko.org/pydrilla/tree/src/pydrilla/pydrilla.py?id=f4edcbe7f4739d6f82a2e1bb180960b003b30862#n205 |
|
| 53 |
*/ |
|
| 54 |
|
|
| 55 | 50 |
/* "Pattern Tree" is how we refer to the data structure used for querying |
| 56 | 51 |
* Haketilo patterns. Those look like 'https://*.example.com/ab/***'. The goal |
| 57 | 52 |
* is to make it possible for given URL to quickly retrieve all known patterns |
| 58 | 53 |
* that match it. |
| 59 | 54 |
*/ |
| 60 |
function make_tree_node() {
|
|
| 55 |
function empty_node() {
|
|
| 61 | 56 |
return {
|
| 62 | 57 |
wildcard_matches: [null, null, null], |
| 63 | 58 |
literal_match: null, |
| ... | ... | |
| 141 | 136 |
for (var current_segment of segments) {
|
| 142 | 137 |
wildcards = tree_node.wildcard_matches; |
| 143 | 138 |
|
| 144 |
const child = tree_node.children[current_segment] || make_tree_node();
|
|
| 139 |
const child = tree_node.children[current_segment] || empty_node();
|
|
| 145 | 140 |
tree_node.children[current_segment] = child; |
| 146 | 141 |
tree_node = child; |
| 147 | 142 |
nodes.push(tree_node); |
| ... | ... | |
| 171 | 166 |
} |
| 172 | 167 |
} |
| 173 | 168 |
|
| 174 |
/* |
|
| 175 |
* Make item queryable through the Pattern Tree that starts with the protocols |
|
| 176 |
* dictionary object passed in the first argument. |
|
| 177 |
*/ |
|
| 178 |
function pattern_tree_register(patterns_by_proto, pattern, item_name, item) |
|
| 169 |
/* Helper function for modify_tree(). */ |
|
| 170 |
function modify_path(tree_node, deco, item_modifier) |
|
| 171 |
{
|
|
| 172 |
tree_node = tree_node || empty_node(); |
|
| 173 |
modify_sequence(tree_node, deco.path, item_modifier); |
|
| 174 |
return is_empty_node(tree_node) ? null : tree_node; |
|
| 175 |
} |
|
| 176 |
|
|
| 177 |
/* Helper function for modify_tree(). */ |
|
| 178 |
function modify_domain(tree_node, deco, item_modifier) |
|
| 179 |
{
|
|
| 180 |
const path_modifier = branch => modify_path(branch, deco, item_modifier); |
|
| 181 |
tree_node = tree_node || empty_node(); |
|
| 182 |
/* We need an array of domain labels ordered most-significant-first. */ |
|
| 183 |
modify_sequence(tree_node, [...deco.domain].reverse(), path_modifier); |
|
| 184 |
return is_empty_node(tree_node) ? null : tree_node; |
|
| 185 |
} |
|
| 186 |
|
|
| 187 |
/* Helper function for pattern_tree_register() and pattern_tree_deregister(). */ |
|
| 188 |
function modify_tree(patterns_by_proto, pattern, item_modifier) |
|
| 179 | 189 |
{
|
| 180 | 190 |
/* |
| 181 | 191 |
* We pass 'false' to disable length limits on URL parts. Length limits are |
| ... | ... | |
| 184 | 194 |
*/ |
| 185 | 195 |
const deco = deconstruct_url(pattern, false); |
| 186 | 196 |
|
| 187 |
const tree = patterns_by_proto[deco.proto] || make_tree_node(); |
|
| 188 |
patterns_by_proto[deco.proto] = tree; |
|
| 197 |
let tree_for_proto = patterns_by_proto[deco.proto]; |
|
| 189 | 198 |
|
| 190 |
let path_trees; |
|
| 199 |
tree_for_proto = deco.domain === undefined ? |
|
| 200 |
modify_path(tree_for_proto, deco, item_modifier) : |
|
| 201 |
modify_domain(tree_for_proto, deco, item_modifier); |
|
| 191 | 202 |
|
| 192 |
if (deco.proto === "file") {
|
|
| 193 |
path_trees = [tree]; |
|
| 194 |
} else {
|
|
| 195 |
/* We need an aray of domain labels ordered most-significant-first. */ |
|
| 196 |
const domain = [...deco.domain].reverse(); |
|
| 197 |
path_trees = add_sequence(tree, domain, make_tree_node); |
|
| 198 |
} |
|
| 203 |
patterns_by_proto[deco.proto] = tree_for_proto; |
|
| 204 |
if (tree_for_proto === null) |
|
| 205 |
delete patterns_by_proto[deco.proto]; |
|
| 206 |
} |
|
| 199 | 207 |
|
| 200 |
for (const path_tree of path_trees) {
|
|
| 201 |
for (const match_obj of add_sequence(path_tree, deco.path, () => {}))
|
|
| 202 |
match_obj[item_name] = item; |
|
| 203 |
} |
|
| 208 |
/* |
|
| 209 |
* Make item queryable through the Pattern Tree that starts with the protocols |
|
| 210 |
* dictionary object passed in the first argument. |
|
| 211 |
*/ |
|
| 212 |
function pattern_tree_register(patterns_by_proto, pattern, item_name, item) |
|
| 213 |
{
|
|
| 214 |
const key_prefix = pattern[pattern.length - 1] === '/' ? '/' : '_'; |
|
| 215 |
item_name = key_prefix + item_name; |
|
| 216 |
const add_item = obj => Object.assign(obj || {}, {[item_name]: item});
|
|
| 217 |
modify_tree(patterns_by_proto, pattern, add_item); |
|
| 218 |
} |
|
| 219 |
|
|
| 220 |
/* Helper function for pattern_tree_deregister(). */ |
|
| 221 |
function _remove_item(obj, item_name) |
|
| 222 |
{
|
|
| 223 |
obj = obj || {};
|
|
| 224 |
delete obj[item_name]; |
|
| 225 |
for (const key in obj) |
|
| 226 |
return obj; |
|
| 227 |
return null; |
|
| 204 | 228 |
} |
| 205 | 229 |
|
| 206 |
// /* |
|
| 207 |
// * Remove registered item from the Pattern Tree that starts with the protocols |
|
| 208 |
// * dictionary object passed in the first argument. The remaining 2 arguments |
|
| 209 |
// * should be pattern and name that have been earlier passed to |
|
| 210 |
// * pattern_tree_register(). |
|
| 211 |
// */ |
|
| 212 |
// function pattern_tree_deregister(patterns_by_proto, pattern, item_name) |
|
| 213 |
// {
|
|
| 214 |
// const deco = deconstruct_url(pattern, false); |
|
| 215 |
|
|
| 216 |
// const tree = patterns_by_proto[deco.proto] || make_tree_node(); |
|
| 217 |
// patterns_by_proto[deco.proto] = tree; |
|
| 218 |
|
|
| 219 |
// let path_trees; |
|
| 220 |
|
|
| 221 |
// if (deco.proto === "file") {
|
|
| 222 |
// path_trees = [tree]; |
|
| 223 |
// } else {
|
|
| 224 |
// /* We need an aray of domain labels ordered most-significant-first. */ |
|
| 225 |
// const domain = [...deco.domain].reverse(); |
|
| 226 |
// .......................... |
|
| 227 |
// path_trees = add_sequence(tree, domain, make_tree_node); |
|
| 228 |
// } |
|
| 229 |
// } |
|
| 230 |
/* |
|
| 231 |
* Remove registered item from the Pattern Tree that starts with the protocols |
|
| 232 |
* dictionary object passed in the first argument. The remaining 2 arguments |
|
| 233 |
* should be pattern and name that have been earlier passed to |
|
| 234 |
* pattern_tree_register(). |
|
| 235 |
*/ |
|
| 236 |
function pattern_tree_deregister(patterns_by_proto, pattern, item_name) |
|
| 237 |
{
|
|
| 238 |
const key_prefix = pattern[pattern.length - 1] === '/' ? '/' : '_'; |
|
| 239 |
item_name = key_prefix + item_name; |
|
| 240 |
const remove_item = obj => _remove_item(obj, item_name); |
|
| 241 |
modify_tree(patterns_by_proto, pattern, remove_item); |
|
| 242 |
} |
|
| 230 | 243 |
|
| 231 | 244 |
/* |
| 232 |
* Yield registered items (as [item_name, item] arrays) that match url. For |
|
| 233 |
* cosistent behavior, please don't modify the pattern tree while iterating over |
|
| 234 |
* the results. |
|
| 245 |
* Yield registered items that match url. Each yielded value is an object with |
|
| 246 |
* key being matched item names and values being the items. One such object |
|
| 247 |
* shall contain all items matched with given pattern specificity. Objects are |
|
| 248 |
* yielded in order from greatest to lowest pattern specificity. |
|
| 235 | 249 |
*/ |
| 236 | 250 |
function* pattern_tree_search(patterns_by_proto, url) |
| 237 | 251 |
{
|
| 238 | 252 |
const deco = deconstruct_url(url, false); |
| 239 | 253 |
|
| 240 |
tree = patterns_by_proto[deco.proto] || make_tree_node(); |
|
| 254 |
const tree_for_proto = patterns_by_proto[deco.proto] || empty_node(); |
|
| 255 |
let by_path = [tree_for_proto]; |
|
| 241 | 256 |
|
| 242 |
let path_trees; |
|
| 243 |
|
|
| 244 |
if (deco.proto === "file") {
|
|
| 245 |
path_trees = [tree]; |
|
| 246 |
} else {
|
|
| 247 |
/* We need an aray of domain labels ordered most-significant-first. */ |
|
| 248 |
const domain = [...deco.domain].reverse(); |
|
| 249 |
path_trees = search_sequence(tree, domain); |
|
| 250 |
} |
|
| 257 |
/* We need an array of domain labels ordered most-significant-first. */ |
|
| 258 |
if (deco.domain !== undefined) |
|
| 259 |
by_path = search_sequence(tree_for_proto, [...deco.domain].reverse()); |
|
| 251 | 260 |
|
| 252 |
for (const path_tree of path_trees) {
|
|
| 261 |
for (const path_tree of by_path) {
|
|
| 253 | 262 |
for (const match_obj of search_sequence(path_tree, deco.path)) {
|
| 254 |
for (const entry of Object.entries(match_obj)) |
|
| 255 |
yield entry; |
|
| 263 |
let result_obj_slash = null; |
|
| 264 |
let result_obj_no_slash = null; |
|
| 265 |
|
|
| 266 |
for (const [key, item] of Object.entries(match_obj)) {
|
|
| 267 |
if (deco.trailing_slash && key[0] === '/') {
|
|
| 268 |
result_obj_slash = result_obj_slash || {};
|
|
| 269 |
result_obj_slash[key.substring(1)] = item; |
|
| 270 |
} else if (key[0] !== '/') {
|
|
| 271 |
result_obj_no_slash = result_obj_no_slash || {};
|
|
| 272 |
result_obj_no_slash[key.substring(1)] = item; |
|
| 273 |
} |
|
| 274 |
} |
|
| 275 |
|
|
| 276 |
if (deco.trailing_slash && result_obj_slash) |
|
| 277 |
yield result_obj_slash; |
|
| 278 |
|
|
| 279 |
if (result_obj_no_slash) |
|
| 280 |
yield result_obj_no_slash; |
|
| 256 | 281 |
} |
| 257 | 282 |
} |
| 258 | 283 |
} |
| 259 | 284 |
|
| 260 | 285 |
const pattern_tree = {
|
| 261 |
make: () => {},
|
|
| 262 |
register: pattern_tree_register, |
|
| 263 |
// deregister: pattern_tree_deregister,
|
|
| 264 |
search: pattern_tree_search |
|
| 286 |
make: () => ({}),
|
|
| 287 |
register: pattern_tree_register,
|
|
| 288 |
deregister: pattern_tree_deregister, |
|
| 289 |
search: pattern_tree_search
|
|
| 265 | 290 |
} |
| 266 | 291 |
|
| 267 | 292 |
/* |
Also available in: Unified diff
finish implementing more efficient querying of URL patterns
The algorithm is implemented and tested. However, it is yet to be hooked into the actual extension.