Project

General

Profile

Download (3.68 KB) Statistics
| Branch: | Tag: | Revision:

haketilo / common / patterns.js @ 72cbfa74

1
/**
2
 * Hachette operations on page url patterns
3
 *
4
 * Copyright (C) 2021 Wojtek Kosior
5
 * Redistribution terms are gathered in the `copyright' file.
6
 */
7

    
8
const MAX_URL_PATH_LEN = 12;
9
const MAX_URL_PATH_CHARS = 255;
10
const MAX_DOMAIN_LEN = 7;
11
const MAX_DOMAIN_CHARS = 100;
12

    
13
const proto_regex = /^(\w+):\/\/(.*)$/;
14

    
15
const user_re = "[^/?#@]+@"
16
const domain_re = "[^/?#]+";
17
const path_re = "[^?#]*";
18
const query_re = "\\??[^#]*";
19

    
20
const http_regex = new RegExp(`^(${domain_re})(${path_re})(${query_re}).*`);
21

    
22
const file_regex = new RegExp(`^(${path_re}).*`);
23

    
24
const ftp_regex = new RegExp(`^(${user_re})?(${domain_re})(${path_re}).*`);
25

    
26
function deconstruct_url(url)
27
{
28
    const proto_match = proto_regex.exec(url);
29
    if (proto_match === null)
30
	return undefined;
31

    
32
    const deco = {proto: proto_match[1]};
33

    
34
    if (deco.proto === "file") {
35
	deco.path = file_regex.exec(proto_match[2])[1];
36
    } else if (deco.proto === "ftp") {
37
	[deco.domain, deco.path] = ftp_regex.exec(proto_match[2]).slice(2, 4);
38
    } else {
39
	const http_match = http_regex.exec(proto_match[2]);
40
	if (!http_match)
41
	    return undefined;
42
	[deco.domain, deco.path, deco.query] = http_match.slice(1, 4);
43
    }
44

    
45
    const leading_dash = deco.path[0] === "/";
46
    deco.trailing_dash = deco.path[deco.path.length - 1] === "/";
47

    
48
    if (deco.domain) {
49
	if (deco.domain.length > MAX_DOMAIN_CHARS) {
50
	    const idx = deco.domain.indexOf(".", deco.domain.length -
51
					    MAX_DOMAIN_CHARS);
52
	    if (idx === -1)
53
		deco.domain = [];
54
	    else
55
		deco.domain = deco.domain.substring(idx + 1);
56

    
57
	    deco.domain_truncated = true;
58
	}
59

    
60
	if (deco.path.length > MAX_URL_PATH_CHARS) {
61
	    deco.path = deco.path.substring(0, deco.path.lastIndexOf("/"));
62
	    deco.path_truncated = true;
63
	}
64
    }
65

    
66
    if (typeof deco.domain === "string") {
67
	deco.domain = deco.domain.split(".");
68
	if (deco.domain.splice(0, deco.domain.length - MAX_DOMAIN_LEN).length
69
	    > 0)
70
	    deco.domain_truncated = true;
71
    }
72

    
73
    deco.path = deco.path.split("/").filter(s => s !== "");
74
    if (deco.domain && deco.path.splice(MAX_URL_PATH_LEN).length > 0)
75
	deco.path_truncated = true;
76
    if (leading_dash || deco.path.length === 0)
77
	deco.path.unshift("");
78

    
79
    return deco;
80
}
81

    
82
function* each_domain_pattern(deco)
83
{
84
    for (let slice = 0; slice < deco.domain.length - 1; slice++) {
85
	const domain_part = deco.domain.slice(slice).join(".");
86
	const domain_wildcards = [];
87
	if (slice === 0 && !deco.domain_truncated)
88
	    yield domain_part;
89
	if (slice === 1 && !deco.domain_truncated)
90
	    yield "*." + domain_part;
91
	if (slice > 1)
92
	    yield "**." + domain_part;
93
	yield "***." + domain_part;
94
    }
95
}
96

    
97
function* each_path_pattern(deco)
98
{
99
    for (let slice = deco.path.length; slice > 0; slice--) {
100
	const path_part = deco.path.slice(0, slice).join("/");
101
	const path_wildcards = [];
102
	if (slice === deco.path.length && !deco.path_truncated) {
103
	    if (deco.trailing_dash)
104
		yield path_part + "/";
105
	    yield path_part;
106
	}
107
	if (slice === deco.path.length - 1 && !deco.path_truncated &&
108
	    deco.path[slice] !== "*")
109
	    yield path_part + "/*";
110
	if (slice < deco.path.length - 1)
111
	    yield path_part + "/**";
112
	if (slice !== deco.path.length - 1 || deco.path_truncated ||
113
	    deco.path[slice] !== "***")
114
	    yield path_part + "/***";
115
    }
116
}
117

    
118
/* Generate every possible pattern that matches url. */
119
function* each_url_pattern(url)
120
{
121
    const deco = deconstruct_url(url);
122

    
123
    if (deco === undefined) {
124
	console.error("bad url format", url);
125
	return false;
126
    }
127

    
128
    const all_domains = deco.domain ? each_domain_pattern(deco) : [""];
129
    for (const domain of all_domains) {
130
	for (const path of each_path_pattern(deco))
131
	    yield `${deco.proto}://${domain}${path}`;
132
    }
133
}
134

    
135
/*
136
 * EXPORTS_START
137
 * EXPORT each_url_pattern
138
 * EXPORTS_END
139
 */
(8-8/16)