Remove underlying Char from Hyphen type.
[dead/harbl.git] / harbl / src / Network / DNS / RBL / Domain.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 -- | The 'Domain' data type and its parser. A 'Domain' represents a
4 -- name in the domain name system (DNS) as described by
5 -- RFC1035. In particular, we enforce the restrictions from Section
6 -- 2.3.1 \"Preferred name syntax\". See for example,
7 --
8 -- <https://tools.ietf.org/html/rfc1035#section-2.3.1>
9 --
10 -- We basically work with strings and characters everywhere, even
11 -- though this isn't really correct. The length specifications in
12 -- the RFCs are all in terms of octets, so really a ByteString.Char8
13 -- would be more appropriate. With strings, for example, we could
14 -- have a unicode mumbo jumbo character that takes up two bytes
15 -- (octets).
16 --
17 module Network.DNS.RBL.Domain (
18 Domain(..),
19 domain )
20 where
21
22 import Text.Parsec (
23 (<|>),
24 char,
25 optionMaybe,
26 string,
27 try )
28 import Text.Parsec.String ( Parser )
29
30 import Network.DNS.RBL.Domain.LdhStr (
31 LdhStr(..),
32 ldh_str,
33 ldh_str_length )
34 import Network.DNS.RBL.Domain.LetDig ( LetDig, let_dig )
35 import Network.DNS.RBL.Domain.LetDigHyp ( LetDigHyp(..) )
36 import Network.DNS.RBL.Pretty ( Pretty(..) )
37 import Network.DNS.RBL.Reversible ( Reversible(..) )
38
39
40 -- * Letter/Digit/Hyphen string followed by a trailing Letter/Digit
41
42 -- | This type isn't explicitly part of the grammar, but it's what
43 -- shows up in the square brackets of,
44 --
45 -- <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
46 --
47 -- The ldh-str is optional, but if one is present, we must also have
48 -- a trailing let-dig to prevent the name from ending with a
49 -- hyphen. This can be represented with a @Maybe LdhStrLetDig@,
50 -- which is why we're about to define it.
51 --
52 data LdhStrLetDig = LdhStrLetDig (Maybe LdhStr) LetDig
53 deriving (Eq, Show)
54
55 instance Pretty LdhStrLetDig where
56 pretty_show (LdhStrLetDig Nothing ld) = pretty_show ld
57 pretty_show (LdhStrLetDig (Just s) ld) = (pretty_show s) ++ (pretty_show ld)
58
59
60
61 -- | Parse an 'LdhStrLetDig'. This isn't in the grammar, but we might
62 -- as well define the parser for it independently since we gave it
63 -- its own data type.
64 --
65 -- ==== _Examples_
66 --
67 -- >>> import Text.Parsec ( parse, parseTest )
68 --
69 -- Make sure we can parse a single character:
70 --
71 -- >>> parseTest ldh_str_let_dig "a"
72 -- LdhStrLetDig Nothing (LetDigLetter (Letter 'a'))
73 --
74 -- And longer strings:
75 --
76 -- >>> pretty_print $ parse ldh_str_let_dig "" "ab"
77 -- ab
78 --
79 -- >>> pretty_print $ parse ldh_str_let_dig "" "-b"
80 -- -b
81 --
82 -- >>> parseTest ldh_str_let_dig "b-"
83 -- parse error at (line 1, column 3):
84 -- label cannot end with a hyphen
85 --
86 ldh_str_let_dig :: Parser LdhStrLetDig
87 ldh_str_let_dig = do
88 -- This will happily eat up the trailing let-dig...
89 full_ldh <- ldh_str
90
91 -- So we have to go back and see what happened.
92 case (backwards full_ldh) of
93
94 -- Fail on a single hyphen.
95 (LdhStrSingleLdh (LetDigHypHyphen _)) ->
96 fail "label cannot end with a hyphen"
97
98 -- Fail for a hyphen followed by other stuff.
99 (LdhStrMultipleLdh (LetDigHypHyphen _) _) ->
100 fail "label cannot end with a hyphen"
101
102 -- Simply return the thing if it's a single non-hyphen.
103 (LdhStrSingleLdh (LetDigHypLetDig ld)) -> return $ LdhStrLetDig Nothing ld
104
105 -- And peel off the last character for a non-hyphen followed by
106 -- other stuff. We wind up reversing things twice, but whatever.
107 (LdhStrMultipleLdh (LetDigHypLetDig ld) init_ldh_rev) ->
108 let init_ldh = backwards init_ldh_rev
109 in return $ LdhStrLetDig (Just init_ldh) ld
110
111
112
113 -- | Compute the length of a 'LdhStrLetDig'. It's at least one, since
114 -- the let-dig at the end is always there. And when there's an
115 -- ldh-str too, we add its length to one.
116 --
117 -- ==== _Examples_
118 --
119 -- >>> import Text.Parsec ( parse )
120 --
121 -- >>> let (Right r) = parse ldh_str_let_dig "" "a"
122 -- >>> length_ldh_str_let_dig r
123 -- 1
124 --
125 -- >>> let (Right r) = parse ldh_str_let_dig "" "abc-def"
126 -- >>> length_ldh_str_let_dig r
127 -- 7
128 --
129 length_ldh_str_let_dig :: LdhStrLetDig -> Int
130 length_ldh_str_let_dig (LdhStrLetDig Nothing _) = 1
131 length_ldh_str_let_dig (LdhStrLetDig (Just ldhstring) _) =
132 1 + (ldh_str_length ldhstring)
133
134
135 -- * Labels
136
137 -- | The label type from the RFC1035 grammar:
138 --
139 -- <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
140 --
141 -- We allow the slightly more general syntax from RFC1123, Section 2.1:
142 --
143 -- The syntax of a legal Internet host name was specified in RFC-952
144 -- [DNS:4]. One aspect of host name syntax is hereby changed: the
145 -- restriction on the first character is relaxed to allow either a
146 -- letter or a digit. Host software MUST support this more liberal
147 -- syntax.
148 --
149 data Label = Label LetDig (Maybe LdhStrLetDig)
150 deriving (Eq, Show)
151
152 instance Pretty Label where
153 pretty_show (Label l Nothing) = pretty_show l
154 pretty_show (Label l (Just s)) = (pretty_show l) ++ (pretty_show s)
155
156 -- | Parse a 'Label'.
157 --
158 -- In addition to the grammar, there's another restriction on
159 -- labels: their length must be 63 characters or less. Quoting
160 -- Section 2.3.1, \"Preferred name syntax\", of RFC1035:
161 --
162 -- The labels must follow the rules for ARPANET host names. They
163 -- must start with a letter, end with a letter or digit, and have
164 -- as interior characters only letters, digits, and hyphen. There
165 -- are also some restrictions on the length. Labels must be 63
166 -- characters or less.
167 --
168 -- We check this only after we have successfully parsed a label.
169 --
170 -- ==== _Examples_
171 --
172 -- >>> import Text.Parsec ( parse, parseTest )
173 --
174 -- Make sure we can parse a single character:
175 --
176 -- >>> parseTest label "a"
177 -- Label (LetDigLetter (Letter 'a')) Nothing
178 --
179 -- And longer strings:
180 --
181 -- >>> pretty_print $ parse label "" "abc-def"
182 -- abc-def
183 --
184 -- But not anything ending in a hyphen:
185 --
186 -- >>> parseTest label "abc-"
187 -- parse error at (line 1, column 5):
188 -- label cannot end with a hyphen
189 --
190 -- Or anything over 63 characters:
191 --
192 -- >>> parseTest label (['a'..'z'] ++ ['a'..'z'] ++ ['a'..'z'])
193 -- parse error at (line 1, column 79):
194 -- labels must be 63 or fewer characters
195 --
196 -- However, /exactly/ 63 characters is acceptable:
197 --
198 -- >>> pretty_print $ parse label "" (replicate 63 'x')
199 -- xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
200 --
201 -- Ensure that a label can begin with a digit:
202 --
203 -- >>> pretty_print $ parse label "" "3com"
204 -- 3com
205 --
206 label :: Parser Label
207 label = do
208 l <- let_dig -- Guaranteed to be there
209 maybe_s <- optionMaybe ldh_str_let_dig -- Might not be there
210 case maybe_s of
211 -- It can only be one character long, from the letter...
212 Nothing -> return $ Label l maybe_s
213
214 -- The letter gives us one character, so we check that the rest is
215 -- less than 62 characters long. But in the error message we need
216 -- to report 63.
217 Just s -> if (length_ldh_str_let_dig s) <= 62
218 then return $ Label l maybe_s
219 else fail "labels must be 63 or fewer characters"
220
221
222
223 -- * Subdomains
224
225
226 -- | The data type representing a \"subdomain\" from RFC1035:
227 --
228 -- <subdomain> ::= <label> | <subdomain> "." <label>
229 --
230 -- We have reversed the order of the subdomain and label in the
231 -- second option, however. This is explained in 'subdomain'.
232 --
233 data Subdomain =
234 SubdomainSingleLabel Label |
235 SubdomainMultipleLabel Label Subdomain
236 deriving (Eq, Show)
237
238
239
240 instance Pretty Subdomain where
241 pretty_show (SubdomainSingleLabel l) = pretty_show l
242 pretty_show (SubdomainMultipleLabel l s) =
243 (pretty_show l) ++ "." ++ (pretty_show s)
244
245
246 instance Reversible Subdomain where
247 -- | Reverse the labels of the given subdomain.
248 --
249 -- ==== _Examples_
250 --
251 -- >>> import Text.Parsec ( parse )
252 --
253 -- Standard usage:
254 --
255 -- >>> let (Right r) = parse subdomain "" "com"
256 -- >>> pretty_print $ backwards r
257 -- com
258 --
259 -- >>> let (Right r) = parse subdomain "" "example.com"
260 -- >>> pretty_print $ backwards r
261 -- com.example
262 --
263 -- >>> let (Right r) = parse subdomain "" "www.example.com"
264 -- >>> pretty_print $ backwards r
265 -- com.example.www
266 --
267 -- >>> let (Right r) = parse subdomain "" "new.www.example.com"
268 -- >>> pretty_print $ backwards r
269 -- com.example.www.new
270 --
271
272 -- It's easy to reverse a single label...
273 backwards s@(SubdomainSingleLabel _) = s
274
275 -- For multiple labels we have two cases. The first is where we have
276 -- exactly two labels, and we just need to swap them.
277 backwards (SubdomainMultipleLabel l (SubdomainSingleLabel m)) =
278 SubdomainMultipleLabel m (SubdomainSingleLabel l)
279
280 -- And now the hard case. See the 'LdhStr' implementation for an
281 -- explanation.
282 --
283 backwards (SubdomainMultipleLabel l s) = build (SubdomainSingleLabel l) s
284 where
285 -- Build up the first Subdomain on the left by peeling off the
286 -- leading elements of the second Subdomain.
287 build :: Subdomain -> Subdomain -> Subdomain
288 build dst (SubdomainSingleLabel final) = SubdomainMultipleLabel final dst
289 build dst (SubdomainMultipleLabel leading rest) =
290 build (SubdomainMultipleLabel leading dst) rest
291
292
293
294 -- | Parse an RFC1035 \"subdomain\". The given grammar is,
295 --
296 -- <subdomain> ::= <label> | <subdomain> "." <label>
297 --
298 -- However, we have reversed the order of the subdomain and label to
299 -- prevent infinite recursion. The second option (subdomain + label)
300 -- is obviously more specific, we we need to try it first. This
301 -- presents a problem: we're trying to parse a subdomain in terms of
302 -- a subdomain! The given grammar represents subdomains how we like
303 -- to think of them; from right to left. But it's better to parse
304 -- from left to right, so we pick off the leading label and then
305 -- recurse into the definition of subdomain.
306 --
307 -- According to RFC1034, Section 3.1, two neighboring labels in a
308 -- DNS name cannot be equal:
309 --
310 -- Each node has a label, which is zero to 63 octets in length. Brother
311 -- nodes may not have the same label, although the same label can be used
312 -- for nodes which are not brothers. One label is reserved, and that is
313 -- the null (i.e., zero length) label used for the root.
314 --
315 -- We enforce this restriction, but the result is usually that we
316 -- only parse the part of the subdomain leading up to the repeated
317 -- label.
318 --
319 -- ==== _Examples_
320 --
321 -- >>> import Text.Parsec ( parse, parseTest )
322 --
323 -- Make sure we can parse a single character:
324 --
325 -- >>> parseTest subdomain "a"
326 -- SubdomainSingleLabel (Label (LetDigLetter (Letter 'a')) Nothing)
327 --
328 -- >>> pretty_print $ parse subdomain "" "example.com"
329 -- example.com
330 --
331 -- >>> pretty_print $ parse subdomain "" "www.example.com"
332 -- www.example.com
333 --
334 -- We reject a subdomain with equal neighbors, but this leads to
335 -- only the single first label being parsed instead:
336 --
337 -- >>> pretty_print $ parse subdomain "" "www.www.example.com"
338 -- www
339 --
340 -- But not one with a repeated but non-neighboring label:
341 --
342 -- >>> pretty_print $ parse subdomain "" "www.example.www.com"
343 -- www.example.www.com
344 --
345 subdomain :: Parser Subdomain
346 subdomain = try both <|> just_one
347 where
348 both :: Parser Subdomain
349 both = do
350 l <- label
351 _ <- char '.'
352 s <- subdomain
353 let result = SubdomainMultipleLabel l s
354 if (subdomain_has_equal_neighbors result)
355 then fail "subdomain cannot have equal neighboring labels"
356 else return result
357
358 just_one :: Parser Subdomain
359 just_one = fmap SubdomainSingleLabel label
360
361
362
363 -- | Retrieve a list of labels contained in a 'Subdomain'.
364 --
365 -- ==== _Examples_
366 --
367 -- >>> import Text.Parsec ( parse )
368 --
369 -- >>> let (Right r) = parse subdomain "" "a"
370 -- >>> pretty_print $ subdomain_labels r
371 -- ["a"]
372 --
373 -- >>> let (Right r) = parse subdomain "" "example.com"
374 -- >>> pretty_print $ subdomain_labels r
375 -- ["example","com"]
376 --
377 -- >>> let (Right r) = parse subdomain "" "www.example.com"
378 -- >>> pretty_print $ subdomain_labels r
379 -- ["www","example","com"]
380 --
381 subdomain_labels :: Subdomain -> [Label]
382 subdomain_labels (SubdomainSingleLabel l) = [l]
383 subdomain_labels (SubdomainMultipleLabel l s) = l : (subdomain_labels s)
384
385
386 -- | Return a list of pairs of neighboring labels in a subdomain.
387 --
388 -- ==== _Examples_
389 --
390 -- >>> import Text.Parsec ( parse )
391 -- >>> let (Right r) = parse subdomain "" "www.example.com"
392 -- >>> pretty_print $ subdomain_label_neighbors r
393 -- ["(\"www\",\"example\")","(\"example\",\"com\")"]
394 --
395 subdomain_label_neighbors :: Subdomain -> [(Label,Label)]
396 subdomain_label_neighbors s =
397 zip ls (tail ls)
398 where
399 ls = subdomain_labels s
400
401
402 -- | Return @True@ if the subdomain has any two equal neighboring
403 -- labels, and @False@ otherwise.
404 --
405 -- ==== _Examples_
406 --
407 -- >>> import Text.Parsec ( parse )
408 --
409 -- >>> let (Right r) = parse subdomain "" "www.example.com"
410 -- >>> subdomain_has_equal_neighbors r
411 -- False
412 --
413 -- >>> let (Right l) = parse label "" "www"
414 -- >>> let (Right s) = parse subdomain "" "www.example.com"
415 -- >>> let bad_subdomain = SubdomainMultipleLabel l s
416 -- >>> subdomain_has_equal_neighbors bad_subdomain
417 -- True
418 --
419 subdomain_has_equal_neighbors :: Subdomain -> Bool
420 subdomain_has_equal_neighbors s =
421 or [ x == y | (x,y) <- subdomain_label_neighbors s ]
422
423
424
425
426 -- * Domains
427
428 -- | An RFC1035 domain. According to RFC1035 a domain can be either a
429 -- subdomain or \" \", which according to RFC2181
430 -- <https://tools.ietf.org/html/rfc2181#section-11> means the root:
431 --
432 -- The zero length full name is defined as representing the root
433 -- of the DNS tree, and is typically written and displayed as
434 -- \".\".
435 --
436 -- We let the 'Domain' type remain true to those RFCs, even though
437 -- they don't support an absolute domain name of e.g. a single dot.
438 --
439 data Domain =
440 DomainName Subdomain |
441 DomainRoot
442 deriving (Eq, Show)
443
444 instance Pretty Domain where
445 pretty_show DomainRoot = ""
446 pretty_show (DomainName s) = pretty_show s
447
448 -- | Parse an RFC1035 \"domain\"
449 --
450 -- ==== _Examples_
451 --
452 -- >>> import Text.Parsec ( parse, parseTest )
453 --
454 -- Make sure we can parse a single character:
455 --
456 -- >>> pretty_print $ parse domain "" "a"
457 -- a
458 --
459 -- And the empty domain:
460 --
461 -- >>> parseTest domain ""
462 -- DomainRoot
463 --
464 -- We will in fact parse the \"empty\" domain off the front of
465 -- pretty much anything:
466 --
467 -- >>> parseTest domain "!8===D"
468 -- DomainRoot
469 --
470 -- Equality of domains is case-insensitive:
471 --
472 -- >>> let (Right r1) = parse domain "" "example.com"
473 -- >>> let (Right r2) = parse domain "" "ExaMPle.coM"
474 -- >>> r1 == r2
475 -- True
476 --
477 -- A single dot IS parsed as the root, but the dot isn't consumed:
478 --
479 -- >>> parseTest domain "."
480 -- DomainRoot
481 --
482 -- Anything over 255 characters is an error, so the root will be
483 -- parsed:
484 --
485 -- >>> let big_l1 = replicate 63 'x'
486 -- >>> let big_l2 = replicate 63 'y' -- Avoid equal neighboring labels!
487 -- >>> let big_labels = big_l1 ++ "." ++ big_l2 ++ "."
488 -- >>> let big_subdomain = concat $ replicate 3 big_labels
489 -- >>> parseTest domain big_subdomain
490 -- DomainRoot
491 --
492 -- But exactly 255 is allowed:
493 --
494 -- >>> import Data.List ( intercalate )
495 -- >>> let l1 = replicate 63 'w'
496 -- >>> let l2 = replicate 63 'x'
497 -- >>> let l3 = replicate 63 'y'
498 -- >>> let l4 = replicate 63 'z'
499 -- >>> let big_subdomain = intercalate "." [l1,l2,l3,l4]
500 -- >>> let (Right r) = parse domain "" big_subdomain
501 -- >>> length (pretty_show r)
502 -- 255
503 --
504 domain :: Parser Domain
505 domain = try parse_subdomain <|> parse_empty
506 where
507 parse_subdomain :: Parser Domain
508 parse_subdomain = do
509 s <- subdomain
510 if length (pretty_show s) <= 255
511 then return $ DomainName s
512 else fail "subdomains can be at most 255 characters"
513
514 parse_empty :: Parser Domain
515 parse_empty = string "" >> return DomainRoot
516
517
518 instance Reversible Domain where
519 -- | Reverse the labels of a 'Domain'.
520 --
521 -- -- ==== _Examples_
522 --
523 -- >>> import Text.Parsec ( parse )
524 --
525 -- The root reverses to itself:
526 --
527 -- >>> let (Right r) = parse domain "" ""
528 -- >>> backwards r
529 -- DomainRoot
530 --
531 -- >>> let (Right r) = parse domain "" "new.www.example.com"
532 -- >>> pretty_print $ backwards r
533 -- com.example.www.new
534 --
535 backwards DomainRoot = DomainRoot
536 backwards (DomainName s) = DomainName $ backwards s