]> gitweb.michael.orlitzky.com - dead/harbl.git/blob - src/Domain.hs
3ef6f26597a89c730dbb5a36fe14e8e1c5147af4
[dead/harbl.git] / src / Domain.hs
1 -- | The 'Domain' data type and its parser. A 'Domain' represents a
2 -- name in the domain name system (DNS) as described by
3 -- RFC1035. In particular, we enforce the restrictions from Section
4 -- 2.3.1 \"Preferred name syntax\". See for example,
5 --
6 -- <https://tools.ietf.org/html/rfc1035#section-2.3.1>
7 --
8 module Domain
9 where
10
11 import Data.Char ( toLower )
12 import Text.Parsec (
13 ParseError,
14 (<|>),
15 alphaNum,
16 char,
17 eof,
18 many1,
19 option,
20 optionMaybe,
21 parse,
22 string,
23 try,
24 unexpected )
25 import qualified Text.Parsec as Parsec ( digit, letter)
26 import Text.Parsec.String ( Parser )
27
28 import Pretty ( Pretty(..) )
29
30 -- * Digits
31
32 -- | A wrapper around a digit character.
33 --
34 newtype Digit = Digit Char deriving (Eq, Show)
35 instance Pretty Digit where pretty_show (Digit d) = [d]
36
37 -- | Parse a single digit, but wrap it in our 'Digit' type.
38 --
39 digit :: Parser Digit
40 digit = fmap Digit Parsec.digit
41
42
43 -- * Letters
44
45 -- | A wrapper around a letter character.
46 --
47 newtype Letter = Letter Char deriving (Show)
48 instance Pretty Letter where pretty_show (Letter l) = [l]
49
50
51 -- | Parse a single letter, but wrap it in our 'Letter' type.
52 --
53 letter :: Parser Letter
54 letter = fmap Letter Parsec.letter
55
56 -- | The derived instance of 'Eq' for letters is incorrect. All
57 -- comparisons should be made case-insensitively. The following
58 -- is an excerpt from RFC1035:
59 --
60 -- 2.3.3. Character Case
61 --
62 -- For all parts of the DNS that are part of the official
63 -- protocol, all comparisons between character strings (e.g.,
64 -- labels, domain names, etc.) are done in a case-insensitive
65 -- manner...
66 --
67 -- Since each part of DNS name is composed of our custom types, it
68 -- suffices to munge the equality for 'Letter'.
69 --
70 instance Eq Letter where
71 (Letter l1) == (Letter l2) = (toLower l1) == (toLower l2)
72
73 -- * Letters/Digits
74
75 -- | A sum type representing either a letter or a digit.
76 --
77 data LetDig =
78 LetDigLetter Letter |
79 LetDigDigit Digit
80 deriving (Eq, Show)
81
82 instance Pretty LetDig where
83 pretty_show (LetDigLetter l) = pretty_show l
84 pretty_show (LetDigDigit d) = pretty_show d
85
86 -- | Parse a letter or a digit and wrap it in our 'LetDig' type.
87 --
88 let_dig :: Parser LetDig
89 let_dig = (fmap LetDigLetter letter) <|> (fmap LetDigDigit digit)
90
91
92 -- * Hyphens
93
94 -- | A wrapper around a single hyphen character.
95 --
96 newtype Hyphen = Hyphen Char deriving (Eq, Show)
97 instance Pretty Hyphen where pretty_show (Hyphen h) = [h]
98
99 -- | Parse a single hyphen and wrap it in our 'Hyphen' type.
100 --
101 hyphen :: Parser Hyphen
102 hyphen = fmap Hyphen (char '-')
103
104
105 -- * Letter, Digit, or Hyphen.
106
107 -- | A sum type representing a letter, digit, or hyphen.
108 --
109 data LetDigHyp =
110 LetDigHypLetDig LetDig |
111 LetDigHypHyphen Hyphen
112 deriving (Eq, Show)
113
114 instance Pretty LetDigHyp where
115 pretty_show (LetDigHypLetDig ld) = pretty_show ld
116 pretty_show (LetDigHypHyphen h) = pretty_show h
117
118
119 -- | The following is the simplest type in the domain grammar that
120 -- isn't already implemented for us.
121 --
122 -- <let-dig> ::= <letter> | <digit>
123 --
124 -- ==== _Examples_
125 --
126 -- >>> import Text.Parsec ( parseTest )
127 --
128 -- Letters, digits, and hyphens are all parsed:
129 --
130 -- >>> parseTest let_dig_hyp "a"
131 -- LetDigHypLetDig (LetDigLetter (Letter 'a'))
132 --
133 -- >>> parseTest let_dig_hyp "7"
134 -- LetDigHypLetDig (LetDigDigit (Digit '7'))
135 --
136 -- >>> parseTest let_dig_hyp "-"
137 -- LetDigHypHyphen (Hyphen '-')
138 --
139 -- However, an underscore (for example) is not:
140 --
141 -- >>> parseTest let_dig_hyp "_"
142 -- parse error at (line 1, column 1):
143 -- unexpected "_"
144 -- expecting letter, digit or "-"
145 --
146 let_dig_hyp :: Parser LetDigHyp
147 let_dig_hyp =
148 parse_letdig <|> parse_hyphen
149 where
150 parse_letdig :: Parser LetDigHyp
151 parse_letdig = fmap LetDigHypLetDig let_dig
152
153 parse_hyphen :: Parser LetDigHyp
154 parse_hyphen = fmap LetDigHypHyphen hyphen
155
156
157 -- * Letter/Digit/Hyphen strings
158
159 -- | A string of letters, digits, and hyphens from the RFC1035 grammar:
160 --
161 -- <ldh-str> ::= <let-dig-hyp> | <let-dig-hyp> <ldh-str>
162 --
163 -- These are represented as either a single instance of a
164 -- 'LetDigHyp', or a string of them (recursive).
165 --
166 data LdhStr =
167 LdhStrSingleLdh LetDigHyp |
168 LdhStrMultipleLdh LetDigHyp LdhStr
169 deriving (Eq, Show)
170
171 instance Pretty LdhStr where
172 pretty_show (LdhStrSingleLdh ldh) = pretty_show ldh
173 pretty_show (LdhStrMultipleLdh ldh s) = (pretty_show ldh) ++ (pretty_show s)
174
175 -- | Parse a string of letters, digits, and hyphens (an 'LdhStr').
176 --
177 -- ==== _Examples_
178 --
179 -- >>> import Text.Parsec ( parseTest )
180 --
181 -- Single letters, digits, and hyphens are parsed:
182 --
183 -- >>> parseTest ldh_str "a"
184 -- LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a')))
185 --
186 -- >>> parseTest ldh_str "0"
187 -- LdhStrSingleLdh (LetDigHypLetDig (LetDigDigit (Digit '0')))
188 --
189 -- >>> parseTest ldh_str "-"
190 -- LdhStrSingleLdh (LetDigHypHyphen (Hyphen '-'))
191 --
192 -- As well as strings of them:
193 --
194 -- >>> pretty_print $ parse ldh_str "" "a0-b"
195 -- a0-b
196 --
197 ldh_str :: Parser LdhStr
198 ldh_str = try both <|> just_one
199 where
200 both :: Parser LdhStr
201 both = do
202 ldh1 <- let_dig_hyp
203 ldh_tail <- ldh_str
204 return $ LdhStrMultipleLdh ldh1 ldh_tail
205
206 just_one :: Parser LdhStr
207 just_one = fmap LdhStrSingleLdh let_dig_hyp
208
209
210
211 -- | A version of 'last' that works on a 'LdhStr' rather than a
212 -- list. That is, it returns the last 'LetDigHyp' in the
213 -- string. Since 'LdhStr' contains at least one character, there's
214 -- no \"nil\" case here.
215 --
216 -- ==== _Examples_
217 --
218 -- >>> let (Right r) = parse ldh_str "" "a"
219 -- >>> last_ldh_str r
220 -- LetDigHypLetDig (LetDigLetter (Letter 'a'))
221 --
222 -- >>> let (Right r) = parse ldh_str "" "abc-def"
223 -- >>> last_ldh_str r
224 -- LetDigHypLetDig (LetDigLetter (Letter 'f'))
225 --
226 last_ldh_str :: LdhStr -> LetDigHyp
227 last_ldh_str (LdhStrSingleLdh x) = x
228 last_ldh_str (LdhStrMultipleLdh _ x) = last_ldh_str x
229
230
231 -- | A version of 'init' that works on a 'LdhStr' rather than a
232 -- list. That is, it returns everything /except/ the last character in
233 -- the string.
234 --
235 -- Since an 'LdhStr' must contain at least one character, this might
236 -- not be opssible (when the input is of length one). So, we return
237 -- a 'Maybe' value.
238 --
239 -- ==== _Examples_
240 --
241 -- >>> let (Right r) = parse ldh_str "" "a"
242 -- >>> init_ldh_str r
243 -- Nothing
244 --
245 -- >>> let (Right r) = parse ldh_str "" "ab"
246 -- >>> init_ldh_str r
247 -- Just (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a'))))
248 --
249 -- >>> let (Right r) = parse ldh_str "" "abc-def"
250 -- >>> init_ldh_str r
251 -- Just (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a'))) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'b'))) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'c'))) (LdhStrMultipleLdh (LetDigHypHyphen (Hyphen '-')) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'd'))) (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'e')))))))))
252 --
253 init_ldh_str :: LdhStr -> Maybe LdhStr
254 init_ldh_str (LdhStrSingleLdh _) = Nothing
255 init_ldh_str (LdhStrMultipleLdh h t) =
256 Just $ case (init_ldh_str t) of
257 -- We just got the second-to-last character, we're done.
258 Nothing -> LdhStrSingleLdh h
259
260 -- There's still more stuff. Recurse.
261 Just rest -> LdhStrMultipleLdh h rest
262
263
264 -- | Compute the length of an 'LdhStr'. It will be at least one, since
265 -- 'LdhStr's are non-empty. And if there's something other than the
266 -- first character present, we simply recurse.
267 --
268 -- ==== _Examples_
269 --
270 -- >>> let (Right r) = parse ldh_str "" "a"
271 -- >>> length_ldh_str r
272 -- 1
273 --
274 -- >>> let (Right r) = parse ldh_str "" "abc-def"
275 -- >>> length_ldh_str r
276 -- 7
277 --
278 length_ldh_str :: LdhStr -> Int
279 length_ldh_str (LdhStrSingleLdh _) = 1
280 length_ldh_str (LdhStrMultipleLdh _ t) = 1 + (length_ldh_str t)
281
282 -- * Letter/Digit/Hyphen string followed by a trailing Letter/Digit
283
284 -- | This type isn't explicitly part of the grammar, but it's what
285 -- shows up in the square brackets of,
286 --
287 -- <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
288 --
289 -- The ldh-str is optional, but if one is present, we must also have
290 -- a trailing let-dig to prevent the name from ending with a
291 -- hyphen. This can be represented with a @Maybe LdhStrLetDig@,
292 -- which is why we're about to define it.
293 --
294 data LdhStrLetDig = LdhStrLetDig (Maybe LdhStr) LetDig
295 deriving (Eq, Show)
296
297 instance Pretty LdhStrLetDig where
298 pretty_show (LdhStrLetDig Nothing ld) = pretty_show ld
299 pretty_show (LdhStrLetDig (Just s) ld) = (pretty_show s) ++ (pretty_show ld)
300
301 -- | Parse an 'LdhStrLetDig'. This isn't in the grammar, but we might
302 -- as well define the parser for it independently since we gave it
303 -- its own data type.
304 --
305 -- ==== _Examples_
306 --
307 -- >>> import Text.Parsec ( parseTest )
308 --
309 -- Make sure we can parse a single character:
310 --
311 -- >>> parseTest ldh_str_let_dig "a"
312 -- LdhStrLetDig Nothing (LetDigLetter (Letter 'a'))
313 --
314 -- And longer strings:
315 --
316 -- >>> pretty_print $ parse ldh_str_let_dig "" "ab"
317 -- ab
318 --
319 -- >>> pretty_print $ parse ldh_str_let_dig "" "-b"
320 -- -b
321 --
322 -- >>> parseTest ldh_str_let_dig "b-"
323 -- parse error at (line 1, column 3):
324 -- label cannot end with a hyphen
325 --
326 ldh_str_let_dig :: Parser LdhStrLetDig
327 ldh_str_let_dig = do
328 -- This will happily eat up the trailing let-dig...
329 full_ldh <- ldh_str
330
331 -- So we have to go back and see what happened.
332 case (last_ldh_str full_ldh) of
333 (LetDigHypHyphen _) -> fail "label cannot end with a hyphen"
334 (LetDigHypLetDig ld) ->
335 -- Ok, the label didn't end with a hyphen; now we need to split
336 -- off the last letter/digit so we can pack it into our return
337 -- type separately.
338 return $ case (init_ldh_str full_ldh) of
339 -- We only parsed one letter/digit. This can happen
340 -- if the label contains two characters. For example,
341 -- if we try to parse the label "ab", then the "a"
342 -- will be eaten by the label parser, and this
343 -- function will be left with only "b".
344 Nothing -> LdhStrLetDig Nothing ld
345
346 -- Usual case: there's was some leading let-dig-hyp junk,
347 -- return it too.
348 leading_ldhs -> LdhStrLetDig leading_ldhs ld
349
350
351
352 -- | Compute the length of a 'LdhStrLetDig'. It's at least one, since
353 -- the let-dig at the end is always there. And when there's an
354 -- ldh-str too, we add its length to one.
355 --
356 -- ==== _Examples_
357 --
358 -- >>> let (Right r) = parse ldh_str_let_dig "" "a"
359 -- >>> length_ldh_str_let_dig r
360 -- 1
361 --
362 -- >>> let (Right r) = parse ldh_str_let_dig "" "abc-def"
363 -- >>> length_ldh_str_let_dig r
364 -- 7
365 --
366 length_ldh_str_let_dig :: LdhStrLetDig -> Int
367 length_ldh_str_let_dig (LdhStrLetDig Nothing _) = 1
368 length_ldh_str_let_dig (LdhStrLetDig (Just ldhstring) _) =
369 1 + (length_ldh_str ldhstring)
370
371
372 -- * Labels
373
374 -- | The label type from the RFC1035 grammar:
375 --
376 -- <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
377 --
378 data Label = Label Letter (Maybe LdhStrLetDig)
379 deriving (Eq, Show)
380
381 instance Pretty Label where
382 pretty_show (Label l Nothing) = pretty_show l
383 pretty_show (Label l (Just s)) = (pretty_show l) ++ (pretty_show s)
384
385 -- | Parse a 'Label'.
386 --
387 -- In addition to the grammar, there's another restriction on
388 -- labels: their length must be 63 characters or less. Quoting
389 -- Section 2.3.1, \"Preferred name syntax\", of RFC1035:
390 --
391 -- The labels must follow the rules for ARPANET host names. They
392 -- must start with a letter, end with a letter or digit, and have
393 -- as interior characters only letters, digits, and hyphen. There
394 -- are also some restrictions on the length. Labels must be 63
395 -- characters or less.
396 --
397 -- We check this only after we have successfully parsed a label.
398 --
399 -- ==== _Examples_
400 --
401 -- >>> import Text.Parsec ( parseTest )
402 --
403 -- Make sure we can parse a single character:
404 --
405 -- >>> parseTest label "a"
406 -- Label (Letter 'a') Nothing
407 --
408 -- And longer strings:
409 --
410 -- >>> pretty_print $ parse label "" "abc-def"
411 -- abc-def
412 --
413 -- But not anything ending in a hyphen:
414 --
415 -- >>> parseTest label "abc-"
416 -- parse error at (line 1, column 5):
417 -- label cannot end with a hyphen
418 --
419 -- Or anything over 63 characters:
420 --
421 -- >>> parseTest label (['a'..'z'] ++ ['a'..'z'] ++ ['a'..'z'])
422 -- parse error at (line 1, column 79):
423 -- labels must be 63 or fewer characters
424 --
425 -- However, /exactly/ 63 characters is acceptable:
426 --
427 -- >>> pretty_print $ parse label "" (replicate 63 'x')
428 -- xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
429 --
430 label :: Parser Label
431 label = do
432 l <- letter -- Guaranteed to be there
433 maybe_s <- optionMaybe ldh_str_let_dig -- Might not be there
434 case maybe_s of
435 -- It can only be one character long, from the letter...
436 Nothing -> return $ Label l maybe_s
437
438 -- The letter gives us one character, so we check that the rest is
439 -- less than 62 characters long. But in the error message we need
440 -- to report 63.
441 Just s -> if (length_ldh_str_let_dig s) <= 62
442 then return $ Label l maybe_s
443 else fail "labels must be 63 or fewer characters"
444
445
446
447 -- * Subdomains
448
449
450 -- | The data type representing a \"subdomain\" from RFC1035:
451 --
452 -- <subdomain> ::= <label> | <subdomain> "." <label>
453 --
454 -- We have reversed the order of the subdomain and label in the
455 -- second option, however. This is explained in 'subdomain'.
456 --
457 data Subdomain =
458 SubdomainSingleLabel Label |
459 SubdomainMultipleLabel Label Subdomain
460 deriving (Eq, Show)
461
462 instance Pretty Subdomain where
463 pretty_show (SubdomainSingleLabel l) = pretty_show l
464 pretty_show (SubdomainMultipleLabel l s) =
465 (pretty_show l) ++ "." ++ (pretty_show s)
466
467 -- | Parse an RFC1035 \"subdomain\". The given grammar is,
468 --
469 -- <subdomain> ::= <label> | <subdomain> "." <label>
470 --
471 -- However, we have reversed the order of the subdomain and label to
472 -- prevent infinite recursion. The second option (subdomain + label)
473 -- is obviously more specific, we we need to try it first. This
474 -- presents a problem: we're trying to parse a subdomain in terms of
475 -- a subdomain! The given grammar represents subdomains how we like
476 -- to think of them; from right to left. But it's better to parse
477 -- from left to right, so we pick off the leading label and then
478 -- recurse into the definition of subdomain.
479 --
480 -- ==== _Examples_
481 --
482 -- >>> import Text.Parsec ( parseTest )
483 --
484 -- Make sure we can parse a single character:
485 --
486 -- >>> parseTest subdomain "a"
487 -- SubdomainSingleLabel (Label (Letter 'a') Nothing)
488 --
489 -- >>> pretty_print $ parse subdomain "" "example.com"
490 -- example.com
491 --
492 -- >>> pretty_print $ parse subdomain "" "www.example.com"
493 -- www.example.com
494 --
495 subdomain :: Parser Subdomain
496 subdomain = try both <|> just_one
497 where
498 both :: Parser Subdomain
499 both = do
500 l <- label
501 char '.'
502 s <- subdomain
503 return (SubdomainMultipleLabel l s)
504
505 just_one :: Parser Subdomain
506 just_one = fmap SubdomainSingleLabel label
507
508
509
510 -- * Domains
511
512 -- | An RFC1035 domain. According to RFC1035 a domain can be either a
513 -- subdomain or \" \", which according to RFC2181
514 -- <https://tools.ietf.org/html/rfc2181#section-11> means the root:
515 --
516 -- The zero length full name is defined as representing the root
517 -- of the DNS tree, and is typically written and displayed as
518 -- \".\".
519 --
520 -- We let the 'Domain' type remain true to those RFCs, even though
521 -- they don't support an absolute domain name of e.g. a single dot.
522 -- We have one more data type 'UserDomain' which handles the possibility
523 -- of an absolute path.
524 --
525 data Domain =
526 DomainName Subdomain |
527 DomainRoot
528 deriving (Eq, Show)
529
530 instance Pretty Domain where
531 pretty_show DomainRoot = ""
532 pretty_show (DomainName s) = pretty_show s
533
534 -- | Parse an RFC1035 \"domain\"
535 --
536 -- ==== _Examples_
537 --
538 -- >>> import Text.Parsec ( parseTest )
539 --
540 -- Make sure we can parse a single character:
541 --
542 -- >>> parseTest domain "a"
543 -- DomainName (SubdomainSingleLabel (Label (Letter 'a') Nothing))
544 --
545 -- And the empty domain:
546 --
547 -- >>> parseTest domain ""
548 -- DomainRoot
549 --
550 -- We will in fact parse the \"empty\" domain off the front of
551 -- pretty much anything:
552 --
553 -- >>> parseTest domain "8===D"
554 -- DomainRoot
555 --
556 -- Equality of domains is case-insensitive:
557 --
558 -- >>> let (Right r1) = parse domain "" "example.com"
559 -- >>> let (Right r2) = parse domain "" "ExaMPle.coM"
560 -- >>> r1 == r2
561 -- True
562 --
563 -- A single dot IS parsed as the root, but the dot isn't consumed:
564 --
565 -- >>> parseTest domain "."
566 -- DomainRoot
567 --
568 domain :: Parser Domain
569 domain = try parse_subdomain <|> parse_empty
570 where
571 parse_subdomain = fmap DomainName subdomain
572 parse_empty = string "" >> return DomainRoot
573
574
575
576 -- * User domains
577
578 -- | This type helps clarify some murkiness in the DNS \"domain\" name
579 -- specification. In RFC1034, it is acknowledged that a domain name
580 -- input with a trailing \".\" will represent an absolute domain
581 -- name (i.e. with respect to the DNS root). However, the grammar in
582 -- RFC1035 disallows a trailing dot.
583 --
584 -- This makes some sense: within the DNS, everything knows its
585 -- position in the tree. The relative/absolute distinction only
586 -- makes sense on the client side, where a user's resolver might
587 -- decide to append some suffix to a relative
588 -- request. Unfortunately, that's where we live. So we have to deal
589 -- with the possibility of having a trailing dot at the end of any
590 -- domain name.
591 --
592 data UserDomain =
593 UserDomainRelative Domain |
594 UserDomainAbsolute Domain
595 deriving (Eq, Show)
596
597 instance Pretty UserDomain where
598 pretty_show (UserDomainRelative d) = pretty_show d
599 pretty_show (UserDomainAbsolute d) = (pretty_show d) ++ "."
600
601
602 -- | Parse a 'UserDomain'. This is what we'll be using to read user
603 -- input, since it supports both relative and absolute domain names
604 -- (unlike the implicitly-absolute 'Domain').
605 --
606 -- ==== _Examples_
607 --
608 -- >>> import Text.Parsec ( parseTest )
609 --
610 -- We can really parse the root now!
611 --
612 -- >>> parseTest user_domain "."
613 -- UserDomainAbsolute DomainRoot
614 --
615 -- But multiple dots aren't (only the first):
616 --
617 -- >>> pretty_print $ parse user_domain "" ".."
618 -- .
619 --
620 -- We can also optionally have a trailing dot at the end of a
621 -- non-empty name:
622 --
623 -- >>> pretty_print $ parse user_domain "" "www.example.com"
624 -- www.example.com
625 --
626 -- >>> pretty_print $ parse user_domain "" "www.example.com."
627 -- www.example.com.
628 --
629 -- A \"relative root\" can also be parsed, letting the user's
630 -- resolver deal with it:
631 --
632 -- >>> parseTest user_domain ""
633 -- UserDomainRelative DomainRoot
634 --
635 user_domain :: Parser UserDomain
636 user_domain = try absolute <|> relative
637 where
638 absolute :: Parser UserDomain
639 absolute = do
640 d <- domain
641 r <- char '.'
642 return $ UserDomainAbsolute d
643
644 relative :: Parser UserDomain
645 relative = fmap UserDomainRelative domain