]> gitweb.michael.orlitzky.com - dead/harbl.git/blob - src/Domain.hs
More import/export cleanup.
[dead/harbl.git] / src / Domain.hs
1 -- | The 'Domain' data type and its parser. A 'Domain' represents a
2 -- name in the domain name system (DNS) as described by
3 -- RFC1035. In particular, we enforce the restrictions from Section
4 -- 2.3.1 \"Preferred name syntax\". See for example,
5 --
6 -- <https://tools.ietf.org/html/rfc1035#section-2.3.1>
7 --
8 module Domain (
9 UserDomain,
10 user_domain )
11 where
12
13 import Data.Char ( toLower )
14 import Text.Parsec (
15 ParseError,
16 (<|>),
17 alphaNum,
18 char,
19 eof,
20 many1,
21 option,
22 optionMaybe,
23 parse,
24 string,
25 try,
26 unexpected )
27 import qualified Text.Parsec as Parsec ( digit, letter)
28 import Text.Parsec.String ( Parser )
29
30 import Pretty ( Pretty(..) )
31
32 -- * Digits
33
34 -- | A wrapper around a digit character.
35 --
36 newtype Digit = Digit Char deriving (Eq, Show)
37 instance Pretty Digit where pretty_show (Digit d) = [d]
38
39 -- | Parse a single digit, but wrap it in our 'Digit' type.
40 --
41 digit :: Parser Digit
42 digit = fmap Digit Parsec.digit
43
44
45 -- * Letters
46
47 -- | A wrapper around a letter character.
48 --
49 newtype Letter = Letter Char deriving (Show)
50 instance Pretty Letter where pretty_show (Letter l) = [l]
51
52
53 -- | Parse a single letter, but wrap it in our 'Letter' type.
54 --
55 letter :: Parser Letter
56 letter = fmap Letter Parsec.letter
57
58 -- | The derived instance of 'Eq' for letters is incorrect. All
59 -- comparisons should be made case-insensitively. The following
60 -- is an excerpt from RFC1035:
61 --
62 -- 2.3.3. Character Case
63 --
64 -- For all parts of the DNS that are part of the official
65 -- protocol, all comparisons between character strings (e.g.,
66 -- labels, domain names, etc.) are done in a case-insensitive
67 -- manner...
68 --
69 -- Since each part of DNS name is composed of our custom types, it
70 -- suffices to munge the equality for 'Letter'.
71 --
72 instance Eq Letter where
73 (Letter l1) == (Letter l2) = (toLower l1) == (toLower l2)
74
75 -- * Letters/Digits
76
77 -- | A sum type representing either a letter or a digit.
78 --
79 data LetDig =
80 LetDigLetter Letter |
81 LetDigDigit Digit
82 deriving (Eq, Show)
83
84 instance Pretty LetDig where
85 pretty_show (LetDigLetter l) = pretty_show l
86 pretty_show (LetDigDigit d) = pretty_show d
87
88 -- | Parse a letter or a digit and wrap it in our 'LetDig' type.
89 --
90 let_dig :: Parser LetDig
91 let_dig = (fmap LetDigLetter letter) <|> (fmap LetDigDigit digit)
92
93
94 -- * Hyphens
95
96 -- | A wrapper around a single hyphen character.
97 --
98 newtype Hyphen = Hyphen Char deriving (Eq, Show)
99 instance Pretty Hyphen where pretty_show (Hyphen h) = [h]
100
101 -- | Parse a single hyphen and wrap it in our 'Hyphen' type.
102 --
103 hyphen :: Parser Hyphen
104 hyphen = fmap Hyphen (char '-')
105
106
107 -- * Letter, Digit, or Hyphen.
108
109 -- | A sum type representing a letter, digit, or hyphen.
110 --
111 data LetDigHyp =
112 LetDigHypLetDig LetDig |
113 LetDigHypHyphen Hyphen
114 deriving (Eq, Show)
115
116 instance Pretty LetDigHyp where
117 pretty_show (LetDigHypLetDig ld) = pretty_show ld
118 pretty_show (LetDigHypHyphen h) = pretty_show h
119
120
121 -- | The following is the simplest type in the domain grammar that
122 -- isn't already implemented for us.
123 --
124 -- <let-dig> ::= <letter> | <digit>
125 --
126 -- ==== _Examples_
127 --
128 -- >>> import Text.Parsec ( parseTest )
129 --
130 -- Letters, digits, and hyphens are all parsed:
131 --
132 -- >>> parseTest let_dig_hyp "a"
133 -- LetDigHypLetDig (LetDigLetter (Letter 'a'))
134 --
135 -- >>> parseTest let_dig_hyp "7"
136 -- LetDigHypLetDig (LetDigDigit (Digit '7'))
137 --
138 -- >>> parseTest let_dig_hyp "-"
139 -- LetDigHypHyphen (Hyphen '-')
140 --
141 -- However, an underscore (for example) is not:
142 --
143 -- >>> parseTest let_dig_hyp "_"
144 -- parse error at (line 1, column 1):
145 -- unexpected "_"
146 -- expecting letter, digit or "-"
147 --
148 let_dig_hyp :: Parser LetDigHyp
149 let_dig_hyp =
150 parse_letdig <|> parse_hyphen
151 where
152 parse_letdig :: Parser LetDigHyp
153 parse_letdig = fmap LetDigHypLetDig let_dig
154
155 parse_hyphen :: Parser LetDigHyp
156 parse_hyphen = fmap LetDigHypHyphen hyphen
157
158
159 -- * Letter/Digit/Hyphen strings
160
161 -- | A string of letters, digits, and hyphens from the RFC1035 grammar:
162 --
163 -- <ldh-str> ::= <let-dig-hyp> | <let-dig-hyp> <ldh-str>
164 --
165 -- These are represented as either a single instance of a
166 -- 'LetDigHyp', or a string of them (recursive).
167 --
168 data LdhStr =
169 LdhStrSingleLdh LetDigHyp |
170 LdhStrMultipleLdh LetDigHyp LdhStr
171 deriving (Eq, Show)
172
173 instance Pretty LdhStr where
174 pretty_show (LdhStrSingleLdh ldh) = pretty_show ldh
175 pretty_show (LdhStrMultipleLdh ldh s) = (pretty_show ldh) ++ (pretty_show s)
176
177 -- | Parse a string of letters, digits, and hyphens (an 'LdhStr').
178 --
179 -- ==== _Examples_
180 --
181 -- >>> import Text.Parsec ( parseTest )
182 --
183 -- Single letters, digits, and hyphens are parsed:
184 --
185 -- >>> parseTest ldh_str "a"
186 -- LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a')))
187 --
188 -- >>> parseTest ldh_str "0"
189 -- LdhStrSingleLdh (LetDigHypLetDig (LetDigDigit (Digit '0')))
190 --
191 -- >>> parseTest ldh_str "-"
192 -- LdhStrSingleLdh (LetDigHypHyphen (Hyphen '-'))
193 --
194 -- As well as strings of them:
195 --
196 -- >>> pretty_print $ parse ldh_str "" "a0-b"
197 -- a0-b
198 --
199 ldh_str :: Parser LdhStr
200 ldh_str = try both <|> just_one
201 where
202 both :: Parser LdhStr
203 both = do
204 ldh1 <- let_dig_hyp
205 ldh_tail <- ldh_str
206 return $ LdhStrMultipleLdh ldh1 ldh_tail
207
208 just_one :: Parser LdhStr
209 just_one = fmap LdhStrSingleLdh let_dig_hyp
210
211
212
213 -- | A version of 'last' that works on a 'LdhStr' rather than a
214 -- list. That is, it returns the last 'LetDigHyp' in the
215 -- string. Since 'LdhStr' contains at least one character, there's
216 -- no \"nil\" case here.
217 --
218 -- ==== _Examples_
219 --
220 -- >>> let (Right r) = parse ldh_str "" "a"
221 -- >>> last_ldh_str r
222 -- LetDigHypLetDig (LetDigLetter (Letter 'a'))
223 --
224 -- >>> let (Right r) = parse ldh_str "" "abc-def"
225 -- >>> last_ldh_str r
226 -- LetDigHypLetDig (LetDigLetter (Letter 'f'))
227 --
228 last_ldh_str :: LdhStr -> LetDigHyp
229 last_ldh_str (LdhStrSingleLdh x) = x
230 last_ldh_str (LdhStrMultipleLdh _ x) = last_ldh_str x
231
232
233 -- | A version of 'init' that works on a 'LdhStr' rather than a
234 -- list. That is, it returns everything /except/ the last character in
235 -- the string.
236 --
237 -- Since an 'LdhStr' must contain at least one character, this might
238 -- not be opssible (when the input is of length one). So, we return
239 -- a 'Maybe' value.
240 --
241 -- ==== _Examples_
242 --
243 -- >>> let (Right r) = parse ldh_str "" "a"
244 -- >>> init_ldh_str r
245 -- Nothing
246 --
247 -- >>> let (Right r) = parse ldh_str "" "ab"
248 -- >>> init_ldh_str r
249 -- Just (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a'))))
250 --
251 -- >>> let (Right r) = parse ldh_str "" "abc-def"
252 -- >>> init_ldh_str r
253 -- Just (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a'))) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'b'))) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'c'))) (LdhStrMultipleLdh (LetDigHypHyphen (Hyphen '-')) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'd'))) (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'e')))))))))
254 --
255 init_ldh_str :: LdhStr -> Maybe LdhStr
256 init_ldh_str (LdhStrSingleLdh _) = Nothing
257 init_ldh_str (LdhStrMultipleLdh h t) =
258 Just $ case (init_ldh_str t) of
259 -- We just got the second-to-last character, we're done.
260 Nothing -> LdhStrSingleLdh h
261
262 -- There's still more stuff. Recurse.
263 Just rest -> LdhStrMultipleLdh h rest
264
265
266 -- | Compute the length of an 'LdhStr'. It will be at least one, since
267 -- 'LdhStr's are non-empty. And if there's something other than the
268 -- first character present, we simply recurse.
269 --
270 -- ==== _Examples_
271 --
272 -- >>> let (Right r) = parse ldh_str "" "a"
273 -- >>> length_ldh_str r
274 -- 1
275 --
276 -- >>> let (Right r) = parse ldh_str "" "abc-def"
277 -- >>> length_ldh_str r
278 -- 7
279 --
280 length_ldh_str :: LdhStr -> Int
281 length_ldh_str (LdhStrSingleLdh _) = 1
282 length_ldh_str (LdhStrMultipleLdh _ t) = 1 + (length_ldh_str t)
283
284 -- * Letter/Digit/Hyphen string followed by a trailing Letter/Digit
285
286 -- | This type isn't explicitly part of the grammar, but it's what
287 -- shows up in the square brackets of,
288 --
289 -- <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
290 --
291 -- The ldh-str is optional, but if one is present, we must also have
292 -- a trailing let-dig to prevent the name from ending with a
293 -- hyphen. This can be represented with a @Maybe LdhStrLetDig@,
294 -- which is why we're about to define it.
295 --
296 data LdhStrLetDig = LdhStrLetDig (Maybe LdhStr) LetDig
297 deriving (Eq, Show)
298
299 instance Pretty LdhStrLetDig where
300 pretty_show (LdhStrLetDig Nothing ld) = pretty_show ld
301 pretty_show (LdhStrLetDig (Just s) ld) = (pretty_show s) ++ (pretty_show ld)
302
303 -- | Parse an 'LdhStrLetDig'. This isn't in the grammar, but we might
304 -- as well define the parser for it independently since we gave it
305 -- its own data type.
306 --
307 -- ==== _Examples_
308 --
309 -- >>> import Text.Parsec ( parseTest )
310 --
311 -- Make sure we can parse a single character:
312 --
313 -- >>> parseTest ldh_str_let_dig "a"
314 -- LdhStrLetDig Nothing (LetDigLetter (Letter 'a'))
315 --
316 -- And longer strings:
317 --
318 -- >>> pretty_print $ parse ldh_str_let_dig "" "ab"
319 -- ab
320 --
321 -- >>> pretty_print $ parse ldh_str_let_dig "" "-b"
322 -- -b
323 --
324 -- >>> parseTest ldh_str_let_dig "b-"
325 -- parse error at (line 1, column 3):
326 -- label cannot end with a hyphen
327 --
328 ldh_str_let_dig :: Parser LdhStrLetDig
329 ldh_str_let_dig = do
330 -- This will happily eat up the trailing let-dig...
331 full_ldh <- ldh_str
332
333 -- So we have to go back and see what happened.
334 case (last_ldh_str full_ldh) of
335 (LetDigHypHyphen _) -> fail "label cannot end with a hyphen"
336 (LetDigHypLetDig ld) ->
337 -- Ok, the label didn't end with a hyphen; now we need to split
338 -- off the last letter/digit so we can pack it into our return
339 -- type separately.
340 return $ case (init_ldh_str full_ldh) of
341 -- We only parsed one letter/digit. This can happen
342 -- if the label contains two characters. For example,
343 -- if we try to parse the label "ab", then the "a"
344 -- will be eaten by the label parser, and this
345 -- function will be left with only "b".
346 Nothing -> LdhStrLetDig Nothing ld
347
348 -- Usual case: there's was some leading let-dig-hyp junk,
349 -- return it too.
350 leading_ldhs -> LdhStrLetDig leading_ldhs ld
351
352
353
354 -- | Compute the length of a 'LdhStrLetDig'. It's at least one, since
355 -- the let-dig at the end is always there. And when there's an
356 -- ldh-str too, we add its length to one.
357 --
358 -- ==== _Examples_
359 --
360 -- >>> let (Right r) = parse ldh_str_let_dig "" "a"
361 -- >>> length_ldh_str_let_dig r
362 -- 1
363 --
364 -- >>> let (Right r) = parse ldh_str_let_dig "" "abc-def"
365 -- >>> length_ldh_str_let_dig r
366 -- 7
367 --
368 length_ldh_str_let_dig :: LdhStrLetDig -> Int
369 length_ldh_str_let_dig (LdhStrLetDig Nothing _) = 1
370 length_ldh_str_let_dig (LdhStrLetDig (Just ldhstring) _) =
371 1 + (length_ldh_str ldhstring)
372
373
374 -- * Labels
375
376 -- | The label type from the RFC1035 grammar:
377 --
378 -- <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
379 --
380 data Label = Label Letter (Maybe LdhStrLetDig)
381 deriving (Eq, Show)
382
383 instance Pretty Label where
384 pretty_show (Label l Nothing) = pretty_show l
385 pretty_show (Label l (Just s)) = (pretty_show l) ++ (pretty_show s)
386
387 -- | Parse a 'Label'.
388 --
389 -- In addition to the grammar, there's another restriction on
390 -- labels: their length must be 63 characters or less. Quoting
391 -- Section 2.3.1, \"Preferred name syntax\", of RFC1035:
392 --
393 -- The labels must follow the rules for ARPANET host names. They
394 -- must start with a letter, end with a letter or digit, and have
395 -- as interior characters only letters, digits, and hyphen. There
396 -- are also some restrictions on the length. Labels must be 63
397 -- characters or less.
398 --
399 -- We check this only after we have successfully parsed a label.
400 --
401 -- ==== _Examples_
402 --
403 -- >>> import Text.Parsec ( parseTest )
404 --
405 -- Make sure we can parse a single character:
406 --
407 -- >>> parseTest label "a"
408 -- Label (Letter 'a') Nothing
409 --
410 -- And longer strings:
411 --
412 -- >>> pretty_print $ parse label "" "abc-def"
413 -- abc-def
414 --
415 -- But not anything ending in a hyphen:
416 --
417 -- >>> parseTest label "abc-"
418 -- parse error at (line 1, column 5):
419 -- label cannot end with a hyphen
420 --
421 -- Or anything over 63 characters:
422 --
423 -- >>> parseTest label (['a'..'z'] ++ ['a'..'z'] ++ ['a'..'z'])
424 -- parse error at (line 1, column 79):
425 -- labels must be 63 or fewer characters
426 --
427 -- However, /exactly/ 63 characters is acceptable:
428 --
429 -- >>> pretty_print $ parse label "" (replicate 63 'x')
430 -- xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
431 --
432 label :: Parser Label
433 label = do
434 l <- letter -- Guaranteed to be there
435 maybe_s <- optionMaybe ldh_str_let_dig -- Might not be there
436 case maybe_s of
437 -- It can only be one character long, from the letter...
438 Nothing -> return $ Label l maybe_s
439
440 -- The letter gives us one character, so we check that the rest is
441 -- less than 62 characters long. But in the error message we need
442 -- to report 63.
443 Just s -> if (length_ldh_str_let_dig s) <= 62
444 then return $ Label l maybe_s
445 else fail "labels must be 63 or fewer characters"
446
447
448
449 -- * Subdomains
450
451
452 -- | The data type representing a \"subdomain\" from RFC1035:
453 --
454 -- <subdomain> ::= <label> | <subdomain> "." <label>
455 --
456 -- We have reversed the order of the subdomain and label in the
457 -- second option, however. This is explained in 'subdomain'.
458 --
459 data Subdomain =
460 SubdomainSingleLabel Label |
461 SubdomainMultipleLabel Label Subdomain
462 deriving (Eq, Show)
463
464 instance Pretty Subdomain where
465 pretty_show (SubdomainSingleLabel l) = pretty_show l
466 pretty_show (SubdomainMultipleLabel l s) =
467 (pretty_show l) ++ "." ++ (pretty_show s)
468
469 -- | Parse an RFC1035 \"subdomain\". The given grammar is,
470 --
471 -- <subdomain> ::= <label> | <subdomain> "." <label>
472 --
473 -- However, we have reversed the order of the subdomain and label to
474 -- prevent infinite recursion. The second option (subdomain + label)
475 -- is obviously more specific, we we need to try it first. This
476 -- presents a problem: we're trying to parse a subdomain in terms of
477 -- a subdomain! The given grammar represents subdomains how we like
478 -- to think of them; from right to left. But it's better to parse
479 -- from left to right, so we pick off the leading label and then
480 -- recurse into the definition of subdomain.
481 --
482 -- ==== _Examples_
483 --
484 -- >>> import Text.Parsec ( parseTest )
485 --
486 -- Make sure we can parse a single character:
487 --
488 -- >>> parseTest subdomain "a"
489 -- SubdomainSingleLabel (Label (Letter 'a') Nothing)
490 --
491 -- >>> pretty_print $ parse subdomain "" "example.com"
492 -- example.com
493 --
494 -- >>> pretty_print $ parse subdomain "" "www.example.com"
495 -- www.example.com
496 --
497 subdomain :: Parser Subdomain
498 subdomain = try both <|> just_one
499 where
500 both :: Parser Subdomain
501 both = do
502 l <- label
503 char '.'
504 s <- subdomain
505 return (SubdomainMultipleLabel l s)
506
507 just_one :: Parser Subdomain
508 just_one = fmap SubdomainSingleLabel label
509
510
511
512 -- * Domains
513
514 -- | An RFC1035 domain. According to RFC1035 a domain can be either a
515 -- subdomain or \" \", which according to RFC2181
516 -- <https://tools.ietf.org/html/rfc2181#section-11> means the root:
517 --
518 -- The zero length full name is defined as representing the root
519 -- of the DNS tree, and is typically written and displayed as
520 -- \".\".
521 --
522 -- We let the 'Domain' type remain true to those RFCs, even though
523 -- they don't support an absolute domain name of e.g. a single dot.
524 -- We have one more data type 'UserDomain' which handles the possibility
525 -- of an absolute path.
526 --
527 data Domain =
528 DomainName Subdomain |
529 DomainRoot
530 deriving (Eq, Show)
531
532 instance Pretty Domain where
533 pretty_show DomainRoot = ""
534 pretty_show (DomainName s) = pretty_show s
535
536 -- | Parse an RFC1035 \"domain\"
537 --
538 -- ==== _Examples_
539 --
540 -- >>> import Text.Parsec ( parseTest )
541 --
542 -- Make sure we can parse a single character:
543 --
544 -- >>> parseTest domain "a"
545 -- DomainName (SubdomainSingleLabel (Label (Letter 'a') Nothing))
546 --
547 -- And the empty domain:
548 --
549 -- >>> parseTest domain ""
550 -- DomainRoot
551 --
552 -- We will in fact parse the \"empty\" domain off the front of
553 -- pretty much anything:
554 --
555 -- >>> parseTest domain "8===D"
556 -- DomainRoot
557 --
558 -- Equality of domains is case-insensitive:
559 --
560 -- >>> let (Right r1) = parse domain "" "example.com"
561 -- >>> let (Right r2) = parse domain "" "ExaMPle.coM"
562 -- >>> r1 == r2
563 -- True
564 --
565 -- A single dot IS parsed as the root, but the dot isn't consumed:
566 --
567 -- >>> parseTest domain "."
568 -- DomainRoot
569 --
570 domain :: Parser Domain
571 domain = try parse_subdomain <|> parse_empty
572 where
573 parse_subdomain = fmap DomainName subdomain
574 parse_empty = string "" >> return DomainRoot
575
576
577
578 -- * User domains
579
580 -- | This type helps clarify some murkiness in the DNS \"domain\" name
581 -- specification. In RFC1034, it is acknowledged that a domain name
582 -- input with a trailing \".\" will represent an absolute domain
583 -- name (i.e. with respect to the DNS root). However, the grammar in
584 -- RFC1035 disallows a trailing dot.
585 --
586 -- This makes some sense: within the DNS, everything knows its
587 -- position in the tree. The relative/absolute distinction only
588 -- makes sense on the client side, where a user's resolver might
589 -- decide to append some suffix to a relative
590 -- request. Unfortunately, that's where we live. So we have to deal
591 -- with the possibility of having a trailing dot at the end of any
592 -- domain name.
593 --
594 data UserDomain =
595 UserDomainRelative Domain |
596 UserDomainAbsolute Domain
597 deriving (Eq, Show)
598
599 instance Pretty UserDomain where
600 pretty_show (UserDomainRelative d) = pretty_show d
601 pretty_show (UserDomainAbsolute d) = (pretty_show d) ++ "."
602
603
604 -- | Parse a 'UserDomain'. This is what we'll be using to read user
605 -- input, since it supports both relative and absolute domain names
606 -- (unlike the implicitly-absolute 'Domain').
607 --
608 -- ==== _Examples_
609 --
610 -- >>> import Text.Parsec ( parseTest )
611 --
612 -- We can really parse the root now!
613 --
614 -- >>> parseTest user_domain "."
615 -- UserDomainAbsolute DomainRoot
616 --
617 -- But multiple dots aren't (only the first):
618 --
619 -- >>> pretty_print $ parse user_domain "" ".."
620 -- .
621 --
622 -- We can also optionally have a trailing dot at the end of a
623 -- non-empty name:
624 --
625 -- >>> pretty_print $ parse user_domain "" "www.example.com"
626 -- www.example.com
627 --
628 -- >>> pretty_print $ parse user_domain "" "www.example.com."
629 -- www.example.com.
630 --
631 -- A \"relative root\" can also be parsed, letting the user's
632 -- resolver deal with it:
633 --
634 -- >>> parseTest user_domain ""
635 -- UserDomainRelative DomainRoot
636 --
637 user_domain :: Parser UserDomain
638 user_domain = try absolute <|> relative
639 where
640 absolute :: Parser UserDomain
641 absolute = do
642 d <- domain
643 r <- char '.'
644 return $ UserDomainAbsolute d
645
646 relative :: Parser UserDomain
647 relative = fmap UserDomainRelative domain