1 -- | The 'Domain' data type and its parser. A 'Domain' represents a
2 -- name in the domain name system (DNS) as described by
3 -- RFC1035. In particular, we enforce the restrictions from Section
4 -- 2.3.1 \"Preferred name syntax\". See for example,
6 -- <https://tools.ietf.org/html/rfc1035#section-2.3.1>
8 -- We basically work with strings and characters everywhere, even
9 -- though this isn't really correct. The length specifications in
10 -- the RFCs are all in terms of octets, so really a ByteString.Char8
11 -- would be more appropriate. With strings, for example, we could
12 -- have a unicode mumbo jumbo character that takes up two bytes
20 import Data.Char ( toLower )
34 import qualified Text.Parsec as Parsec ( digit, letter)
35 import Text.Parsec.String ( Parser )
37 import Pretty ( Pretty(..) )
41 -- | A wrapper around a digit character.
43 newtype Digit = Digit Char deriving (Eq, Show)
44 instance Pretty Digit where pretty_show (Digit d) = [d]
46 -- | Parse a single digit, but wrap it in our 'Digit' type.
49 digit = fmap Digit Parsec.digit
54 -- | A wrapper around a letter character.
56 newtype Letter = Letter Char deriving (Show)
57 instance Pretty Letter where pretty_show (Letter l) = [l]
60 -- | Parse a single letter, but wrap it in our 'Letter' type.
62 letter :: Parser Letter
63 letter = fmap Letter Parsec.letter
65 -- | The derived instance of 'Eq' for letters is incorrect. All
66 -- comparisons should be made case-insensitively. The following
67 -- is an excerpt from RFC1035:
69 -- 2.3.3. Character Case
71 -- For all parts of the DNS that are part of the official
72 -- protocol, all comparisons between character strings (e.g.,
73 -- labels, domain names, etc.) are done in a case-insensitive
76 -- Since each part of DNS name is composed of our custom types, it
77 -- suffices to munge the equality for 'Letter'.
79 instance Eq Letter where
80 (Letter l1) == (Letter l2) = (toLower l1) == (toLower l2)
84 -- | A sum type representing either a letter or a digit.
91 instance Pretty LetDig where
92 pretty_show (LetDigLetter l) = pretty_show l
93 pretty_show (LetDigDigit d) = pretty_show d
95 -- | Parse a letter or a digit and wrap it in our 'LetDig' type.
97 let_dig :: Parser LetDig
98 let_dig = (fmap LetDigLetter letter) <|> (fmap LetDigDigit digit)
103 -- | A wrapper around a single hyphen character.
105 newtype Hyphen = Hyphen Char deriving (Eq, Show)
106 instance Pretty Hyphen where pretty_show (Hyphen h) = [h]
108 -- | Parse a single hyphen and wrap it in our 'Hyphen' type.
110 hyphen :: Parser Hyphen
111 hyphen = fmap Hyphen (char '-')
114 -- * Letter, Digit, or Hyphen.
116 -- | A sum type representing a letter, digit, or hyphen.
119 LetDigHypLetDig LetDig |
120 LetDigHypHyphen Hyphen
123 instance Pretty LetDigHyp where
124 pretty_show (LetDigHypLetDig ld) = pretty_show ld
125 pretty_show (LetDigHypHyphen h) = pretty_show h
128 -- | The following is the simplest type in the domain grammar that
129 -- isn't already implemented for us.
131 -- <let-dig> ::= <letter> | <digit>
135 -- >>> import Text.Parsec ( parseTest )
137 -- Letters, digits, and hyphens are all parsed:
139 -- >>> parseTest let_dig_hyp "a"
140 -- LetDigHypLetDig (LetDigLetter (Letter 'a'))
142 -- >>> parseTest let_dig_hyp "7"
143 -- LetDigHypLetDig (LetDigDigit (Digit '7'))
145 -- >>> parseTest let_dig_hyp "-"
146 -- LetDigHypHyphen (Hyphen '-')
148 -- However, an underscore (for example) is not:
150 -- >>> parseTest let_dig_hyp "_"
151 -- parse error at (line 1, column 1):
153 -- expecting letter, digit or "-"
155 let_dig_hyp :: Parser LetDigHyp
157 parse_letdig <|> parse_hyphen
159 parse_letdig :: Parser LetDigHyp
160 parse_letdig = fmap LetDigHypLetDig let_dig
162 parse_hyphen :: Parser LetDigHyp
163 parse_hyphen = fmap LetDigHypHyphen hyphen
166 -- * Letter/Digit/Hyphen strings
168 -- | A string of letters, digits, and hyphens from the RFC1035 grammar:
170 -- <ldh-str> ::= <let-dig-hyp> | <let-dig-hyp> <ldh-str>
172 -- These are represented as either a single instance of a
173 -- 'LetDigHyp', or a string of them (recursive).
176 LdhStrSingleLdh LetDigHyp |
177 LdhStrMultipleLdh LetDigHyp LdhStr
180 instance Pretty LdhStr where
181 pretty_show (LdhStrSingleLdh ldh) = pretty_show ldh
182 pretty_show (LdhStrMultipleLdh ldh s) = (pretty_show ldh) ++ (pretty_show s)
184 -- | Parse a string of letters, digits, and hyphens (an 'LdhStr').
188 -- >>> import Text.Parsec ( parseTest )
190 -- Single letters, digits, and hyphens are parsed:
192 -- >>> parseTest ldh_str "a"
193 -- LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a')))
195 -- >>> parseTest ldh_str "0"
196 -- LdhStrSingleLdh (LetDigHypLetDig (LetDigDigit (Digit '0')))
198 -- >>> parseTest ldh_str "-"
199 -- LdhStrSingleLdh (LetDigHypHyphen (Hyphen '-'))
201 -- As well as strings of them:
203 -- >>> pretty_print $ parse ldh_str "" "a0-b"
206 ldh_str :: Parser LdhStr
207 ldh_str = try both <|> just_one
209 both :: Parser LdhStr
213 return $ LdhStrMultipleLdh ldh1 ldh_tail
215 just_one :: Parser LdhStr
216 just_one = fmap LdhStrSingleLdh let_dig_hyp
220 -- | A version of 'last' that works on a 'LdhStr' rather than a
221 -- list. That is, it returns the last 'LetDigHyp' in the
222 -- string. Since 'LdhStr' contains at least one character, there's
223 -- no \"nil\" case here.
227 -- >>> let (Right r) = parse ldh_str "" "a"
228 -- >>> last_ldh_str r
229 -- LetDigHypLetDig (LetDigLetter (Letter 'a'))
231 -- >>> let (Right r) = parse ldh_str "" "abc-def"
232 -- >>> last_ldh_str r
233 -- LetDigHypLetDig (LetDigLetter (Letter 'f'))
235 last_ldh_str :: LdhStr -> LetDigHyp
236 last_ldh_str (LdhStrSingleLdh x) = x
237 last_ldh_str (LdhStrMultipleLdh _ x) = last_ldh_str x
240 -- | A version of 'init' that works on a 'LdhStr' rather than a
241 -- list. That is, it returns everything /except/ the last character in
244 -- Since an 'LdhStr' must contain at least one character, this might
245 -- not be opssible (when the input is of length one). So, we return
250 -- >>> let (Right r) = parse ldh_str "" "a"
251 -- >>> init_ldh_str r
254 -- >>> let (Right r) = parse ldh_str "" "ab"
255 -- >>> init_ldh_str r
256 -- Just (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a'))))
258 -- >>> let (Right r) = parse ldh_str "" "abc-def"
259 -- >>> init_ldh_str r
260 -- Just (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a'))) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'b'))) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'c'))) (LdhStrMultipleLdh (LetDigHypHyphen (Hyphen '-')) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'd'))) (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'e')))))))))
262 init_ldh_str :: LdhStr -> Maybe LdhStr
263 init_ldh_str (LdhStrSingleLdh _) = Nothing
264 init_ldh_str (LdhStrMultipleLdh h t) =
265 Just $ case (init_ldh_str t) of
266 -- We just got the second-to-last character, we're done.
267 Nothing -> LdhStrSingleLdh h
269 -- There's still more stuff. Recurse.
270 Just rest -> LdhStrMultipleLdh h rest
273 -- | Compute the length of an 'LdhStr'. It will be at least one, since
274 -- 'LdhStr's are non-empty. And if there's something other than the
275 -- first character present, we simply recurse.
279 -- >>> let (Right r) = parse ldh_str "" "a"
280 -- >>> length_ldh_str r
283 -- >>> let (Right r) = parse ldh_str "" "abc-def"
284 -- >>> length_ldh_str r
287 length_ldh_str :: LdhStr -> Int
288 length_ldh_str (LdhStrSingleLdh _) = 1
289 length_ldh_str (LdhStrMultipleLdh _ t) = 1 + (length_ldh_str t)
291 -- * Letter/Digit/Hyphen string followed by a trailing Letter/Digit
293 -- | This type isn't explicitly part of the grammar, but it's what
294 -- shows up in the square brackets of,
296 -- <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
298 -- The ldh-str is optional, but if one is present, we must also have
299 -- a trailing let-dig to prevent the name from ending with a
300 -- hyphen. This can be represented with a @Maybe LdhStrLetDig@,
301 -- which is why we're about to define it.
303 data LdhStrLetDig = LdhStrLetDig (Maybe LdhStr) LetDig
306 instance Pretty LdhStrLetDig where
307 pretty_show (LdhStrLetDig Nothing ld) = pretty_show ld
308 pretty_show (LdhStrLetDig (Just s) ld) = (pretty_show s) ++ (pretty_show ld)
310 -- | Parse an 'LdhStrLetDig'. This isn't in the grammar, but we might
311 -- as well define the parser for it independently since we gave it
312 -- its own data type.
316 -- >>> import Text.Parsec ( parseTest )
318 -- Make sure we can parse a single character:
320 -- >>> parseTest ldh_str_let_dig "a"
321 -- LdhStrLetDig Nothing (LetDigLetter (Letter 'a'))
323 -- And longer strings:
325 -- >>> pretty_print $ parse ldh_str_let_dig "" "ab"
328 -- >>> pretty_print $ parse ldh_str_let_dig "" "-b"
331 -- >>> parseTest ldh_str_let_dig "b-"
332 -- parse error at (line 1, column 3):
333 -- label cannot end with a hyphen
335 ldh_str_let_dig :: Parser LdhStrLetDig
337 -- This will happily eat up the trailing let-dig...
340 -- So we have to go back and see what happened.
341 case (last_ldh_str full_ldh) of
342 (LetDigHypHyphen _) -> fail "label cannot end with a hyphen"
343 (LetDigHypLetDig ld) ->
344 -- Ok, the label didn't end with a hyphen; now we need to split
345 -- off the last letter/digit so we can pack it into our return
347 return $ case (init_ldh_str full_ldh) of
348 -- We only parsed one letter/digit. This can happen
349 -- if the label contains two characters. For example,
350 -- if we try to parse the label "ab", then the "a"
351 -- will be eaten by the label parser, and this
352 -- function will be left with only "b".
353 Nothing -> LdhStrLetDig Nothing ld
355 -- Usual case: there's was some leading let-dig-hyp junk,
357 leading_ldhs -> LdhStrLetDig leading_ldhs ld
361 -- | Compute the length of a 'LdhStrLetDig'. It's at least one, since
362 -- the let-dig at the end is always there. And when there's an
363 -- ldh-str too, we add its length to one.
367 -- >>> let (Right r) = parse ldh_str_let_dig "" "a"
368 -- >>> length_ldh_str_let_dig r
371 -- >>> let (Right r) = parse ldh_str_let_dig "" "abc-def"
372 -- >>> length_ldh_str_let_dig r
375 length_ldh_str_let_dig :: LdhStrLetDig -> Int
376 length_ldh_str_let_dig (LdhStrLetDig Nothing _) = 1
377 length_ldh_str_let_dig (LdhStrLetDig (Just ldhstring) _) =
378 1 + (length_ldh_str ldhstring)
383 -- | The label type from the RFC1035 grammar:
385 -- <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
387 data Label = Label Letter (Maybe LdhStrLetDig)
390 instance Pretty Label where
391 pretty_show (Label l Nothing) = pretty_show l
392 pretty_show (Label l (Just s)) = (pretty_show l) ++ (pretty_show s)
394 -- | Parse a 'Label'.
396 -- In addition to the grammar, there's another restriction on
397 -- labels: their length must be 63 characters or less. Quoting
398 -- Section 2.3.1, \"Preferred name syntax\", of RFC1035:
400 -- The labels must follow the rules for ARPANET host names. They
401 -- must start with a letter, end with a letter or digit, and have
402 -- as interior characters only letters, digits, and hyphen. There
403 -- are also some restrictions on the length. Labels must be 63
404 -- characters or less.
406 -- We check this only after we have successfully parsed a label.
410 -- >>> import Text.Parsec ( parseTest )
412 -- Make sure we can parse a single character:
414 -- >>> parseTest label "a"
415 -- Label (Letter 'a') Nothing
417 -- And longer strings:
419 -- >>> pretty_print $ parse label "" "abc-def"
422 -- But not anything ending in a hyphen:
424 -- >>> parseTest label "abc-"
425 -- parse error at (line 1, column 5):
426 -- label cannot end with a hyphen
428 -- Or anything over 63 characters:
430 -- >>> parseTest label (['a'..'z'] ++ ['a'..'z'] ++ ['a'..'z'])
431 -- parse error at (line 1, column 79):
432 -- labels must be 63 or fewer characters
434 -- However, /exactly/ 63 characters is acceptable:
436 -- >>> pretty_print $ parse label "" (replicate 63 'x')
437 -- xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
439 label :: Parser Label
441 l <- letter -- Guaranteed to be there
442 maybe_s <- optionMaybe ldh_str_let_dig -- Might not be there
444 -- It can only be one character long, from the letter...
445 Nothing -> return $ Label l maybe_s
447 -- The letter gives us one character, so we check that the rest is
448 -- less than 62 characters long. But in the error message we need
450 Just s -> if (length_ldh_str_let_dig s) <= 62
451 then return $ Label l maybe_s
452 else fail "labels must be 63 or fewer characters"
459 -- | The data type representing a \"subdomain\" from RFC1035:
461 -- <subdomain> ::= <label> | <subdomain> "." <label>
463 -- We have reversed the order of the subdomain and label in the
464 -- second option, however. This is explained in 'subdomain'.
467 SubdomainSingleLabel Label |
468 SubdomainMultipleLabel Label Subdomain
471 instance Pretty Subdomain where
472 pretty_show (SubdomainSingleLabel l) = pretty_show l
473 pretty_show (SubdomainMultipleLabel l s) =
474 (pretty_show l) ++ "." ++ (pretty_show s)
476 -- | Parse an RFC1035 \"subdomain\". The given grammar is,
478 -- <subdomain> ::= <label> | <subdomain> "." <label>
480 -- However, we have reversed the order of the subdomain and label to
481 -- prevent infinite recursion. The second option (subdomain + label)
482 -- is obviously more specific, we we need to try it first. This
483 -- presents a problem: we're trying to parse a subdomain in terms of
484 -- a subdomain! The given grammar represents subdomains how we like
485 -- to think of them; from right to left. But it's better to parse
486 -- from left to right, so we pick off the leading label and then
487 -- recurse into the definition of subdomain.
491 -- >>> import Text.Parsec ( parseTest )
493 -- Make sure we can parse a single character:
495 -- >>> parseTest subdomain "a"
496 -- SubdomainSingleLabel (Label (Letter 'a') Nothing)
498 -- >>> pretty_print $ parse subdomain "" "example.com"
501 -- >>> pretty_print $ parse subdomain "" "www.example.com"
504 subdomain :: Parser Subdomain
505 subdomain = try both <|> just_one
507 both :: Parser Subdomain
512 return (SubdomainMultipleLabel l s)
514 just_one :: Parser Subdomain
515 just_one = fmap SubdomainSingleLabel label
521 -- | An RFC1035 domain. According to RFC1035 a domain can be either a
522 -- subdomain or \" \", which according to RFC2181
523 -- <https://tools.ietf.org/html/rfc2181#section-11> means the root:
525 -- The zero length full name is defined as representing the root
526 -- of the DNS tree, and is typically written and displayed as
529 -- We let the 'Domain' type remain true to those RFCs, even though
530 -- they don't support an absolute domain name of e.g. a single dot.
531 -- We have one more data type 'UserDomain' which handles the possibility
532 -- of an absolute path.
535 DomainName Subdomain |
539 instance Pretty Domain where
540 pretty_show DomainRoot = ""
541 pretty_show (DomainName s) = pretty_show s
543 -- | Parse an RFC1035 \"domain\"
547 -- >>> import Text.Parsec ( parseTest )
549 -- Make sure we can parse a single character:
551 -- >>> parseTest domain "a"
552 -- DomainName (SubdomainSingleLabel (Label (Letter 'a') Nothing))
554 -- And the empty domain:
556 -- >>> parseTest domain ""
559 -- We will in fact parse the \"empty\" domain off the front of
560 -- pretty much anything:
562 -- >>> parseTest domain "8===D"
565 -- Equality of domains is case-insensitive:
567 -- >>> let (Right r1) = parse domain "" "example.com"
568 -- >>> let (Right r2) = parse domain "" "ExaMPle.coM"
572 -- A single dot IS parsed as the root, but the dot isn't consumed:
574 -- >>> parseTest domain "."
577 -- Anything over 255 characters is an error, so the root will be
580 -- >>> let big_label = replicate 63 'x'
581 -- >>> let big_subdomain = concat $ replicate 5 (big_label ++ ".")
582 -- >>> parseTest domain big_subdomain
585 -- But exactly 255 is allowed:
587 -- >>> import Data.List ( intercalate )
588 -- >>> let big_label = replicate 63 'x'
589 -- >>> let big_subdomain = intercalate "." (replicate 4 big_label)
590 -- >>> let (Right r) = parse domain "" big_subdomain
591 -- >>> length (pretty_show r)
594 domain :: Parser Domain
595 domain = try parse_subdomain <|> parse_empty
597 parse_subdomain :: Parser Domain
600 if (length $ pretty_show s) <= 255
601 then return $ DomainName s
602 else fail "subdomains can be at most 255 characters"
604 parse_empty :: Parser Domain
605 parse_empty = string "" >> return DomainRoot
611 -- | This type helps clarify some murkiness in the DNS \"domain\" name
612 -- specification. In RFC1034, it is acknowledged that a domain name
613 -- input with a trailing \".\" will represent an absolute domain
614 -- name (i.e. with respect to the DNS root). However, the grammar in
615 -- RFC1035 disallows a trailing dot.
617 -- This makes some sense: within the DNS, everything knows its
618 -- position in the tree. The relative/absolute distinction only
619 -- makes sense on the client side, where a user's resolver might
620 -- decide to append some suffix to a relative
621 -- request. Unfortunately, that's where we live. So we have to deal
622 -- with the possibility of having a trailing dot at the end of any
626 UserDomainRelative Domain |
627 UserDomainAbsolute Domain
630 instance Pretty UserDomain where
631 pretty_show (UserDomainRelative d) = pretty_show d
632 pretty_show (UserDomainAbsolute d) = (pretty_show d) ++ "."
635 -- | Parse a 'UserDomain'. This is what we'll be using to read user
636 -- input, since it supports both relative and absolute domain names
637 -- (unlike the implicitly-absolute 'Domain').
641 -- >>> import Text.Parsec ( parseTest )
643 -- We can really parse the root now!
645 -- >>> parseTest user_domain "."
646 -- UserDomainAbsolute DomainRoot
648 -- But multiple dots aren't (only the first):
650 -- >>> pretty_print $ parse user_domain "" ".."
653 -- We can also optionally have a trailing dot at the end of a
656 -- >>> pretty_print $ parse user_domain "" "www.example.com"
659 -- >>> pretty_print $ parse user_domain "" "www.example.com."
662 -- A \"relative root\" can also be parsed, letting the user's
663 -- resolver deal with it:
665 -- >>> parseTest user_domain ""
666 -- UserDomainRelative DomainRoot
668 user_domain :: Parser UserDomain
669 user_domain = try absolute <|> relative
671 absolute :: Parser UserDomain
675 return $ UserDomainAbsolute d
677 relative :: Parser UserDomain
678 relative = fmap UserDomainRelative domain