src/Network/DNS/RBL/Domain.hs

   1 {-# LANGUAGE DoAndIfThenElse #-}
   2
   3 -- | The 'Domain' data type and its parser. A 'Domain' represents a
   4 --   name in the domain name system (DNS) as described by
   5 --   RFC1035. In particular, we enforce the restrictions from Section
   6 --   2.3.1 \"Preferred name syntax\". See for example,
   7 --
   8 --     <https://tools.ietf.org/html/rfc1035#section-2.3.1>
   9 --
  10 --   We basically work with strings and characters everywhere, even
  11 --   though this isn't really correct. The length specifications in
  12 --   the RFCs are all in terms of octets, so really a ByteString.Char8
  13 --   would be more appropriate. With strings, for example, we could
  14 --   have a unicode mumbo jumbo character that takes up two bytes
  15 --   (octets).
  16 --
  17 module Network.DNS.RBL.Domain (
  18   UserDomain,
  19   user_domain )
  20 where
  21
  22 import Data.Char ( toLower )
  23 import Text.Parsec (
  24   (<|>),
  25   char,
  26   optionMaybe,
  27   string,
  28   try )
  29 import qualified Text.Parsec as Parsec ( digit, letter)
  30 import Text.Parsec.String ( Parser )
  31
  32 import Network.DNS.RBL.Pretty ( Pretty(..) )
  33
  34 -- * Digits
  35
  36 -- | A wrapper around a digit character.
  37 --
  38 newtype Digit = Digit Char deriving (Eq, Show)
  39 instance Pretty Digit where pretty_show (Digit d) = [d]
  40
  41 -- | Parse a single digit, but wrap it in our 'Digit' type.
  42 --
  43 digit :: Parser Digit
  44 digit = fmap Digit Parsec.digit
  45
  46
  47 -- * Letters
  48
  49 -- | A wrapper around a letter character.
  50 --
  51 newtype Letter = Letter Char deriving (Show)
  52 instance Pretty Letter where pretty_show (Letter l) = [l]
  53
  54
  55 -- | Parse a single letter, but wrap it in our 'Letter' type.
  56 --
  57 letter :: Parser Letter
  58 letter = fmap Letter Parsec.letter
  59
  60 -- | The derived instance of 'Eq' for letters is incorrect. All
  61 --   comparisons should be made case-insensitively. The following
  62 --   is an excerpt from RFC1035:
  63 --
  64 --     2.3.3. Character Case
  65 --
  66 --     For all parts of the DNS that are part of the official
  67 --     protocol, all comparisons between character strings (e.g.,
  68 --     labels, domain names, etc.)  are done in a case-insensitive
  69 --     manner...
  70 --
  71 --   Since each part of DNS name is composed of our custom types, it
  72 --   suffices to munge the equality for 'Letter'. RFC4343
  73 --   <https://tools.ietf.org/html/rfc4343> clarifies the
  74 --   case-insensitivity rules, but the fact that we're treating DNS
  75 --   names as strings makes most of those problems go away (in
  76 --   exchange for new ones).
  77 --
  78 instance Eq Letter where
  79   (Letter l1) == (Letter l2) = (toLower l1) == (toLower l2)
  80
  81 -- * Letters/Digits
  82
  83 -- | A sum type representing either a letter or a digit.
  84 --
  85 data LetDig =
  86   LetDigLetter Letter |
  87   LetDigDigit  Digit
  88   deriving (Eq, Show)
  89
  90 instance Pretty LetDig where
  91   pretty_show (LetDigLetter l) = pretty_show l
  92   pretty_show (LetDigDigit d) = pretty_show d
  93
  94 -- | Parse a letter or a digit and wrap it in our 'LetDig' type.
  95 --
  96 let_dig :: Parser LetDig
  97 let_dig = (fmap LetDigLetter letter) <|> (fmap LetDigDigit digit)
  98
  99
 100 -- * Hyphens
 101
 102 -- | A wrapper around a single hyphen character.
 103 --
 104 newtype Hyphen = Hyphen Char deriving (Eq, Show)
 105 instance Pretty Hyphen where pretty_show (Hyphen h) = [h]
 106
 107 -- | Parse a single hyphen and wrap it in our 'Hyphen' type.
 108 --
 109 hyphen :: Parser Hyphen
 110 hyphen = fmap Hyphen (char '-')
 111
 112
 113 -- * Letter, Digit, or Hyphen.
 114
 115 -- | A sum type representing a letter, digit, or hyphen.
 116 --
 117 data LetDigHyp =
 118   LetDigHypLetDig LetDig |
 119   LetDigHypHyphen Hyphen
 120   deriving (Eq, Show)
 121
 122 instance Pretty LetDigHyp where
 123   pretty_show (LetDigHypLetDig ld) = pretty_show ld
 124   pretty_show (LetDigHypHyphen h) = pretty_show h
 125
 126
 127 -- | The following is the simplest type in the domain grammar that
 128 --   isn't already implemented for us.
 129 --
 130 --     <let-dig> ::= <letter> | <digit>
 131 --
 132 --   ==== _Examples_
 133 --
 134 --   >>> import Text.Parsec ( parseTest )
 135 --
 136 --   Letters, digits, and hyphens are all parsed:
 137 --
 138 --   >>> parseTest let_dig_hyp "a"
 139 --   LetDigHypLetDig (LetDigLetter (Letter 'a'))
 140 --
 141 --   >>> parseTest let_dig_hyp "7"
 142 --   LetDigHypLetDig (LetDigDigit (Digit '7'))
 143 --
 144 --   >>> parseTest let_dig_hyp "-"
 145 --   LetDigHypHyphen (Hyphen '-')
 146 --
 147 --   However, an underscore (for example) is not:
 148 --
 149 --   >>> parseTest let_dig_hyp "_"
 150 --   parse error at (line 1, column 1):
 151 --   unexpected "_"
 152 --   expecting letter, digit or "-"
 153 --
 154 let_dig_hyp :: Parser LetDigHyp
 155 let_dig_hyp =
 156   parse_letdig <|> parse_hyphen
 157   where
 158     parse_letdig :: Parser LetDigHyp
 159     parse_letdig = fmap LetDigHypLetDig let_dig
 160
 161     parse_hyphen :: Parser LetDigHyp
 162     parse_hyphen = fmap LetDigHypHyphen hyphen
 163
 164
 165 -- * Letter/Digit/Hyphen strings
 166
 167 -- | A string of letters, digits, and hyphens from the RFC1035 grammar:
 168 --
 169 --     <ldh-str> ::= <let-dig-hyp> | <let-dig-hyp> <ldh-str>
 170 --
 171 --   These are represented as either a single instance of a
 172 --   'LetDigHyp', or a string of them (recursive).
 173 --
 174 data LdhStr =
 175   LdhStrSingleLdh LetDigHyp |
 176   LdhStrMultipleLdh LetDigHyp LdhStr
 177   deriving (Eq, Show)
 178
 179 instance Pretty LdhStr where
 180   pretty_show (LdhStrSingleLdh ldh) = pretty_show ldh
 181   pretty_show (LdhStrMultipleLdh ldh s) = (pretty_show ldh) ++ (pretty_show s)
 182
 183 -- | Parse a string of letters, digits, and hyphens (an 'LdhStr').
 184 --
 185 --   ==== _Examples_
 186 --
 187 --   >>> import Text.Parsec ( parseTest )
 188 --
 189 --   Single letters, digits, and hyphens are parsed:
 190 --
 191 --   >>> parseTest ldh_str "a"
 192 --   LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a')))
 193 --
 194 --   >>> parseTest ldh_str "0"
 195 --   LdhStrSingleLdh (LetDigHypLetDig (LetDigDigit (Digit '0')))
 196 --
 197 --   >>> parseTest ldh_str "-"
 198 --   LdhStrSingleLdh (LetDigHypHyphen (Hyphen '-'))
 199 --
 200 --   As well as strings of them:
 201 --
 202 --   >>> import Text.Parsec ( parse )
 203 --   >>> pretty_print $ parse ldh_str "" "a0-b"
 204 --   a0-b
 205 --
 206 ldh_str :: Parser LdhStr
 207 ldh_str = try both <|> just_one
 208   where
 209     both :: Parser LdhStr
 210     both = do
 211       ldh1 <- let_dig_hyp
 212       ldh_tail <- ldh_str
 213       return $ LdhStrMultipleLdh ldh1 ldh_tail
 214
 215     just_one :: Parser LdhStr
 216     just_one = fmap LdhStrSingleLdh let_dig_hyp
 217
 218
 219
 220 -- | A version of 'last' that works on a 'LdhStr' rather than a
 221 --   list. That is, it returns the last 'LetDigHyp' in the
 222 --   string. Since 'LdhStr' contains at least one character, there's
 223 --   no \"nil\" case here.
 224 --
 225 --   ==== _Examples_
 226 --
 227 --   >>> import Text.Parsec ( parse )
 228 --
 229 --   >>> let (Right r) = parse ldh_str "" "a"
 230 --   >>> last_ldh_str r
 231 --   LetDigHypLetDig (LetDigLetter (Letter 'a'))
 232 --
 233 --   >>> let (Right r) = parse ldh_str "" "abc-def"
 234 --   >>> last_ldh_str r
 235 --   LetDigHypLetDig (LetDigLetter (Letter 'f'))
 236 --
 237 last_ldh_str :: LdhStr -> LetDigHyp
 238 last_ldh_str (LdhStrSingleLdh x) = x
 239 last_ldh_str (LdhStrMultipleLdh _ x) = last_ldh_str x
 240
 241
 242 -- | A version of 'init' that works on a 'LdhStr' rather than a
 243 --   list. That is, it returns everything /except/ the last character in
 244 --   the string.
 245 --
 246 --   Since an 'LdhStr' must contain at least one character, this might
 247 --   not be opssible (when the input is of length one). So, we return
 248 --   a 'Maybe' value.
 249 --
 250 --   ==== _Examples_
 251 --
 252 --   >>> import Text.Parsec ( parse )
 253 --
 254 --   >>> let (Right r) = parse ldh_str "" "a"
 255 --   >>> init_ldh_str r
 256 --   Nothing
 257 --
 258 --   >>> let (Right r) = parse ldh_str "" "ab"
 259 --   >>> init_ldh_str r
 260 --   Just (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a'))))
 261 --
 262 --   >>> let (Right r) = parse ldh_str "" "abc-def"
 263 --   >>> init_ldh_str r
 264 --   Just (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a'))) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'b'))) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'c'))) (LdhStrMultipleLdh (LetDigHypHyphen (Hyphen '-')) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'd'))) (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'e')))))))))
 265 --
 266 init_ldh_str :: LdhStr -> Maybe LdhStr
 267 init_ldh_str (LdhStrSingleLdh _) = Nothing
 268 init_ldh_str (LdhStrMultipleLdh h t) =
 269   Just $ case (init_ldh_str t) of
 270            -- We just got the second-to-last character, we're done.
 271            Nothing   -> LdhStrSingleLdh h
 272
 273            -- There's still more stuff. Recurse.
 274            Just rest -> LdhStrMultipleLdh h rest
 275
 276
 277 -- | Compute the length of an 'LdhStr'. It will be at least one, since
 278 --   'LdhStr's are non-empty. And if there's something other than the
 279 --   first character present, we simply recurse.
 280 --
 281 --   ==== _Examples_
 282 --
 283 --   >>> import Text.Parsec ( parse )
 284 --
 285 --   >>> let (Right r) = parse ldh_str "" "a"
 286 --   >>> length_ldh_str r
 287 --   1
 288 --
 289 --   >>> let (Right r) = parse ldh_str "" "abc-def"
 290 --   >>> length_ldh_str r
 291 --   7
 292 --
 293 length_ldh_str :: LdhStr -> Int
 294 length_ldh_str (LdhStrSingleLdh _) = 1
 295 length_ldh_str (LdhStrMultipleLdh _ t) = 1 + (length_ldh_str t)
 296
 297 -- * Letter/Digit/Hyphen string followed by a trailing Letter/Digit
 298
 299 -- | This type isn't explicitly part of the grammar, but it's what
 300 --   shows up in the square brackets of,
 301 --
 302 --     <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
 303 --
 304 --   The ldh-str is optional, but if one is present, we must also have
 305 --   a trailing let-dig to prevent the name from ending with a
 306 --   hyphen. This can be represented with a @Maybe LdhStrLetDig@,
 307 --   which is why we're about to define it.
 308 --
 309 data LdhStrLetDig = LdhStrLetDig (Maybe LdhStr) LetDig
 310   deriving (Eq, Show)
 311
 312 instance Pretty LdhStrLetDig where
 313   pretty_show (LdhStrLetDig Nothing ld) = pretty_show ld
 314   pretty_show (LdhStrLetDig (Just s) ld) = (pretty_show s) ++ (pretty_show ld)
 315
 316 -- | Parse an 'LdhStrLetDig'. This isn't in the grammar, but we might
 317 --   as well define the parser for it independently since we gave it
 318 --   its own data type.
 319 --
 320 --   ==== _Examples_
 321 --
 322 --   >>> import Text.Parsec ( parse, parseTest )
 323 --
 324 --   Make sure we can parse a single character:
 325 --
 326 --   >>> parseTest ldh_str_let_dig "a"
 327 --   LdhStrLetDig Nothing (LetDigLetter (Letter 'a'))
 328 --
 329 --   And longer strings:
 330 --
 331 --   >>> pretty_print $ parse ldh_str_let_dig "" "ab"
 332 --   ab
 333 --
 334 --   >>> pretty_print $ parse ldh_str_let_dig "" "-b"
 335 --   -b
 336 --
 337 --   >>> parseTest ldh_str_let_dig "b-"
 338 --   parse error at (line 1, column 3):
 339 --   label cannot end with a hyphen
 340 --
 341 ldh_str_let_dig :: Parser LdhStrLetDig
 342 ldh_str_let_dig = do
 343   -- This will happily eat up the trailing let-dig...
 344   full_ldh <- ldh_str
 345
 346   -- So we have to go back and see what happened.
 347   case (last_ldh_str full_ldh) of
 348     (LetDigHypHyphen _) -> fail "label cannot end with a hyphen"
 349     (LetDigHypLetDig ld) ->
 350       -- Ok, the label didn't end with a hyphen; now we need to split
 351       -- off the last letter/digit so we can pack it into our return
 352       -- type separately.
 353       return $ case (init_ldh_str full_ldh) of
 354                  -- We only parsed one letter/digit. This can happen
 355                  -- if the label contains two characters. For example,
 356                  -- if we try to parse the label "ab", then the "a"
 357                  -- will be eaten by the label parser, and this
 358                  -- function will be left with only "b".
 359                  Nothing -> LdhStrLetDig Nothing ld
 360
 361                  -- Usual case: there's was some leading let-dig-hyp junk,
 362                  -- return it too.
 363                  leading_ldhs -> LdhStrLetDig leading_ldhs ld
 364
 365
 366
 367 -- | Compute the length of a 'LdhStrLetDig'. It's at least one, since
 368 --   the let-dig at the end is always there. And when there's an
 369 --   ldh-str too, we add its length to one.
 370 --
 371 --   ==== _Examples_
 372 --
 373 --   >>> import Text.Parsec ( parse )
 374 --
 375 --   >>> let (Right r) = parse ldh_str_let_dig "" "a"
 376 --   >>> length_ldh_str_let_dig r
 377 --   1
 378 --
 379 --   >>> let (Right r) = parse ldh_str_let_dig "" "abc-def"
 380 --   >>> length_ldh_str_let_dig r
 381 --   7
 382 --
 383 length_ldh_str_let_dig :: LdhStrLetDig -> Int
 384 length_ldh_str_let_dig (LdhStrLetDig Nothing _) = 1
 385 length_ldh_str_let_dig (LdhStrLetDig (Just ldhstring) _) =
 386   1 + (length_ldh_str ldhstring)
 387
 388
 389 -- * Labels
 390
 391 -- | The label type from the RFC1035 grammar:
 392 --
 393 --     <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
 394 --
 395 --   We allow the slightly more general syntax from RFC1123, Section 2.1:
 396 --
 397 --     The syntax of a legal Internet host name was specified in RFC-952
 398 --     [DNS:4].  One aspect of host name syntax is hereby changed: the
 399 --     restriction on the first character is relaxed to allow either a
 400 --     letter or a digit.  Host software MUST support this more liberal
 401 --     syntax.
 402 --
 403 data Label = Label Letter (Maybe LdhStrLetDig)
 404   deriving (Eq, Show)
 405
 406 instance Pretty Label where
 407   pretty_show (Label l Nothing) = pretty_show l
 408   pretty_show (Label l (Just s)) = (pretty_show l) ++ (pretty_show s)
 409
 410 -- | Parse a 'Label'.
 411 --
 412 --   In addition to the grammar, there's another restriction on
 413 --   labels: their length must be 63 characters or less. Quoting
 414 --   Section 2.3.1, \"Preferred name syntax\", of RFC1035:
 415 --
 416 --     The labels must follow the rules for ARPANET host names.  They
 417 --     must start with a letter, end with a letter or digit, and have
 418 --     as interior characters only letters, digits, and hyphen.  There
 419 --     are also some restrictions on the length.  Labels must be 63
 420 --     characters or less.
 421 --
 422 --   We check this only after we have successfully parsed a label.
 423 --
 424 --   ==== _Examples_
 425 --
 426 --   >>> import Text.Parsec ( parse, parseTest )
 427 --
 428 --   Make sure we can parse a single character:
 429 --
 430 --   >>> parseTest label "a"
 431 --   Label (Letter 'a') Nothing
 432 --
 433 --   And longer strings:
 434 --
 435 --   >>> pretty_print $ parse label "" "abc-def"
 436 --   abc-def
 437 --
 438 --   But not anything ending in a hyphen:
 439 --
 440 --   >>> parseTest label "abc-"
 441 --   parse error at (line 1, column 5):
 442 --   label cannot end with a hyphen
 443 --
 444 --   Or anything over 63 characters:
 445 --
 446 --   >>> parseTest label (['a'..'z'] ++ ['a'..'z'] ++ ['a'..'z'])
 447 --   parse error at (line 1, column 79):
 448 --   labels must be 63 or fewer characters
 449 --
 450 --   However, /exactly/ 63 characters is acceptable:
 451 --
 452 --   >>> pretty_print $ parse label "" (replicate 63 'x')
 453 --   xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 454 --
 455 label :: Parser Label
 456 label = do
 457   l <- letter -- Guaranteed to be there
 458   maybe_s <- optionMaybe ldh_str_let_dig -- Might not be there
 459   case maybe_s of
 460     -- It can only be one character long, from the letter...
 461     Nothing -> return $ Label l maybe_s
 462
 463     -- The letter gives us one character, so we check that the rest is
 464     -- less than 62 characters long. But in the error message we need
 465     -- to report 63.
 466     Just s  -> if (length_ldh_str_let_dig s) <= 62
 467               then return $ Label l maybe_s
 468               else fail "labels must be 63 or fewer characters"
 469
 470
 471
 472 -- * Subdomains
 473
 474
 475 -- | The data type representing a \"subdomain\" from RFC1035:
 476 --
 477 --     <subdomain> ::= <label> | <subdomain> "." <label>
 478 --
 479 --   We have reversed the order of the subdomain and label in the
 480 --   second option, however. This is explained in 'subdomain'.
 481 --
 482 data Subdomain =
 483   SubdomainSingleLabel Label |
 484   SubdomainMultipleLabel Label Subdomain
 485   deriving (Eq, Show)
 486
 487
 488
 489 instance Pretty Subdomain where
 490   pretty_show (SubdomainSingleLabel l) = pretty_show l
 491   pretty_show (SubdomainMultipleLabel l s) =
 492     (pretty_show l) ++ "." ++ (pretty_show s)
 493
 494 -- | Parse an RFC1035 \"subdomain\". The given grammar is,
 495 --
 496 --     <subdomain> ::= <label> | <subdomain> "." <label>
 497 --
 498 --   However, we have reversed the order of the subdomain and label to
 499 --   prevent infinite recursion. The second option (subdomain + label)
 500 --   is obviously more specific, we we need to try it first. This
 501 --   presents a problem: we're trying to parse a subdomain in terms of
 502 --   a subdomain! The given grammar represents subdomains how we like
 503 --   to think of them; from right to left. But it's better to parse
 504 --   from left to right, so we pick off the leading label and then
 505 --   recurse into the definition of subdomain.
 506 --
 507 --   According to RFC1034, Section 3.1, two neighboring labels in a
 508 --   DNS name cannot be equal:
 509 --
 510 --     Each node has a label, which is zero to 63 octets in length.  Brother
 511 --     nodes may not have the same label, although the same label can be used
 512 --     for nodes which are not brothers.  One label is reserved, and that is
 513 --     the null (i.e., zero length) label used for the root.
 514 --
 515 --   We enforce this restriction, but the result is usually that we
 516 --   only parse the part of the subdomain leading up to the repeated
 517 --   label.
 518 --
 519 --   ==== _Examples_
 520 --
 521 --   >>> import Text.Parsec ( parse, parseTest )
 522 --
 523 --   Make sure we can parse a single character:
 524 --
 525 --   >>> parseTest subdomain "a"
 526 --   SubdomainSingleLabel (Label (Letter 'a') Nothing)
 527 --
 528 --   >>> pretty_print $ parse subdomain "" "example.com"
 529 --   example.com
 530 --
 531 --   >>> pretty_print $ parse subdomain "" "www.example.com"
 532 --   www.example.com
 533 --
 534 --   We reject a subdomain with equal neighbors, but this leads to
 535 --   only the single first label being parsed instead:
 536 --
 537 --   >>> pretty_print $ parse subdomain "" "www.www.example.com"
 538 --   www
 539 --
 540 --   But not one with a repeated but non-neighboring label:
 541 --
 542 --   >>> pretty_print $ parse subdomain "" "www.example.www.com"
 543 --   www.example.www.com
 544 --
 545 subdomain :: Parser Subdomain
 546 subdomain = try both <|> just_one
 547   where
 548     both :: Parser Subdomain
 549     both = do
 550       l <- label
 551       _ <- char '.'
 552       s <- subdomain
 553       let result = SubdomainMultipleLabel l s
 554       if (subdomain_has_equal_neighbors result)
 555       then fail "subdomain cannot have equal neighboring labels"
 556       else return result
 557
 558     just_one :: Parser Subdomain
 559     just_one = fmap SubdomainSingleLabel label
 560
 561
 562
 563 -- | Retrieve a list of labels contained in a 'Subdomain'.
 564 --
 565 --   ==== _Examples_
 566 --
 567 --   >>> import Text.Parsec ( parse )
 568 --
 569 --   >>> let (Right r) = parse subdomain "" "a"
 570 --   >>> pretty_print $ subdomain_labels r
 571 --   ["a"]
 572 --
 573 --   >>> let (Right r) = parse subdomain "" "example.com"
 574 --   >>> pretty_print $ subdomain_labels r
 575 --   ["example","com"]
 576 --
 577 --   >>> let (Right r) = parse subdomain "" "www.example.com"
 578 --   >>> pretty_print $ subdomain_labels r
 579 --   ["www","example","com"]
 580 --
 581 subdomain_labels :: Subdomain -> [Label]
 582 subdomain_labels (SubdomainSingleLabel l) = [l]
 583 subdomain_labels (SubdomainMultipleLabel l s) = l : (subdomain_labels s)
 584
 585
 586 -- | Return a list of pairs of neighboring labels in a subdomain.
 587 --
 588 --   ==== _Examples_
 589 --
 590 --   >>> import Text.Parsec ( parse )
 591 --   >>> let (Right r) = parse subdomain "" "www.example.com"
 592 --   >>> pretty_print $ subdomain_label_neighbors r
 593 --   ["(\"www\",\"example\")","(\"example\",\"com\")"]
 594 --
 595 subdomain_label_neighbors :: Subdomain -> [(Label,Label)]
 596 subdomain_label_neighbors s =
 597   zip ls (tail ls)
 598   where
 599     ls = subdomain_labels s
 600
 601
 602 -- | Return @True@ if the subdomain has any two equal neighboring
 603 --   labels, and @False@ otherwise.
 604 --
 605 --   ==== _Examples_
 606 --
 607 --   >>> import Text.Parsec ( parse )
 608 --
 609 --   >>> let (Right r) = parse subdomain "" "www.example.com"
 610 --   >>> subdomain_has_equal_neighbors r
 611 --   False
 612 --
 613 --   >>> let (Right l) = parse label "" "www"
 614 --   >>> let (Right s) = parse subdomain "" "www.example.com"
 615 --   >>> let bad_subdomain = SubdomainMultipleLabel l s
 616 --   >>> subdomain_has_equal_neighbors bad_subdomain
 617 --   True
 618 --
 619 subdomain_has_equal_neighbors :: Subdomain -> Bool
 620 subdomain_has_equal_neighbors s =
 621   or [ x == y | (x,y) <- subdomain_label_neighbors s ]
 622
 623
 624
 625 -- * Domains
 626
 627 -- | An RFC1035 domain. According to RFC1035 a domain can be either a
 628 -- subdomain or \" \", which according to RFC2181
 629 -- <https://tools.ietf.org/html/rfc2181#section-11> means the root:
 630 --
 631 --      The zero length full name is defined as representing the root
 632 --      of the DNS tree, and is typically written and displayed as
 633 --      \".\".
 634 --
 635 --   We let the 'Domain' type remain true to those RFCs, even though
 636 --   they don't support an absolute domain name of e.g. a single dot.
 637 --   We have one more data type 'UserDomain' which handles the possibility
 638 --   of an absolute path.
 639 --
 640 data Domain =
 641   DomainName Subdomain |
 642   DomainRoot
 643   deriving (Eq, Show)
 644
 645 instance Pretty Domain where
 646   pretty_show DomainRoot = ""
 647   pretty_show (DomainName s) = pretty_show s
 648
 649 -- | Parse an RFC1035 \"domain\"
 650 --
 651 --   ==== _Examples_
 652 --
 653 --   >>> import Text.Parsec ( parse, parseTest )
 654 --
 655 --   Make sure we can parse a single character:
 656 --
 657 --   >>> parseTest domain "a"
 658 --   DomainName (SubdomainSingleLabel (Label (Letter 'a') Nothing))
 659 --
 660 --   And the empty domain:
 661 --
 662 --   >>> parseTest domain ""
 663 --   DomainRoot
 664 --
 665 --   We will in fact parse the \"empty\" domain off the front of
 666 --   pretty much anything:
 667 --
 668 --   >>> parseTest domain "8===D"
 669 --   DomainRoot
 670 --
 671 --   Equality of domains is case-insensitive:
 672 --
 673 --   >>> let (Right r1) = parse domain "" "example.com"
 674 --   >>> let (Right r2) = parse domain "" "ExaMPle.coM"
 675 --   >>> r1 == r2
 676 --   True
 677 --
 678 --   A single dot IS parsed as the root, but the dot isn't consumed:
 679 --
 680 --   >>> parseTest domain "."
 681 --   DomainRoot
 682 --
 683 --   Anything over 255 characters is an error, so the root will be
 684 --   parsed:
 685 --
 686 --   >>> let big_l1 = replicate 63 'x'
 687 --   >>> let big_l2 = replicate 63 'y' -- Avoid equal neighboring labels!
 688 --   >>> let big_labels = big_l1 ++ "." ++ big_l2 ++ "."
 689 --   >>> let big_subdomain = concat $ replicate 3 big_labels
 690 --   >>> parseTest domain big_subdomain
 691 --   DomainRoot
 692 --
 693 --   But exactly 255 is allowed:
 694 --
 695 --   >>> import Data.List ( intercalate )
 696 --   >>> let l1 = replicate 63 'w'
 697 --   >>> let l2 = replicate 63 'x'
 698 --   >>> let l3 = replicate 63 'y'
 699 --   >>> let l4 = replicate 63 'z'
 700 --   >>> let big_subdomain = intercalate "." [l1,l2,l3,l4]
 701 --   >>> let (Right r) = parse domain "" big_subdomain
 702 --   >>> length (pretty_show r)
 703 --   255
 704 --
 705 domain :: Parser Domain
 706 domain = try parse_subdomain <|> parse_empty
 707   where
 708     parse_subdomain :: Parser Domain
 709     parse_subdomain = do
 710       s <- subdomain
 711       if length (pretty_show s) <= 255
 712       then return $ DomainName s
 713       else fail "subdomains can be at most 255 characters"
 714
 715     parse_empty :: Parser Domain
 716     parse_empty = string "" >> return DomainRoot
 717
 718
 719
 720 -- * User domains
 721
 722 -- | This type helps clarify some murkiness in the DNS \"domain\" name
 723 --   specification. In RFC1034, it is acknowledged that a domain name
 724 --   input with a trailing \".\" will represent an absolute domain
 725 --   name (i.e. with respect to the DNS root). However, the grammar in
 726 --   RFC1035 disallows a trailing dot.
 727 --
 728 --   This makes some sense: within the DNS, everything knows its
 729 --   position in the tree. The relative/absolute distinction only
 730 --   makes sense on the client side, where a user's resolver might
 731 --   decide to append some suffix to a relative
 732 --   request. Unfortunately, that's where we live. So we have to deal
 733 --   with the possibility of having a trailing dot at the end of any
 734 --   domain name.
 735 --
 736 data UserDomain =
 737   UserDomainRelative Domain |
 738   UserDomainAbsolute Domain
 739   deriving (Eq, Show)
 740
 741 instance Pretty UserDomain where
 742   pretty_show (UserDomainRelative d) = pretty_show d
 743   pretty_show (UserDomainAbsolute d) = (pretty_show d) ++ "."
 744
 745
 746 -- | Parse a 'UserDomain'. This is what we'll be using to read user
 747 --   input, since it supports both relative and absolute domain names
 748 --   (unlike the implicitly-absolute 'Domain').
 749 --
 750 --   ==== _Examples_
 751 --
 752 --   >>> import Text.Parsec ( parse, parseTest )
 753 --
 754 --   We can really parse the root now!
 755 --
 756 --   >>> parseTest user_domain "."
 757 --   UserDomainAbsolute DomainRoot
 758 --
 759 --   But multiple dots aren't (only the first):
 760 --
 761 --   >>> pretty_print $ parse user_domain "" ".."
 762 --   .
 763 --
 764 --   We can also optionally have a trailing dot at the end of a
 765 --   non-empty name:
 766 --
 767 --   >>>  pretty_print $ parse user_domain "" "www.example.com"
 768 --   www.example.com
 769 --
 770 --   >>>  pretty_print $ parse user_domain "" "www.example.com."
 771 --   www.example.com.
 772 --
 773 --   A \"relative root\" can also be parsed, letting the user's
 774 --   resolver deal with it:
 775 --
 776 --   >>> parseTest user_domain ""
 777 --   UserDomainRelative DomainRoot
 778 --
 779 user_domain :: Parser UserDomain
 780 user_domain = try absolute <|> relative
 781   where
 782     absolute :: Parser UserDomain
 783     absolute = do
 784       d <- domain
 785       _ <- char '.'
 786       return $ UserDomainAbsolute d
 787
 788     relative :: Parser UserDomain
 789     relative = fmap UserDomainRelative domain