harbl/src/Network/DNS/RBL/Domain.hs

   1 {-# LANGUAGE DoAndIfThenElse #-}
   2
   3 -- | The 'Domain' data type and its parser. A 'Domain' represents a
   4 --   name in the domain name system (DNS) as described by
   5 --   RFC1035. In particular, we enforce the restrictions from Section
   6 --   2.3.1 \"Preferred name syntax\". See for example,
   7 --
   8 --     <https://tools.ietf.org/html/rfc1035#section-2.3.1>
   9 --
  10 --   We basically work with strings and characters everywhere, even
  11 --   though this isn't really correct. The length specifications in
  12 --   the RFCs are all in terms of octets, so really a ByteString.Char8
  13 --   would be more appropriate. With strings, for example, we could
  14 --   have a unicode mumbo jumbo character that takes up two bytes
  15 --   (octets).
  16 --
  17 module Network.DNS.RBL.Domain (
  18   Domain(..),
  19   domain )
  20 where
  21
  22 import Data.Char ( toLower )
  23 import Text.Parsec (
  24   (<|>),
  25   char,
  26   optionMaybe,
  27   string,
  28   try )
  29 import qualified Text.Parsec as Parsec ( digit, letter)
  30 import Text.Parsec.String ( Parser )
  31
  32 import Network.DNS.RBL.Pretty ( Pretty(..) )
  33 import Network.DNS.RBL.Reversible ( Reversible(..) )
  34
  35 -- * Digits
  36
  37 -- | A wrapper around a digit character.
  38 --
  39 newtype Digit = Digit Char deriving (Eq, Show)
  40 instance Pretty Digit where pretty_show (Digit d) = [d]
  41
  42 -- | Parse a single digit, but wrap it in our 'Digit' type.
  43 --
  44 digit :: Parser Digit
  45 digit = fmap Digit Parsec.digit
  46
  47
  48 -- * Letters
  49
  50 -- | A wrapper around a letter character.
  51 --
  52 newtype Letter = Letter Char deriving (Show)
  53 instance Pretty Letter where pretty_show (Letter l) = [l]
  54
  55
  56 -- | Parse a single letter, but wrap it in our 'Letter' type.
  57 --
  58 letter :: Parser Letter
  59 letter = fmap Letter Parsec.letter
  60
  61 -- | The derived instance of 'Eq' for letters is incorrect. All
  62 --   comparisons should be made case-insensitively. The following
  63 --   is an excerpt from RFC1035:
  64 --
  65 --     2.3.3. Character Case
  66 --
  67 --     For all parts of the DNS that are part of the official
  68 --     protocol, all comparisons between character strings (e.g.,
  69 --     labels, domain names, etc.)  are done in a case-insensitive
  70 --     manner...
  71 --
  72 --   Since each part of DNS name is composed of our custom types, it
  73 --   suffices to munge the equality for 'Letter'. RFC4343
  74 --   <https://tools.ietf.org/html/rfc4343> clarifies the
  75 --   case-insensitivity rules, but the fact that we're treating DNS
  76 --   names as strings makes most of those problems go away (in
  77 --   exchange for new ones).
  78 --
  79 instance Eq Letter where
  80   (Letter l1) == (Letter l2) = (toLower l1) == (toLower l2)
  81
  82 -- * Letters/Digits
  83
  84 -- | A sum type representing either a letter or a digit.
  85 --
  86 data LetDig =
  87   LetDigLetter Letter |
  88   LetDigDigit  Digit
  89   deriving (Eq, Show)
  90
  91 instance Pretty LetDig where
  92   pretty_show (LetDigLetter l) = pretty_show l
  93   pretty_show (LetDigDigit d) = pretty_show d
  94
  95 -- | Parse a letter or a digit and wrap it in our 'LetDig' type.
  96 --
  97 let_dig :: Parser LetDig
  98 let_dig = (fmap LetDigLetter letter) <|> (fmap LetDigDigit digit)
  99
 100
 101 -- * Hyphens
 102
 103 -- | A wrapper around a single hyphen character.
 104 --
 105 newtype Hyphen = Hyphen Char deriving (Eq, Show)
 106 instance Pretty Hyphen where pretty_show (Hyphen h) = [h]
 107
 108 -- | Parse a single hyphen and wrap it in our 'Hyphen' type.
 109 --
 110 hyphen :: Parser Hyphen
 111 hyphen = fmap Hyphen (char '-')
 112
 113
 114 -- * Letter, Digit, or Hyphen.
 115
 116 -- | A sum type representing a letter, digit, or hyphen.
 117 --
 118 data LetDigHyp =
 119   LetDigHypLetDig LetDig |
 120   LetDigHypHyphen Hyphen
 121   deriving (Eq, Show)
 122
 123 instance Pretty LetDigHyp where
 124   pretty_show (LetDigHypLetDig ld) = pretty_show ld
 125   pretty_show (LetDigHypHyphen h) = pretty_show h
 126
 127
 128 -- | The following is the simplest type in the domain grammar that
 129 --   isn't already implemented for us.
 130 --
 131 --     <let-dig> ::= <letter> | <digit>
 132 --
 133 --   ==== _Examples_
 134 --
 135 --   >>> import Text.Parsec ( parseTest )
 136 --
 137 --   Letters, digits, and hyphens are all parsed:
 138 --
 139 --   >>> parseTest let_dig_hyp "a"
 140 --   LetDigHypLetDig (LetDigLetter (Letter 'a'))
 141 --
 142 --   >>> parseTest let_dig_hyp "7"
 143 --   LetDigHypLetDig (LetDigDigit (Digit '7'))
 144 --
 145 --   >>> parseTest let_dig_hyp "-"
 146 --   LetDigHypHyphen (Hyphen '-')
 147 --
 148 --   However, an underscore (for example) is not:
 149 --
 150 --   >>> parseTest let_dig_hyp "_"
 151 --   parse error at (line 1, column 1):
 152 --   unexpected "_"
 153 --   expecting letter, digit or "-"
 154 --
 155 let_dig_hyp :: Parser LetDigHyp
 156 let_dig_hyp =
 157   parse_letdig <|> parse_hyphen
 158   where
 159     parse_letdig :: Parser LetDigHyp
 160     parse_letdig = fmap LetDigHypLetDig let_dig
 161
 162     parse_hyphen :: Parser LetDigHyp
 163     parse_hyphen = fmap LetDigHypHyphen hyphen
 164
 165
 166 -- * Letter/Digit/Hyphen strings
 167
 168 -- | A string of letters, digits, and hyphens from the RFC1035 grammar:
 169 --
 170 --     <ldh-str> ::= <let-dig-hyp> | <let-dig-hyp> <ldh-str>
 171 --
 172 --   These are represented as either a single instance of a
 173 --   'LetDigHyp', or a string of them (recursive).
 174 --
 175 data LdhStr =
 176   LdhStrSingleLdh LetDigHyp |
 177   LdhStrMultipleLdh LetDigHyp LdhStr
 178   deriving (Eq, Show)
 179
 180 instance Pretty LdhStr where
 181   pretty_show (LdhStrSingleLdh ldh) = pretty_show ldh
 182   pretty_show (LdhStrMultipleLdh ldh s) = (pretty_show ldh) ++ (pretty_show s)
 183
 184 -- | Parse a string of letters, digits, and hyphens (an 'LdhStr').
 185 --
 186 --   ==== _Examples_
 187 --
 188 --   >>> import Text.Parsec ( parseTest )
 189 --
 190 --   Single letters, digits, and hyphens are parsed:
 191 --
 192 --   >>> parseTest ldh_str "a"
 193 --   LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a')))
 194 --
 195 --   >>> parseTest ldh_str "0"
 196 --   LdhStrSingleLdh (LetDigHypLetDig (LetDigDigit (Digit '0')))
 197 --
 198 --   >>> parseTest ldh_str "-"
 199 --   LdhStrSingleLdh (LetDigHypHyphen (Hyphen '-'))
 200 --
 201 --   As well as strings of them:
 202 --
 203 --   >>> import Text.Parsec ( parse )
 204 --   >>> pretty_print $ parse ldh_str "" "a0-b"
 205 --   a0-b
 206 --
 207 ldh_str :: Parser LdhStr
 208 ldh_str = try both <|> just_one
 209   where
 210     both :: Parser LdhStr
 211     both = do
 212       ldh1 <- let_dig_hyp
 213       ldh_tail <- ldh_str
 214       return $ LdhStrMultipleLdh ldh1 ldh_tail
 215
 216     just_one :: Parser LdhStr
 217     just_one = fmap LdhStrSingleLdh let_dig_hyp
 218
 219
 220
 221 -- | A version of 'last' that works on a 'LdhStr' rather than a
 222 --   list. That is, it returns the last 'LetDigHyp' in the
 223 --   string. Since 'LdhStr' contains at least one character, there's
 224 --   no \"nil\" case here.
 225 --
 226 --   ==== _Examples_
 227 --
 228 --   >>> import Text.Parsec ( parse )
 229 --
 230 --   >>> let (Right r) = parse ldh_str "" "a"
 231 --   >>> last_ldh_str r
 232 --   LetDigHypLetDig (LetDigLetter (Letter 'a'))
 233 --
 234 --   >>> let (Right r) = parse ldh_str "" "abc-def"
 235 --   >>> last_ldh_str r
 236 --   LetDigHypLetDig (LetDigLetter (Letter 'f'))
 237 --
 238 last_ldh_str :: LdhStr -> LetDigHyp
 239 last_ldh_str (LdhStrSingleLdh x) = x
 240 last_ldh_str (LdhStrMultipleLdh _ x) = last_ldh_str x
 241
 242
 243 -- | A version of 'init' that works on a 'LdhStr' rather than a
 244 --   list. That is, it returns everything /except/ the last character in
 245 --   the string.
 246 --
 247 --   Since an 'LdhStr' must contain at least one character, this might
 248 --   not be opssible (when the input is of length one). So, we return
 249 --   a 'Maybe' value.
 250 --
 251 --   ==== _Examples_
 252 --
 253 --   >>> import Text.Parsec ( parse )
 254 --
 255 --   >>> let (Right r) = parse ldh_str "" "a"
 256 --   >>> init_ldh_str r
 257 --   Nothing
 258 --
 259 --   >>> let (Right r) = parse ldh_str "" "ab"
 260 --   >>> init_ldh_str r
 261 --   Just (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a'))))
 262 --
 263 --   >>> let (Right r) = parse ldh_str "" "abc-def"
 264 --   >>> init_ldh_str r
 265 --   Just (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a'))) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'b'))) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'c'))) (LdhStrMultipleLdh (LetDigHypHyphen (Hyphen '-')) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'd'))) (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'e')))))))))
 266 --
 267 init_ldh_str :: LdhStr -> Maybe LdhStr
 268 init_ldh_str (LdhStrSingleLdh _) = Nothing
 269 init_ldh_str (LdhStrMultipleLdh h t) =
 270   Just $ case (init_ldh_str t) of
 271            -- We just got the second-to-last character, we're done.
 272            Nothing   -> LdhStrSingleLdh h
 273
 274            -- There's still more stuff. Recurse.
 275            Just rest -> LdhStrMultipleLdh h rest
 276
 277
 278 -- | Compute the length of an 'LdhStr'. It will be at least one, since
 279 --   'LdhStr's are non-empty. And if there's something other than the
 280 --   first character present, we simply recurse.
 281 --
 282 --   ==== _Examples_
 283 --
 284 --   >>> import Text.Parsec ( parse )
 285 --
 286 --   >>> let (Right r) = parse ldh_str "" "a"
 287 --   >>> length_ldh_str r
 288 --   1
 289 --
 290 --   >>> let (Right r) = parse ldh_str "" "abc-def"
 291 --   >>> length_ldh_str r
 292 --   7
 293 --
 294 length_ldh_str :: LdhStr -> Int
 295 length_ldh_str (LdhStrSingleLdh _) = 1
 296 length_ldh_str (LdhStrMultipleLdh _ t) = 1 + (length_ldh_str t)
 297
 298 -- * Letter/Digit/Hyphen string followed by a trailing Letter/Digit
 299
 300 -- | This type isn't explicitly part of the grammar, but it's what
 301 --   shows up in the square brackets of,
 302 --
 303 --     <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
 304 --
 305 --   The ldh-str is optional, but if one is present, we must also have
 306 --   a trailing let-dig to prevent the name from ending with a
 307 --   hyphen. This can be represented with a @Maybe LdhStrLetDig@,
 308 --   which is why we're about to define it.
 309 --
 310 data LdhStrLetDig = LdhStrLetDig (Maybe LdhStr) LetDig
 311   deriving (Eq, Show)
 312
 313 instance Pretty LdhStrLetDig where
 314   pretty_show (LdhStrLetDig Nothing ld) = pretty_show ld
 315   pretty_show (LdhStrLetDig (Just s) ld) = (pretty_show s) ++ (pretty_show ld)
 316
 317 -- | Parse an 'LdhStrLetDig'. This isn't in the grammar, but we might
 318 --   as well define the parser for it independently since we gave it
 319 --   its own data type.
 320 --
 321 --   ==== _Examples_
 322 --
 323 --   >>> import Text.Parsec ( parse, parseTest )
 324 --
 325 --   Make sure we can parse a single character:
 326 --
 327 --   >>> parseTest ldh_str_let_dig "a"
 328 --   LdhStrLetDig Nothing (LetDigLetter (Letter 'a'))
 329 --
 330 --   And longer strings:
 331 --
 332 --   >>> pretty_print $ parse ldh_str_let_dig "" "ab"
 333 --   ab
 334 --
 335 --   >>> pretty_print $ parse ldh_str_let_dig "" "-b"
 336 --   -b
 337 --
 338 --   >>> parseTest ldh_str_let_dig "b-"
 339 --   parse error at (line 1, column 3):
 340 --   label cannot end with a hyphen
 341 --
 342 ldh_str_let_dig :: Parser LdhStrLetDig
 343 ldh_str_let_dig = do
 344   -- This will happily eat up the trailing let-dig...
 345   full_ldh <- ldh_str
 346
 347   -- So we have to go back and see what happened.
 348   case (last_ldh_str full_ldh) of
 349     (LetDigHypHyphen _) -> fail "label cannot end with a hyphen"
 350     (LetDigHypLetDig ld) ->
 351       -- Ok, the label didn't end with a hyphen; now we need to split
 352       -- off the last letter/digit so we can pack it into our return
 353       -- type separately.
 354       return $ case (init_ldh_str full_ldh) of
 355                  -- We only parsed one letter/digit. This can happen
 356                  -- if the label contains two characters. For example,
 357                  -- if we try to parse the label "ab", then the "a"
 358                  -- will be eaten by the label parser, and this
 359                  -- function will be left with only "b".
 360                  Nothing -> LdhStrLetDig Nothing ld
 361
 362                  -- Usual case: there's was some leading let-dig-hyp junk,
 363                  -- return it too.
 364                  leading_ldhs -> LdhStrLetDig leading_ldhs ld
 365
 366
 367
 368 -- | Compute the length of a 'LdhStrLetDig'. It's at least one, since
 369 --   the let-dig at the end is always there. And when there's an
 370 --   ldh-str too, we add its length to one.
 371 --
 372 --   ==== _Examples_
 373 --
 374 --   >>> import Text.Parsec ( parse )
 375 --
 376 --   >>> let (Right r) = parse ldh_str_let_dig "" "a"
 377 --   >>> length_ldh_str_let_dig r
 378 --   1
 379 --
 380 --   >>> let (Right r) = parse ldh_str_let_dig "" "abc-def"
 381 --   >>> length_ldh_str_let_dig r
 382 --   7
 383 --
 384 length_ldh_str_let_dig :: LdhStrLetDig -> Int
 385 length_ldh_str_let_dig (LdhStrLetDig Nothing _) = 1
 386 length_ldh_str_let_dig (LdhStrLetDig (Just ldhstring) _) =
 387   1 + (length_ldh_str ldhstring)
 388
 389
 390 -- * Labels
 391
 392 -- | The label type from the RFC1035 grammar:
 393 --
 394 --     <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
 395 --
 396 --   We allow the slightly more general syntax from RFC1123, Section 2.1:
 397 --
 398 --     The syntax of a legal Internet host name was specified in RFC-952
 399 --     [DNS:4].  One aspect of host name syntax is hereby changed: the
 400 --     restriction on the first character is relaxed to allow either a
 401 --     letter or a digit.  Host software MUST support this more liberal
 402 --     syntax.
 403 --
 404 data Label = Label LetDig (Maybe LdhStrLetDig)
 405   deriving (Eq, Show)
 406
 407 instance Pretty Label where
 408   pretty_show (Label l Nothing) = pretty_show l
 409   pretty_show (Label l (Just s)) = (pretty_show l) ++ (pretty_show s)
 410
 411 -- | Parse a 'Label'.
 412 --
 413 --   In addition to the grammar, there's another restriction on
 414 --   labels: their length must be 63 characters or less. Quoting
 415 --   Section 2.3.1, \"Preferred name syntax\", of RFC1035:
 416 --
 417 --     The labels must follow the rules for ARPANET host names.  They
 418 --     must start with a letter, end with a letter or digit, and have
 419 --     as interior characters only letters, digits, and hyphen.  There
 420 --     are also some restrictions on the length.  Labels must be 63
 421 --     characters or less.
 422 --
 423 --   We check this only after we have successfully parsed a label.
 424 --
 425 --   ==== _Examples_
 426 --
 427 --   >>> import Text.Parsec ( parse, parseTest )
 428 --
 429 --   Make sure we can parse a single character:
 430 --
 431 --   >>> parseTest label "a"
 432 --   Label (LetDigLetter (Letter 'a')) Nothing
 433 --
 434 --   And longer strings:
 435 --
 436 --   >>> pretty_print $ parse label "" "abc-def"
 437 --   abc-def
 438 --
 439 --   But not anything ending in a hyphen:
 440 --
 441 --   >>> parseTest label "abc-"
 442 --   parse error at (line 1, column 5):
 443 --   label cannot end with a hyphen
 444 --
 445 --   Or anything over 63 characters:
 446 --
 447 --   >>> parseTest label (['a'..'z'] ++ ['a'..'z'] ++ ['a'..'z'])
 448 --   parse error at (line 1, column 79):
 449 --   labels must be 63 or fewer characters
 450 --
 451 --   However, /exactly/ 63 characters is acceptable:
 452 --
 453 --   >>> pretty_print $ parse label "" (replicate 63 'x')
 454 --   xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 455 --
 456 --   Ensure that a label can begin with a digit:
 457 --
 458 --   >>> pretty_print $ parse label "" "3com"
 459 --   3com
 460 --
 461 label :: Parser Label
 462 label = do
 463   l <- let_dig -- Guaranteed to be there
 464   maybe_s <- optionMaybe ldh_str_let_dig -- Might not be there
 465   case maybe_s of
 466     -- It can only be one character long, from the letter...
 467     Nothing -> return $ Label l maybe_s
 468
 469     -- The letter gives us one character, so we check that the rest is
 470     -- less than 62 characters long. But in the error message we need
 471     -- to report 63.
 472     Just s  -> if (length_ldh_str_let_dig s) <= 62
 473               then return $ Label l maybe_s
 474               else fail "labels must be 63 or fewer characters"
 475
 476
 477
 478 -- * Subdomains
 479
 480
 481 -- | The data type representing a \"subdomain\" from RFC1035:
 482 --
 483 --     <subdomain> ::= <label> | <subdomain> "." <label>
 484 --
 485 --   We have reversed the order of the subdomain and label in the
 486 --   second option, however. This is explained in 'subdomain'.
 487 --
 488 data Subdomain =
 489   SubdomainSingleLabel Label |
 490   SubdomainMultipleLabel Label Subdomain
 491   deriving (Eq, Show)
 492
 493
 494
 495 instance Pretty Subdomain where
 496   pretty_show (SubdomainSingleLabel l) = pretty_show l
 497   pretty_show (SubdomainMultipleLabel l s) =
 498     (pretty_show l) ++ "." ++ (pretty_show s)
 499
 500
 501 instance Reversible Subdomain where
 502   -- | Reverse the labels of the given subdomain.
 503   --
 504   --   ==== _Examples_
 505   --
 506   --   >>> import Text.Parsec ( parse )
 507   --
 508   --   Standard usage:
 509   --
 510   --   >>> let (Right r) = parse subdomain "" "com"
 511   --   >>> pretty_print $ backwards r
 512   --   com
 513   --
 514   --   >>> let (Right r) = parse subdomain "" "example.com"
 515   --   >>> pretty_print $ backwards r
 516   --   com.example
 517   --
 518   --   >>> let (Right r) = parse subdomain "" "www.example.com"
 519   --   >>> pretty_print $ backwards r
 520   --   com.example.www
 521   --
 522   --   >>> let (Right r) = parse subdomain "" "new.www.example.com"
 523   --   >>> pretty_print $ backwards r
 524   --   com.example.www.new
 525   --
 526
 527   -- It's easy to reverse a single label...
 528   backwards s@(SubdomainSingleLabel _) = s
 529
 530   -- For multiple labels we have two cases. The first is where we have
 531   -- exactly two labels, and we just need to swap them.
 532   backwards (SubdomainMultipleLabel l (SubdomainSingleLabel m)) =
 533     SubdomainMultipleLabel m (SubdomainSingleLabel l)
 534
 535   -- And now the hard case. If we reversed @s@, then the "head" of
 536   -- the result (@last_s@) should be the last label in the entire
 537   -- subdomain. Stick @last_s@ on the front of the result. That makes
 538   -- enough sense.
 539   --
 540   -- But what to do about the rest? We need to get \"init s\"
 541   -- somehow. Well, we have the reverse of it... why not waste a bunch
 542   -- of time and reverse that, too? With @init s@ in hand, we can
 543   -- prepend @l@ to that, and THEN reverse the entire thing. What we'll
 544   -- wind up with looks like @[last_s, init_s_rev, l]@ which you can
 545   -- pretend you recognize as the subdomain in reverse.
 546   --
 547   backwards (SubdomainMultipleLabel l s) =
 548     case (backwards s) of
 549       SubdomainMultipleLabel last_s init_s_rev ->
 550         let init_s = backwards init_s_rev
 551         in
 552           SubdomainMultipleLabel
 553             last_s
 554             (backwards (SubdomainMultipleLabel l init_s))
 555
 556       -- Reversing a multiple label thing gives you back a multiple
 557       -- label thing but there's no way to promise that.
 558       impossible -> impossible
 559
 560
 561
 562 -- | Parse an RFC1035 \"subdomain\". The given grammar is,
 563 --
 564 --     <subdomain> ::= <label> | <subdomain> "." <label>
 565 --
 566 --   However, we have reversed the order of the subdomain and label to
 567 --   prevent infinite recursion. The second option (subdomain + label)
 568 --   is obviously more specific, we we need to try it first. This
 569 --   presents a problem: we're trying to parse a subdomain in terms of
 570 --   a subdomain! The given grammar represents subdomains how we like
 571 --   to think of them; from right to left. But it's better to parse
 572 --   from left to right, so we pick off the leading label and then
 573 --   recurse into the definition of subdomain.
 574 --
 575 --   According to RFC1034, Section 3.1, two neighboring labels in a
 576 --   DNS name cannot be equal:
 577 --
 578 --     Each node has a label, which is zero to 63 octets in length.  Brother
 579 --     nodes may not have the same label, although the same label can be used
 580 --     for nodes which are not brothers.  One label is reserved, and that is
 581 --     the null (i.e., zero length) label used for the root.
 582 --
 583 --   We enforce this restriction, but the result is usually that we
 584 --   only parse the part of the subdomain leading up to the repeated
 585 --   label.
 586 --
 587 --   ==== _Examples_
 588 --
 589 --   >>> import Text.Parsec ( parse, parseTest )
 590 --
 591 --   Make sure we can parse a single character:
 592 --
 593 --   >>> parseTest subdomain "a"
 594 --   SubdomainSingleLabel (Label (LetDigLetter (Letter 'a')) Nothing)
 595 --
 596 --   >>> pretty_print $ parse subdomain "" "example.com"
 597 --   example.com
 598 --
 599 --   >>> pretty_print $ parse subdomain "" "www.example.com"
 600 --   www.example.com
 601 --
 602 --   We reject a subdomain with equal neighbors, but this leads to
 603 --   only the single first label being parsed instead:
 604 --
 605 --   >>> pretty_print $ parse subdomain "" "www.www.example.com"
 606 --   www
 607 --
 608 --   But not one with a repeated but non-neighboring label:
 609 --
 610 --   >>> pretty_print $ parse subdomain "" "www.example.www.com"
 611 --   www.example.www.com
 612 --
 613 subdomain :: Parser Subdomain
 614 subdomain = try both <|> just_one
 615   where
 616     both :: Parser Subdomain
 617     both = do
 618       l <- label
 619       _ <- char '.'
 620       s <- subdomain
 621       let result = SubdomainMultipleLabel l s
 622       if (subdomain_has_equal_neighbors result)
 623       then fail "subdomain cannot have equal neighboring labels"
 624       else return result
 625
 626     just_one :: Parser Subdomain
 627     just_one = fmap SubdomainSingleLabel label
 628
 629
 630
 631 -- | Retrieve a list of labels contained in a 'Subdomain'.
 632 --
 633 --   ==== _Examples_
 634 --
 635 --   >>> import Text.Parsec ( parse )
 636 --
 637 --   >>> let (Right r) = parse subdomain "" "a"
 638 --   >>> pretty_print $ subdomain_labels r
 639 --   ["a"]
 640 --
 641 --   >>> let (Right r) = parse subdomain "" "example.com"
 642 --   >>> pretty_print $ subdomain_labels r
 643 --   ["example","com"]
 644 --
 645 --   >>> let (Right r) = parse subdomain "" "www.example.com"
 646 --   >>> pretty_print $ subdomain_labels r
 647 --   ["www","example","com"]
 648 --
 649 subdomain_labels :: Subdomain -> [Label]
 650 subdomain_labels (SubdomainSingleLabel l) = [l]
 651 subdomain_labels (SubdomainMultipleLabel l s) = l : (subdomain_labels s)
 652
 653
 654 -- | Return a list of pairs of neighboring labels in a subdomain.
 655 --
 656 --   ==== _Examples_
 657 --
 658 --   >>> import Text.Parsec ( parse )
 659 --   >>> let (Right r) = parse subdomain "" "www.example.com"
 660 --   >>> pretty_print $ subdomain_label_neighbors r
 661 --   ["(\"www\",\"example\")","(\"example\",\"com\")"]
 662 --
 663 subdomain_label_neighbors :: Subdomain -> [(Label,Label)]
 664 subdomain_label_neighbors s =
 665   zip ls (tail ls)
 666   where
 667     ls = subdomain_labels s
 668
 669
 670 -- | Return @True@ if the subdomain has any two equal neighboring
 671 --   labels, and @False@ otherwise.
 672 --
 673 --   ==== _Examples_
 674 --
 675 --   >>> import Text.Parsec ( parse )
 676 --
 677 --   >>> let (Right r) = parse subdomain "" "www.example.com"
 678 --   >>> subdomain_has_equal_neighbors r
 679 --   False
 680 --
 681 --   >>> let (Right l) = parse label "" "www"
 682 --   >>> let (Right s) = parse subdomain "" "www.example.com"
 683 --   >>> let bad_subdomain = SubdomainMultipleLabel l s
 684 --   >>> subdomain_has_equal_neighbors bad_subdomain
 685 --   True
 686 --
 687 subdomain_has_equal_neighbors :: Subdomain -> Bool
 688 subdomain_has_equal_neighbors s =
 689   or [ x == y | (x,y) <- subdomain_label_neighbors s ]
 690
 691
 692
 693
 694 -- * Domains
 695
 696 -- | An RFC1035 domain. According to RFC1035 a domain can be either a
 697 -- subdomain or \" \", which according to RFC2181
 698 -- <https://tools.ietf.org/html/rfc2181#section-11> means the root:
 699 --
 700 --      The zero length full name is defined as representing the root
 701 --      of the DNS tree, and is typically written and displayed as
 702 --      \".\".
 703 --
 704 --   We let the 'Domain' type remain true to those RFCs, even though
 705 --   they don't support an absolute domain name of e.g. a single dot.
 706 --
 707 data Domain =
 708   DomainName Subdomain |
 709   DomainRoot
 710   deriving (Eq, Show)
 711
 712 instance Pretty Domain where
 713   pretty_show DomainRoot = ""
 714   pretty_show (DomainName s) = pretty_show s
 715
 716 -- | Parse an RFC1035 \"domain\"
 717 --
 718 --   ==== _Examples_
 719 --
 720 --   >>> import Text.Parsec ( parse, parseTest )
 721 --
 722 --   Make sure we can parse a single character:
 723 --
 724 --   >>> pretty_print $ parse domain "" "a"
 725 --   a
 726 --
 727 --   And the empty domain:
 728 --
 729 --   >>> parseTest domain ""
 730 --   DomainRoot
 731 --
 732 --   We will in fact parse the \"empty\" domain off the front of
 733 --   pretty much anything:
 734 --
 735 --   >>> parseTest domain "!8===D"
 736 --   DomainRoot
 737 --
 738 --   Equality of domains is case-insensitive:
 739 --
 740 --   >>> let (Right r1) = parse domain "" "example.com"
 741 --   >>> let (Right r2) = parse domain "" "ExaMPle.coM"
 742 --   >>> r1 == r2
 743 --   True
 744 --
 745 --   A single dot IS parsed as the root, but the dot isn't consumed:
 746 --
 747 --   >>> parseTest domain "."
 748 --   DomainRoot
 749 --
 750 --   Anything over 255 characters is an error, so the root will be
 751 --   parsed:
 752 --
 753 --   >>> let big_l1 = replicate 63 'x'
 754 --   >>> let big_l2 = replicate 63 'y' -- Avoid equal neighboring labels!
 755 --   >>> let big_labels = big_l1 ++ "." ++ big_l2 ++ "."
 756 --   >>> let big_subdomain = concat $ replicate 3 big_labels
 757 --   >>> parseTest domain big_subdomain
 758 --   DomainRoot
 759 --
 760 --   But exactly 255 is allowed:
 761 --
 762 --   >>> import Data.List ( intercalate )
 763 --   >>> let l1 = replicate 63 'w'
 764 --   >>> let l2 = replicate 63 'x'
 765 --   >>> let l3 = replicate 63 'y'
 766 --   >>> let l4 = replicate 63 'z'
 767 --   >>> let big_subdomain = intercalate "." [l1,l2,l3,l4]
 768 --   >>> let (Right r) = parse domain "" big_subdomain
 769 --   >>> length (pretty_show r)
 770 --   255
 771 --
 772 domain :: Parser Domain
 773 domain = try parse_subdomain <|> parse_empty
 774   where
 775     parse_subdomain :: Parser Domain
 776     parse_subdomain = do
 777       s <- subdomain
 778       if length (pretty_show s) <= 255
 779       then return $ DomainName s
 780       else fail "subdomains can be at most 255 characters"
 781
 782     parse_empty :: Parser Domain
 783     parse_empty = string "" >> return DomainRoot
 784
 785
 786 instance Reversible Domain where
 787   -- | Reverse the labels of a 'Domain'.
 788   --
 789   --   --   ==== _Examples_
 790   --
 791   --   >>> import Text.Parsec ( parse )
 792   --
 793   --   The root reverses to itself:
 794   --
 795   --   >>> let (Right r) = parse domain "" ""
 796   --   >>> backwards r
 797   --   DomainRoot
 798   --
 799   --   >>> let (Right r) = parse domain "" "new.www.example.com"
 800   --   >>> pretty_print $ backwards r
 801   --   com.example.www.new
 802   --
 803   backwards DomainRoot = DomainRoot
 804   backwards (DomainName s) = DomainName $ backwards s