harbl/src/Network/DNS/RBL/Domain.hs

   1 {-# LANGUAGE DoAndIfThenElse #-}
   2
   3 -- | The 'Domain' data type and its parser. A 'Domain' represents a
   4 --   name in the domain name system (DNS) as described by
   5 --   RFC1035. In particular, we enforce the restrictions from Section
   6 --   2.3.1 \"Preferred name syntax\". See for example,
   7 --
   8 --     <https://tools.ietf.org/html/rfc1035#section-2.3.1>
   9 --
  10 --   We basically work with strings and characters everywhere, even
  11 --   though this isn't really correct. The length specifications in
  12 --   the RFCs are all in terms of octets, so really a ByteString.Char8
  13 --   would be more appropriate. With strings, for example, we could
  14 --   have a unicode mumbo jumbo character that takes up two bytes
  15 --   (octets).
  16 --
  17 module Network.DNS.RBL.Domain (
  18   Domain(..),
  19   domain )
  20 where
  21
  22 import Text.Parsec (
  23   (<|>),
  24   char,
  25   optionMaybe,
  26   string,
  27   try )
  28 import Text.Parsec.String ( Parser )
  29
  30 import Network.DNS.RBL.Domain.LdhStr (
  31   LdhStr(..),
  32   ldh_str,
  33   ldh_str_length )
  34 import Network.DNS.RBL.Domain.LetDig ( LetDig, let_dig )
  35 import Network.DNS.RBL.Domain.LetDigHyp ( LetDigHyp(..) )
  36 import Network.DNS.RBL.Pretty ( Pretty(..) )
  37 import Network.DNS.RBL.Reversible ( Reversible(..) )
  38
  39
  40 -- * Letter/Digit/Hyphen string followed by a trailing Letter/Digit
  41
  42 -- | This type isn't explicitly part of the grammar, but it's what
  43 --   shows up in the square brackets of,
  44 --
  45 --     <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
  46 --
  47 --   The ldh-str is optional, but if one is present, we must also have
  48 --   a trailing let-dig to prevent the name from ending with a
  49 --   hyphen. This can be represented with a @Maybe LdhStrLetDig@,
  50 --   which is why we're about to define it.
  51 --
  52 data LdhStrLetDig = LdhStrLetDig (Maybe LdhStr) LetDig
  53   deriving (Eq, Show)
  54
  55 instance Pretty LdhStrLetDig where
  56   pretty_show (LdhStrLetDig Nothing ld) = pretty_show ld
  57   pretty_show (LdhStrLetDig (Just s) ld) = (pretty_show s) ++ (pretty_show ld)
  58
  59
  60
  61 -- | Parse an 'LdhStrLetDig'. This isn't in the grammar, but we might
  62 --   as well define the parser for it independently since we gave it
  63 --   its own data type.
  64 --
  65 --   ==== _Examples_
  66 --
  67 --   >>> import Text.Parsec ( parse, parseTest )
  68 --
  69 --   Make sure we can parse a single character:
  70 --
  71 --   >>> parseTest ldh_str_let_dig "a"
  72 --   LdhStrLetDig Nothing (LetDigLetter (Letter 'a'))
  73 --
  74 --   And longer strings:
  75 --
  76 --   >>> pretty_print $ parse ldh_str_let_dig "" "ab"
  77 --   ab
  78 --
  79 --   >>> pretty_print $ parse ldh_str_let_dig "" "-b"
  80 --   -b
  81 --
  82 --   >>> parseTest ldh_str_let_dig "b-"
  83 --   parse error at (line 1, column 3):
  84 --   label cannot end with a hyphen
  85 --
  86 ldh_str_let_dig :: Parser LdhStrLetDig
  87 ldh_str_let_dig = do
  88   -- This will happily eat up the trailing let-dig...
  89   full_ldh <- ldh_str
  90
  91   -- So we have to go back and see what happened.
  92   case (backwards full_ldh) of
  93
  94     -- Fail on a single hyphen.
  95     (LdhStrSingleLdh (LetDigHypHyphen _)) ->
  96       fail "label cannot end with a hyphen"
  97
  98     -- Fail for a hyphen followed by other stuff.
  99     (LdhStrMultipleLdh (LetDigHypHyphen _) _) ->
 100       fail "label cannot end with a hyphen"
 101
 102     -- Simply return the thing if it's a single non-hyphen.
 103     (LdhStrSingleLdh (LetDigHypLetDig ld)) -> return $ LdhStrLetDig Nothing ld
 104
 105     -- And peel off the last character for a non-hyphen followed by
 106     -- other stuff. We wind up reversing things twice, but whatever.
 107     (LdhStrMultipleLdh (LetDigHypLetDig ld) init_ldh_rev) ->
 108       let init_ldh = backwards init_ldh_rev
 109       in return $ LdhStrLetDig (Just init_ldh) ld
 110
 111
 112
 113 -- | Compute the length of a 'LdhStrLetDig'. It's at least one, since
 114 --   the let-dig at the end is always there. And when there's an
 115 --   ldh-str too, we add its length to one.
 116 --
 117 --   ==== _Examples_
 118 --
 119 --   >>> import Text.Parsec ( parse )
 120 --
 121 --   >>> let (Right r) = parse ldh_str_let_dig "" "a"
 122 --   >>> length_ldh_str_let_dig r
 123 --   1
 124 --
 125 --   >>> let (Right r) = parse ldh_str_let_dig "" "abc-def"
 126 --   >>> length_ldh_str_let_dig r
 127 --   7
 128 --
 129 length_ldh_str_let_dig :: LdhStrLetDig -> Int
 130 length_ldh_str_let_dig (LdhStrLetDig Nothing _) = 1
 131 length_ldh_str_let_dig (LdhStrLetDig (Just ldhstring) _) =
 132   1 + (ldh_str_length ldhstring)
 133
 134
 135 -- * Labels
 136
 137 -- | The label type from the RFC1035 grammar:
 138 --
 139 --     <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
 140 --
 141 --   We allow the slightly more general syntax from RFC1123, Section 2.1:
 142 --
 143 --     The syntax of a legal Internet host name was specified in RFC-952
 144 --     [DNS:4].  One aspect of host name syntax is hereby changed: the
 145 --     restriction on the first character is relaxed to allow either a
 146 --     letter or a digit.  Host software MUST support this more liberal
 147 --     syntax.
 148 --
 149 data Label = Label LetDig (Maybe LdhStrLetDig)
 150   deriving (Eq, Show)
 151
 152 instance Pretty Label where
 153   pretty_show (Label l Nothing) = pretty_show l
 154   pretty_show (Label l (Just s)) = (pretty_show l) ++ (pretty_show s)
 155
 156 -- | Parse a 'Label'.
 157 --
 158 --   In addition to the grammar, there's another restriction on
 159 --   labels: their length must be 63 characters or less. Quoting
 160 --   Section 2.3.1, \"Preferred name syntax\", of RFC1035:
 161 --
 162 --     The labels must follow the rules for ARPANET host names.  They
 163 --     must start with a letter, end with a letter or digit, and have
 164 --     as interior characters only letters, digits, and hyphen.  There
 165 --     are also some restrictions on the length.  Labels must be 63
 166 --     characters or less.
 167 --
 168 --   We check this only after we have successfully parsed a label.
 169 --
 170 --   ==== _Examples_
 171 --
 172 --   >>> import Text.Parsec ( parse, parseTest )
 173 --
 174 --   Make sure we can parse a single character:
 175 --
 176 --   >>> parseTest label "a"
 177 --   Label (LetDigLetter (Letter 'a')) Nothing
 178 --
 179 --   And longer strings:
 180 --
 181 --   >>> pretty_print $ parse label "" "abc-def"
 182 --   abc-def
 183 --
 184 --   But not anything ending in a hyphen:
 185 --
 186 --   >>> parseTest label "abc-"
 187 --   parse error at (line 1, column 5):
 188 --   label cannot end with a hyphen
 189 --
 190 --   Or anything over 63 characters:
 191 --
 192 --   >>> parseTest label (['a'..'z'] ++ ['a'..'z'] ++ ['a'..'z'])
 193 --   parse error at (line 1, column 79):
 194 --   labels must be 63 or fewer characters
 195 --
 196 --   However, /exactly/ 63 characters is acceptable:
 197 --
 198 --   >>> pretty_print $ parse label "" (replicate 63 'x')
 199 --   xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 200 --
 201 --   Ensure that a label can begin with a digit:
 202 --
 203 --   >>> pretty_print $ parse label "" "3com"
 204 --   3com
 205 --
 206 label :: Parser Label
 207 label = do
 208   l <- let_dig -- Guaranteed to be there
 209   maybe_s <- optionMaybe ldh_str_let_dig -- Might not be there
 210   case maybe_s of
 211     -- It can only be one character long, from the letter...
 212     Nothing -> return $ Label l maybe_s
 213
 214     -- The letter gives us one character, so we check that the rest is
 215     -- less than 62 characters long. But in the error message we need
 216     -- to report 63.
 217     Just s  -> if (length_ldh_str_let_dig s) <= 62
 218               then return $ Label l maybe_s
 219               else fail "labels must be 63 or fewer characters"
 220
 221
 222
 223 -- * Subdomains
 224
 225
 226 -- | The data type representing a \"subdomain\" from RFC1035:
 227 --
 228 --     <subdomain> ::= <label> | <subdomain> "." <label>
 229 --
 230 --   We have reversed the order of the subdomain and label in the
 231 --   second option, however. This is explained in 'subdomain'.
 232 --
 233 data Subdomain =
 234   SubdomainSingleLabel Label |
 235   SubdomainMultipleLabel Label Subdomain
 236   deriving (Eq, Show)
 237
 238
 239
 240 instance Pretty Subdomain where
 241   pretty_show (SubdomainSingleLabel l) = pretty_show l
 242   pretty_show (SubdomainMultipleLabel l s) =
 243     (pretty_show l) ++ "." ++ (pretty_show s)
 244
 245
 246 instance Reversible Subdomain where
 247   -- | Reverse the labels of the given subdomain.
 248   --
 249   --   ==== _Examples_
 250   --
 251   --   >>> import Text.Parsec ( parse )
 252   --
 253   --   Standard usage:
 254   --
 255   --   >>> let (Right r) = parse subdomain "" "com"
 256   --   >>> pretty_print $ backwards r
 257   --   com
 258   --
 259   --   >>> let (Right r) = parse subdomain "" "example.com"
 260   --   >>> pretty_print $ backwards r
 261   --   com.example
 262   --
 263   --   >>> let (Right r) = parse subdomain "" "www.example.com"
 264   --   >>> pretty_print $ backwards r
 265   --   com.example.www
 266   --
 267   --   >>> let (Right r) = parse subdomain "" "new.www.example.com"
 268   --   >>> pretty_print $ backwards r
 269   --   com.example.www.new
 270   --
 271
 272   -- It's easy to reverse a single label...
 273   backwards s@(SubdomainSingleLabel _) = s
 274
 275   -- For multiple labels we have two cases. The first is where we have
 276   -- exactly two labels, and we just need to swap them.
 277   backwards (SubdomainMultipleLabel l (SubdomainSingleLabel m)) =
 278     SubdomainMultipleLabel m (SubdomainSingleLabel l)
 279
 280   -- And now the hard case. See the 'LdhStr' implementation for an
 281   -- explanation.
 282   --
 283   backwards (SubdomainMultipleLabel l s) = build (SubdomainSingleLabel l) s
 284     where
 285       -- Build up the first Subdomain on the left by peeling off the
 286       -- leading elements of the second Subdomain.
 287       build :: Subdomain -> Subdomain -> Subdomain
 288       build dst (SubdomainSingleLabel final) = SubdomainMultipleLabel final dst
 289       build dst (SubdomainMultipleLabel leading rest) =
 290         build (SubdomainMultipleLabel leading dst) rest
 291
 292
 293
 294 -- | Parse an RFC1035 \"subdomain\". The given grammar is,
 295 --
 296 --     <subdomain> ::= <label> | <subdomain> "." <label>
 297 --
 298 --   However, we have reversed the order of the subdomain and label to
 299 --   prevent infinite recursion. The second option (subdomain + label)
 300 --   is obviously more specific, we we need to try it first. This
 301 --   presents a problem: we're trying to parse a subdomain in terms of
 302 --   a subdomain! The given grammar represents subdomains how we like
 303 --   to think of them; from right to left. But it's better to parse
 304 --   from left to right, so we pick off the leading label and then
 305 --   recurse into the definition of subdomain.
 306 --
 307 --   According to RFC1034, Section 3.1, two neighboring labels in a
 308 --   DNS name cannot be equal:
 309 --
 310 --     Each node has a label, which is zero to 63 octets in length.  Brother
 311 --     nodes may not have the same label, although the same label can be used
 312 --     for nodes which are not brothers.  One label is reserved, and that is
 313 --     the null (i.e., zero length) label used for the root.
 314 --
 315 --   We enforce this restriction, but the result is usually that we
 316 --   only parse the part of the subdomain leading up to the repeated
 317 --   label.
 318 --
 319 --   ==== _Examples_
 320 --
 321 --   >>> import Text.Parsec ( parse, parseTest )
 322 --
 323 --   Make sure we can parse a single character:
 324 --
 325 --   >>> parseTest subdomain "a"
 326 --   SubdomainSingleLabel (Label (LetDigLetter (Letter 'a')) Nothing)
 327 --
 328 --   >>> pretty_print $ parse subdomain "" "example.com"
 329 --   example.com
 330 --
 331 --   >>> pretty_print $ parse subdomain "" "www.example.com"
 332 --   www.example.com
 333 --
 334 --   We reject a subdomain with equal neighbors, but this leads to
 335 --   only the single first label being parsed instead:
 336 --
 337 --   >>> pretty_print $ parse subdomain "" "www.www.example.com"
 338 --   www
 339 --
 340 --   But not one with a repeated but non-neighboring label:
 341 --
 342 --   >>> pretty_print $ parse subdomain "" "www.example.www.com"
 343 --   www.example.www.com
 344 --
 345 subdomain :: Parser Subdomain
 346 subdomain = try both <|> just_one
 347   where
 348     both :: Parser Subdomain
 349     both = do
 350       l <- label
 351       _ <- char '.'
 352       s <- subdomain
 353       let result = SubdomainMultipleLabel l s
 354       if (subdomain_has_equal_neighbors result)
 355       then fail "subdomain cannot have equal neighboring labels"
 356       else return result
 357
 358     just_one :: Parser Subdomain
 359     just_one = fmap SubdomainSingleLabel label
 360
 361
 362
 363 -- | Retrieve a list of labels contained in a 'Subdomain'.
 364 --
 365 --   ==== _Examples_
 366 --
 367 --   >>> import Text.Parsec ( parse )
 368 --
 369 --   >>> let (Right r) = parse subdomain "" "a"
 370 --   >>> pretty_print $ subdomain_labels r
 371 --   ["a"]
 372 --
 373 --   >>> let (Right r) = parse subdomain "" "example.com"
 374 --   >>> pretty_print $ subdomain_labels r
 375 --   ["example","com"]
 376 --
 377 --   >>> let (Right r) = parse subdomain "" "www.example.com"
 378 --   >>> pretty_print $ subdomain_labels r
 379 --   ["www","example","com"]
 380 --
 381 subdomain_labels :: Subdomain -> [Label]
 382 subdomain_labels (SubdomainSingleLabel l) = [l]
 383 subdomain_labels (SubdomainMultipleLabel l s) = l : (subdomain_labels s)
 384
 385
 386 -- | Return a list of pairs of neighboring labels in a subdomain.
 387 --
 388 --   ==== _Examples_
 389 --
 390 --   >>> import Text.Parsec ( parse )
 391 --   >>> let (Right r) = parse subdomain "" "www.example.com"
 392 --   >>> pretty_print $ subdomain_label_neighbors r
 393 --   ["(\"www\",\"example\")","(\"example\",\"com\")"]
 394 --
 395 subdomain_label_neighbors :: Subdomain -> [(Label,Label)]
 396 subdomain_label_neighbors s =
 397   zip ls (tail ls)
 398   where
 399     ls = subdomain_labels s
 400
 401
 402 -- | Return @True@ if the subdomain has any two equal neighboring
 403 --   labels, and @False@ otherwise.
 404 --
 405 --   ==== _Examples_
 406 --
 407 --   >>> import Text.Parsec ( parse )
 408 --
 409 --   >>> let (Right r) = parse subdomain "" "www.example.com"
 410 --   >>> subdomain_has_equal_neighbors r
 411 --   False
 412 --
 413 --   >>> let (Right l) = parse label "" "www"
 414 --   >>> let (Right s) = parse subdomain "" "www.example.com"
 415 --   >>> let bad_subdomain = SubdomainMultipleLabel l s
 416 --   >>> subdomain_has_equal_neighbors bad_subdomain
 417 --   True
 418 --
 419 subdomain_has_equal_neighbors :: Subdomain -> Bool
 420 subdomain_has_equal_neighbors s =
 421   or [ x == y | (x,y) <- subdomain_label_neighbors s ]
 422
 423
 424
 425
 426 -- * Domains
 427
 428 -- | An RFC1035 domain. According to RFC1035 a domain can be either a
 429 -- subdomain or \" \", which according to RFC2181
 430 -- <https://tools.ietf.org/html/rfc2181#section-11> means the root:
 431 --
 432 --      The zero length full name is defined as representing the root
 433 --      of the DNS tree, and is typically written and displayed as
 434 --      \".\".
 435 --
 436 --   We let the 'Domain' type remain true to those RFCs, even though
 437 --   they don't support an absolute domain name of e.g. a single dot.
 438 --
 439 data Domain =
 440   DomainName Subdomain |
 441   DomainRoot
 442   deriving (Eq, Show)
 443
 444 instance Pretty Domain where
 445   pretty_show DomainRoot = ""
 446   pretty_show (DomainName s) = pretty_show s
 447
 448 -- | Parse an RFC1035 \"domain\"
 449 --
 450 --   ==== _Examples_
 451 --
 452 --   >>> import Text.Parsec ( parse, parseTest )
 453 --
 454 --   Make sure we can parse a single character:
 455 --
 456 --   >>> pretty_print $ parse domain "" "a"
 457 --   a
 458 --
 459 --   And the empty domain:
 460 --
 461 --   >>> parseTest domain ""
 462 --   DomainRoot
 463 --
 464 --   We will in fact parse the \"empty\" domain off the front of
 465 --   pretty much anything:
 466 --
 467 --   >>> parseTest domain "!8===D"
 468 --   DomainRoot
 469 --
 470 --   Equality of domains is case-insensitive:
 471 --
 472 --   >>> let (Right r1) = parse domain "" "example.com"
 473 --   >>> let (Right r2) = parse domain "" "ExaMPle.coM"
 474 --   >>> r1 == r2
 475 --   True
 476 --
 477 --   A single dot IS parsed as the root, but the dot isn't consumed:
 478 --
 479 --   >>> parseTest domain "."
 480 --   DomainRoot
 481 --
 482 --   Anything over 255 characters is an error, so the root will be
 483 --   parsed:
 484 --
 485 --   >>> let big_l1 = replicate 63 'x'
 486 --   >>> let big_l2 = replicate 63 'y' -- Avoid equal neighboring labels!
 487 --   >>> let big_labels = big_l1 ++ "." ++ big_l2 ++ "."
 488 --   >>> let big_subdomain = concat $ replicate 3 big_labels
 489 --   >>> parseTest domain big_subdomain
 490 --   DomainRoot
 491 --
 492 --   But exactly 255 is allowed:
 493 --
 494 --   >>> import Data.List ( intercalate )
 495 --   >>> let l1 = replicate 63 'w'
 496 --   >>> let l2 = replicate 63 'x'
 497 --   >>> let l3 = replicate 63 'y'
 498 --   >>> let l4 = replicate 63 'z'
 499 --   >>> let big_subdomain = intercalate "." [l1,l2,l3,l4]
 500 --   >>> let (Right r) = parse domain "" big_subdomain
 501 --   >>> length (pretty_show r)
 502 --   255
 503 --
 504 domain :: Parser Domain
 505 domain = try parse_subdomain <|> parse_empty
 506   where
 507     parse_subdomain :: Parser Domain
 508     parse_subdomain = do
 509       s <- subdomain
 510       if length (pretty_show s) <= 255
 511       then return $ DomainName s
 512       else fail "subdomains can be at most 255 characters"
 513
 514     parse_empty :: Parser Domain
 515     parse_empty = string "" >> return DomainRoot
 516
 517
 518 instance Reversible Domain where
 519   -- | Reverse the labels of a 'Domain'.
 520   --
 521   --   --   ==== _Examples_
 522   --
 523   --   >>> import Text.Parsec ( parse )
 524   --
 525   --   The root reverses to itself:
 526   --
 527   --   >>> let (Right r) = parse domain "" ""
 528   --   >>> backwards r
 529   --   DomainRoot
 530   --
 531   --   >>> let (Right r) = parse domain "" "new.www.example.com"
 532   --   >>> pretty_print $ backwards r
 533   --   com.example.www.new
 534   --
 535   backwards DomainRoot = DomainRoot
 536   backwards (DomainName s) = DomainName $ backwards s