Add back the TODO.

[dead/harbl.git] / src / Domain.hs
diff --git a/src/Domain.hs b/src/Domain.hs

index 3ef6f26597a89c730dbb5a36fe14e8e1c5147af4..87a63d9757f8bb2c6cfda6bac49651101a3153ab 100644 (file)
--- a/src/Domain.hs
+++ b/src/Domain.hs
@@ -1,3 +1,5 @@
+{-# LANGUAGE DoAndIfThenElse #-}
+
  -- | The 'Domain' data type and its parser. A 'Domain' represents a
  --   name in the domain name system (DNS) as described by
  --   RFC1035. In particular, we enforce the restrictions from Section
@@ -5,23 +7,25 @@
  --
  --     <https://tools.ietf.org/html/rfc1035#section-2.3.1>
  --
-module Domain
+--   We basically work with strings and characters everywhere, even
+--   though this isn't really correct. The length specifications in
+--   the RFCs are all in terms of octets, so really a ByteString.Char8
+--   would be more appropriate. With strings, for example, we could
+--   have a unicode mumbo jumbo character that takes up two bytes
+--   (octets).
+--
+module Domain (
+  UserDomain,
+  user_domain )
  where
  
  import Data.Char ( toLower )
  import Text.Parsec (
-  ParseError,
    (<|>),
-  alphaNum,
    char,
-  eof,
-  many1,
-  option,
    optionMaybe,
-  parse,
    string,
-  try,
-  unexpected )
+  try )
  import qualified Text.Parsec as Parsec ( digit, letter)
  import Text.Parsec.String ( Parser )
  
@@ -65,7 +69,11 @@ letter = fmap Letter Parsec.letter
  --     manner...
  --
  --   Since each part of DNS name is composed of our custom types, it
---   suffices to munge the equality for 'Letter'.
+--   suffices to munge the equality for 'Letter'. RFC4343
+--   <https://tools.ietf.org/html/rfc4343> clarifies the
+--   case-insensitivity rules, but the fact that we're treating DNS
+--   names as strings makes most of those problems go away (in
+--   exchange for new ones).
  --
  instance Eq Letter where
    (Letter l1) == (Letter l2) = (toLower l1) == (toLower l2)
@@ -191,6 +199,7 @@ instance Pretty LdhStr where
  --
  --   As well as strings of them:
  --
+--   >>> import Text.Parsec ( parse )
  --   >>> pretty_print $ parse ldh_str "" "a0-b"
  --   a0-b
  --
@@ -215,6 +224,8 @@ ldh_str = try both <|> just_one
  --
  --   ==== _Examples_
  --
+--   >>> import Text.Parsec ( parse )
+--
  --   >>> let (Right r) = parse ldh_str "" "a"
  --   >>> last_ldh_str r
  --   LetDigHypLetDig (LetDigLetter (Letter 'a'))
@@ -238,6 +249,8 @@ last_ldh_str (LdhStrMultipleLdh _ x) = last_ldh_str x
  --
  --   ==== _Examples_
  --
+--   >>> import Text.Parsec ( parse )
+--
  --   >>> let (Right r) = parse ldh_str "" "a"
  --   >>> init_ldh_str r
  --   Nothing
@@ -267,6 +280,8 @@ init_ldh_str (LdhStrMultipleLdh h t) =
  --
  --   ==== _Examples_
  --
+--   >>> import Text.Parsec ( parse )
+--
  --   >>> let (Right r) = parse ldh_str "" "a"
  --   >>> length_ldh_str r
  --   1
@@ -304,7 +319,7 @@ instance Pretty LdhStrLetDig where
  --
  --   ==== _Examples_
  --
---   >>> import Text.Parsec ( parseTest )
+--   >>> import Text.Parsec ( parse, parseTest )
  --
  --   Make sure we can parse a single character:
  --
@@ -355,6 +370,8 @@ ldh_str_let_dig = do
  --
  --   ==== _Examples_
  --
+--   >>> import Text.Parsec ( parse )
+--
  --   >>> let (Right r) = parse ldh_str_let_dig "" "a"
  --   >>> length_ldh_str_let_dig r
  --   1
@@ -375,6 +392,14 @@ length_ldh_str_let_dig (LdhStrLetDig (Just ldhstring) _) =
  --
  --     <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
  --
+--   We allow the slightly more general syntax from RFC1123, Section 2.1:
+--
+--     The syntax of a legal Internet host name was specified in RFC-952
+--     [DNS:4].  One aspect of host name syntax is hereby changed: the
+--     restriction on the first character is relaxed to allow either a
+--     letter or a digit.  Host software MUST support this more liberal
+--     syntax.
+--
  data Label = Label Letter (Maybe LdhStrLetDig)
    deriving (Eq, Show)
  
@@ -398,7 +423,7 @@ instance Pretty Label where
  --
  --   ==== _Examples_
  --
---   >>> import Text.Parsec ( parseTest )
+--   >>> import Text.Parsec ( parse, parseTest )
  --
  --   Make sure we can parse a single character:
  --
@@ -459,6 +484,8 @@ data Subdomain =
    SubdomainMultipleLabel Label Subdomain
    deriving (Eq, Show)
  
+
+
  instance Pretty Subdomain where
    pretty_show (SubdomainSingleLabel l) = pretty_show l
    pretty_show (SubdomainMultipleLabel l s) =
@@ -477,9 +504,21 @@ instance Pretty Subdomain where
  --   from left to right, so we pick off the leading label and then
  --   recurse into the definition of subdomain.
  --
+--   According to RFC1034, Section 3.1, two neighboring labels in a
+--   DNS name cannot be equal:
+--
+--     Each node has a label, which is zero to 63 octets in length.  Brother
+--     nodes may not have the same label, although the same label can be used
+--     for nodes which are not brothers.  One label is reserved, and that is
+--     the null (i.e., zero length) label used for the root.
+--
+--   We enforce this restriction, but the result is usually that we
+--   only parse the part of the subdomain leading up to the repeated
+--   label.
+--
  --   ==== _Examples_
  --
---   >>> import Text.Parsec ( parseTest )
+--   >>> import Text.Parsec ( parse, parseTest )
  --
  --   Make sure we can parse a single character:
  --
@@ -492,21 +531,97 @@ instance Pretty Subdomain where
  --   >>> pretty_print $ parse subdomain "" "www.example.com"
  --   www.example.com
  --
+--   We reject a subdomain with equal neighbors, but this leads to
+--   only the single first label being parsed instead:
+--
+--   >>> pretty_print $ parse subdomain "" "www.www.example.com"
+--   www
+--
+--   But not one with a repeated but non-neighboring label:
+--
+--   >>> pretty_print $ parse subdomain "" "www.example.www.com"
+--   www.example.www.com
+--
  subdomain :: Parser Subdomain
  subdomain = try both <|> just_one
    where
      both :: Parser Subdomain
      both = do
        l <- label
-      char '.'
+      _ <- char '.'
        s <- subdomain
-      return (SubdomainMultipleLabel l s)
+      let result = SubdomainMultipleLabel l s
+      if (subdomain_has_equal_neighbors result)
+      then fail "subdomain cannot have equal neighboring labels"
+      else return result
  
      just_one :: Parser Subdomain
      just_one = fmap SubdomainSingleLabel label
  
  
  
+-- | Retrieve a list of labels contained in a 'Subdomain'.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse )
+--
+--   >>> let (Right r) = parse subdomain "" "a"
+--   >>> pretty_print $ subdomain_labels r
+--   ["a"]
+--
+--   >>> let (Right r) = parse subdomain "" "example.com"
+--   >>> pretty_print $ subdomain_labels r
+--   ["example","com"]
+--
+--   >>> let (Right r) = parse subdomain "" "www.example.com"
+--   >>> pretty_print $ subdomain_labels r
+--   ["www","example","com"]
+--
+subdomain_labels :: Subdomain -> [Label]
+subdomain_labels (SubdomainSingleLabel l) = [l]
+subdomain_labels (SubdomainMultipleLabel l s) = l : (subdomain_labels s)
+
+
+-- | Return a list of pairs of neighboring labels in a subdomain.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse )
+--   >>> let (Right r) = parse subdomain "" "www.example.com"
+--   >>> pretty_print $ subdomain_label_neighbors r
+--   ["(\"www\",\"example\")","(\"example\",\"com\")"]
+--
+subdomain_label_neighbors :: Subdomain -> [(Label,Label)]
+subdomain_label_neighbors s =
+  zip ls (tail ls)
+  where
+    ls = subdomain_labels s
+
+
+-- | Return @True@ if the subdomain has any two equal neighboring
+--   labels, and @False@ otherwise.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse )
+--
+--   >>> let (Right r) = parse subdomain "" "www.example.com"
+--   >>> subdomain_has_equal_neighbors r
+--   False
+--
+--   >>> let (Right l) = parse label "" "www"
+--   >>> let (Right s) = parse subdomain "" "www.example.com"
+--   >>> let bad_subdomain = SubdomainMultipleLabel l s
+--   >>> subdomain_has_equal_neighbors bad_subdomain
+--   True
+--
+subdomain_has_equal_neighbors :: Subdomain -> Bool
+subdomain_has_equal_neighbors s =
+  or [ x == y | (x,y) <- subdomain_label_neighbors s ]
+
+
+
  -- * Domains
  
  -- | An RFC1035 domain. According to RFC1035 a domain can be either a
@@ -535,7 +650,7 @@ instance Pretty Domain where
  --
  --   ==== _Examples_
  --
---   >>> import Text.Parsec ( parseTest )
+--   >>> import Text.Parsec ( parse, parseTest )
  --
  --   Make sure we can parse a single character:
  --
@@ -565,10 +680,39 @@ instance Pretty Domain where
  --   >>> parseTest domain "."
  --   DomainRoot
  --
+--   Anything over 255 characters is an error, so the root will be
+--   parsed:
+--
+--   >>> let big_l1 = replicate 63 'x'
+--   >>> let big_l2 = replicate 63 'y' -- Avoid equal neighboring labels!
+--   >>> let big_labels = big_l1 ++ "." ++ big_l2 ++ "."
+--   >>> let big_subdomain = concat $ replicate 3 big_labels
+--   >>> parseTest domain big_subdomain
+--   DomainRoot
+--
+--   But exactly 255 is allowed:
+--
+--   >>> import Data.List ( intercalate )
+--   >>> let l1 = replicate 63 'w'
+--   >>> let l2 = replicate 63 'x'
+--   >>> let l3 = replicate 63 'y'
+--   >>> let l4 = replicate 63 'z'
+--   >>> let big_subdomain = intercalate "." [l1,l2,l3,l4]
+--   >>> let (Right r) = parse domain "" big_subdomain
+--   >>> length (pretty_show r)
+--   255
+--
  domain :: Parser Domain
  domain = try parse_subdomain <|> parse_empty
    where
-    parse_subdomain = fmap DomainName subdomain
+    parse_subdomain :: Parser Domain
+    parse_subdomain = do
+      s <- subdomain
+      if length (pretty_show s) <= 255
+      then return $ DomainName s
+      else fail "subdomains can be at most 255 characters"
+
+    parse_empty :: Parser Domain
      parse_empty = string "" >> return DomainRoot
  
  
@@ -605,7 +749,7 @@ instance Pretty UserDomain where
  --
  --   ==== _Examples_
  --
---   >>> import Text.Parsec ( parseTest )
+--   >>> import Text.Parsec ( parse, parseTest )
  --
  --   We can really parse the root now!
  --
@@ -638,7 +782,7 @@ user_domain = try absolute <|> relative
      absolute :: Parser UserDomain
      absolute = do
        d <- domain
-      r <- char '.'
+      _ <- char '.'
        return $ UserDomainAbsolute d
  
      relative :: Parser UserDomain