X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Fharbl.git;a=blobdiff_plain;f=src%2FDomain.hs;h=87a63d9757f8bb2c6cfda6bac49651101a3153ab;hp=ce20aae25c078f8ceec150a319cd39a4eb38b52d;hb=80b389fd4d76bc8b2cb5dfad0f066fd7a838bdfb;hpb=e1060ef815f35309c7ca0800a345d7c54ce346bd

diff --git a/src/Domain.hs b/src/Domain.hs
index ce20aae..87a63d9 100644
--- a/src/Domain.hs
+++ b/src/Domain.hs
@@ -1,3 +1,5 @@
+{-# LANGUAGE DoAndIfThenElse #-}
+
 -- | The 'Domain' data type and its parser. A 'Domain' represents a
 --   name in the domain name system (DNS) as described by
 --   RFC1035. In particular, we enforce the restrictions from Section
@@ -5,82 +7,122 @@
 --
 --     <https://tools.ietf.org/html/rfc1035#section-2.3.1>
 --
-module Domain
+--   We basically work with strings and characters everywhere, even
+--   though this isn't really correct. The length specifications in
+--   the RFCs are all in terms of octets, so really a ByteString.Char8
+--   would be more appropriate. With strings, for example, we could
+--   have a unicode mumbo jumbo character that takes up two bytes
+--   (octets).
+--
+module Domain (
+  UserDomain,
+  user_domain )
 where
 
 import Data.Char ( toLower )
 import Text.Parsec (
-  ParseError,
   (<|>),
-  alphaNum,
   char,
-  eof,
-  many1,
-  option,
   optionMaybe,
-  parse,
   string,
-  try,
-  unexpected )
+  try )
 import qualified Text.Parsec as Parsec ( digit, letter)
 import Text.Parsec.String ( Parser )
 
-newtype Domain = Domain String deriving Show
+import Pretty ( Pretty(..) )
 
+-- * Digits
 
--- | The derived instance of 'Eq' for domain names is incorrect. All
---   comparisons are currently made case-insensitively. The following is
---   an excerpt from RFC1035:
---
---     2.3.3. Character Case
---
---     For all parts of the DNS that are part of the official
---     protocol, all comparisons between character strings (e.g.,
---     labels, domain names, etc.)  are done in a case-insensitive
---     manner...
---
---   So to compare two DNS names, we compare their lower-case
---   counterparts.
+-- | A wrapper around a digit character.
 --
-instance Eq Domain where
-  (Domain d1) == (Domain d2) =
-    (map toLower d1) == (map toLower d2)
-
-
--- * Digits
 newtype Digit = Digit Char deriving (Eq, Show)
+instance Pretty Digit where pretty_show (Digit d) = [d]
 
+-- | Parse a single digit, but wrap it in our 'Digit' type.
+--
 digit :: Parser Digit
 digit = fmap Digit Parsec.digit
 
 
 -- * Letters
-newtype Letter = Letter Char deriving (Eq, Show)
 
+-- | A wrapper around a letter character.
+--
+newtype Letter = Letter Char deriving (Show)
+instance Pretty Letter where pretty_show (Letter l) = [l]
+
+
+-- | Parse a single letter, but wrap it in our 'Letter' type.
+--
 letter :: Parser Letter
 letter = fmap Letter Parsec.letter
 
+-- | The derived instance of 'Eq' for letters is incorrect. All
+--   comparisons should be made case-insensitively. The following
+--   is an excerpt from RFC1035:
+--
+--     2.3.3. Character Case
+--
+--     For all parts of the DNS that are part of the official
+--     protocol, all comparisons between character strings (e.g.,
+--     labels, domain names, etc.)  are done in a case-insensitive
+--     manner...
+--
+--   Since each part of DNS name is composed of our custom types, it
+--   suffices to munge the equality for 'Letter'. RFC4343
+--   <https://tools.ietf.org/html/rfc4343> clarifies the
+--   case-insensitivity rules, but the fact that we're treating DNS
+--   names as strings makes most of those problems go away (in
+--   exchange for new ones).
+--
+instance Eq Letter where
+  (Letter l1) == (Letter l2) = (toLower l1) == (toLower l2)
 
 -- * Letters/Digits
-data LetDig = LetDigLetter Letter | LetDigDigit Digit deriving (Eq, Show)
 
+-- | A sum type representing either a letter or a digit.
+--
+data LetDig =
+  LetDigLetter Letter |
+  LetDigDigit  Digit
+  deriving (Eq, Show)
+
+instance Pretty LetDig where
+  pretty_show (LetDigLetter l) = pretty_show l
+  pretty_show (LetDigDigit d) = pretty_show d
+
+-- | Parse a letter or a digit and wrap it in our 'LetDig' type.
+--
 let_dig :: Parser LetDig
 let_dig = (fmap LetDigLetter letter) <|> (fmap LetDigDigit digit)
 
 
 -- * Hyphens
+
+-- | A wrapper around a single hyphen character.
+--
 newtype Hyphen = Hyphen Char deriving (Eq, Show)
+instance Pretty Hyphen where pretty_show (Hyphen h) = [h]
 
+-- | Parse a single hyphen and wrap it in our 'Hyphen' type.
+--
 hyphen :: Parser Hyphen
 hyphen = fmap Hyphen (char '-')
 
 
 -- * Letter, Digit, or Hyphen.
 
-data LetDigHyp = LetDigHypLetDig LetDig
-               | LetDigHypHyphen Hyphen
+-- | A sum type representing a letter, digit, or hyphen.
+--
+data LetDigHyp =
+  LetDigHypLetDig LetDig |
+  LetDigHypHyphen Hyphen
   deriving (Eq, Show)
 
+instance Pretty LetDigHyp where
+  pretty_show (LetDigHypLetDig ld) = pretty_show ld
+  pretty_show (LetDigHypHyphen h) = pretty_show h
+
 
 -- | The following is the simplest type in the domain grammar that
 --   isn't already implemented for us.
@@ -129,6 +171,17 @@ let_dig_hyp =
 --   These are represented as either a single instance of a
 --   'LetDigHyp', or a string of them (recursive).
 --
+data LdhStr =
+  LdhStrSingleLdh LetDigHyp |
+  LdhStrMultipleLdh LetDigHyp LdhStr
+  deriving (Eq, Show)
+
+instance Pretty LdhStr where
+  pretty_show (LdhStrSingleLdh ldh) = pretty_show ldh
+  pretty_show (LdhStrMultipleLdh ldh s) = (pretty_show ldh) ++ (pretty_show s)
+
+-- | Parse a string of letters, digits, and hyphens (an 'LdhStr').
+--
 --   ==== _Examples_
 --
 --   >>> import Text.Parsec ( parseTest )
@@ -146,12 +199,10 @@ let_dig_hyp =
 --
 --   As well as strings of them:
 --
---   LdhStr (LetDigHypLetter (Letter 'a')) (LdhStr (LetDigHypDigit (Digit '0')) (LdhStr (LetDigHypHyphen (Hyphen '-')) (LdhStrSingleLdh (LetDigHypLetter (Letter 'b')))))
+--   >>> import Text.Parsec ( parse )
+--   >>> pretty_print $ parse ldh_str "" "a0-b"
+--   a0-b
 --
-data LdhStr =
-  LdhStrSingleLdh LetDigHyp | LdhStrMultipleLdh LetDigHyp LdhStr
-  deriving (Eq, Show)
-
 ldh_str :: Parser LdhStr
 ldh_str = try both <|> just_one
   where
@@ -173,6 +224,8 @@ ldh_str = try both <|> just_one
 --
 --   ==== _Examples_
 --
+--   >>> import Text.Parsec ( parse )
+--
 --   >>> let (Right r) = parse ldh_str "" "a"
 --   >>> last_ldh_str r
 --   LetDigHypLetDig (LetDigLetter (Letter 'a'))
@@ -196,6 +249,8 @@ last_ldh_str (LdhStrMultipleLdh _ x) = last_ldh_str x
 --
 --   ==== _Examples_
 --
+--   >>> import Text.Parsec ( parse )
+--
 --   >>> let (Right r) = parse ldh_str "" "a"
 --   >>> init_ldh_str r
 --   Nothing
@@ -225,6 +280,8 @@ init_ldh_str (LdhStrMultipleLdh h t) =
 --
 --   ==== _Examples_
 --
+--   >>> import Text.Parsec ( parse )
+--
 --   >>> let (Right r) = parse ldh_str "" "a"
 --   >>> length_ldh_str r
 --   1
@@ -252,13 +309,17 @@ length_ldh_str (LdhStrMultipleLdh _ t) = 1 + (length_ldh_str t)
 data LdhStrLetDig = LdhStrLetDig (Maybe LdhStr) LetDig
   deriving (Eq, Show)
 
+instance Pretty LdhStrLetDig where
+  pretty_show (LdhStrLetDig Nothing ld) = pretty_show ld
+  pretty_show (LdhStrLetDig (Just s) ld) = (pretty_show s) ++ (pretty_show ld)
+
 -- | Parse an 'LdhStrLetDig'. This isn't in the grammar, but we might
 --   as well define the parser for it independently since we gave it
 --   its own data type.
 --
 --   ==== _Examples_
 --
---   >>> import Text.Parsec ( parseTest )
+--   >>> import Text.Parsec ( parse, parseTest )
 --
 --   Make sure we can parse a single character:
 --
@@ -267,11 +328,11 @@ data LdhStrLetDig = LdhStrLetDig (Maybe LdhStr) LetDig
 --
 --   And longer strings:
 --
---   >>> parseTest ldh_str_let_dig "ab"
---   LdhStrLetDig (Just (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a'))))) (LetDigLetter (Letter 'b'))
+--   >>> pretty_print $ parse ldh_str_let_dig "" "ab"
+--   ab
 --
---   >>> parseTest ldh_str_let_dig "-b"
---   LdhStrLetDig (Just (LdhStrSingleLdh (LetDigHypHyphen (Hyphen '-')))) (LetDigLetter (Letter 'b'))
+--   >>> pretty_print $ parse ldh_str_let_dig "" "-b"
+--   -b
 --
 --   >>> parseTest ldh_str_let_dig "b-"
 --   parse error at (line 1, column 3):
@@ -309,6 +370,8 @@ ldh_str_let_dig = do
 --
 --   ==== _Examples_
 --
+--   >>> import Text.Parsec ( parse )
+--
 --   >>> let (Right r) = parse ldh_str_let_dig "" "a"
 --   >>> length_ldh_str_let_dig r
 --   1
@@ -329,9 +392,20 @@ length_ldh_str_let_dig (LdhStrLetDig (Just ldhstring) _) =
 --
 --     <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
 --
+--   We allow the slightly more general syntax from RFC1123, Section 2.1:
+--
+--     The syntax of a legal Internet host name was specified in RFC-952
+--     [DNS:4].  One aspect of host name syntax is hereby changed: the
+--     restriction on the first character is relaxed to allow either a
+--     letter or a digit.  Host software MUST support this more liberal
+--     syntax.
+--
 data Label = Label Letter (Maybe LdhStrLetDig)
   deriving (Eq, Show)
 
+instance Pretty Label where
+  pretty_show (Label l Nothing) = pretty_show l
+  pretty_show (Label l (Just s)) = (pretty_show l) ++ (pretty_show s)
 
 -- | Parse a 'Label'.
 --
@@ -349,7 +423,7 @@ data Label = Label Letter (Maybe LdhStrLetDig)
 --
 --   ==== _Examples_
 --
---   >>> import Text.Parsec ( parseTest )
+--   >>> import Text.Parsec ( parse, parseTest )
 --
 --   Make sure we can parse a single character:
 --
@@ -358,8 +432,8 @@ data Label = Label Letter (Maybe LdhStrLetDig)
 --
 --   And longer strings:
 --
---   >>> parseTest label "abc-def"
---   Label (Letter 'a') (Just (LdhStrLetDig (Just (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'b'))) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'c'))) (LdhStrMultipleLdh (LetDigHypHyphen (Hyphen '-')) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'd'))) (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'e'))))))))) (LetDigLetter (Letter 'f'))))
+--   >>> pretty_print $ parse label "" "abc-def"
+--   abc-def
 --
 --   But not anything ending in a hyphen:
 --
@@ -375,7 +449,8 @@ data Label = Label Letter (Maybe LdhStrLetDig)
 --
 --   However, /exactly/ 63 characters is acceptable:
 --
---   TODO
+--   >>> pretty_print $ parse label "" (replicate 63 'x')
+--   xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
 --
 label :: Parser Label
 label = do
@@ -393,8 +468,322 @@ label = do
               else fail "labels must be 63 or fewer characters"
 
 
+
+-- * Subdomains
+
+
+-- | The data type representing a \"subdomain\" from RFC1035:
+--
+--     <subdomain> ::= <label> | <subdomain> "." <label>
+--
+--   We have reversed the order of the subdomain and label in the
+--   second option, however. This is explained in 'subdomain'.
+--
+data Subdomain =
+  SubdomainSingleLabel Label |
+  SubdomainMultipleLabel Label Subdomain
+  deriving (Eq, Show)
+
+
+
+instance Pretty Subdomain where
+  pretty_show (SubdomainSingleLabel l) = pretty_show l
+  pretty_show (SubdomainMultipleLabel l s) =
+    (pretty_show l) ++ "." ++ (pretty_show s)
+
+-- | Parse an RFC1035 \"subdomain\". The given grammar is,
+--
+--     <subdomain> ::= <label> | <subdomain> "." <label>
+--
+--   However, we have reversed the order of the subdomain and label to
+--   prevent infinite recursion. The second option (subdomain + label)
+--   is obviously more specific, we we need to try it first. This
+--   presents a problem: we're trying to parse a subdomain in terms of
+--   a subdomain! The given grammar represents subdomains how we like
+--   to think of them; from right to left. But it's better to parse
+--   from left to right, so we pick off the leading label and then
+--   recurse into the definition of subdomain.
+--
+--   According to RFC1034, Section 3.1, two neighboring labels in a
+--   DNS name cannot be equal:
+--
+--     Each node has a label, which is zero to 63 octets in length.  Brother
+--     nodes may not have the same label, although the same label can be used
+--     for nodes which are not brothers.  One label is reserved, and that is
+--     the null (i.e., zero length) label used for the root.
+--
+--   We enforce this restriction, but the result is usually that we
+--   only parse the part of the subdomain leading up to the repeated
+--   label.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse, parseTest )
+--
+--   Make sure we can parse a single character:
+--
+--   >>> parseTest subdomain "a"
+--   SubdomainSingleLabel (Label (Letter 'a') Nothing)
+--
+--   >>> pretty_print $ parse subdomain "" "example.com"
+--   example.com
+--
+--   >>> pretty_print $ parse subdomain "" "www.example.com"
+--   www.example.com
+--
+--   We reject a subdomain with equal neighbors, but this leads to
+--   only the single first label being parsed instead:
+--
+--   >>> pretty_print $ parse subdomain "" "www.www.example.com"
+--   www
+--
+--   But not one with a repeated but non-neighboring label:
+--
+--   >>> pretty_print $ parse subdomain "" "www.example.www.com"
+--   www.example.www.com
+--
+subdomain :: Parser Subdomain
+subdomain = try both <|> just_one
+  where
+    both :: Parser Subdomain
+    both = do
+      l <- label
+      _ <- char '.'
+      s <- subdomain
+      let result = SubdomainMultipleLabel l s
+      if (subdomain_has_equal_neighbors result)
+      then fail "subdomain cannot have equal neighboring labels"
+      else return result
+
+    just_one :: Parser Subdomain
+    just_one = fmap SubdomainSingleLabel label
+
+
+
+-- | Retrieve a list of labels contained in a 'Subdomain'.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse )
+--
+--   >>> let (Right r) = parse subdomain "" "a"
+--   >>> pretty_print $ subdomain_labels r
+--   ["a"]
+--
+--   >>> let (Right r) = parse subdomain "" "example.com"
+--   >>> pretty_print $ subdomain_labels r
+--   ["example","com"]
+--
+--   >>> let (Right r) = parse subdomain "" "www.example.com"
+--   >>> pretty_print $ subdomain_labels r
+--   ["www","example","com"]
+--
+subdomain_labels :: Subdomain -> [Label]
+subdomain_labels (SubdomainSingleLabel l) = [l]
+subdomain_labels (SubdomainMultipleLabel l s) = l : (subdomain_labels s)
+
+
+-- | Return a list of pairs of neighboring labels in a subdomain.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse )
+--   >>> let (Right r) = parse subdomain "" "www.example.com"
+--   >>> pretty_print $ subdomain_label_neighbors r
+--   ["(\"www\",\"example\")","(\"example\",\"com\")"]
+--
+subdomain_label_neighbors :: Subdomain -> [(Label,Label)]
+subdomain_label_neighbors s =
+  zip ls (tail ls)
+  where
+    ls = subdomain_labels s
+
+
+-- | Return @True@ if the subdomain has any two equal neighboring
+--   labels, and @False@ otherwise.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse )
+--
+--   >>> let (Right r) = parse subdomain "" "www.example.com"
+--   >>> subdomain_has_equal_neighbors r
+--   False
+--
+--   >>> let (Right l) = parse label "" "www"
+--   >>> let (Right s) = parse subdomain "" "www.example.com"
+--   >>> let bad_subdomain = SubdomainMultipleLabel l s
+--   >>> subdomain_has_equal_neighbors bad_subdomain
+--   True
+--
+subdomain_has_equal_neighbors :: Subdomain -> Bool
+subdomain_has_equal_neighbors s =
+  or [ x == y | (x,y) <- subdomain_label_neighbors s ]
+
+
+
+-- * Domains
+
+-- | An RFC1035 domain. According to RFC1035 a domain can be either a
+-- subdomain or \" \", which according to RFC2181
+-- <https://tools.ietf.org/html/rfc2181#section-11> means the root:
+--
+--      The zero length full name is defined as representing the root
+--      of the DNS tree, and is typically written and displayed as
+--      \".\".
+--
+--   We let the 'Domain' type remain true to those RFCs, even though
+--   they don't support an absolute domain name of e.g. a single dot.
+--   We have one more data type 'UserDomain' which handles the possibility
+--   of an absolute path.
+--
+data Domain =
+  DomainName Subdomain |
+  DomainRoot
+  deriving (Eq, Show)
+
+instance Pretty Domain where
+  pretty_show DomainRoot = ""
+  pretty_show (DomainName s) = pretty_show s
+
+-- | Parse an RFC1035 \"domain\"
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse, parseTest )
+--
+--   Make sure we can parse a single character:
+--
+--   >>> parseTest domain "a"
+--   DomainName (SubdomainSingleLabel (Label (Letter 'a') Nothing))
+--
+--   And the empty domain:
+--
+--   >>> parseTest domain ""
+--   DomainRoot
+--
+--   We will in fact parse the \"empty\" domain off the front of
+--   pretty much anything:
+--
+--   >>> parseTest domain "8===D"
+--   DomainRoot
+--
+--   Equality of domains is case-insensitive:
+--
+--   >>> let (Right r1) = parse domain "" "example.com"
+--   >>> let (Right r2) = parse domain "" "ExaMPle.coM"
+--   >>> r1 == r2
+--   True
+--
+--   A single dot IS parsed as the root, but the dot isn't consumed:
+--
+--   >>> parseTest domain "."
+--   DomainRoot
+--
+--   Anything over 255 characters is an error, so the root will be
+--   parsed:
+--
+--   >>> let big_l1 = replicate 63 'x'
+--   >>> let big_l2 = replicate 63 'y' -- Avoid equal neighboring labels!
+--   >>> let big_labels = big_l1 ++ "." ++ big_l2 ++ "."
+--   >>> let big_subdomain = concat $ replicate 3 big_labels
+--   >>> parseTest domain big_subdomain
+--   DomainRoot
+--
+--   But exactly 255 is allowed:
+--
+--   >>> import Data.List ( intercalate )
+--   >>> let l1 = replicate 63 'w'
+--   >>> let l2 = replicate 63 'x'
+--   >>> let l3 = replicate 63 'y'
+--   >>> let l4 = replicate 63 'z'
+--   >>> let big_subdomain = intercalate "." [l1,l2,l3,l4]
+--   >>> let (Right r) = parse domain "" big_subdomain
+--   >>> length (pretty_show r)
+--   255
+--
 domain :: Parser Domain
-domain = undefined
+domain = try parse_subdomain <|> parse_empty
+  where
+    parse_subdomain :: Parser Domain
+    parse_subdomain = do
+      s <- subdomain
+      if length (pretty_show s) <= 255
+      then return $ DomainName s
+      else fail "subdomains can be at most 255 characters"
+
+    parse_empty :: Parser Domain
+    parse_empty = string "" >> return DomainRoot
+
+
+
+-- * User domains
 
---subdomain :: Parser Subdomain
---subdomain = undefined
+-- | This type helps clarify some murkiness in the DNS \"domain\" name
+--   specification. In RFC1034, it is acknowledged that a domain name
+--   input with a trailing \".\" will represent an absolute domain
+--   name (i.e. with respect to the DNS root). However, the grammar in
+--   RFC1035 disallows a trailing dot.
+--
+--   This makes some sense: within the DNS, everything knows its
+--   position in the tree. The relative/absolute distinction only
+--   makes sense on the client side, where a user's resolver might
+--   decide to append some suffix to a relative
+--   request. Unfortunately, that's where we live. So we have to deal
+--   with the possibility of having a trailing dot at the end of any
+--   domain name.
+--
+data UserDomain =
+  UserDomainRelative Domain |
+  UserDomainAbsolute Domain
+  deriving (Eq, Show)
+
+instance Pretty UserDomain where
+  pretty_show (UserDomainRelative d) = pretty_show d
+  pretty_show (UserDomainAbsolute d) = (pretty_show d) ++ "."
+
+
+-- | Parse a 'UserDomain'. This is what we'll be using to read user
+--   input, since it supports both relative and absolute domain names
+--   (unlike the implicitly-absolute 'Domain').
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse, parseTest )
+--
+--   We can really parse the root now!
+--
+--   >>> parseTest user_domain "."
+--   UserDomainAbsolute DomainRoot
+--
+--   But multiple dots aren't (only the first):
+--
+--   >>> pretty_print $ parse user_domain "" ".."
+--   .
+--
+--   We can also optionally have a trailing dot at the end of a
+--   non-empty name:
+--
+--   >>>  pretty_print $ parse user_domain "" "www.example.com"
+--   www.example.com
+--
+--   >>>  pretty_print $ parse user_domain "" "www.example.com."
+--   www.example.com.
+--
+--   A \"relative root\" can also be parsed, letting the user's
+--   resolver deal with it:
+--
+--   >>> parseTest user_domain ""
+--   UserDomainRelative DomainRoot
+--
+user_domain :: Parser UserDomain
+user_domain = try absolute <|> relative
+  where
+    absolute :: Parser UserDomain
+    absolute = do
+      d <- domain
+      _ <- char '.'
+      return $ UserDomainAbsolute d
+
+    relative :: Parser UserDomain
+    relative = fmap UserDomainRelative domain