From: Michael Orlitzky <michael@orlitzky.com>
Date: Mon, 6 Jul 2015 14:39:47 +0000 (-0400)
Subject: Add the Domain module, work in progress.
X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Fharbl.git;a=commitdiff_plain;h=e1060ef815f35309c7ca0800a345d7c54ce346bd

Add the Domain module, work in progress.
Add a TODO.
---

diff --git a/doc/TODO b/doc/TODO
new file mode 100644
index 0000000..20b8bea
--- /dev/null
+++ b/doc/TODO
@@ -0,0 +1,3 @@
+1. Move the Pretty class into its own module.
+2. Implement pretty printing of Domains to clean up the doctests.
+3. Add exactly-63 test to labels.
diff --git a/src/Domain.hs b/src/Domain.hs
new file mode 100644
index 0000000..ce20aae
--- /dev/null
+++ b/src/Domain.hs
@@ -0,0 +1,400 @@
+-- | The 'Domain' data type and its parser. A 'Domain' represents a
+--   name in the domain name system (DNS) as described by
+--   RFC1035. In particular, we enforce the restrictions from Section
+--   2.3.1 \"Preferred name syntax\". See for example,
+--
+--     <https://tools.ietf.org/html/rfc1035#section-2.3.1>
+--
+module Domain
+where
+
+import Data.Char ( toLower )
+import Text.Parsec (
+  ParseError,
+  (<|>),
+  alphaNum,
+  char,
+  eof,
+  many1,
+  option,
+  optionMaybe,
+  parse,
+  string,
+  try,
+  unexpected )
+import qualified Text.Parsec as Parsec ( digit, letter)
+import Text.Parsec.String ( Parser )
+
+newtype Domain = Domain String deriving Show
+
+
+-- | The derived instance of 'Eq' for domain names is incorrect. All
+--   comparisons are currently made case-insensitively. The following is
+--   an excerpt from RFC1035:
+--
+--     2.3.3. Character Case
+--
+--     For all parts of the DNS that are part of the official
+--     protocol, all comparisons between character strings (e.g.,
+--     labels, domain names, etc.)  are done in a case-insensitive
+--     manner...
+--
+--   So to compare two DNS names, we compare their lower-case
+--   counterparts.
+--
+instance Eq Domain where
+  (Domain d1) == (Domain d2) =
+    (map toLower d1) == (map toLower d2)
+
+
+-- * Digits
+newtype Digit = Digit Char deriving (Eq, Show)
+
+digit :: Parser Digit
+digit = fmap Digit Parsec.digit
+
+
+-- * Letters
+newtype Letter = Letter Char deriving (Eq, Show)
+
+letter :: Parser Letter
+letter = fmap Letter Parsec.letter
+
+
+-- * Letters/Digits
+data LetDig = LetDigLetter Letter | LetDigDigit Digit deriving (Eq, Show)
+
+let_dig :: Parser LetDig
+let_dig = (fmap LetDigLetter letter) <|> (fmap LetDigDigit digit)
+
+
+-- * Hyphens
+newtype Hyphen = Hyphen Char deriving (Eq, Show)
+
+hyphen :: Parser Hyphen
+hyphen = fmap Hyphen (char '-')
+
+
+-- * Letter, Digit, or Hyphen.
+
+data LetDigHyp = LetDigHypLetDig LetDig
+               | LetDigHypHyphen Hyphen
+  deriving (Eq, Show)
+
+
+-- | The following is the simplest type in the domain grammar that
+--   isn't already implemented for us.
+--
+--     <let-dig> ::= <letter> | <digit>
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parseTest )
+--
+--   Letters, digits, and hyphens are all parsed:
+--
+--   >>> parseTest let_dig_hyp "a"
+--   LetDigHypLetDig (LetDigLetter (Letter 'a'))
+--
+--   >>> parseTest let_dig_hyp "7"
+--   LetDigHypLetDig (LetDigDigit (Digit '7'))
+--
+--   >>> parseTest let_dig_hyp "-"
+--   LetDigHypHyphen (Hyphen '-')
+--
+--   However, an underscore (for example) is not:
+--
+--   >>> parseTest let_dig_hyp "_"
+--   parse error at (line 1, column 1):
+--   unexpected "_"
+--   expecting letter, digit or "-"
+--
+let_dig_hyp :: Parser LetDigHyp
+let_dig_hyp =
+  parse_letdig <|> parse_hyphen
+  where
+    parse_letdig :: Parser LetDigHyp
+    parse_letdig = fmap LetDigHypLetDig let_dig
+
+    parse_hyphen :: Parser LetDigHyp
+    parse_hyphen = fmap LetDigHypHyphen hyphen
+
+
+-- * Letter/Digit/Hyphen strings
+
+-- | A string of letters, digits, and hyphens from the RFC1035 grammar:
+--
+--     <ldh-str> ::= <let-dig-hyp> | <let-dig-hyp> <ldh-str>
+--
+--   These are represented as either a single instance of a
+--   'LetDigHyp', or a string of them (recursive).
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parseTest )
+--
+--   Single letters, digits, and hyphens are parsed:
+--
+--   >>> parseTest ldh_str "a"
+--   LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a')))
+--
+--   >>> parseTest ldh_str "0"
+--   LdhStrSingleLdh (LetDigHypLetDig (LetDigDigit (Digit '0')))
+--
+--   >>> parseTest ldh_str "-"
+--   LdhStrSingleLdh (LetDigHypHyphen (Hyphen '-'))
+--
+--   As well as strings of them:
+--
+--   LdhStr (LetDigHypLetter (Letter 'a')) (LdhStr (LetDigHypDigit (Digit '0')) (LdhStr (LetDigHypHyphen (Hyphen '-')) (LdhStrSingleLdh (LetDigHypLetter (Letter 'b')))))
+--
+data LdhStr =
+  LdhStrSingleLdh LetDigHyp | LdhStrMultipleLdh LetDigHyp LdhStr
+  deriving (Eq, Show)
+
+ldh_str :: Parser LdhStr
+ldh_str = try both <|> just_one
+  where
+    both :: Parser LdhStr
+    both = do
+      ldh1 <- let_dig_hyp
+      ldh_tail <- ldh_str
+      return $ LdhStrMultipleLdh ldh1 ldh_tail
+
+    just_one :: Parser LdhStr
+    just_one = fmap LdhStrSingleLdh let_dig_hyp
+
+
+
+-- | A version of 'last' that works on a 'LdhStr' rather than a
+--   list. That is, it returns the last 'LetDigHyp' in the
+--   string. Since 'LdhStr' contains at least one character, there's
+--   no \"nil\" case here.
+--
+--   ==== _Examples_
+--
+--   >>> let (Right r) = parse ldh_str "" "a"
+--   >>> last_ldh_str r
+--   LetDigHypLetDig (LetDigLetter (Letter 'a'))
+--
+--   >>> let (Right r) = parse ldh_str "" "abc-def"
+--   >>> last_ldh_str r
+--   LetDigHypLetDig (LetDigLetter (Letter 'f'))
+--
+last_ldh_str :: LdhStr -> LetDigHyp
+last_ldh_str (LdhStrSingleLdh x) = x
+last_ldh_str (LdhStrMultipleLdh _ x) = last_ldh_str x
+
+
+-- | A version of 'init' that works on a 'LdhStr' rather than a
+--   list. That is, it returns everything /except/ the last character in
+--   the string.
+--
+--   Since an 'LdhStr' must contain at least one character, this might
+--   not be opssible (when the input is of length one). So, we return
+--   a 'Maybe' value.
+--
+--   ==== _Examples_
+--
+--   >>> let (Right r) = parse ldh_str "" "a"
+--   >>> init_ldh_str r
+--   Nothing
+--
+--   >>> let (Right r) = parse ldh_str "" "ab"
+--   >>> init_ldh_str r
+--   Just (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a'))))
+--
+--   >>> let (Right r) = parse ldh_str "" "abc-def"
+--   >>> init_ldh_str r
+--   Just (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a'))) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'b'))) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'c'))) (LdhStrMultipleLdh (LetDigHypHyphen (Hyphen '-')) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'd'))) (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'e')))))))))
+--
+init_ldh_str :: LdhStr -> Maybe LdhStr
+init_ldh_str (LdhStrSingleLdh _) = Nothing
+init_ldh_str (LdhStrMultipleLdh h t) =
+  Just $ case (init_ldh_str t) of
+           -- We just got the second-to-last character, we're done.
+           Nothing   -> LdhStrSingleLdh h
+
+           -- There's still more stuff. Recurse.
+           Just rest -> LdhStrMultipleLdh h rest
+
+
+-- | Compute the length of an 'LdhStr'. It will be at least one, since
+--   'LdhStr's are non-empty. And if there's something other than the
+--   first character present, we simply recurse.
+--
+--   ==== _Examples_
+--
+--   >>> let (Right r) = parse ldh_str "" "a"
+--   >>> length_ldh_str r
+--   1
+--
+--   >>> let (Right r) = parse ldh_str "" "abc-def"
+--   >>> length_ldh_str r
+--   7
+--
+length_ldh_str :: LdhStr -> Int
+length_ldh_str (LdhStrSingleLdh _) = 1
+length_ldh_str (LdhStrMultipleLdh _ t) = 1 + (length_ldh_str t)
+
+-- * Letter/Digit/Hyphen string followed by a trailing Letter/Digit
+
+-- | This type isn't explicitly part of the grammar, but it's what
+--   shows up in the square brackets of,
+--
+--     <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
+--
+--   The ldh-str is optional, but if one is present, we must also have
+--   a trailing let-dig to prevent the name from ending with a
+--   hyphen. This can be represented with a @Maybe LdhStrLetDig@,
+--   which is why we're about to define it.
+--
+data LdhStrLetDig = LdhStrLetDig (Maybe LdhStr) LetDig
+  deriving (Eq, Show)
+
+-- | Parse an 'LdhStrLetDig'. This isn't in the grammar, but we might
+--   as well define the parser for it independently since we gave it
+--   its own data type.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parseTest )
+--
+--   Make sure we can parse a single character:
+--
+--   >>> parseTest ldh_str_let_dig "a"
+--   LdhStrLetDig Nothing (LetDigLetter (Letter 'a'))
+--
+--   And longer strings:
+--
+--   >>> parseTest ldh_str_let_dig "ab"
+--   LdhStrLetDig (Just (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'a'))))) (LetDigLetter (Letter 'b'))
+--
+--   >>> parseTest ldh_str_let_dig "-b"
+--   LdhStrLetDig (Just (LdhStrSingleLdh (LetDigHypHyphen (Hyphen '-')))) (LetDigLetter (Letter 'b'))
+--
+--   >>> parseTest ldh_str_let_dig "b-"
+--   parse error at (line 1, column 3):
+--   label cannot end with a hyphen
+--
+ldh_str_let_dig :: Parser LdhStrLetDig
+ldh_str_let_dig = do
+  -- This will happily eat up the trailing let-dig...
+  full_ldh <- ldh_str
+
+  -- So we have to go back and see what happened.
+  case (last_ldh_str full_ldh) of
+    (LetDigHypHyphen _) -> fail "label cannot end with a hyphen"
+    (LetDigHypLetDig ld) ->
+      -- Ok, the label didn't end with a hyphen; now we need to split
+      -- off the last letter/digit so we can pack it into our return
+      -- type separately.
+      return $ case (init_ldh_str full_ldh) of
+                 -- We only parsed one letter/digit. This can happen
+                 -- if the label contains two characters. For example,
+                 -- if we try to parse the label "ab", then the "a"
+                 -- will be eaten by the label parser, and this
+                 -- function will be left with only "b".
+                 Nothing -> LdhStrLetDig Nothing ld
+
+                 -- Usual case: there's was some leading let-dig-hyp junk,
+                 -- return it too.
+                 leading_ldhs -> LdhStrLetDig leading_ldhs ld
+
+
+
+-- | Compute the length of a 'LdhStrLetDig'. It's at least one, since
+--   the let-dig at the end is always there. And when there's an
+--   ldh-str too, we add its length to one.
+--
+--   ==== _Examples_
+--
+--   >>> let (Right r) = parse ldh_str_let_dig "" "a"
+--   >>> length_ldh_str_let_dig r
+--   1
+--
+--   >>> let (Right r) = parse ldh_str_let_dig "" "abc-def"
+--   >>> length_ldh_str_let_dig r
+--   7
+--
+length_ldh_str_let_dig :: LdhStrLetDig -> Int
+length_ldh_str_let_dig (LdhStrLetDig Nothing _) = 1
+length_ldh_str_let_dig (LdhStrLetDig (Just ldhstring) _) =
+  1 + (length_ldh_str ldhstring)
+
+
+-- * Labels
+
+-- | The label type from the RFC1035 grammar:
+--
+--     <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
+--
+data Label = Label Letter (Maybe LdhStrLetDig)
+  deriving (Eq, Show)
+
+
+-- | Parse a 'Label'.
+--
+--   In addition to the grammar, there's another restriction on
+--   labels: their length must be 63 characters or less. Quoting
+--   Section 2.3.1, \"Preferred name syntax\", of RFC1035:
+--
+--     The labels must follow the rules for ARPANET host names.  They
+--     must start with a letter, end with a letter or digit, and have
+--     as interior characters only letters, digits, and hyphen.  There
+--     are also some restrictions on the length.  Labels must be 63
+--     characters or less.
+--
+--   We check this only after we have successfully parsed a label.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parseTest )
+--
+--   Make sure we can parse a single character:
+--
+--   >>> parseTest label "a"
+--   Label (Letter 'a') Nothing
+--
+--   And longer strings:
+--
+--   >>> parseTest label "abc-def"
+--   Label (Letter 'a') (Just (LdhStrLetDig (Just (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'b'))) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'c'))) (LdhStrMultipleLdh (LetDigHypHyphen (Hyphen '-')) (LdhStrMultipleLdh (LetDigHypLetDig (LetDigLetter (Letter 'd'))) (LdhStrSingleLdh (LetDigHypLetDig (LetDigLetter (Letter 'e'))))))))) (LetDigLetter (Letter 'f'))))
+--
+--   But not anything ending in a hyphen:
+--
+--   >>> parseTest label "abc-"
+--   parse error at (line 1, column 5):
+--   label cannot end with a hyphen
+--
+--   Or anything over 63 characters:
+--
+--   >>> parseTest label (['a'..'z'] ++ ['a'..'z'] ++ ['a'..'z'])
+--   parse error at (line 1, column 79):
+--   labels must be 63 or fewer characters
+--
+--   However, /exactly/ 63 characters is acceptable:
+--
+--   TODO
+--
+label :: Parser Label
+label = do
+  l <- letter -- Guaranteed to be there
+  maybe_s <- optionMaybe ldh_str_let_dig -- Might not be there
+  case maybe_s of
+    -- It can only be one character long, from the letter...
+    Nothing -> return $ Label l maybe_s
+
+    -- The letter gives us one character, so we check that the rest is
+    -- less than 62 characters long. But in the error message we need
+    -- to report 63.
+    Just s  -> if (length_ldh_str_let_dig s) <= 62
+              then return $ Label l maybe_s
+              else fail "labels must be 63 or fewer characters"
+
+
+domain :: Parser Domain
+domain = undefined
+
+--subdomain :: Parser Subdomain
+--subdomain = undefined