Finish moving all of the DNS name components under Network.DNS.RBL.Domain.

author Michael Orlitzky <michael@orlitzky.com>

Mon, 13 Jul 2015 19:42:50 +0000 (15:42 -0400)

committer Michael Orlitzky <michael@orlitzky.com>

Mon, 13 Jul 2015 19:42:50 +0000 (15:42 -0400)
author Michael Orlitzky <michael@orlitzky.com>
Mon, 13 Jul 2015 19:42:50 +0000 (15:42 -0400)
committer Michael Orlitzky <michael@orlitzky.com>
Mon, 13 Jul 2015 19:42:50 +0000 (15:42 -0400)
diff --git a/harbl.cabal b/harbl.cabal

index 1753ed322675013f97d33ee3c6fa47b240ef7fc1..91ae36095160a4a53d0d4dc64ce1d0cc00a2dbfa 100644 (file)
--- a/harbl.cabal
+++ b/harbl.cabal
@@ -28,13 +28,16 @@ library
      Network.DNS.RBL.Tests
  
    other-modules:
-    Network.DNS.RBL.Domain
      Network.DNS.RBL.Domain.Digit
+    Network.DNS.RBL.Domain.Domain
      Network.DNS.RBL.Domain.Hyphen
+    Network.DNS.RBL.Domain.Label
      Network.DNS.RBL.Domain.LdhStr
+    Network.DNS.RBL.Domain.LdhStrLetDig
      Network.DNS.RBL.Domain.LetDig
      Network.DNS.RBL.Domain.LetDigHyp
      Network.DNS.RBL.Domain.Letter
+    Network.DNS.RBL.Domain.Subdomain
      Network.DNS.RBL.Host
      Network.DNS.RBL.IPv4Pattern
      Network.DNS.RBL.Pretty
diff --git a/harbl/src/Network/DNS/RBL/Domain.hs b/harbl/src/Network/DNS/RBL/Domain.hs

deleted file mode 100644 (file)

index 7adc1a9..0000000
--- a/harbl/src/Network/DNS/RBL/Domain.hs
+++ /dev/null
@@ -1,536 +0,0 @@
-{-# LANGUAGE DoAndIfThenElse #-}
-
--- | The 'Domain' data type and its parser. A 'Domain' represents a
---   name in the domain name system (DNS) as described by
---   RFC1035. In particular, we enforce the restrictions from Section
---   2.3.1 \"Preferred name syntax\". See for example,
---
---     <https://tools.ietf.org/html/rfc1035#section-2.3.1>
---
---   We basically work with strings and characters everywhere, even
---   though this isn't really correct. The length specifications in
---   the RFCs are all in terms of octets, so really a ByteString.Char8
---   would be more appropriate. With strings, for example, we could
---   have a unicode mumbo jumbo character that takes up two bytes
---   (octets).
---
-module Network.DNS.RBL.Domain (
-  Domain(..),
-  domain )
-where
-
-import Text.Parsec (
-  (<|>),
-  char,
-  optionMaybe,
-  string,
-  try )
-import Text.Parsec.String ( Parser )
-
-import Network.DNS.RBL.Domain.LdhStr (
-  LdhStr(..),
-  ldh_str,
-  ldh_str_length )
-import Network.DNS.RBL.Domain.LetDig ( LetDig, let_dig )
-import Network.DNS.RBL.Domain.LetDigHyp ( LetDigHyp(..) )
-import Network.DNS.RBL.Pretty ( Pretty(..) )
-import Network.DNS.RBL.Reversible ( Reversible(..) )
-
-
--- * Letter/Digit/Hyphen string followed by a trailing Letter/Digit
-
--- | This type isn't explicitly part of the grammar, but it's what
---   shows up in the square brackets of,
---
---     <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
---
---   The ldh-str is optional, but if one is present, we must also have
---   a trailing let-dig to prevent the name from ending with a
---   hyphen. This can be represented with a @Maybe LdhStrLetDig@,
---   which is why we're about to define it.
---
-data LdhStrLetDig = LdhStrLetDig (Maybe LdhStr) LetDig
-  deriving (Eq, Show)
-
-instance Pretty LdhStrLetDig where
-  pretty_show (LdhStrLetDig Nothing ld) = pretty_show ld
-  pretty_show (LdhStrLetDig (Just s) ld) = (pretty_show s) ++ (pretty_show ld)
-
-
-
--- | Parse an 'LdhStrLetDig'. This isn't in the grammar, but we might
---   as well define the parser for it independently since we gave it
---   its own data type.
---
---   ==== _Examples_
---
---   >>> import Text.Parsec ( parse, parseTest )
---
---   Make sure we can parse a single character:
---
---   >>> parseTest ldh_str_let_dig "a"
---   LdhStrLetDig Nothing (LetDigLetter (Letter 'a'))
---
---   And longer strings:
---
---   >>> pretty_print $ parse ldh_str_let_dig "" "ab"
---   ab
---
---   >>> pretty_print $ parse ldh_str_let_dig "" "-b"
---   -b
---
---   >>> parseTest ldh_str_let_dig "b-"
---   parse error at (line 1, column 3):
---   label cannot end with a hyphen
---
-ldh_str_let_dig :: Parser LdhStrLetDig
-ldh_str_let_dig = do
-  -- This will happily eat up the trailing let-dig...
-  full_ldh <- ldh_str
-
-  -- So we have to go back and see what happened.
-  case (backwards full_ldh) of
-
-    -- Fail on a single hyphen.
-    (LdhStrSingleLdh (LetDigHypHyphen _)) ->
-      fail "label cannot end with a hyphen"
-
-    -- Fail for a hyphen followed by other stuff.
-    (LdhStrMultipleLdh (LetDigHypHyphen _) _) ->
-      fail "label cannot end with a hyphen"
-
-    -- Simply return the thing if it's a single non-hyphen.
-    (LdhStrSingleLdh (LetDigHypLetDig ld)) -> return $ LdhStrLetDig Nothing ld
-
-    -- And peel off the last character for a non-hyphen followed by
-    -- other stuff. We wind up reversing things twice, but whatever.
-    (LdhStrMultipleLdh (LetDigHypLetDig ld) init_ldh_rev) ->
-      let init_ldh = backwards init_ldh_rev
-      in return $ LdhStrLetDig (Just init_ldh) ld
-
-
-
--- | Compute the length of a 'LdhStrLetDig'. It's at least one, since
---   the let-dig at the end is always there. And when there's an
---   ldh-str too, we add its length to one.
---
---   ==== _Examples_
---
---   >>> import Text.Parsec ( parse )
---
---   >>> let (Right r) = parse ldh_str_let_dig "" "a"
---   >>> length_ldh_str_let_dig r
---   1
---
---   >>> let (Right r) = parse ldh_str_let_dig "" "abc-def"
---   >>> length_ldh_str_let_dig r
---   7
---
-length_ldh_str_let_dig :: LdhStrLetDig -> Int
-length_ldh_str_let_dig (LdhStrLetDig Nothing _) = 1
-length_ldh_str_let_dig (LdhStrLetDig (Just ldhstring) _) =
-  1 + (ldh_str_length ldhstring)
-
-
--- * Labels
-
--- | The label type from the RFC1035 grammar:
---
---     <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
---
---   We allow the slightly more general syntax from RFC1123, Section 2.1:
---
---     The syntax of a legal Internet host name was specified in RFC-952
---     [DNS:4].  One aspect of host name syntax is hereby changed: the
---     restriction on the first character is relaxed to allow either a
---     letter or a digit.  Host software MUST support this more liberal
---     syntax.
---
-data Label = Label LetDig (Maybe LdhStrLetDig)
-  deriving (Eq, Show)
-
-instance Pretty Label where
-  pretty_show (Label l Nothing) = pretty_show l
-  pretty_show (Label l (Just s)) = (pretty_show l) ++ (pretty_show s)
-
--- | Parse a 'Label'.
---
---   In addition to the grammar, there's another restriction on
---   labels: their length must be 63 characters or less. Quoting
---   Section 2.3.1, \"Preferred name syntax\", of RFC1035:
---
---     The labels must follow the rules for ARPANET host names.  They
---     must start with a letter, end with a letter or digit, and have
---     as interior characters only letters, digits, and hyphen.  There
---     are also some restrictions on the length.  Labels must be 63
---     characters or less.
---
---   We check this only after we have successfully parsed a label.
---
---   ==== _Examples_
---
---   >>> import Text.Parsec ( parse, parseTest )
---
---   Make sure we can parse a single character:
---
---   >>> parseTest label "a"
---   Label (LetDigLetter (Letter 'a')) Nothing
---
---   And longer strings:
---
---   >>> pretty_print $ parse label "" "abc-def"
---   abc-def
---
---   But not anything ending in a hyphen:
---
---   >>> parseTest label "abc-"
---   parse error at (line 1, column 5):
---   label cannot end with a hyphen
---
---   Or anything over 63 characters:
---
---   >>> parseTest label (['a'..'z'] ++ ['a'..'z'] ++ ['a'..'z'])
---   parse error at (line 1, column 79):
---   labels must be 63 or fewer characters
---
---   However, /exactly/ 63 characters is acceptable:
---
---   >>> pretty_print $ parse label "" (replicate 63 'x')
---   xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
---
---   Ensure that a label can begin with a digit:
---
---   >>> pretty_print $ parse label "" "3com"
---   3com
---
-label :: Parser Label
-label = do
-  l <- let_dig -- Guaranteed to be there
-  maybe_s <- optionMaybe ldh_str_let_dig -- Might not be there
-  case maybe_s of
-    -- It can only be one character long, from the letter...
-    Nothing -> return $ Label l maybe_s
-
-    -- The letter gives us one character, so we check that the rest is
-    -- less than 62 characters long. But in the error message we need
-    -- to report 63.
-    Just s  -> if (length_ldh_str_let_dig s) <= 62
-              then return $ Label l maybe_s
-              else fail "labels must be 63 or fewer characters"
-
-
-
--- * Subdomains
-
-
--- | The data type representing a \"subdomain\" from RFC1035:
---
---     <subdomain> ::= <label> | <subdomain> "." <label>
---
---   We have reversed the order of the subdomain and label in the
---   second option, however. This is explained in 'subdomain'.
---
-data Subdomain =
-  SubdomainSingleLabel Label |
-  SubdomainMultipleLabel Label Subdomain
-  deriving (Eq, Show)
-
-
-
-instance Pretty Subdomain where
-  pretty_show (SubdomainSingleLabel l) = pretty_show l
-  pretty_show (SubdomainMultipleLabel l s) =
-    (pretty_show l) ++ "." ++ (pretty_show s)
-
-
-instance Reversible Subdomain where
-  -- | Reverse the labels of the given subdomain.
-  --
-  --   ==== _Examples_
-  --
-  --   >>> import Text.Parsec ( parse )
-  --
-  --   Standard usage:
-  --
-  --   >>> let (Right r) = parse subdomain "" "com"
-  --   >>> pretty_print $ backwards r
-  --   com
-  --
-  --   >>> let (Right r) = parse subdomain "" "example.com"
-  --   >>> pretty_print $ backwards r
-  --   com.example
-  --
-  --   >>> let (Right r) = parse subdomain "" "www.example.com"
-  --   >>> pretty_print $ backwards r
-  --   com.example.www
-  --
-  --   >>> let (Right r) = parse subdomain "" "new.www.example.com"
-  --   >>> pretty_print $ backwards r
-  --   com.example.www.new
-  --
-
-  -- It's easy to reverse a single label...
-  backwards s@(SubdomainSingleLabel _) = s
-
-  -- For multiple labels we have two cases. The first is where we have
-  -- exactly two labels, and we just need to swap them.
-  backwards (SubdomainMultipleLabel l (SubdomainSingleLabel m)) =
-    SubdomainMultipleLabel m (SubdomainSingleLabel l)
-
-  -- And now the hard case. See the 'LdhStr' implementation for an
-  -- explanation.
-  --
-  backwards (SubdomainMultipleLabel l s) = build (SubdomainSingleLabel l) s
-    where
-      -- Build up the first Subdomain on the left by peeling off the
-      -- leading elements of the second Subdomain.
-      build :: Subdomain -> Subdomain -> Subdomain
-      build dst (SubdomainSingleLabel final) = SubdomainMultipleLabel final dst
-      build dst (SubdomainMultipleLabel leading rest) =
-        build (SubdomainMultipleLabel leading dst) rest
-
-
-
--- | Parse an RFC1035 \"subdomain\". The given grammar is,
---
---     <subdomain> ::= <label> | <subdomain> "." <label>
---
---   However, we have reversed the order of the subdomain and label to
---   prevent infinite recursion. The second option (subdomain + label)
---   is obviously more specific, we we need to try it first. This
---   presents a problem: we're trying to parse a subdomain in terms of
---   a subdomain! The given grammar represents subdomains how we like
---   to think of them; from right to left. But it's better to parse
---   from left to right, so we pick off the leading label and then
---   recurse into the definition of subdomain.
---
---   According to RFC1034, Section 3.1, two neighboring labels in a
---   DNS name cannot be equal:
---
---     Each node has a label, which is zero to 63 octets in length.  Brother
---     nodes may not have the same label, although the same label can be used
---     for nodes which are not brothers.  One label is reserved, and that is
---     the null (i.e., zero length) label used for the root.
---
---   We enforce this restriction, but the result is usually that we
---   only parse the part of the subdomain leading up to the repeated
---   label.
---
---   ==== _Examples_
---
---   >>> import Text.Parsec ( parse, parseTest )
---
---   Make sure we can parse a single character:
---
---   >>> parseTest subdomain "a"
---   SubdomainSingleLabel (Label (LetDigLetter (Letter 'a')) Nothing)
---
---   >>> pretty_print $ parse subdomain "" "example.com"
---   example.com
---
---   >>> pretty_print $ parse subdomain "" "www.example.com"
---   www.example.com
---
---   We reject a subdomain with equal neighbors, but this leads to
---   only the single first label being parsed instead:
---
---   >>> pretty_print $ parse subdomain "" "www.www.example.com"
---   www
---
---   But not one with a repeated but non-neighboring label:
---
---   >>> pretty_print $ parse subdomain "" "www.example.www.com"
---   www.example.www.com
---
-subdomain :: Parser Subdomain
-subdomain = try both <|> just_one
-  where
-    both :: Parser Subdomain
-    both = do
-      l <- label
-      _ <- char '.'
-      s <- subdomain
-      let result = SubdomainMultipleLabel l s
-      if (subdomain_has_equal_neighbors result)
-      then fail "subdomain cannot have equal neighboring labels"
-      else return result
-
-    just_one :: Parser Subdomain
-    just_one = fmap SubdomainSingleLabel label
-
-
-
--- | Retrieve a list of labels contained in a 'Subdomain'.
---
---   ==== _Examples_
---
---   >>> import Text.Parsec ( parse )
---
---   >>> let (Right r) = parse subdomain "" "a"
---   >>> pretty_print $ subdomain_labels r
---   ["a"]
---
---   >>> let (Right r) = parse subdomain "" "example.com"
---   >>> pretty_print $ subdomain_labels r
---   ["example","com"]
---
---   >>> let (Right r) = parse subdomain "" "www.example.com"
---   >>> pretty_print $ subdomain_labels r
---   ["www","example","com"]
---
-subdomain_labels :: Subdomain -> [Label]
-subdomain_labels (SubdomainSingleLabel l) = [l]
-subdomain_labels (SubdomainMultipleLabel l s) = l : (subdomain_labels s)
-
-
--- | Return a list of pairs of neighboring labels in a subdomain.
---
---   ==== _Examples_
---
---   >>> import Text.Parsec ( parse )
---   >>> let (Right r) = parse subdomain "" "www.example.com"
---   >>> pretty_print $ subdomain_label_neighbors r
---   ["(\"www\",\"example\")","(\"example\",\"com\")"]
---
-subdomain_label_neighbors :: Subdomain -> [(Label,Label)]
-subdomain_label_neighbors s =
-  zip ls (tail ls)
-  where
-    ls = subdomain_labels s
-
-
--- | Return @True@ if the subdomain has any two equal neighboring
---   labels, and @False@ otherwise.
---
---   ==== _Examples_
---
---   >>> import Text.Parsec ( parse )
---
---   >>> let (Right r) = parse subdomain "" "www.example.com"
---   >>> subdomain_has_equal_neighbors r
---   False
---
---   >>> let (Right l) = parse label "" "www"
---   >>> let (Right s) = parse subdomain "" "www.example.com"
---   >>> let bad_subdomain = SubdomainMultipleLabel l s
---   >>> subdomain_has_equal_neighbors bad_subdomain
---   True
---
-subdomain_has_equal_neighbors :: Subdomain -> Bool
-subdomain_has_equal_neighbors s =
-  or [ x == y | (x,y) <- subdomain_label_neighbors s ]
-
-
-
-
--- * Domains
-
--- | An RFC1035 domain. According to RFC1035 a domain can be either a
--- subdomain or \" \", which according to RFC2181
--- <https://tools.ietf.org/html/rfc2181#section-11> means the root:
---
---      The zero length full name is defined as representing the root
---      of the DNS tree, and is typically written and displayed as
---      \".\".
---
---   We let the 'Domain' type remain true to those RFCs, even though
---   they don't support an absolute domain name of e.g. a single dot.
---
-data Domain =
-  DomainName Subdomain |
-  DomainRoot
-  deriving (Eq, Show)
-
-instance Pretty Domain where
-  pretty_show DomainRoot = ""
-  pretty_show (DomainName s) = pretty_show s
-
--- | Parse an RFC1035 \"domain\"
---
---   ==== _Examples_
---
---   >>> import Text.Parsec ( parse, parseTest )
---
---   Make sure we can parse a single character:
---
---   >>> pretty_print $ parse domain "" "a"
---   a
---
---   And the empty domain:
---
---   >>> parseTest domain ""
---   DomainRoot
---
---   We will in fact parse the \"empty\" domain off the front of
---   pretty much anything:
---
---   >>> parseTest domain "!8===D"
---   DomainRoot
---
---   Equality of domains is case-insensitive:
---
---   >>> let (Right r1) = parse domain "" "example.com"
---   >>> let (Right r2) = parse domain "" "ExaMPle.coM"
---   >>> r1 == r2
---   True
---
---   A single dot IS parsed as the root, but the dot isn't consumed:
---
---   >>> parseTest domain "."
---   DomainRoot
---
---   Anything over 255 characters is an error, so the root will be
---   parsed:
---
---   >>> let big_l1 = replicate 63 'x'
---   >>> let big_l2 = replicate 63 'y' -- Avoid equal neighboring labels!
---   >>> let big_labels = big_l1 ++ "." ++ big_l2 ++ "."
---   >>> let big_subdomain = concat $ replicate 3 big_labels
---   >>> parseTest domain big_subdomain
---   DomainRoot
---
---   But exactly 255 is allowed:
---
---   >>> import Data.List ( intercalate )
---   >>> let l1 = replicate 63 'w'
---   >>> let l2 = replicate 63 'x'
---   >>> let l3 = replicate 63 'y'
---   >>> let l4 = replicate 63 'z'
---   >>> let big_subdomain = intercalate "." [l1,l2,l3,l4]
---   >>> let (Right r) = parse domain "" big_subdomain
---   >>> length (pretty_show r)
---   255
---
-domain :: Parser Domain
-domain = try parse_subdomain <|> parse_empty
-  where
-    parse_subdomain :: Parser Domain
-    parse_subdomain = do
-      s <- subdomain
-      if length (pretty_show s) <= 255
-      then return $ DomainName s
-      else fail "subdomains can be at most 255 characters"
-
-    parse_empty :: Parser Domain
-    parse_empty = string "" >> return DomainRoot
-
-
-instance Reversible Domain where
-  -- | Reverse the labels of a 'Domain'.
-  --
-  --   --   ==== _Examples_
-  --
-  --   >>> import Text.Parsec ( parse )
-  --
-  --   The root reverses to itself:
-  --
-  --   >>> let (Right r) = parse domain "" ""
-  --   >>> backwards r
-  --   DomainRoot
-  --
-  --   >>> let (Right r) = parse domain "" "new.www.example.com"
-  --   >>> pretty_print $ backwards r
-  --   com.example.www.new
-  --
-  backwards DomainRoot = DomainRoot
-  backwards (DomainName s) = DomainName $ backwards s
diff --git a/harbl/src/Network/DNS/RBL/Domain/Domain.hs b/harbl/src/Network/DNS/RBL/Domain/Domain.hs

new file mode 100644 (file)

index 0000000..1049e88
--- /dev/null
+++ b/harbl/src/Network/DNS/RBL/Domain/Domain.hs
@@ -0,0 +1,180 @@
+{-# LANGUAGE DoAndIfThenElse #-}
+
+-- | The 'Domain' data type and its parser. A 'Domain' represents a
+--   name in the domain name system (DNS) as described by
+--   RFC1035. In particular, we enforce the restrictions from Section
+--   2.3.1 \"Preferred name syntax\". See for example,
+--
+--     <https://tools.ietf.org/html/rfc1035#section-2.3.1>
+--
+--   We basically work with strings and characters everywhere, even
+--   though this isn't really correct. The length specifications in
+--   the RFCs are all in terms of octets, so really a ByteString.Char8
+--   would be more appropriate. With strings, for example, we could
+--   have a unicode mumbo jumbo character that takes up two bytes
+--   (octets).
+--
+module Network.DNS.RBL.Domain.Domain (
+  Domain(..),
+  domain )
+where
+
+import Text.Parsec (
+  (<|>),
+  string,
+  try )
+import Text.Parsec.String ( Parser )
+
+import Network.DNS.RBL.Domain.Subdomain ( Subdomain, subdomain )
+import Network.DNS.RBL.Pretty ( Pretty(..) )
+import Network.DNS.RBL.Reversible ( Reversible(..) )
+
+
+
+-- | An RFC1035 domain. According to RFC1035 a domain can be either a
+--   subdomain or \" \", which according to RFC2181
+--   <https://tools.ietf.org/html/rfc2181#section-11> means the root:
+--
+--      The zero length full name is defined as representing the root
+--      of the DNS tree, and is typically written and displayed as
+--      \".\".
+--
+--   We let the 'Domain' type remain true to those RFCs, even though
+--   they don't support an absolute domain name of e.g. a single dot.
+--
+--   ==== _Examples_
+--
+--   >>> DomainRoot
+--   DomainRoot
+--
+--   >>> import Text.Parsec ( parse )
+--   >>> let s = parse subdomain "" "x"
+--   >>> DomainName s
+--   DomainName (SubdomainSingleLabel (Label (LetDigLetter (Letter 'x')) Nothing))
+--
+data Domain =
+  DomainName Subdomain |
+  DomainRoot
+  deriving (Eq, Show)
+
+
+-- | Pretty-print a 'Domain'.
+--
+--   ==== _Examples_
+--
+--   >>> pretty_show $ DomainRoot
+--   ""
+--
+--   >>> import Text.Parsec ( parse )
+--   >>> let s = parse subdomain "" "x"
+--   >>> pretty_print $ DomainName s
+--   x
+--
+instance Pretty Domain where
+  pretty_show DomainRoot = ""
+  pretty_show (DomainName s) = pretty_show s
+
+
+-- | The maximum number of characters (octets, really) allowed in a
+--   label. Quoting Section 3.1, \"Name space definitions\", of
+--   RFC1035:
+--
+--   To simplify implementations, the total length of a domain name
+--   (i.e., label octets and label length octets) is restricted to 255
+--   octets or less.
+--
+domain_max_length :: Int
+domain_max_length = 255
+
+
+-- | Parse an RFC1035 \"domain\"
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse, parseTest )
+--
+--   Make sure we can parse a single character:
+--
+--   >>> pretty_print $ parse domain "" "a"
+--   a
+--
+--   And the empty domain:
+--
+--   >>> parseTest domain ""
+--   DomainRoot
+--
+--   We will in fact parse the \"empty\" domain off the front of
+--   pretty much anything:
+--
+--   >>> parseTest domain "!8===D"
+--   DomainRoot
+--
+--   Equality of domains is case-insensitive:
+--
+--   >>> let (Right r1) = parse domain "" "example.com"
+--   >>> let (Right r2) = parse domain "" "ExaMPle.coM"
+--   >>> r1 == r2
+--   True
+--
+--   A single dot IS parsed as the root, but the dot isn't consumed:
+--
+--   >>> parseTest domain "."
+--   DomainRoot
+--
+--   Anything over domain_max_length characters is an error, so the
+--   root will be parsed:
+--
+--   >>> let big_l1 = replicate 63 'x'
+--   >>> let big_l2 = replicate 63 'y' -- Avoid equal neighboring labels!
+--   >>> let big_labels = big_l1 ++ "." ++ big_l2 ++ "."
+--   >>> let big_subdomain = concat $ replicate 3 big_labels
+--   >>> parseTest domain big_subdomain
+--   DomainRoot
+--
+--   But exactly domain_max_length is allowed:
+--
+--   >>> import Data.List ( intercalate )
+--   >>> let l1 = replicate 63 'w'
+--   >>> let l2 = replicate 63 'x'
+--   >>> let l3 = replicate 63 'y'
+--   >>> let l4 = replicate 63 'z'
+--   >>> let big_subdomain = intercalate "." [l1,l2,l3,l4]
+--   >>> let (Right r) = parse domain "" big_subdomain
+--   >>> length (pretty_show r)
+--   255
+--
+domain :: Parser Domain
+domain = try parse_subdomain <|> parse_empty
+  where
+    parse_subdomain :: Parser Domain
+    parse_subdomain = do
+      s <- subdomain
+      if length (pretty_show s) <= domain_max_length
+      then return $ DomainName s
+      else fail $ "subdomains can be at most " ++
+                  (show domain_max_length) ++
+                  " characters"
+
+    parse_empty :: Parser Domain
+    parse_empty = string "" >> return DomainRoot
+
+
+instance Reversible Domain where
+  -- | Reverse the labels of a 'Domain'.
+  --
+  --   --   ==== _Examples_
+  --
+  --   >>> import Text.Parsec ( parse )
+  --
+  --   The root reverses to itself:
+  --
+  --   >>> let (Right r) = parse domain "" ""
+  --   >>> backwards r
+  --   DomainRoot
+  --
+  --   >>> let (Right r) = parse domain "" "new.www.example.com"
+  --   >>> pretty_print $ backwards r
+  --   com.example.www.new
+  --
+  backwards DomainRoot = DomainRoot
+  backwards (DomainName s) = DomainName $ backwards s
diff --git a/harbl/src/Network/DNS/RBL/Domain/Label.hs b/harbl/src/Network/DNS/RBL/Domain/Label.hs

new file mode 100644 (file)

index 0000000..fb98cd8
--- /dev/null
+++ b/harbl/src/Network/DNS/RBL/Domain/Label.hs
@@ -0,0 +1,145 @@
+-- | This module contains the 'Letter' type and a Parsec parser to
+--   parse one. We don't export its constructor because then you could
+--   do something dumb like stick a period inside one.
+--
+--   These are defined in RFC1035, Section 2.3.1, \"Preferred name
+--   syntax\" <https://tools.ietf.org/html/rfc1035#section-2.3.1>:
+--
+--     <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
+--
+--   However, we allow the slightly more general syntax from RFC1123,
+--   Section 2.1 <https://tools.ietf.org/html/rfc1123#section-2.1>:
+--
+--     The syntax of a legal Internet host name was specified in RFC-952
+--     [DNS:4].  One aspect of host name syntax is hereby changed: the
+--     restriction on the first character is relaxed to allow either a
+--     letter or a digit.  Host software MUST support this more liberal
+--     syntax.
+--
+module Network.DNS.RBL.Domain.Label (
+  Label,
+  label )
+where
+
+import Text.Parsec ( optionMaybe )
+import Text.Parsec.String ( Parser )
+
+import Network.DNS.RBL.Domain.LdhStrLetDig (
+  LdhStrLetDig,
+  ldh_str_let_dig,
+  ldh_str_let_dig_length )
+import Network.DNS.RBL.Domain.LetDig ( LetDig, let_dig )
+import Network.DNS.RBL.Pretty ( Pretty(..) )
+
+-- | The label type from the RFC1035 and RFC1123 grammars.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse )
+--
+--   >>> let (Right r)  = parse let_dig "" "x"
+--   >>> r
+--   LetDigLetter (Letter 'x')
+--   >>> Label r Nothing
+--   Label (LetDigLetter (Letter 'x')) Nothing
+--
+data Label = Label LetDig (Maybe LdhStrLetDig)
+  deriving (Eq, Show)
+
+-- | Pretty-print a 'Label'. Should give you back a string that can be
+--   parsed as a 'Label'.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse )
+--
+--   >>> let (Right r) = parse label "" "www"
+--   >>> pretty_print r
+--   www
+--
+--   >>> let (Right r) = parse label "" "example"
+--   >>> pretty_print r
+--   example
+--
+--   >>> let (Right r) = parse label "" "com"
+--   >>> pretty_print r
+--   com
+--
+instance Pretty Label where
+  pretty_show (Label l Nothing) = pretty_show l
+  pretty_show (Label l (Just s)) = (pretty_show l) ++ (pretty_show s)
+
+
+-- | The maximum number of characters (octets, really) allowed in a
+--   label. Quoting Section 2.3.1, \"Preferred name syntax\", of
+--   RFC1035:
+--
+--     The labels must follow the rules for ARPANET host names.  They
+--     must start with a letter, end with a letter or digit, and have
+--     as interior characters only letters, digits, and hyphen.  There
+--     are also some restrictions on the length.  Labels must be 63
+--     characters or less.
+--
+label_max_length :: Int
+label_max_length = 63
+
+
+-- | Parse a 'Label'.
+--
+--   In addition to the grammar, there's another restriction on
+--   labels: their length must be 'label_max_length' characters or
+--   less. We check this only after we have successfully parsed a
+--   label.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse, parseTest )
+--
+--   Make sure we can parse a single character:
+--
+--   >>> parseTest label "a"
+--   Label (LetDigLetter (Letter 'a')) Nothing
+--
+--   And longer strings:
+--
+--   >>> pretty_print $ parse label "" "abc-def"
+--   abc-def
+--
+--   But not anything ending in a hyphen:
+--
+--   >>> parseTest label "abc-"
+--   parse error at (line 1, column 5):
+--   label cannot end with a hyphen
+--
+--   Or anything over 'label_max_length' characters:
+--
+--   >>> parseTest label (['a'..'z'] ++ ['a'..'z'] ++ ['a'..'z'])
+--   parse error at (line 1, column 79):
+--   labels must be 63 or fewer characters
+--
+--   However, /exactly/ 'label_max_length' characters is acceptable:
+--
+--   >>> pretty_print $ parse label "" (replicate label_max_length 'x')
+--   xxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxxx
+--
+--   Ensure that a label can begin with a digit:
+--
+--   >>> pretty_print $ parse label "" "3com"
+--   3com
+--
+label :: Parser Label
+label = do
+  l <- let_dig -- Guaranteed to be there
+  maybe_s <- optionMaybe ldh_str_let_dig -- Might not be there
+  case maybe_s of
+    -- It can only be one character long, from the letter...
+    Nothing -> return $ Label l maybe_s
+
+    -- The letter gives us one character, so we check that the rest is
+    -- less than (label_max_length - 1) characters long. But in the
+    -- error message we need to report label_max_length.
+    Just s  -> if (ldh_str_let_dig_length s) <= (label_max_length - 1)
+              then return $ Label l maybe_s
+              else fail $ "labels must be " ++
+                          (show label_max_length) ++
+                          " or fewer characters"
diff --git a/harbl/src/Network/DNS/RBL/Domain/LdhStrLetDig.hs b/harbl/src/Network/DNS/RBL/Domain/LdhStrLetDig.hs

new file mode 100644 (file)

index 0000000..50c8b97
--- /dev/null
+++ b/harbl/src/Network/DNS/RBL/Domain/LdhStrLetDig.hs
@@ -0,0 +1,127 @@
+-- | This module contains the 'LdhStrLetDig' type a Parsec parser to
+--   parse one.  This type isn't part of the RFC grammar, but it's
+--   used implicitly. In RFC1035, Section 2.3.1, \"Preferred name
+--   syntax\" <https://tools.ietf.org/html/rfc1035#section-2.3.1>:
+--
+--     <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
+--
+--   The stuff in square brackets can be thought of as an 'LdhStrLetDig'.
+--
+module Network.DNS.RBL.Domain.LdhStrLetDig (
+  LdhStrLetDig,
+  ldh_str_let_dig,
+  ldh_str_let_dig_length )
+where
+
+import Text.Parsec.String ( Parser )
+
+import Network.DNS.RBL.Domain.LdhStr ( LdhStr(..), ldh_str, ldh_str_length )
+import Network.DNS.RBL.Domain.LetDig ( LetDig )
+import Network.DNS.RBL.Domain.LetDigHyp ( LetDigHyp(..) )
+import Network.DNS.RBL.Pretty ( Pretty(..) )
+import Network.DNS.RBL.Reversible ( Reversible(..) )
+
+
+-- | A type representing a Letter/Digit/Hyphen string followed by a
+--   trailing Letter/Digit. This type isn't explicitly part of the
+--   grammar, but it's what shows up in the square brackets of,
+--
+--     <label> ::= <letter> [ [ <ldh-str> ] <let-dig> ]
+--
+--   The ldh-str is optional, but if one is present, we must also have
+--   a trailing let-dig to prevent the name from ending with a
+--   hyphen. This can be represented with a 'LdhStrLetDig',
+--   which is why we're about to define it.
+--
+data LdhStrLetDig = LdhStrLetDig (Maybe LdhStr) LetDig
+  deriving (Eq, Show)
+
+
+-- | Pretty-printing for strings of letters/digits/hyphens (ending
+--   with a letter or a digit) that we've already parsed.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse )
+--   >>> let (Right r) = parse ldh_str_let_dig "" "xy-z"
+--   >>> pretty_print $ r
+--   xy-z
+--
+instance Pretty LdhStrLetDig where
+  pretty_show (LdhStrLetDig Nothing ld) = pretty_show ld
+  pretty_show (LdhStrLetDig (Just s) ld) = (pretty_show s) ++ (pretty_show ld)
+
+
+
+-- | Parse an 'LdhStrLetDig'. This isn't in the grammar, but we might
+--   as well define the parser for it independently since we gave it
+--   its own data type.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse, parseTest )
+--
+--   Make sure we can parse a single character:
+--
+--   >>> parseTest ldh_str_let_dig "a"
+--   LdhStrLetDig Nothing (LetDigLetter (Letter 'a'))
+--
+--   And longer strings:
+--
+--   >>> pretty_print $ parse ldh_str_let_dig "" "ab"
+--   ab
+--
+--   >>> pretty_print $ parse ldh_str_let_dig "" "-b"
+--   -b
+--
+--   >>> parseTest ldh_str_let_dig "b-"
+--   parse error at (line 1, column 3):
+--   label cannot end with a hyphen
+--
+ldh_str_let_dig :: Parser LdhStrLetDig
+ldh_str_let_dig = do
+  -- This will happily eat up the trailing let-dig...
+  full_ldh <- ldh_str
+
+  -- So we have to go back and see what happened.
+  case (backwards full_ldh) of
+
+    -- Fail on a single hyphen.
+    (LdhStrSingleLdh (LetDigHypHyphen _)) ->
+      fail "label cannot end with a hyphen"
+
+    -- Fail for a hyphen followed by other stuff.
+    (LdhStrMultipleLdh (LetDigHypHyphen _) _) ->
+      fail "label cannot end with a hyphen"
+
+    -- Simply return the thing if it's a single non-hyphen.
+    (LdhStrSingleLdh (LetDigHypLetDig ld)) -> return $ LdhStrLetDig Nothing ld
+
+    -- And peel off the last character for a non-hyphen followed by
+    -- other stuff. We wind up reversing things twice, but whatever.
+    (LdhStrMultipleLdh (LetDigHypLetDig ld) init_ldh_rev) ->
+      let init_ldh = backwards init_ldh_rev
+      in return $ LdhStrLetDig (Just init_ldh) ld
+
+
+
+-- | Compute the length of a 'LdhStrLetDig'. It's at least one, since
+--   the let-dig at the end is always there. And when there's an
+--   ldh-str too, we add its length to one.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse )
+--
+--   >>> let (Right r) = parse ldh_str_let_dig "" "a"
+--   >>> ldh_str_let_dig_length r
+--   1
+--
+--   >>> let (Right r) = parse ldh_str_let_dig "" "abc-def"
+--   >>> ldh_str_let_dig_length r
+--   7
+--
+ldh_str_let_dig_length :: LdhStrLetDig -> Int
+ldh_str_let_dig_length (LdhStrLetDig Nothing _) = 1
+ldh_str_let_dig_length (LdhStrLetDig (Just ldhstring) _) =
+  1 + (ldh_str_length ldhstring)
diff --git a/harbl/src/Network/DNS/RBL/Domain/Letter.hs b/harbl/src/Network/DNS/RBL/Domain/Letter.hs

index bd26786066599a7c291bd5675492a95147873339..d19d717c0999468bdf23aa103ab12372abdda66e 100644 (file)
--- a/harbl/src/Network/DNS/RBL/Domain/Letter.hs
+++ b/harbl/src/Network/DNS/RBL/Domain/Letter.hs
@@ -20,7 +20,6 @@ import Text.Parsec.String ( Parser )
  
  import Network.DNS.RBL.Pretty ( Pretty(..) )
  
--- * Letters
  
  -- | A wrapper around a letter character.
  --
diff --git a/harbl/src/Network/DNS/RBL/Domain/Subdomain.hs b/harbl/src/Network/DNS/RBL/Domain/Subdomain.hs

new file mode 100644 (file)

index 0000000..af2102c
--- /dev/null
+++ b/harbl/src/Network/DNS/RBL/Domain/Subdomain.hs
@@ -0,0 +1,230 @@
+{-# LANGUAGE DoAndIfThenElse #-}
+
+-- | This module contains the 'Subdomain' type and a Parsec parser to
+--   parse one. We don't export its constructor because then you could
+--   do something dumb like stick a period inside one.
+--
+--   These are defined in RFC1035, Section 2.3.1, \"Preferred name
+--   syntax\" <https://tools.ietf.org/html/rfc1035#section-2.3.1>:
+--
+--     <subdomain> ::= <label> | <subdomain> "." <label>
+--
+module Network.DNS.RBL.Domain.Subdomain (
+  Subdomain,
+  subdomain )
+where
+
+import Text.Parsec ( (<|>), char, try )
+import Text.Parsec.String ( Parser )
+
+import Network.DNS.RBL.Domain.Label ( Label, label )
+import Network.DNS.RBL.Pretty ( Pretty(..) )
+import Network.DNS.RBL.Reversible ( Reversible(..) )
+
+
+-- | The data type representing a \"subdomain\" from RFC1035. We have
+--   reversed the order of the subdomain and label in the second
+--   option (from the grammar), however. This is explained in the
+--   'subdomain' parser.
+--
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse )
+--
+--   >>> let (Right r) = parse label "" "x"
+--   >>> SubdomainSingleLabel r
+--   SubdomainSingleLabel (Label (LetDigLetter (Letter 'x')) Nothing)
+--
+data Subdomain =
+  SubdomainSingleLabel Label |
+  SubdomainMultipleLabel Label Subdomain
+  deriving (Eq, Show)
+
+
+-- | Pretty-print a 'Subdomain'. We print additional \".\" characters
+--   between our labels when there are more than one of them.
+--
+instance Pretty Subdomain where
+  pretty_show (SubdomainSingleLabel l) = pretty_show l
+  pretty_show (SubdomainMultipleLabel l s) =
+    (pretty_show l) ++ "." ++ (pretty_show s)
+
+
+instance Reversible Subdomain where
+  -- | Reverse the labels of the given subdomain.
+  --
+  --   ==== _Examples_
+  --
+  --   >>> import Text.Parsec ( parse )
+  --
+  --   Standard usage:
+  --
+  --   >>> let (Right r) = parse subdomain "" "com"
+  --   >>> pretty_print $ backwards r
+  --   com
+  --
+  --   >>> let (Right r) = parse subdomain "" "example.com"
+  --   >>> pretty_print $ backwards r
+  --   com.example
+  --
+  --   >>> let (Right r) = parse subdomain "" "www.example.com"
+  --   >>> pretty_print $ backwards r
+  --   com.example.www
+  --
+  --   >>> let (Right r) = parse subdomain "" "new.www.example.com"
+  --   >>> pretty_print $ backwards r
+  --   com.example.www.new
+  --
+
+  -- It's easy to reverse a single label...
+  backwards s@(SubdomainSingleLabel _) = s
+
+  -- For multiple labels we have two cases. The first is where we have
+  -- exactly two labels, and we just need to swap them.
+  backwards (SubdomainMultipleLabel l (SubdomainSingleLabel m)) =
+    SubdomainMultipleLabel m (SubdomainSingleLabel l)
+
+  -- And now the hard case. See the 'LdhStr' implementation for an
+  -- explanation.
+  --
+  backwards (SubdomainMultipleLabel l s) = build (SubdomainSingleLabel l) s
+    where
+      -- Build up the first Subdomain on the left by peeling off the
+      -- leading elements of the second Subdomain.
+      build :: Subdomain -> Subdomain -> Subdomain
+      build dst (SubdomainSingleLabel final) = SubdomainMultipleLabel final dst
+      build dst (SubdomainMultipleLabel leading rest) =
+        build (SubdomainMultipleLabel leading dst) rest
+
+
+
+-- | Parse a 'Subdomain'. Here is the RFC1035 grammar for reference:
+--
+--     <subdomain> ::= <label> | <subdomain> "." <label>
+--
+--   In contrast with the grammar, we have reversed the order of the
+--   subdomain and label to prevent infinite recursion. The second
+--   option (subdomain + label) is obviously more specific, we we need
+--   to try it first. This presents a problem: we're trying to parse a
+--   subdomain in terms of a subdomain! The given grammar represents
+--   subdomains how we like to think of them; from right to left. But
+--   it's better to parse from left to right, so we pick off the
+--   leading label and then recurse into the definition of subdomain.
+--
+--   According to RFC1034 <https://tools.ietf.org/html/rfc1034>
+--   Section 3.1, \"3.1. Name space specifications and terminology\"
+--   two neighboring labels in a DNS name cannot be equal:
+--
+--     Each node has a label, which is zero to 63 octets in length.  Brother
+--     nodes may not have the same label, although the same label can be used
+--     for nodes which are not brothers.  One label is reserved, and that is
+--     the null (i.e., zero length) label used for the root.
+--
+--   We enforce this restriction, but the result is usually that we
+--   only parse the part of the subdomain leading up to the repeated
+--   label.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse, parseTest )
+--
+--   Make sure we can parse a single character:
+--
+--   >>> parseTest subdomain "a"
+--   SubdomainSingleLabel (Label (LetDigLetter (Letter 'a')) Nothing)
+--
+--   >>> pretty_print $ parse subdomain "" "example.com"
+--   example.com
+--
+--   >>> pretty_print $ parse subdomain "" "www.example.com"
+--   www.example.com
+--
+--   We reject a subdomain with equal neighbors, but this leads to
+--   only the single first label being parsed instead:
+--
+--   >>> pretty_print $ parse subdomain "" "www.www.example.com"
+--   www
+--
+--   But not one with a repeated but non-neighboring label:
+--
+--   >>> pretty_print $ parse subdomain "" "www.example.www.com"
+--   www.example.www.com
+--
+subdomain :: Parser Subdomain
+subdomain = try both <|> just_one
+  where
+    both :: Parser Subdomain
+    both = do
+      l <- label
+      _ <- char '.'
+      s <- subdomain
+      let result = SubdomainMultipleLabel l s
+      if (subdomain_has_equal_neighbors result)
+      then fail "subdomain cannot have equal neighboring labels"
+      else return result
+
+    just_one :: Parser Subdomain
+    just_one = fmap SubdomainSingleLabel label
+
+
+
+-- | Retrieve a list of labels contained in a 'Subdomain'.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse )
+--
+--   >>> let (Right r) = parse subdomain "" "a"
+--   >>> pretty_print $ subdomain_labels r
+--   ["a"]
+--
+--   >>> let (Right r) = parse subdomain "" "example.com"
+--   >>> pretty_print $ subdomain_labels r
+--   ["example","com"]
+--
+--   >>> let (Right r) = parse subdomain "" "www.example.com"
+--   >>> pretty_print $ subdomain_labels r
+--   ["www","example","com"]
+--
+subdomain_labels :: Subdomain -> [Label]
+subdomain_labels (SubdomainSingleLabel l) = [l]
+subdomain_labels (SubdomainMultipleLabel l s) = l : (subdomain_labels s)
+
+
+-- | Return a list of pairs of neighboring labels in a subdomain.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse )
+--   >>> let (Right r) = parse subdomain "" "www.example.com"
+--   >>> pretty_print $ subdomain_label_neighbors r
+--   ["(\"www\",\"example\")","(\"example\",\"com\")"]
+--
+subdomain_label_neighbors :: Subdomain -> [(Label,Label)]
+subdomain_label_neighbors s =
+  zip ls (tail ls)
+  where
+    ls = subdomain_labels s
+
+
+-- | Return @True@ if the subdomain has any two equal neighboring
+--   labels, and @False@ otherwise.
+--
+--   ==== _Examples_
+--
+--   >>> import Text.Parsec ( parse )
+--
+--   >>> let (Right r) = parse subdomain "" "www.example.com"
+--   >>> subdomain_has_equal_neighbors r
+--   False
+--
+--   >>> let (Right l) = parse label "" "www"
+--   >>> let (Right s) = parse subdomain "" "www.example.com"
+--   >>> let bad_subdomain = SubdomainMultipleLabel l s
+--   >>> subdomain_has_equal_neighbors bad_subdomain
+--   True
+--
+subdomain_has_equal_neighbors :: Subdomain -> Bool
+subdomain_has_equal_neighbors s =
+  or [ x == y | (x,y) <- subdomain_label_neighbors s ]
diff --git a/harbl/src/Network/DNS/RBL/Host.hs b/harbl/src/Network/DNS/RBL/Host.hs

index 8d9fab99562ca9d43d40ee0391ba62dfa33ef7b7..0b93e04cef58a07c98af109faee7b194a46641cb 100644 (file)
--- a/harbl/src/Network/DNS/RBL/Host.hs
+++ b/harbl/src/Network/DNS/RBL/Host.hs
@@ -9,7 +9,7 @@ import Text.Parsec (
    try )
  import Text.Parsec.String ( Parser )
  
-import Network.DNS.RBL.Domain (
+import Network.DNS.RBL.Domain.Domain (
    Domain(..),
    domain )
  import Network.DNS.RBL.Pretty ( Pretty(..) )
author	Michael Orlitzky <michael@orlitzky.com>
	Mon, 13 Jul 2015 19:42:50 +0000 (15:42 -0400)
committer	Michael Orlitzky <michael@orlitzky.com>
	Mon, 13 Jul 2015 19:42:50 +0000 (15:42 -0400)
harbl.cabal		patch \| blob \| history
harbl/src/Network/DNS/RBL/Domain.hs	[deleted file]	patch \| blob \| history
harbl/src/Network/DNS/RBL/Domain/Domain.hs	[new file with mode: 0644]	patch \| blob
harbl/src/Network/DNS/RBL/Domain/Label.hs	[new file with mode: 0644]	patch \| blob
harbl/src/Network/DNS/RBL/Domain/LdhStrLetDig.hs	[new file with mode: 0644]	patch \| blob
harbl/src/Network/DNS/RBL/Domain/Letter.hs		patch \| blob \| history
harbl/src/Network/DNS/RBL/Domain/Subdomain.hs	[new file with mode: 0644]	patch \| blob
harbl/src/Network/DNS/RBL/Host.hs		patch \| blob \| history