]> gitweb.michael.orlitzky.com - email-validator.git/commitdiff
Get things into shape, it actually validates addresses now.
authorMichael Orlitzky <michael@orlitzky.com>
Wed, 29 May 2013 03:57:26 +0000 (23:57 -0400)
committerMichael Orlitzky <michael@orlitzky.com>
Wed, 29 May 2013 03:57:26 +0000 (23:57 -0400)
email-validator.cabal
src/Main.hs

index 6bd5caf977c1cdbf6630c772acddae44613f6084..2c504f66125e68c208737b41de6f0888c4de7a85 100644 (file)
@@ -11,10 +11,12 @@ build-type:     Simple
 executable email_validator
   build-depends:
     base                        == 4.*,
+    bytestring                  == 0.10.*,
     dns                         == 0.3.*,
     HUnit                       == 1.2.*,
+    parallel-io                 == 0.3.*,
     QuickCheck                  == 2.6.*,
-    regex-pcre                  == 0.94.*,
+    pcre-light                  >= 0.4,
     test-framework              == 0.8.*,
     test-framework-hunit        == 0.3.*,
     test-framework-quickcheck2  == 0.3.*,
@@ -37,9 +39,6 @@ executable email_validator
     -fwarn-incomplete-record-updates
     -fwarn-monomorphism-restriction
     -fwarn-unused-do-bind
-    -funbox-strict-fields
-    -fexcess-precision
-    -fno-spec-constr-count
     -rtsopts
     -threaded
     -optc-O3
@@ -58,10 +57,12 @@ test-suite testsuite
   main-is: TestSuite.hs
   build-depends:
     base                        == 4.*,
+    bytestring                  == 0.10.*,
     dns                         == 0.3.*,
     HUnit                       == 1.2.*,
+    parallel-io                 == 0.3.*,
     QuickCheck                  == 2.6.*,
-    regex-pcre                  == 0.94.*,
+    pcre-light                  >= 0.4,
     test-framework              == 0.8.*,
     test-framework-hunit        == 0.3.*,
     test-framework-quickcheck2  == 0.3.*,
@@ -79,9 +80,6 @@ test-suite testsuite
     -fwarn-incomplete-record-updates
     -fwarn-monomorphism-restriction
     -fwarn-unused-do-bind
-    -funbox-strict-fields
-    -fexcess-precision
-    -fno-spec-constr-count
     -rtsopts
     -threaded
     -optc-O3
index 3e99d0d006ccdcf8ee8e7e105ae4a74636187a95..bd3cc45f5ff741e936243c9de0cf5baa84f0e1ce 100644 (file)
@@ -3,8 +3,9 @@
 module Main
 where
 
-import Control.Monad (filterM)
-import qualified Data.ByteString.UTF8 as BS
+import Control.Concurrent.ParallelIO.Global (parallel, stopGlobalPool)
+import qualified Data.ByteString as BS
+import qualified Data.ByteString.UTF8 as BSU
 import Network.DNS (
   Domain,
   Resolver,
@@ -14,49 +15,100 @@ import Network.DNS (
   withResolver)
 import Network.DNS.Lookup (lookupMX)
 import System.IO (hGetContents, hFlush, hPutStrLn, stdin, stdout)
+import Text.Regex.PCRE.Light (compile, match, utf8)
 
+type Address = BSU.ByteString
 
-type Address = BS.ByteString
-
+-- | Resolver parameters. We increase the default timeout from 3 to 5
+--   seconds.
 resolv_conf ::  ResolvConf
 resolv_conf = defaultResolvConf { resolvTimeout = 5 * 1000 * 1000 }
 
+-- | A list of common domains, there's no need to waste MX lookups
+--   on these.
+common_domains :: [Domain]
+common_domains = map BSU.fromString [ "aol.com",
+                                      "comcast.net",
+                                      "gmail.com",
+                                      "msn.com",
+                                      "yahoo.com",
+                                      "verizon.net" ]
+
 validate_mx :: Resolver -> Domain -> IO Bool
-validate_mx resolver domain = do
-  result <- lookupMX resolver domain
-  case result of
-      Nothing -> return False
-      _       -> return True
+validate_mx resolver domain
+  | domain `elem` common_domains = return True
+  | otherwise = do
+      result <- lookupMX resolver domain
+      case result of
+        Nothing -> return False
+        _       -> return True
+
+-- | Split an address into local/domain parts.
+parts :: Address -> (BSU.ByteString, BSU.ByteString)
+parts address = bytestring_split address '@'
+
+-- | Check that the lengths of the local/domain parts are within spec.
+validate_length :: Address -> Bool
+validate_length address =
+  (BSU.length localpart <= 64) && (BSU.length domain <= 255)
+  where
+    (localpart, domain) = parts address
 
-validate_syntax :: Address -> IO Bool
-validate_syntax address = do
-  return True
+-- | Validate an email address against a simple regex. This should
+--   catch common addresses but disallows a lot of (legal) weird stuff.
+validate_regex :: Address -> Bool
+validate_regex address =
+  case matches of
+    Nothing -> False
+    _       -> True
+  where
+    regex_str = "(\\w+)([\\w\\-\\.]*)@(([0-9a-zA-Z\\-]+\\.)+)[a-zA-Z]{2,4}"
+    regex_bs  = BSU.fromString regex_str
+    regex = compile regex_bs [utf8]
+    matches = match regex address []
 
-utf8_split = undefined
+-- | Validate the syntax of an email address by checking its length
+--   and validating it against a simple regex.
+validate_syntax :: Address -> Bool
+validate_syntax address =
+  (validate_length address) && (validate_regex address)
 
-validate :: Resolver -> Address -> IO Bool
+-- | Split a 'ByteString' @s@ at the first occurrence of @character@.
+bytestring_split :: BSU.ByteString -> Char -> (BSU.ByteString, BSU.ByteString)
+bytestring_split s character =
+  (before, after)
+  where
+    break_func = (== character)
+    (before, rest) = BSU.break break_func s
+    after = BS.tail rest
+
+
+-- | Validate an email address by doing some simple syntax checks and
+--   (if those fail) an MX lookup. We don't count an A record as a mail
+--   exchanger.
+validate :: Resolver -> Address -> IO (Address, Bool)
 validate resolver address = do
-  valid_syntax <- validate_syntax address
+  let valid_syntax = validate_syntax address
   if valid_syntax then do
-    let domain = utf8_split address (BS.fromString "@")
-    validate_mx resolver domain
+    let (_,domain) = parts address
+    mx_result <- validate_mx resolver domain
+    return (address, mx_result)
   else do
-    return False
-
-empty_string :: BS.ByteString
-empty_string = BS.fromString ""
+    return (address, False)
 
 main :: IO ()
 main = do
   input <- hGetContents stdin
 
-  let addresses = BS.lines $ BS.fromString input
-  let nonempty_addresses = filter (/= empty_string) addresses
+  let addresses = BSU.lines $ BSU.fromString input
+  let nonempty_addresses = filter (not . BS.null) addresses
 
   rs <- makeResolvSeed resolv_conf
   withResolver rs $ \resolver -> do
-    good_addresses <- filterM (validate resolver) nonempty_addresses
-    let good_address_strings = map BS.toString good_addresses
-    mapM_ (hPutStrLn stdout) good_address_strings
+    let actions = map (validate resolver) nonempty_addresses
+    results <- parallel actions
+    let good_pairs = filter snd results
+    mapM_ ((hPutStrLn stdout) . BSU.toString . fst) good_pairs
 
+  stopGlobalPool
   hFlush stdout