From c9c873eead8a45dd7df8e8ef1f5861a5aa741d9e Mon Sep 17 00:00:00 2001 From: Michael Orlitzky Date: Tue, 28 May 2013 23:57:26 -0400 Subject: [PATCH] Get things into shape, it actually validates addresses now. --- email-validator.cabal | 14 +++--- src/Main.hs | 104 +++++++++++++++++++++++++++++++----------- 2 files changed, 84 insertions(+), 34 deletions(-) diff --git a/email-validator.cabal b/email-validator.cabal index 6bd5caf..2c504f6 100644 --- a/email-validator.cabal +++ b/email-validator.cabal @@ -11,10 +11,12 @@ build-type: Simple executable email_validator build-depends: base == 4.*, + bytestring == 0.10.*, dns == 0.3.*, HUnit == 1.2.*, + parallel-io == 0.3.*, QuickCheck == 2.6.*, - regex-pcre == 0.94.*, + pcre-light >= 0.4, test-framework == 0.8.*, test-framework-hunit == 0.3.*, test-framework-quickcheck2 == 0.3.*, @@ -37,9 +39,6 @@ executable email_validator -fwarn-incomplete-record-updates -fwarn-monomorphism-restriction -fwarn-unused-do-bind - -funbox-strict-fields - -fexcess-precision - -fno-spec-constr-count -rtsopts -threaded -optc-O3 @@ -58,10 +57,12 @@ test-suite testsuite main-is: TestSuite.hs build-depends: base == 4.*, + bytestring == 0.10.*, dns == 0.3.*, HUnit == 1.2.*, + parallel-io == 0.3.*, QuickCheck == 2.6.*, - regex-pcre == 0.94.*, + pcre-light >= 0.4, test-framework == 0.8.*, test-framework-hunit == 0.3.*, test-framework-quickcheck2 == 0.3.*, @@ -79,9 +80,6 @@ test-suite testsuite -fwarn-incomplete-record-updates -fwarn-monomorphism-restriction -fwarn-unused-do-bind - -funbox-strict-fields - -fexcess-precision - -fno-spec-constr-count -rtsopts -threaded -optc-O3 diff --git a/src/Main.hs b/src/Main.hs index 3e99d0d..bd3cc45 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -3,8 +3,9 @@ module Main where -import Control.Monad (filterM) -import qualified Data.ByteString.UTF8 as BS +import Control.Concurrent.ParallelIO.Global (parallel, stopGlobalPool) +import qualified Data.ByteString as BS +import qualified Data.ByteString.UTF8 as BSU import Network.DNS ( Domain, Resolver, @@ -14,49 +15,100 @@ import Network.DNS ( withResolver) import Network.DNS.Lookup (lookupMX) import System.IO (hGetContents, hFlush, hPutStrLn, stdin, stdout) +import Text.Regex.PCRE.Light (compile, match, utf8) +type Address = BSU.ByteString -type Address = BS.ByteString - +-- | Resolver parameters. We increase the default timeout from 3 to 5 +-- seconds. resolv_conf :: ResolvConf resolv_conf = defaultResolvConf { resolvTimeout = 5 * 1000 * 1000 } +-- | A list of common domains, there's no need to waste MX lookups +-- on these. +common_domains :: [Domain] +common_domains = map BSU.fromString [ "aol.com", + "comcast.net", + "gmail.com", + "msn.com", + "yahoo.com", + "verizon.net" ] + validate_mx :: Resolver -> Domain -> IO Bool -validate_mx resolver domain = do - result <- lookupMX resolver domain - case result of - Nothing -> return False - _ -> return True +validate_mx resolver domain + | domain `elem` common_domains = return True + | otherwise = do + result <- lookupMX resolver domain + case result of + Nothing -> return False + _ -> return True + +-- | Split an address into local/domain parts. +parts :: Address -> (BSU.ByteString, BSU.ByteString) +parts address = bytestring_split address '@' + +-- | Check that the lengths of the local/domain parts are within spec. +validate_length :: Address -> Bool +validate_length address = + (BSU.length localpart <= 64) && (BSU.length domain <= 255) + where + (localpart, domain) = parts address -validate_syntax :: Address -> IO Bool -validate_syntax address = do - return True +-- | Validate an email address against a simple regex. This should +-- catch common addresses but disallows a lot of (legal) weird stuff. +validate_regex :: Address -> Bool +validate_regex address = + case matches of + Nothing -> False + _ -> True + where + regex_str = "(\\w+)([\\w\\-\\.]*)@(([0-9a-zA-Z\\-]+\\.)+)[a-zA-Z]{2,4}" + regex_bs = BSU.fromString regex_str + regex = compile regex_bs [utf8] + matches = match regex address [] -utf8_split = undefined +-- | Validate the syntax of an email address by checking its length +-- and validating it against a simple regex. +validate_syntax :: Address -> Bool +validate_syntax address = + (validate_length address) && (validate_regex address) -validate :: Resolver -> Address -> IO Bool +-- | Split a 'ByteString' @s@ at the first occurrence of @character@. +bytestring_split :: BSU.ByteString -> Char -> (BSU.ByteString, BSU.ByteString) +bytestring_split s character = + (before, after) + where + break_func = (== character) + (before, rest) = BSU.break break_func s + after = BS.tail rest + + +-- | Validate an email address by doing some simple syntax checks and +-- (if those fail) an MX lookup. We don't count an A record as a mail +-- exchanger. +validate :: Resolver -> Address -> IO (Address, Bool) validate resolver address = do - valid_syntax <- validate_syntax address + let valid_syntax = validate_syntax address if valid_syntax then do - let domain = utf8_split address (BS.fromString "@") - validate_mx resolver domain + let (_,domain) = parts address + mx_result <- validate_mx resolver domain + return (address, mx_result) else do - return False - -empty_string :: BS.ByteString -empty_string = BS.fromString "" + return (address, False) main :: IO () main = do input <- hGetContents stdin - let addresses = BS.lines $ BS.fromString input - let nonempty_addresses = filter (/= empty_string) addresses + let addresses = BSU.lines $ BSU.fromString input + let nonempty_addresses = filter (not . BS.null) addresses rs <- makeResolvSeed resolv_conf withResolver rs $ \resolver -> do - good_addresses <- filterM (validate resolver) nonempty_addresses - let good_address_strings = map BS.toString good_addresses - mapM_ (hPutStrLn stdout) good_address_strings + let actions = map (validate resolver) nonempty_addresses + results <- parallel actions + let good_pairs = filter snd results + mapM_ ((hPutStrLn stdout) . BSU.toString . fst) good_pairs + stopGlobalPool hFlush stdout -- 2.44.2