X-Git-Url: http://gitweb.michael.orlitzky.com/?a=blobdiff_plain;f=src%2FMain.hs;h=f38a7db4b3f5c1d60a2067c34d18c2af512e29ec;hb=a6d2e7470f17b44c0e8fe31c1268488a6788631f;hp=3e99d0d006ccdcf8ee8e7e105ae4a74636187a95;hpb=9c30c25bb499e843a40062e2bd1e118c515aa159;p=email-validator.git diff --git a/src/Main.hs b/src/Main.hs index 3e99d0d..f38a7db 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -1,10 +1,13 @@ {-# LANGUAGE DoAndIfThenElse #-} +{-# LANGUAGE RecordWildCards #-} module Main where -import Control.Monad (filterM) -import qualified Data.ByteString.UTF8 as BS +import Control.Concurrent.ParallelIO.Global (parallel, stopGlobalPool) +import Control.Monad (when) +import qualified Data.ByteString as BS +import qualified Data.ByteString.UTF8 as BSU import Network.DNS ( Domain, Resolver, @@ -13,50 +16,134 @@ import Network.DNS ( makeResolvSeed, withResolver) import Network.DNS.Lookup (lookupMX) -import System.IO (hGetContents, hFlush, hPutStrLn, stdin, stdout) +import System.Directory (doesFileExist) +import System.Exit (exitWith, ExitCode(..)) +import System.IO ( + Handle, + IOMode( WriteMode ), + hClose, + hFlush, + openFile, + stdin, + stdout) +import Text.Regex.PCRE.Light (compile, match, utf8) +import CommandLine (Args(..), apply_args) +import ExitCodes (exit_input_file_doesnt_exist) -type Address = BS.ByteString +type Address = BSU.ByteString +-- | Resolver parameters. We increase the default timeout from 3 to 5 +-- seconds. resolv_conf :: ResolvConf resolv_conf = defaultResolvConf { resolvTimeout = 5 * 1000 * 1000 } +-- | A list of common domains, there's no need to waste MX lookups +-- on these. +common_domains :: [Domain] +common_domains = map BSU.fromString [ "aol.com", + "comcast.net", + "gmail.com", + "msn.com", + "yahoo.com", + "verizon.net" ] + validate_mx :: Resolver -> Domain -> IO Bool -validate_mx resolver domain = do - result <- lookupMX resolver domain - case result of - Nothing -> return False - _ -> return True +validate_mx resolver domain + | domain `elem` common_domains = return True + | otherwise = do + result <- lookupMX resolver domain + case result of + Nothing -> return False + _ -> return True + +-- | Split an address into local/domain parts. +parts :: Address -> (BSU.ByteString, BSU.ByteString) +parts address = bytestring_split address '@' + +-- | Check that the lengths of the local/domain parts are within spec. +validate_length :: Address -> Bool +validate_length address = + (BSU.length localpart <= 64) && (BSU.length domain <= 255) + where + (localpart, domain) = parts address + +-- | Validate an email address against a simple regex. This should +-- catch common addresses but disallows a lot of (legal) weird stuff. +validate_regex :: Address -> Bool +validate_regex address = + case matches of + Nothing -> False + _ -> True + where + regex_str = "(\\w+)([\\w\\-\\.]*)@(([0-9a-zA-Z\\-]+\\.)+)[a-zA-Z]{2,4}" + regex_bs = BSU.fromString regex_str + regex = compile regex_bs [utf8] + matches = match regex address [] + +-- | Validate the syntax of an email address by checking its length +-- and validating it against a simple regex. +validate_syntax :: Address -> Bool +validate_syntax address = + (validate_length address) && (validate_regex address) -validate_syntax :: Address -> IO Bool -validate_syntax address = do - return True +-- | Split a 'ByteString' @s@ at the first occurrence of @character@. +bytestring_split :: BSU.ByteString -> Char -> (BSU.ByteString, BSU.ByteString) +bytestring_split s character = + (before, after) + where + break_func = (== character) + (before, rest) = BSU.break break_func s + after = BS.tail rest -utf8_split = undefined -validate :: Resolver -> Address -> IO Bool +-- | Validate an email address by doing some simple syntax checks and +-- (if those fail) an MX lookup. We don't count an A record as a mail +-- exchanger. +validate :: Resolver -> Address -> IO (Address, Bool) validate resolver address = do - valid_syntax <- validate_syntax address + let valid_syntax = validate_syntax address if valid_syntax then do - let domain = utf8_split address (BS.fromString "@") - validate_mx resolver domain + let (_,domain) = parts address + mx_result <- validate_mx resolver domain + return (address, mx_result) else do - return False + return (address, False) + + +append_handle_with_newline :: Handle -> BS.ByteString -> IO () +append_handle_with_newline h bs = do + BS.hPutStr h bs + BS.hPutStr h newline + where + newline = BSU.fromString "\n" -empty_string :: BS.ByteString -empty_string = BS.fromString "" main :: IO () main = do - input <- hGetContents stdin + Args{..} <- apply_args + + input <- case input_file of + Nothing -> BS.hGetContents stdin + Just path -> do + is_file <- doesFileExist path + when (not is_file) $ do + exitWith (ExitFailure exit_input_file_doesnt_exist) + BS.readFile path - let addresses = BS.lines $ BS.fromString input - let nonempty_addresses = filter (/= empty_string) addresses + output_handle <- case output_file of + Nothing -> return stdout + Just path -> openFile path WriteMode + + let addresses = BSU.lines input + let nonempty_addresses = filter (not . BS.null) addresses rs <- makeResolvSeed resolv_conf withResolver rs $ \resolver -> do - good_addresses <- filterM (validate resolver) nonempty_addresses - let good_address_strings = map BS.toString good_addresses - mapM_ (hPutStrLn stdout) good_address_strings - - hFlush stdout + let actions = map (validate resolver) nonempty_addresses + results <- parallel actions + stopGlobalPool + let good_pairs = filter snd results + mapM_ ((append_handle_with_newline output_handle) . fst) good_pairs + hFlush output_handle + hClose output_handle