+{-# LANGUAGE DoAndIfThenElse #-}
+
module LWN.HTTP
where
+import Control.Concurrent.ParallelIO (parallel)
import qualified Data.ByteString as B (hPut)
+import qualified Data.Map as Map (Map, empty, insert)
+import Data.Maybe (fromJust, isNothing)
import Network.Curl (
CurlCode(..),
CurlOption(..),
CurlResponse,
- URLString,
do_curl_,
initialize,
respBody,
respCurlCode,
- withCurlDo
+ withCurlDo
)
import Network.Curl.Download (openURI)
-import System.Directory (getTemporaryDirectory)
-import System.IO (hPutStrLn, stderr)
-import System.IO.Temp (openBinaryTempFile)
+import System.Directory (doesFileExist, getTemporaryDirectory)
+import System.IO (hClose, hPutStrLn, stderr)
+import qualified System.IO.UTF8 as Utf8 (readFile)
+import System.IO.Temp (openBinaryTempFile, openTempFile)
-import LWN.URI (filename)
+import qualified Configuration as C (Cfg(..))
+import LWN.Article (real_article_path)
+import LWN.URI (URL, filename)
-login_url :: URLString
+login_url :: URL
login_url = "https://lwn.net/login"
username_field :: String
default_curl_opts =
[ -- The Global cache is not thread-friendly.
CurlDNSUseGlobalCache False,
-
+
-- And we don't want to use a DNS cache anyway.
CurlDNSCacheTimeout 0,
- -- Give it a little time...
- CurlTimeout 45,
+ -- Follow redirects.
+ CurlFollowLocation True,
- -- And let us know when things go wrong.
- CurlVerbose True ]
+ -- Give it a little time...
+ CurlTimeout 45 ]
+make_cookie_jar :: IO FilePath
+make_cookie_jar = do
+ temp_dir <- getTemporaryDirectory
+ let file_name_template = "lwn-epub-cookies.txt"
+ (out_path, out_handle) <- openTempFile temp_dir file_name_template
+ hClose out_handle -- We just want to create it for now.
+ return out_path
-get_page :: Maybe FilePath -> URLString -> IO (Maybe String)
-get_page cookie_jar url =
+get_page :: Maybe FilePath -> URL -> IO (Either String String)
+get_page cookie_file url =
withCurlDo $ do
-- Create a curl instance.
curl <- initialize
-- Perform the request, and get back a CurlResponse object.
-- The cast is needed to specify how we would like our headers
-- and body returned (Strings).
- resp <- do_curl_ curl login_url curl_opts :: IO CurlResponse
+ resp <- do_curl_ curl url curl_opts :: IO CurlResponse
-- Pull out the response code as a CurlCode.
let code = respCurlCode resp
- case code of
- CurlOK -> return $ Just (respBody resp)
- error_code -> do
- hPutStrLn stderr ("HTTP Error: " ++ (show error_code))
+ return $
+ case code of
+ CurlOK -> Right (respBody resp)
+ error_code -> Left ("HTTP Error: " ++ (show error_code))
-- If an error occurred, we want to dump as much information as
-- possible. If this becomes a problem, we can use respGetInfo to
-- query the response object for more information
- return Nothing
where
get_opts =
- case cookie_jar of
+ case cookie_file of
Nothing -> []
- Just cookies -> [ CurlCookieJar cookies ]
+ Just cookies -> [ CurlCookieFile cookies ]
curl_opts = default_curl_opts ++ get_opts
-log_in :: FilePath -> String -> String -> IO Bool
+-- | Log in using curl. Store the resulting session cookies in the
+-- supplied file.
+log_in :: FilePath -> String -> String -> IO (Either String String)
log_in cookie_jar username password =
withCurlDo $ do
-- Create a curl instance.
-- Pull out the response code as a CurlCode.
let code = respCurlCode resp
- case code of
- CurlOK -> return True
- error_code -> do
- hPutStrLn stderr ("HTTP Error: " ++ (show error_code))
+ return $
+ case code of
+ CurlOK -> Right (respBody resp)
+ error_code -> Left $ "HTTP Error: " ++ (show error_code)
-- If an error occurred, we want to dump as much information as
-- possible. If this becomes a problem, we can use respGetInfo to
-- query the response object for more information
- return False
where
post_submit :: String
post_submit = submit_field ++ "=Log+In"
post_password = password_field ++ "=" ++ password
post_data :: [String]
- post_data = [post_username, post_password]
+ post_data = [post_username, post_password, post_submit]
post_opts :: [CurlOption]
post_opts =
CurlCookieJar cookie_jar,
CurlPost True,
CurlPostFields post_data ]
-
+
curl_opts :: [CurlOption]
curl_opts = default_curl_opts ++ post_opts
-- We need to be able to parse the filename out of the URL
-- so that when we stick our image in the document, the reader
-- knows that type (jpg, png, etc.) it is.
-save_image :: URLString -> IO (Maybe FilePath)
+save_image :: URL -> IO (Maybe FilePath)
save_image url = do
- let fn = filename url
- case fn of
- Nothing -> return Nothing
- Just file -> do
- temp_dir <- getTemporaryDirectory
- (out_path, out_handle) <- openBinaryTempFile temp_dir file
- result <- openURI url
- case result of
+ it_exists <- doesFileExist url
+ if it_exists then do
+ -- It's local, just use it.
+ return $ Just url
+ else do
+ let fn = filename url
+ case fn of
+ Nothing -> return Nothing
+ Just file -> do
+ temp_dir <- getTemporaryDirectory
+ (out_path, out_handle) <- openBinaryTempFile temp_dir file
+ result <- openURI url
+ case result of
+ Left err -> do
+ hPutStrLn stderr ("HTTP Error: " ++ err)
+ return Nothing
+ Right bs -> do
+ B.hPut out_handle bs
+ return $ Just out_path
+
+
+
+
+-- | Map absolute image URLs to local system file paths where the
+-- image referenced by the URL is stored.
+type ImageMap = Map.Map URL FilePath
+
+download_image_urls :: [URL] -> IO ImageMap
+download_image_urls image_urls = do
+ files <- parallel $ map save_image image_urls
+ let pairs = zip image_urls files
+ return $ foldl my_insert empty_map pairs
+ where
+ empty_map = Map.empty :: ImageMap
+
+ my_insert :: ImageMap -> (URL, Maybe FilePath) -> ImageMap
+ my_insert dict (_, Nothing) = dict
+ my_insert dict (k, Just v) = Map.insert k v dict
+
+
+
+
+
+get_login_cookie :: C.Cfg -> IO C.Cfg
+get_login_cookie cfg
+ | isNothing (C.username cfg) = return cfg
+ | isNothing (C.password cfg) = return cfg
+ | otherwise = do
+ let uname = fromJust $ C.username cfg
+ let pword = fromJust $ C.password cfg
+ cj <- make_cookie_jar
+ li_result <- log_in cj uname pword
+
+ case li_result of
+ Left err -> do
+ let msg = "Failed to log in. " ++ err
+ hPutStrLn stderr msg
+ Right response_body -> do
+ hPutStrLn stderr response_body
+
+ return $ cfg { C.cookie_jar = Just cj }
+
+
+-- | Try to parse the given article using HXT. We try a few different
+-- methods; if none of them work, we return 'Nothing'.
+get_article_contents :: C.Cfg -> URL -> IO (Maybe String)
+get_article_contents cfg article_name = do
+ my_article <- real_article_path article_name
+ is_file <- doesFileExist my_article
+ case is_file of
+ True -> do
+ contents <- Utf8.readFile my_article
+ return $ Just $ contents
+ False -> do
+ -- Download the URL and try to parse it.
+ html <- get_page (C.cookie_jar cfg) my_article
+
+ case html of
Left err -> do
- hPutStrLn stderr ("HTTP Error: " ++ err)
+ let msg = "Failed to retrieve page. " ++ err
+ hPutStrLn stderr msg
return Nothing
- Right bs -> do
- B.hPut out_handle bs
- return $ Just out_path
+ Right h -> return $ Just h