]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/HTTP.hs
Use parallel-io instead of mapM to download images and full stories.
[dead/lwn-epub.git] / src / LWN / HTTP.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.HTTP
4 where
5
6 import Control.Concurrent.ParallelIO (parallel)
7 import qualified Data.ByteString as B (hPut)
8 import qualified Data.Map as Map (Map, empty, insert)
9 import Data.Maybe (fromJust, isNothing)
10 import Network.Curl (
11 CurlCode(..),
12 CurlOption(..),
13 CurlResponse,
14 do_curl_,
15 initialize,
16 respBody,
17 respCurlCode,
18 withCurlDo
19 )
20 import Network.Curl.Download (openURI)
21 import System.Directory (doesFileExist, getTemporaryDirectory)
22 import System.IO (hClose, hPutStrLn, stderr)
23 import qualified System.IO.UTF8 as Utf8 (readFile)
24 import System.IO.Temp (openBinaryTempFile, openTempFile)
25
26 import qualified Configuration as C (Cfg(..))
27 import LWN.Article (real_article_path)
28 import LWN.URI (URL, filename)
29
30 login_url :: URL
31 login_url = "https://lwn.net/login"
32
33 username_field :: String
34 username_field = "Username"
35
36 password_field :: String
37 password_field = "Password"
38
39 submit_field :: String
40 submit_field = "submit"
41
42
43 default_curl_opts :: [CurlOption]
44 default_curl_opts =
45 [ -- The Global cache is not thread-friendly.
46 CurlDNSUseGlobalCache False,
47
48 -- And we don't want to use a DNS cache anyway.
49 CurlDNSCacheTimeout 0,
50
51 -- Follow redirects.
52 CurlFollowLocation True,
53
54 -- Give it a little time...
55 CurlTimeout 45 ]
56
57
58 make_cookie_jar :: IO FilePath
59 make_cookie_jar = do
60 temp_dir <- getTemporaryDirectory
61 let file_name_template = "lwn-epub-cookies.txt"
62 (out_path, out_handle) <- openTempFile temp_dir file_name_template
63 hClose out_handle -- We just want to create it for now.
64 return out_path
65
66 get_page :: Maybe FilePath -> URL -> IO (Either String String)
67 get_page cookie_file url =
68 withCurlDo $ do
69 -- Create a curl instance.
70 curl <- initialize
71
72 -- Perform the request, and get back a CurlResponse object.
73 -- The cast is needed to specify how we would like our headers
74 -- and body returned (Strings).
75 resp <- do_curl_ curl url curl_opts :: IO CurlResponse
76
77 -- Pull out the response code as a CurlCode.
78 let code = respCurlCode resp
79
80 return $
81 case code of
82 CurlOK -> Right (respBody resp)
83 error_code -> Left ("HTTP Error: " ++ (show error_code))
84 -- If an error occurred, we want to dump as much information as
85 -- possible. If this becomes a problem, we can use respGetInfo to
86 -- query the response object for more information
87 where
88 get_opts =
89 case cookie_file of
90 Nothing -> []
91 Just cookies -> [ CurlCookieFile cookies ]
92
93 curl_opts = default_curl_opts ++ get_opts
94
95
96 -- | Log in using curl. Store the resulting session cookies in the
97 -- supplied file.
98 log_in :: FilePath -> String -> String -> IO (Either String String)
99 log_in cookie_jar username password =
100 withCurlDo $ do
101 -- Create a curl instance.
102 curl <- initialize
103
104 -- Perform the request, and get back a CurlResponse object.
105 -- The cast is needed to specify how we would like our headers
106 -- and body returned (Strings).
107 resp <- do_curl_ curl login_url curl_opts :: IO CurlResponse
108
109 -- Pull out the response code as a CurlCode.
110 let code = respCurlCode resp
111
112 return $
113 case code of
114 CurlOK -> Right (respBody resp)
115 error_code -> Left $ "HTTP Error: " ++ (show error_code)
116 -- If an error occurred, we want to dump as much information as
117 -- possible. If this becomes a problem, we can use respGetInfo to
118 -- query the response object for more information
119 where
120 post_submit :: String
121 post_submit = submit_field ++ "=Log+In"
122
123 post_username :: String
124 post_username = username_field ++ "=" ++ username
125
126 post_password :: String
127 post_password = password_field ++ "=" ++ password
128
129 post_data :: [String]
130 post_data = [post_username, post_password, post_submit]
131
132 post_opts :: [CurlOption]
133 post_opts =
134 [ CurlCookieSession True,
135 CurlCookieJar cookie_jar,
136 CurlPost True,
137 CurlPostFields post_data ]
138
139 curl_opts :: [CurlOption]
140 curl_opts = default_curl_opts ++ post_opts
141
142
143 -- | Save the image at 'url'. Saves to a temporary file, and
144 -- returns the path to that file if successful. Otherwise,
145 -- returns 'Nothing'.
146 --
147 -- We need to be able to parse the filename out of the URL
148 -- so that when we stick our image in the document, the reader
149 -- knows that type (jpg, png, etc.) it is.
150 save_image :: URL -> IO (Maybe FilePath)
151 save_image url = do
152 it_exists <- doesFileExist url
153 if it_exists then do
154 -- It's local, just use it.
155 return $ Just url
156 else do
157 let fn = filename url
158 case fn of
159 Nothing -> return Nothing
160 Just file -> do
161 temp_dir <- getTemporaryDirectory
162 (out_path, out_handle) <- openBinaryTempFile temp_dir file
163 result <- openURI url
164 case result of
165 Left err -> do
166 hPutStrLn stderr ("HTTP Error: " ++ err)
167 return Nothing
168 Right bs -> do
169 B.hPut out_handle bs
170 return $ Just out_path
171
172
173
174
175 -- | Map absolute image URLs to local system file paths where the
176 -- image referenced by the URL is stored.
177 type ImageMap = Map.Map URL FilePath
178
179 download_image_urls :: [URL] -> IO ImageMap
180 download_image_urls image_urls = do
181 files <- parallel $ map save_image image_urls
182 let pairs = zip image_urls files
183 return $ foldl my_insert empty_map pairs
184 where
185 empty_map = Map.empty :: ImageMap
186
187 my_insert :: ImageMap -> (URL, Maybe FilePath) -> ImageMap
188 my_insert dict (_, Nothing) = dict
189 my_insert dict (k, Just v) = Map.insert k v dict
190
191
192
193
194
195 get_login_cookie :: C.Cfg -> IO C.Cfg
196 get_login_cookie cfg
197 | isNothing (C.username cfg) = return cfg
198 | isNothing (C.password cfg) = return cfg
199 | otherwise = do
200 let uname = fromJust $ C.username cfg
201 let pword = fromJust $ C.password cfg
202 cj <- make_cookie_jar
203 li_result <- log_in cj uname pword
204
205 case li_result of
206 Left err -> do
207 let msg = "Failed to log in. " ++ err
208 hPutStrLn stderr msg
209 Right response_body -> do
210 hPutStrLn stderr response_body
211
212 return $ cfg { C.cookie_jar = Just cj }
213
214
215 -- | Try to parse the given article using HXT. We try a few different
216 -- methods; if none of them work, we return 'Nothing'.
217 get_article_contents :: C.Cfg -> URL -> IO (Maybe String)
218 get_article_contents cfg article_name = do
219 my_article <- real_article_path article_name
220 is_file <- doesFileExist my_article
221 case is_file of
222 True -> do
223 contents <- Utf8.readFile my_article
224 return $ Just $ contents
225 False -> do
226 -- Download the URL and try to parse it.
227 html <- get_page (C.cookie_jar cfg) my_article
228
229 case html of
230 Left err -> do
231 let msg = "Failed to retrieve page. " ++ err
232 hPutStrLn stderr msg
233 return Nothing
234 Right h -> return $ Just h