]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/Main.hs
2700038032651efec82cf886161abdac603a6633
[dead/lwn-epub.git] / src / Main.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2 module Main
3 where
4
5 import Control.Concurrent (threadDelay)
6 import Control.Monad (when)
7 import Data.Maybe (fromJust, isJust)
8 import Prelude hiding (readFile)
9 import System.Directory (doesFileExist)
10 import System.IO (
11 Handle,
12 IOMode (WriteMode),
13 hPutStrLn,
14 openBinaryFile,
15 stderr,
16 stdout
17 )
18 import System.IO.UTF8 (readFile)
19 import Test.HUnit (Assertion, assertEqual)
20 import Test.Framework (Test, testGroup)
21 import Test.Framework.Providers.HUnit (testCase)
22 import Text.Regex.Posix ((=~))
23 import Text.XML.HXT.Core (
24 IOStateArrow,
25 SysConfigList,
26 XmlTree,
27 no,
28 readString,
29 withParseHTML,
30 withValidate,
31 withWarnings,
32 yes
33 )
34 import CommandLine (show_help)
35 import Configuration (Cfg(..), get_cfg, use_account)
36 import LWN.HTTP (get_page, log_in, make_cookie_jar)
37 import LWN.Page (epublish, parse)
38 import LWN.URI (add_trailing_slash, is_lwn_url, make_absolute_url, make_https)
39 import Misc (contains)
40
41
42 my_read_opts :: SysConfigList
43 my_read_opts = [ withValidate no,
44 withParseHTML yes,
45 withWarnings no ]
46
47 -- | My version of HandsomeSoup's parseHTML.
48 my_read :: String -> IOStateArrow s b XmlTree
49 my_read = readString my_read_opts
50
51
52 -- |A wrapper around threadDelay which takes seconds instead of
53 -- microseconds as its argument.
54 thread_sleep :: Int -> IO ()
55 thread_sleep seconds = do
56 let microseconds = seconds * (10 ^ (6 :: Int))
57 threadDelay microseconds
58
59
60 -- | Try to parse the given article using HXT. We try a few different
61 -- methods; if none of them work, we return 'Nothing'.
62 get_xml_from_article :: Cfg -> IO (Maybe (IOStateArrow s b XmlTree))
63 get_xml_from_article cfg = do
64 my_article <- real_article_path (article cfg)
65 is_file <- doesFileExist my_article
66 case is_file of
67 True -> do
68 contents <- readFile my_article
69 return $ Just $ my_read contents
70 False -> do
71 -- Download the URL and try to parse it.
72 if use_account cfg then do
73 -- use_account would be false if these fromJusts would fail.
74 cj <- make_cookie_jar
75 li_result <- log_in cj
76 (fromJust $ username cfg)
77 (fromJust $ password cfg)
78
79 -- Without this, the cookie file is empty during
80 -- get_page. Whaaat?
81 thread_sleep 1
82
83 when (isJust li_result) $ do
84 let msg = "Failed to log in. " ++ (fromJust li_result)
85 hPutStrLn stderr msg
86
87 html <- get_page (Just cj) my_article
88
89 case html of
90 Left err -> do
91 let msg = "Failed to retrieve page. " ++ err
92 hPutStrLn stderr msg
93 return Nothing
94 Right h -> return $ Just $ my_read h
95 else do
96 html <- get_page Nothing my_article
97 case html of
98 Left err -> do
99 let msg = "Failed to retrieve page. " ++ err
100 hPutStrLn stderr msg
101 return Nothing
102 Right h -> return $ Just $ my_read h
103
104 -- | If we're given an empty path, return a handle to
105 -- 'stdout'. Otherwise, open the given file and return a read/write
106 -- handle to that.
107 get_output_handle :: FilePath -> IO Handle
108 get_output_handle path =
109 if (null path) then
110 return stdout
111 else
112 openBinaryFile path WriteMode
113
114
115
116 -- | Convert the given article to either a URL or a filesystem
117 -- path. If the given article exists on the filesystem, we assume
118 -- it's a file. Otherwise, we check to see if it's a URL. Failing
119 -- that, we try to construct a URL from what we're given and do our
120 -- best.
121 real_article_path :: String -> IO String
122 real_article_path s = do
123 is_file <- doesFileExist s
124 return $ if is_file then s else add_trailing_slash check_cases
125 where
126 abs_current =
127 case make_absolute_url "current" of
128 Nothing -> s
129 Just ac -> ac
130 abs_article =
131 case make_absolute_url ("Articles/" ++ s) of
132 Nothing -> s
133 Just as -> as
134
135 check_cases :: String
136 check_cases
137 | is_lwn_url s = make_https s
138 | s `contains` "current" = abs_current
139 | s =~ "^[0-9]+$" = abs_article
140 | otherwise = s -- Give up
141
142 main :: IO ()
143 main = do
144 cfg <- get_cfg
145 output_handle <- get_output_handle (output cfg)
146
147 when (use_account cfg) $ do
148 putStrLn "Using account."
149
150 maybe_html <- get_xml_from_article cfg
151
152 case maybe_html of
153 Just html -> do
154 result <- parse html
155 case result of
156 Just stuff -> epublish stuff output_handle
157 Nothing -> do
158 _ <- show_help
159 return ()
160
161 Nothing -> do
162 _ <- show_help
163 return ()
164
165
166 test_current_article_path :: Assertion
167 test_current_article_path = do
168 let expected = "https://lwn.net/current/"
169 actual <- real_article_path "current"
170 assertEqual "Current article path constructed" expected actual
171
172 test_numbered_article_path :: Assertion
173 test_numbered_article_path = do
174 let expected = "https://lwn.net/Articles/69/"
175 actual <- real_article_path "69" -- I'm twelve
176 assertEqual "Numbered article path constructed" expected actual
177
178
179 test_full_article_path :: Assertion
180 test_full_article_path = do
181 let expected = "https://lwn.net/Articles/502979/"
182 actual <- real_article_path "https://lwn.net/Articles/502979/"
183 assertEqual "Full article path left alone" expected actual
184
185 test_non_https_article_path :: Assertion
186 test_non_https_article_path = do
187 let expected = "https://lwn.net/Articles/502979/"
188 actual <- real_article_path "http://lwn.net/Articles/502979/"
189 assertEqual "Non-https URL made https" expected actual
190
191 main_tests :: Test
192 main_tests =
193 testGroup "Main Tests" [
194 testCase "Current article path constructed" test_current_article_path,
195 testCase "Numbered article path constructed" test_numbered_article_path,
196 testCase "Full article path left alone" test_full_article_path,
197 testCase "Non-https URL made https" test_non_https_article_path ]