Add trailing slashes to URLs.
[dead/lwn-epub.git] / src / Main.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2 module Main
3 where
4
5 import Control.Monad (when)
6 import Data.Maybe (fromJust)
7 import Prelude hiding (readFile)
8 import System.Directory (doesFileExist)
9 import System.IO (
10 Handle,
11 IOMode (WriteMode),
12 hPutStrLn,
13 openBinaryFile,
14 stderr,
15 stdout
16 )
17 import System.IO.UTF8 (readFile)
18 import Test.HUnit (Assertion, assertEqual)
19 import Test.Framework (Test, testGroup)
20 import Test.Framework.Providers.HUnit (testCase)
21 import Text.Regex.Posix ((=~))
22 import Text.XML.HXT.Core hiding (when)
23
24 import CommandLine (show_help)
25 import Configuration (Cfg(..), get_cfg, use_account)
26 import LWN.HTTP (get_page, log_in, make_cookie_jar)
27 import LWN.Page (epublish, parse)
28 import LWN.URI (add_trailing_slash, is_lwn_url, make_absolute_url, make_https)
29 import Misc (contains)
30
31
32 my_read_opts :: SysConfigList
33 my_read_opts = [ withValidate no,
34 withParseHTML yes,
35 withWarnings no ]
36
37 -- | My version of HandsomeSoup's parseHTML.
38 my_read :: String -> IOStateArrow s b XmlTree
39 my_read = readString my_read_opts
40
41 -- | Try to parse the given article using HXT. We try a few different
42 -- methods; if none of them work, we return 'Nothing'.
43 get_xml_from_article :: Cfg -> IO (Maybe (IOStateArrow s b XmlTree))
44 get_xml_from_article cfg = do
45 my_article <- real_article_path (article cfg)
46 is_file <- doesFileExist my_article
47 case is_file of
48 True -> do
49 contents <- readFile my_article
50 return $ Just $ my_read contents
51 False -> do
52 -- Download the URL and try to parse it.
53 if use_account cfg then do
54 -- use_account would be false if these fromJusts would fail.
55 cj <- make_cookie_jar
56 li_result <- log_in cj
57 (fromJust $ username cfg)
58 (fromJust $ password cfg)
59 when (not li_result) $ do
60 hPutStrLn stderr "Failed to log in."
61
62 html <- get_page (Just cj) my_article
63 print $ fromJust $ html
64 return $
65 case html of
66 Nothing -> Nothing
67 Just h -> Just $ my_read h
68 else do
69 html <- get_page Nothing my_article
70 putStrLn "Not logged in."
71 print $ fromJust $ html
72 return $
73 case html of
74 Nothing -> Nothing
75 Just h -> Just $ my_read h
76
77 -- | If we're given an empty path, return a handle to
78 -- 'stdout'. Otherwise, open the given file and return a read/write
79 -- handle to that.
80 get_output_handle :: FilePath -> IO Handle
81 get_output_handle path =
82 if (null path) then
83 return stdout
84 else
85 openBinaryFile path WriteMode
86
87
88
89 -- | Convert the given article to either a URL or a filesystem
90 -- path. If the given article exists on the filesystem, we assume
91 -- it's a file. Otherwise, we check to see if it's a URL. Failing
92 -- that, we try to construct a URL from what we're given and do our
93 -- best.
94 real_article_path :: String -> IO String
95 real_article_path s = do
96 is_file <- doesFileExist s
97 return $ if is_file then s else add_trailing_slash check_cases
98 where
99 abs_current =
100 case make_absolute_url "current" of
101 Nothing -> s
102 Just ac -> ac
103 abs_article =
104 case make_absolute_url ("Articles/" ++ s) of
105 Nothing -> s
106 Just as -> as
107
108 check_cases :: String
109 check_cases
110 | is_lwn_url s = make_https s
111 | s `contains` "current" = abs_current
112 | s =~ "^[0-9]+$" = abs_article
113 | otherwise = s -- Give up
114
115 main :: IO ()
116 main = do
117 cfg <- get_cfg
118 output_handle <- get_output_handle (output cfg)
119
120 when (use_account cfg) $ do
121 putStrLn "Using account."
122
123 maybe_html <- get_xml_from_article cfg
124
125 case maybe_html of
126 Just html -> do
127 result <- parse html
128 case result of
129 Just stuff -> epublish stuff output_handle
130 Nothing -> do
131 _ <- show_help
132 return ()
133
134 Nothing -> do
135 _ <- show_help
136 return ()
137
138
139 test_current_article_path :: Assertion
140 test_current_article_path = do
141 let expected = "https://lwn.net/current/"
142 actual <- real_article_path "current"
143 assertEqual "Current article path constructed" expected actual
144
145 test_numbered_article_path :: Assertion
146 test_numbered_article_path = do
147 let expected = "https://lwn.net/Articles/69/"
148 actual <- real_article_path "69" -- I'm twelve
149 assertEqual "Numbered article path constructed" expected actual
150
151
152 test_full_article_path :: Assertion
153 test_full_article_path = do
154 let expected = "https://lwn.net/Articles/502979/"
155 actual <- real_article_path "https://lwn.net/Articles/502979/"
156 assertEqual "Full article path left alone" expected actual
157
158 test_non_https_article_path :: Assertion
159 test_non_https_article_path = do
160 let expected = "https://lwn.net/Articles/502979/"
161 actual <- real_article_path "http://lwn.net/Articles/502979/"
162 assertEqual "Non-https URL made https" expected actual
163
164 main_tests :: Test
165 main_tests =
166 testGroup "Main Tests" [
167 testCase "Current article path constructed" test_current_article_path,
168 testCase "Numbered article path constructed" test_numbered_article_path,
169 testCase "Full article path left alone" test_full_article_path,
170 testCase "Non-https URL made https" test_non_https_article_path ]