Attempt to implement article downloading.
[dead/lwn-epub.git] / src / Main.hs
1 {-# LANGUAGE ScopedTypeVariables, RecordWildCards, DoAndIfThenElse #-}
2 module Main
3 where
4
5 import Control.Monad (when)
6 import Data.Maybe (fromJust)
7 import Prelude hiding (readFile)
8 import System.Directory (doesFileExist)
9 import System.IO (
10 Handle,
11 IOMode (WriteMode),
12 hPutStrLn,
13 openBinaryFile,
14 stderr,
15 stdout
16 )
17 import System.IO.UTF8 (readFile)
18 import Test.HUnit (Assertion, assertEqual)
19 import Test.Framework (Test, testGroup)
20 import Test.Framework.Providers.HUnit (testCase)
21 import Text.Regex.Posix ((=~))
22 import Text.XML.HXT.Core hiding (when)
23
24 import CommandLine (show_help)
25 import Configuration (Cfg(..), get_cfg, use_account)
26 import LWN.HTTP (get_page, log_in, make_cookie_jar)
27 import LWN.Page (epublish, parse)
28 import LWN.URI (is_lwn_url, make_absolute_url, make_https)
29 import Misc (contains)
30
31
32 my_read_opts :: SysConfigList
33 my_read_opts = [ withValidate no,
34 withParseHTML yes,
35 withWarnings no ]
36
37 -- | My version of HandsomeSoup's parseHTML.
38 my_read :: String -> IOStateArrow s b XmlTree
39 my_read = readString my_read_opts
40
41 -- | Try to parse the given article using HXT. We try a few different
42 -- methods; if none of them work, we return 'Nothing'.
43 get_xml_from_article :: Cfg -> IO (Maybe (IOStateArrow s b XmlTree))
44 get_xml_from_article cfg = do
45 my_article <- real_article_path (article cfg)
46 is_file <- doesFileExist my_article
47 case is_file of
48 True -> do
49 contents <- readFile my_article
50 return $ Just $ my_read contents
51 False -> do
52 -- Download the URL and try to parse it.
53 if use_account cfg then do
54 -- use_account would be false if these fromJusts would fail.
55 cj <- make_cookie_jar
56 li_result <- log_in cj
57 (fromJust $ username cfg)
58 (fromJust $ password cfg)
59 when (not li_result) $ do
60 hPutStrLn stderr "Failed to log in."
61
62 html <- get_page (Just cj) my_article
63 return $
64 case html of
65 Nothing -> Nothing
66 Just h -> Just $ my_read h
67 else do
68 html <- get_page Nothing my_article
69 return $
70 case html of
71 Nothing -> Nothing
72 Just h -> Just $ my_read h
73
74 -- | If we're given an empty path, return a handle to
75 -- 'stdout'. Otherwise, open the given file and return a read/write
76 -- handle to that.
77 get_output_handle :: FilePath -> IO Handle
78 get_output_handle path =
79 if (null path) then
80 return stdout
81 else
82 openBinaryFile path WriteMode
83
84
85
86 -- | Convert the given article to either a URL or a filesystem
87 -- path. If the given article exists on the filesystem, we assume
88 -- it's a file. Otherwise, we check to see if it's a URL. Failing
89 -- that, we try to construct a URL from what we're given and do our
90 -- best.
91 real_article_path :: String -> IO String
92 real_article_path s = do
93 is_file <- doesFileExist s
94 return $ if is_file then s else check_cases
95 where
96 abs_current =
97 case make_absolute_url "current" of
98 Nothing -> s
99 Just ac -> ac
100 abs_article =
101 case make_absolute_url ("Articles/" ++ s) of
102 Nothing -> s
103 Just as -> as
104
105 check_cases :: String
106 check_cases
107 | is_lwn_url s = make_https s
108 | s `contains` "current" = abs_current
109 | s =~ "^[0-9]+$" = abs_article
110 | otherwise = s -- Give up
111
112 main :: IO ()
113 main = do
114 cfg <- get_cfg
115 output_handle <- get_output_handle (output cfg)
116
117 maybe_html <- get_xml_from_article cfg
118
119 case maybe_html of
120 Just html -> do
121 result <- parse html
122 case result of
123 Just stuff -> epublish stuff output_handle
124 Nothing -> do
125 _ <- show_help
126 return ()
127
128 Nothing -> do
129 _ <- show_help
130 return ()
131
132
133 test_current_article_path :: Assertion
134 test_current_article_path = do
135 let expected = "https://lwn.net/current"
136 actual <- real_article_path "current"
137 assertEqual "Current article path constructed" expected actual
138
139 test_numbered_article_path :: Assertion
140 test_numbered_article_path = do
141 let expected = "https://lwn.net/Articles/69"
142 actual <- real_article_path "69" -- I'm twelve
143 assertEqual "Numbered article path constructed" expected actual
144
145
146 test_full_article_path :: Assertion
147 test_full_article_path = do
148 let expected = "https://lwn.net/Articles/502979/"
149 actual <- real_article_path "https://lwn.net/Articles/502979/"
150 assertEqual "Full article path left alone" expected actual
151
152 test_non_https_article_path :: Assertion
153 test_non_https_article_path = do
154 let expected = "https://lwn.net/Articles/502979/"
155 actual <- real_article_path "http://lwn.net/Articles/502979/"
156 assertEqual "Non-https URL made https" expected actual
157
158 main_tests :: Test
159 main_tests =
160 testGroup "Main Tests" [
161 testCase "Current article path constructed" test_current_article_path,
162 testCase "Numbered article path constructed" test_numbered_article_path,
163 testCase "Full article path left alone" test_full_article_path,
164 testCase "Non-https URL made https" test_non_https_article_path ]