From: Michael Orlitzky Date: Sat, 7 Jul 2012 16:32:00 +0000 (-0400) Subject: Move the main processing functions into the LWN.Page module. X-Git-Tag: v0.0.1~22 X-Git-Url: https://gitweb.michael.orlitzky.com/?a=commitdiff_plain;h=10f322ce20600de109c4643967b6ce3f61f69bf6;p=dead%2Flwn-epub.git Move the main processing functions into the LWN.Page module. --- diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index 620e3ec..3705f3b 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -5,10 +5,13 @@ where import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) -import System.IO (Handle, hClose, hFlush) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) import Data.Maybe (catMaybes, fromJust, isNothing) +import Prelude hiding (readFile) +import System.Directory (doesFileExist) +import System.IO (Handle, hClose, hFlush, hPutStrLn, stderr) +import System.IO.UTF8 (readFile) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) @@ -21,6 +24,7 @@ import Text.Pandoc ( import Text.XML.HXT.Core ( ArrowXml, IOSArrow, + IOStateArrow, XmlTree, (>>>), (/>), @@ -42,12 +46,64 @@ import Text.XML.HXT.Core ( when) import Text.HandsomeSoup (css, parseHtml) +import Configuration (Cfg, password, use_account, username) import LWN.Article -import LWN.HTTP (ImageMap, download_image_urls) +import LWN.HTTP ( + ImageMap, + download_image_urls, + get_page, + log_in, + make_cookie_jar) import LWN.URI (URL, try_make_absolute_url) -import LWN.XHTML (XHTML, to_xhtml) +import LWN.XHTML (XHTML, parse_lwn, to_xhtml) import Misc (contains) + +-- | Try to parse the given article using HXT. We try a few different +-- methods; if none of them work, we return 'Nothing'. +get_xml_from_article :: Cfg -> URL -> IO (Maybe (IOStateArrow s b XmlTree)) +get_xml_from_article cfg article_name = do + my_article <- real_article_path article_name + is_file <- doesFileExist my_article + case is_file of + True -> do + contents <- readFile my_article + return $ Just $ parse_lwn contents + False -> do + -- Download the URL and try to parse it. + if use_account cfg then do + -- use_account would be false if these fromJusts would fail. + cj <- make_cookie_jar + li_result <- log_in cj + (fromJust $ username cfg) + (fromJust $ password cfg) + + case li_result of + Left err -> do + let msg = "Failed to log in. " ++ err + hPutStrLn stderr msg + Right response_body -> do + hPutStrLn stderr response_body + + html <- get_page (Just cj) my_article + + case html of + Left err -> do + let msg = "Failed to retrieve page. " ++ err + hPutStrLn stderr msg + return Nothing + Right h -> return $ Just $ parse_lwn h + else do + html <- get_page Nothing my_article + case html of + Left err -> do + let msg = "Failed to retrieve page. " ++ err + hPutStrLn stderr msg + return Nothing + Right h -> return $ Just $ parse_lwn h + + + -- Should be called *after* preprocessing. download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap download_images xml = do @@ -102,6 +158,14 @@ instance XHTML Page where +page_from_url :: Cfg -> URL -> IO (Maybe Page) +page_from_url cfg url = do + maybe_html <- get_xml_from_article cfg url + case maybe_html of + Just html -> parse html + Nothing -> return Nothing + + is_link :: (ArrowXml a) => a XmlTree XmlTree is_link = isElem >>> hasName "a" diff --git a/src/Main.hs b/src/Main.hs index 981a705..88ba39c 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -2,72 +2,15 @@ module Main where -import Data.Maybe (fromJust) -import Prelude hiding (readFile) -import System.Directory (doesFileExist) import System.IO ( Handle, IOMode (WriteMode), - hPutStrLn, openBinaryFile, - stderr, stdout) -import System.IO.UTF8 (readFile) -import Text.XML.HXT.Core ( - IOStateArrow, - XmlTree) import CommandLine (show_help) -import Configuration (Cfg(..), get_cfg, use_account) -import LWN.Article (real_article_path) -import LWN.HTTP (get_page, log_in, make_cookie_jar) -import LWN.Page (epublish, parse) -import LWN.XHTML (parse_lwn) - - - --- | Try to parse the given article using HXT. We try a few different --- methods; if none of them work, we return 'Nothing'. -get_xml_from_article :: Cfg -> IO (Maybe (IOStateArrow s b XmlTree)) -get_xml_from_article cfg = do - my_article <- real_article_path (article cfg) - is_file <- doesFileExist my_article - case is_file of - True -> do - contents <- readFile my_article - return $ Just $ parse_lwn contents - False -> do - -- Download the URL and try to parse it. - if use_account cfg then do - -- use_account would be false if these fromJusts would fail. - cj <- make_cookie_jar - li_result <- log_in cj - (fromJust $ username cfg) - (fromJust $ password cfg) - - case li_result of - Left err -> do - let msg = "Failed to log in. " ++ err - hPutStrLn stderr msg - Right response_body -> do - hPutStrLn stderr response_body - - html <- get_page (Just cj) my_article - - case html of - Left err -> do - let msg = "Failed to retrieve page. " ++ err - hPutStrLn stderr msg - return Nothing - Right h -> return $ Just $ parse_lwn h - else do - html <- get_page Nothing my_article - case html of - Left err -> do - let msg = "Failed to retrieve page. " ++ err - hPutStrLn stderr msg - return Nothing - Right h -> return $ Just $ parse_lwn h +import Configuration (Cfg(..), get_cfg) +import LWN.Page (epublish, page_from_url) -- | If we're given an empty path, return a handle to @@ -84,18 +27,11 @@ get_output_handle path = main :: IO () main = do cfg <- get_cfg - output_handle <- get_output_handle (output cfg) - maybe_html <- get_xml_from_article cfg - - case maybe_html of - Just html -> do - result <- parse html - case result of - Just stuff -> epublish stuff output_handle - Nothing -> do - _ <- show_help - return () - + page <- page_from_url cfg (article cfg) + case page of + Just p -> do + output_handle <- get_output_handle (output cfg) + epublish p output_handle Nothing -> do _ <- show_help return ()