{-# LANGUAGE DoAndIfThenElse #-} module LWN.Page where import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) import Data.Maybe (catMaybes, fromJust, isNothing) import Prelude hiding (readFile) import System.Directory (doesFileExist) import System.IO (Handle, hClose, hFlush, hPutStrLn, stderr) import System.IO.UTF8 (readFile) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) import Text.Pandoc ( defaultParserState, defaultWriterOptions, readHtml, writeEPUB, writerEPUBMetadata) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, IOStateArrow, XmlTree, (>>>), (/>), (//>), changeAttrValue, getAttrValue, getChildren, getText, hasAttrValue, hasName, isElem, mkName, none, processAttrl, processTopDown, runX, setElemName, xshow, when) import Text.HandsomeSoup (css, parseHtml) import Configuration (Cfg, password, use_account, username) import LWN.Article import LWN.HTTP ( ImageMap, download_image_urls, get_page, log_in, make_cookie_jar) import LWN.URI (URL, try_make_absolute_url) import LWN.XHTML (XHTML, parse_lwn, to_xhtml) import Misc (contains) -- | Try to parse the given article using HXT. We try a few different -- methods; if none of them work, we return 'Nothing'. get_xml_from_article :: Cfg -> URL -> IO (Maybe (IOStateArrow s b XmlTree)) get_xml_from_article cfg article_name = do my_article <- real_article_path article_name is_file <- doesFileExist my_article case is_file of True -> do contents <- readFile my_article return $ Just $ parse_lwn contents False -> do -- Download the URL and try to parse it. if use_account cfg then do -- use_account would be false if these fromJusts would fail. cj <- make_cookie_jar li_result <- log_in cj (fromJust $ username cfg) (fromJust $ password cfg) case li_result of Left err -> do let msg = "Failed to log in. " ++ err hPutStrLn stderr msg Right response_body -> do hPutStrLn stderr response_body html <- get_page (Just cj) my_article case html of Left err -> do let msg = "Failed to retrieve page. " ++ err hPutStrLn stderr msg return Nothing Right h -> return $ Just $ parse_lwn h else do html <- get_page Nothing my_article case html of Left err -> do let msg = "Failed to retrieve page. " ++ err hPutStrLn stderr msg return Nothing Right h -> return $ Just $ parse_lwn h -- Should be called *after* preprocessing. download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap download_images xml = do image_urls <- runX $ xml >>> image_srcs download_image_urls image_urls data Page = -- | An LWN page with one article on it. ArticlePage { article :: Article } | -- | An LWN page with more than one article on it. These require -- different parsing and display functions than the single-article -- pages. FullPage { headline :: String, articles :: [Article] } instance XHTML Page where to_xhtml (ArticlePage a) = "" ++ "" ++ "
" ++ " " ++ "