X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FPage.hs;h=3027eaef6c83e76067a744acf98be3a8b6b8579f;hp=24997154eeb84951188d804b6336647c8477a61f;hb=2953924e2016393a1ffb9e2e82b4c90c8c57dfd3;hpb=d79424c546d96dcd3955fdc6cb43eb529566be1e diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index 2499715..3027eae 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -1,28 +1,41 @@ +{-# LANGUAGE DoAndIfThenElse #-} + module LWN.Page where -import Text.Pandoc +import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) -import System.IO (Handle) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) import Data.Maybe (catMaybes, fromJust, isNothing) -import Data.Tree.NTree.TypeDefs (NTree) +import Prelude hiding (readFile) +import System.Directory (doesFileExist) +import System.IO (Handle, hClose, hFlush, hPutStrLn, stderr) +import System.IO.UTF8 (readFile) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) +import Text.Pandoc ( + defaultParserState, + defaultWriterOptions, + readHtml, + writeEPUB, + writerEPUBMetadata) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, + IOStateArrow, XmlTree, - XNode, (>>>), (/>), (//>), + changeAttrValue, + getAttrValue, getChildren, getText, hasAttrValue, hasName, + isElem, mkName, none, processAttrl, @@ -30,12 +43,73 @@ import Text.XML.HXT.Core ( runX, setElemName, xshow, - when - ) + when) import Text.HandsomeSoup (css, parseHtml) +import Configuration (Cfg, password, use_account, username) import LWN.Article -import XHTML +import LWN.HTTP ( + ImageMap, + download_image_urls, + get_page, + log_in, + make_cookie_jar) +import LWN.URI (URL, try_make_absolute_url) +import LWN.XHTML (XHTML, parse_lwn, to_xhtml) +import Misc (contains) + + +-- | Try to parse the given article using HXT. We try a few different +-- methods; if none of them work, we return 'Nothing'. +get_xml_from_article :: Cfg -> URL -> IO (Maybe (IOStateArrow s b XmlTree)) +get_xml_from_article cfg article_name = do + my_article <- real_article_path article_name + is_file <- doesFileExist my_article + case is_file of + True -> do + contents <- readFile my_article + return $ Just $ parse_lwn contents + False -> do + -- Download the URL and try to parse it. + if use_account cfg then do + -- use_account would be false if these fromJusts would fail. + cj <- make_cookie_jar + li_result <- log_in cj + (fromJust $ username cfg) + (fromJust $ password cfg) + + case li_result of + Left err -> do + let msg = "Failed to log in. " ++ err + hPutStrLn stderr msg + Right response_body -> do + hPutStrLn stderr response_body + + html <- get_page (Just cj) my_article + + case html of + Left err -> do + let msg = "Failed to retrieve page. " ++ err + hPutStrLn stderr msg + return Nothing + Right h -> return $ Just $ parse_lwn h + else do + html <- get_page Nothing my_article + case html of + Left err -> do + let msg = "Failed to retrieve page. " ++ err + hPutStrLn stderr msg + return Nothing + Right h -> return $ Just $ parse_lwn h + + + +-- Should be called *after* preprocessing. +download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap +download_images xml = do + image_urls <- runX $ xml >>> image_srcs + download_image_urls image_urls + data Page = -- | An LWN page with one article on it. @@ -59,9 +133,7 @@ instance XHTML Page where "