X-Git-Url: http://gitweb.michael.orlitzky.com/?a=blobdiff_plain;ds=sidebyside;f=src%2FLWN%2FPage.hs;h=cdcd1a602e1add49f729f5a5f7263eebae2e69a5;hb=9522dae5ae266206b10e5517215e9d7143bf3df8;hp=3705f3bcf62bd89759c9793410ee6a3633cc877e;hpb=10f322ce20600de109c4643967b6ce3f61f69bf6;p=dead%2Flwn-epub.git diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index 3705f3b..cdcd1a6 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -3,15 +3,15 @@ module LWN.Page where +import Control.Concurrent.ParallelIO (parallel) import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) +import qualified Data.Map as Map (Map, empty, insert) import Data.Maybe (catMaybes, fromJust, isNothing) import Prelude hiding (readFile) -import System.Directory (doesFileExist) -import System.IO (Handle, hClose, hFlush, hPutStrLn, stderr) -import System.IO.UTF8 (readFile) +import System.IO (Handle, hClose, hFlush) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) @@ -20,12 +20,14 @@ import Text.Pandoc ( defaultWriterOptions, readHtml, writeEPUB, - writerEPUBMetadata) + writerEPUBMetadata, + writerUserDataDir) +import Text.Pandoc.Shared ( readDataFile ) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, - IOStateArrow, XmlTree, + ($<), (>>>), (/>), (//>), @@ -33,83 +35,40 @@ import Text.XML.HXT.Core ( getAttrValue, getChildren, getText, - hasAttrValue, hasName, - isElem, - mkName, none, processAttrl, processTopDown, + this, runX, - setElemName, xshow, when) import Text.HandsomeSoup (css, parseHtml) -import Configuration (Cfg, password, use_account, username) +import Configuration (Cfg, full_stories) import LWN.Article import LWN.HTTP ( ImageMap, download_image_urls, - get_page, - log_in, - make_cookie_jar) -import LWN.URI (URL, try_make_absolute_url) -import LWN.XHTML (XHTML, parse_lwn, to_xhtml) -import Misc (contains) - - --- | Try to parse the given article using HXT. We try a few different --- methods; if none of them work, we return 'Nothing'. -get_xml_from_article :: Cfg -> URL -> IO (Maybe (IOStateArrow s b XmlTree)) -get_xml_from_article cfg article_name = do - my_article <- real_article_path article_name - is_file <- doesFileExist my_article - case is_file of - True -> do - contents <- readFile my_article - return $ Just $ parse_lwn contents - False -> do - -- Download the URL and try to parse it. - if use_account cfg then do - -- use_account would be false if these fromJusts would fail. - cj <- make_cookie_jar - li_result <- log_in cj - (fromJust $ username cfg) - (fromJust $ password cfg) - - case li_result of - Left err -> do - let msg = "Failed to log in. " ++ err - hPutStrLn stderr msg - Right response_body -> do - hPutStrLn stderr response_body - - html <- get_page (Just cj) my_article - - case html of - Left err -> do - let msg = "Failed to retrieve page. " ++ err - hPutStrLn stderr msg - return Nothing - Right h -> return $ Just $ parse_lwn h - else do - html <- get_page Nothing my_article - case html of - Left err -> do - let msg = "Failed to retrieve page. " ++ err - hPutStrLn stderr msg - return Nothing - Right h -> return $ Just $ parse_lwn h + get_article_contents) +import LWN.URI (URL) +import LWN.XHTML ( + XHTML, + full_story_urls, + image_srcs, + full_story_link, + full_story_paragraph, + is_image, + preprocess, + remove_byline, + remove_full_story_paragraphs, + remove_title, + to_xhtml, + to_xml, + xml_from_contents) --- Should be called *after* preprocessing. -download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap -download_images xml = do - image_urls <- runX $ xml >>> image_srcs - download_image_urls image_urls - data Page = -- | An LWN page with one article on it. @@ -133,9 +92,7 @@ instance XHTML Page where "