X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FPage.hs;h=620e3ec3d8bf66a9a67bff98cc3ac215a2a64b70;hp=2bbe21ae8e918ed49d658cac3fe05b0aa4905843;hb=4220827f62d772d7edcbdcc1c2f13d6c2eb5f534;hpb=2bf48a15ded4501a127bdfe5b29eaf01acf6576e diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index 2bbe21a..620e3ec 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -3,7 +3,7 @@ module LWN.Page where -import qualified Data.Map as Map +import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) import System.IO (Handle, hClose, hFlush) import qualified Data.ByteString.Lazy as B (ByteString, hPut) @@ -12,7 +12,12 @@ import Data.Maybe (catMaybes, fromJust, isNothing) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) -import Text.Pandoc +import Text.Pandoc ( + defaultParserState, + defaultWriterOptions, + readHtml, + writeEPUB, + writerEPUBMetadata) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, @@ -34,31 +39,14 @@ import Text.XML.HXT.Core ( runX, setElemName, xshow, - when - ) + when) import Text.HandsomeSoup (css, parseHtml) import LWN.Article -import LWN.HTTP (save_image) +import LWN.HTTP (ImageMap, download_image_urls) import LWN.URI (URL, try_make_absolute_url) +import LWN.XHTML (XHTML, to_xhtml) import Misc (contains) -import XHTML - --- Map absolute image URLs to local system file paths where the image --- referenced by the URL is stored. -type ImageMap = Map.Map URL FilePath - -download_image_urls :: [URL] -> IO ImageMap -download_image_urls image_urls = do - files <- mapM save_image image_urls - let pairs = zip image_urls files - return $ foldl my_insert empty_map pairs - where - empty_map = Map.empty :: ImageMap - - my_insert :: ImageMap -> (URL, Maybe FilePath) -> ImageMap - my_insert dict (_, Nothing) = dict - my_insert dict (k, Just v) = Map.insert k v dict -- Should be called *after* preprocessing. download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap @@ -122,7 +110,7 @@ is_link = remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree remove_comment_links = processTopDown $ kill_comments `when` is_link - where + where is_comment_link = hasAttrValue "href" (contains "#Comments") @@ -178,7 +166,7 @@ parse xml = do return $ if (isNothing appr) then fppr - else + else appr @@ -212,12 +200,12 @@ parse_byline xml = do -- ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) ap_parse xml = do - arts <- ap_parse_articles xml + arts <- ap_parse_articles xml case arts of [x] -> return $ Just $ ArticlePage x _ -> return Nothing - + ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String) ap_parse_body xml = do let element_filter = xml >>> css "div.ArticleText" @@ -239,11 +227,11 @@ ap_parse_articles xml = do if (isNothing parsed_headline) || (isNothing parsed_body) then return [] - else do + else do let title' = Title $ fromJust parsed_headline let byline' = Byline parsed_byline let body' = BodyHtml $ fromJust parsed_body - + return $ [Article title' byline' body'] @@ -335,7 +323,7 @@ parse_html_article html = do let xml = parseHtml $ wrap_in_body_div html fp_parse_article xml - + -- | In the full page, all of the article titles and bodies are -- wrapped in one big div.ArticleText. parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree @@ -427,7 +415,7 @@ make_image_srcs_absolute = changeAttrValue try_make_absolute_url make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree - make_srcs_absolute = + make_srcs_absolute = processAttrl $ change_src `when` hasName "src"