X-Git-Url: http://gitweb.michael.orlitzky.com/?a=blobdiff_plain;ds=inline;f=src%2FLWN%2FPage.hs;h=5f3b9ee608f42dc9895c458e3968e0363f4702bb;hb=HEAD;hp=49faa0bb4d6d1b0c77c0b4c6eb8d00e1a9261c50;hpb=5ac7dd7f301ba633b38d7bf2361044d25204bb6e;p=dead%2Flwn-epub.git diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index 49faa0b..5f3b9ee 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -4,11 +4,10 @@ module LWN.Page where import Control.Concurrent.ParallelIO (parallel) -import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) -import qualified Data.Map as Map (Map, empty, insert) +import qualified Data.Map as Map (Map, empty, insert, lookup) import Data.Maybe (catMaybes, fromJust, isNothing) import Prelude hiding (readFile) import System.IO (Handle, hClose, hFlush) @@ -20,7 +19,9 @@ import Text.Pandoc ( defaultWriterOptions, readHtml, writeEPUB, - writerEPUBMetadata) + writerEPUBMetadata, + writerUserDataDir) +import Text.Pandoc.Shared ( readDataFile ) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, @@ -34,6 +35,7 @@ import Text.XML.HXT.Core ( getChildren, getText, hasName, + none, processAttrl, processTopDown, this, @@ -42,7 +44,7 @@ import Text.XML.HXT.Core ( when) import Text.HandsomeSoup (css, parseHtml) -import Configuration (Cfg) +import Configuration (Cfg, full_stories) import LWN.Article import LWN.HTTP ( ImageMap, @@ -58,6 +60,7 @@ import LWN.XHTML ( is_image, preprocess, remove_byline, + remove_full_story_paragraphs, remove_title, to_xhtml, to_xml, @@ -111,6 +114,20 @@ instance XHTML Page where +-- | Stolen from writeEPUB. +default_stylesheet :: IO String +default_stylesheet = + -- This comes with Pandoc, I guess. + readDataFile (writerUserDataDir defaultWriterOptions) "epub.css" + + +construct_stylesheet :: IO String +construct_stylesheet = do + defaults <- default_stylesheet + -- Allow word-wrapping in
elements. + let my_additions = "\n" ++ "pre { white-space: pre-wrap; }" ++ "\n" + return $ defaults ++ my_additions + page_from_url :: Cfg -> URL -> IO (Maybe Page) page_from_url cfg url = do contents <- get_article_contents cfg url @@ -127,14 +144,14 @@ insert_full_stories story_map = lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree lookup_func href = case Map.lookup href story_map of - -- Leave it alone if we don't have the full story. - Nothing -> this + -- Drop the paragraph if we don't have the contents. + Nothing -> none Just v -> to_xml v article_xml :: (ArrowXml a) => a XmlTree XmlTree article_xml = lookup_func - $< + $< -- From HXT's Control.Arrow.ArrowList (this /> full_story_link >>> getAttrValue "href") replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree @@ -145,8 +162,9 @@ replace_remote_img_srcs image_map = change_src_func :: String -> String change_src_func old_src = case Map.lookup old_src image_map of - -- Leave it alone if we don't have the file locally - Nothing -> old_src + -- If we don't have the file, empty the src. Pandoc will crash + -- otherwise. + Nothing -> "" Just v -> v change_src :: (ArrowXml a) => a XmlTree XmlTree @@ -192,8 +210,12 @@ download_full_stories cfg xml = do parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page) parse cfg xml = do - story_map <- download_full_stories cfg xml - let fs_xml = xml >>> insert_full_stories story_map + fs_xml <- if (full_stories cfg) then do + story_map <- download_full_stories cfg xml + return $ xml >>> insert_full_stories story_map + else do + -- Get rid of them if we don't want them. + return $ xml >>> remove_full_story_paragraphs let clean_xml = fs_xml >>> preprocess image_map <- download_images clean_xml @@ -280,9 +302,9 @@ fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) fp_parse xml = do hl <- parse_headline xml parsed_articles <- fp_parse_articles xml - case parsed_articles of - [] -> return Nothing - x -> return $ Just $ FullPage (fromJust hl) x + return $ case parsed_articles of + [] -> Nothing + x -> Just $ FullPage (fromJust hl) x @@ -395,12 +417,16 @@ epublish obj handle = do hClose handle xhtml_to_epub :: String -> String -> IO B.ByteString -xhtml_to_epub epmd = - write_epub . read_html - where - my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd } - write_epub = writeEPUB Nothing [] my_writer_options - read_html = readHtml defaultParserState +xhtml_to_epub epmd xhtml = do + stylesheet <- construct_stylesheet + writeEPUB + (Just stylesheet) + [] + my_writer_options + (read_html xhtml) + where + my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd } + read_html = readHtml defaultParserState --