where
import Control.Concurrent.ParallelIO (parallel)
-import qualified Data.Map as Map (lookup)
import Data.Time (getCurrentTime)
import qualified Data.ByteString.Lazy as B (ByteString, hPut)
import Data.String.Utils (split, strip)
-import qualified Data.Map as Map (Map, empty, insert)
+import qualified Data.Map as Map (Map, empty, insert, lookup)
import Data.Maybe (catMaybes, fromJust, isNothing)
import Prelude hiding (readFile)
import System.IO (Handle, hClose, hFlush)
defaultWriterOptions,
readHtml,
writeEPUB,
- writerEPUBMetadata)
+ writerEPUBMetadata,
+ writerUserDataDir)
+import Text.Pandoc.Shared ( readDataFile )
import Text.XML.HXT.Core (
ArrowXml,
IOSArrow,
getChildren,
getText,
hasName,
+ none,
processAttrl,
processTopDown,
this,
when)
import Text.HandsomeSoup (css, parseHtml)
-import Configuration (Cfg)
+import Configuration (Cfg, full_stories)
import LWN.Article
import LWN.HTTP (
ImageMap,
is_image,
preprocess,
remove_byline,
+ remove_full_story_paragraphs,
remove_title,
to_xhtml,
to_xml,
+-- | Stolen from writeEPUB.
+default_stylesheet :: IO String
+default_stylesheet =
+ -- This comes with Pandoc, I guess.
+ readDataFile (writerUserDataDir defaultWriterOptions) "epub.css"
+
+
+construct_stylesheet :: IO String
+construct_stylesheet = do
+ defaults <- default_stylesheet
+ -- Allow word-wrapping in <pre> elements.
+ let my_additions = "\n" ++ "pre { white-space: pre-wrap; }" ++ "\n"
+ return $ defaults ++ my_additions
+
page_from_url :: Cfg -> URL -> IO (Maybe Page)
page_from_url cfg url = do
contents <- get_article_contents cfg url
lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
lookup_func href =
case Map.lookup href story_map of
- -- Leave it alone if we don't have the full story.
- Nothing -> this
+ -- Drop the paragraph if we don't have the contents.
+ Nothing -> none
Just v -> to_xml v
article_xml :: (ArrowXml a) => a XmlTree XmlTree
article_xml =
lookup_func
- $<
+ $< -- From HXT's Control.Arrow.ArrowList
(this /> full_story_link >>> getAttrValue "href")
replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
change_src_func :: String -> String
change_src_func old_src =
case Map.lookup old_src image_map of
- -- Leave it alone if we don't have the file locally
- Nothing -> old_src
+ -- If we don't have the file, empty the src. Pandoc will crash
+ -- otherwise.
+ Nothing -> ""
Just v -> v
change_src :: (ArrowXml a) => a XmlTree XmlTree
parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
parse cfg xml = do
- story_map <- download_full_stories cfg xml
- let fs_xml = xml >>> insert_full_stories story_map
+ fs_xml <- if (full_stories cfg) then do
+ story_map <- download_full_stories cfg xml
+ return $ xml >>> insert_full_stories story_map
+ else do
+ -- Get rid of them if we don't want them.
+ return $ xml >>> remove_full_story_paragraphs
let clean_xml = fs_xml >>> preprocess
image_map <- download_images clean_xml
parsed_byline <- parse_byline xml
parsed_body <- ap_parse_body xml
- putStrLn $ fromJust parsed_headline
-
if (isNothing parsed_headline) || (isNothing parsed_body)
then return []
else do
fp_parse xml = do
hl <- parse_headline xml
parsed_articles <- fp_parse_articles xml
- case parsed_articles of
- [] -> return Nothing
- x -> return $ Just $ FullPage (fromJust hl) x
+ return $ case parsed_articles of
+ [] -> Nothing
+ x -> Just $ FullPage (fromJust hl) x
hClose handle
xhtml_to_epub :: String -> String -> IO B.ByteString
-xhtml_to_epub epmd =
- write_epub . read_html
- where
- my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
- write_epub = writeEPUB Nothing [] my_writer_options
- read_html = readHtml defaultParserState
+xhtml_to_epub epmd xhtml = do
+ stylesheet <- construct_stylesheet
+ writeEPUB
+ (Just stylesheet)
+ []
+ my_writer_options
+ (read_html xhtml)
+ where
+ my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
+ read_html = readHtml defaultParserState
--