Bump a few dependencies, fix compilation failures.

[dead/lwn-epub.git] / src / LWN / Page.hs
diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs

index f0ada7cd13cfff0707904272f93fa2b4b354f9e3..5f3b9ee608f42dc9895c458e3968e0363f4702bb 100644 (file)
--- a/src/LWN/Page.hs
+++ b/src/LWN/Page.hs
@@ -3,11 +3,11 @@
  module LWN.Page
  where
  
  module LWN.Page
  where
  
-import qualified Data.Map as Map (lookup)
+import Control.Concurrent.ParallelIO (parallel)
  import Data.Time (getCurrentTime)
  import qualified Data.ByteString.Lazy as B (ByteString, hPut)
  import Data.String.Utils (split, strip)
  import Data.Time (getCurrentTime)
  import qualified Data.ByteString.Lazy as B (ByteString, hPut)
  import Data.String.Utils (split, strip)
-import qualified Data.Map as Map (Map, empty, insert)
+import qualified Data.Map as Map (Map, empty, insert, lookup)
  import Data.Maybe (catMaybes, fromJust, isNothing)
  import Prelude hiding (readFile)
  import System.IO (Handle, hClose, hFlush)
  import Data.Maybe (catMaybes, fromJust, isNothing)
  import Prelude hiding (readFile)
  import System.IO (Handle, hClose, hFlush)
@@ -19,7 +19,9 @@ import Text.Pandoc (
    defaultWriterOptions,
    readHtml,
    writeEPUB,
    defaultWriterOptions,
    readHtml,
    writeEPUB,
-  writerEPUBMetadata)
+  writerEPUBMetadata,
+  writerUserDataDir)
+import Text.Pandoc.Shared ( readDataFile )
  import Text.XML.HXT.Core (
    ArrowXml,
    IOSArrow,
  import Text.XML.HXT.Core (
    ArrowXml,
    IOSArrow,
@@ -33,6 +35,7 @@ import Text.XML.HXT.Core (
    getChildren,
    getText,
    hasName,
    getChildren,
    getText,
    hasName,
+  none,
    processAttrl,
    processTopDown,
    this,
    processAttrl,
    processTopDown,
    this,
@@ -41,7 +44,7 @@ import Text.XML.HXT.Core (
    when)
  import Text.HandsomeSoup (css, parseHtml)
  
    when)
  import Text.HandsomeSoup (css, parseHtml)
  
-import Configuration (Cfg)
+import Configuration (Cfg, full_stories)
  import LWN.Article
  import LWN.HTTP (
    ImageMap,
  import LWN.Article
  import LWN.HTTP (
    ImageMap,
@@ -57,6 +60,7 @@ import LWN.XHTML (
    is_image,
    preprocess,
    remove_byline,
    is_image,
    preprocess,
    remove_byline,
+  remove_full_story_paragraphs,
    remove_title,
    to_xhtml,
    to_xml,
    remove_title,
    to_xhtml,
    to_xml,
@@ -110,6 +114,20 @@ instance XHTML Page where
  
  
  
  
  
  
+-- | Stolen from writeEPUB.
+default_stylesheet :: IO String
+default_stylesheet =
+  -- This comes with Pandoc, I guess.
+  readDataFile (writerUserDataDir defaultWriterOptions) "epub.css"
+
+
+construct_stylesheet :: IO String
+construct_stylesheet = do
+  defaults <- default_stylesheet
+  -- Allow word-wrapping in <pre> elements.
+  let my_additions = "\n" ++ "pre { white-space: pre-wrap; }" ++ "\n"
+  return $ defaults ++ my_additions
+
  page_from_url :: Cfg -> URL -> IO (Maybe Page)
  page_from_url cfg url = do
    contents <- get_article_contents cfg url  
  page_from_url :: Cfg -> URL -> IO (Maybe Page)
  page_from_url cfg url = do
    contents <- get_article_contents cfg url  
@@ -126,14 +144,14 @@ insert_full_stories story_map =
      lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
      lookup_func href =
        case Map.lookup href story_map of
      lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
      lookup_func href =
        case Map.lookup href story_map of
-        -- Leave it alone if we don't have the full story.
-        Nothing -> this
+        -- Drop the paragraph if we don't have the contents.
+        Nothing -> none
          Just v -> to_xml v
  
      article_xml :: (ArrowXml a) => a XmlTree XmlTree
      article_xml =
        lookup_func
          Just v -> to_xml v
  
      article_xml :: (ArrowXml a) => a XmlTree XmlTree
      article_xml =
        lookup_func
-      $<
+      $< -- From HXT's Control.Arrow.ArrowList
        (this /> full_story_link >>> getAttrValue "href")
                     
  replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
        (this /> full_story_link >>> getAttrValue "href")
                     
  replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
@@ -144,8 +162,9 @@ replace_remote_img_srcs image_map =
      change_src_func :: String -> String
      change_src_func old_src =
        case Map.lookup old_src image_map of
      change_src_func :: String -> String
      change_src_func old_src =
        case Map.lookup old_src image_map of
-        -- Leave it alone if we don't have the file locally
-        Nothing -> old_src
+        -- If we don't have the file, empty the src. Pandoc will crash
+        -- otherwise.
+        Nothing -> ""
          Just v -> v
  
      change_src :: (ArrowXml a) => a XmlTree XmlTree
          Just v -> v
  
      change_src :: (ArrowXml a) => a XmlTree XmlTree
@@ -172,7 +191,7 @@ type StoryMap = Map.Map URL Article
  -- These come *before* preprocessing.
  download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
  download_full_story_urls cfg story_urls = do
  -- These come *before* preprocessing.
  download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
  download_full_story_urls cfg story_urls = do
-  pages <- mapM (page_from_url cfg) story_urls
+  pages <- parallel $ map (page_from_url cfg) story_urls
    let pairs = zip story_urls pages
    return $ foldl my_insert empty_map pairs
    where
    let pairs = zip story_urls pages
    return $ foldl my_insert empty_map pairs
    where
@@ -191,8 +210,12 @@ download_full_stories cfg xml = do
  
  parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
  parse cfg xml = do
  
  parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
  parse cfg xml = do
-  story_map <- download_full_stories cfg xml
-  let fs_xml = xml >>> insert_full_stories story_map
+  fs_xml <- if (full_stories cfg) then do
+             story_map <- download_full_stories cfg xml
+             return $ xml >>> insert_full_stories story_map
+           else do
+             -- Get rid of them if we don't want them.
+             return $ xml >>> remove_full_story_paragraphs
  
    let clean_xml = fs_xml >>> preprocess
    image_map <- download_images clean_xml
  
    let clean_xml = fs_xml >>> preprocess
    image_map <- download_images clean_xml
@@ -260,8 +283,6 @@ ap_parse_articles xml = do
    parsed_byline   <- parse_byline xml
    parsed_body     <- ap_parse_body xml
  
    parsed_byline   <- parse_byline xml
    parsed_body     <- ap_parse_body xml
  
-  putStrLn $ fromJust parsed_headline
-
    if (isNothing parsed_headline) || (isNothing parsed_body)
    then return []
    else do
    if (isNothing parsed_headline) || (isNothing parsed_body)
    then return []
    else do
@@ -281,9 +302,9 @@ fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
  fp_parse xml = do
      hl <- parse_headline xml
      parsed_articles <- fp_parse_articles xml
  fp_parse xml = do
      hl <- parse_headline xml
      parsed_articles <- fp_parse_articles xml
-    case parsed_articles of
-      []          -> return Nothing
-      x -> return $ Just $ FullPage (fromJust hl) x
+    return $ case parsed_articles of
+      [] -> Nothing
+      x  -> Just $ FullPage (fromJust hl) x
  
  
  
  
  
  
@@ -352,7 +373,6 @@ fp_parse_articles xml = do
                            []    -> []
    -- Put the separator back, it was lost during the split.
    let split_articles = map (article_separator ++) split_articles'
                            []    -> []
    -- Put the separator back, it was lost during the split.
    let split_articles = map (article_separator ++) split_articles'
-  --_ <- mapM print_article split_articles
    real_articles <- mapM parse_html_article split_articles
    let just_articles = catMaybes real_articles
    return just_articles
    real_articles <- mapM parse_html_article split_articles
    let just_articles = catMaybes real_articles
    return just_articles
@@ -397,12 +417,16 @@ epublish obj handle = do
    hClose handle
  
  xhtml_to_epub :: String -> String -> IO B.ByteString
    hClose handle
  
  xhtml_to_epub :: String -> String -> IO B.ByteString
-xhtml_to_epub epmd =
-   write_epub . read_html
-   where
-     my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
-     write_epub = writeEPUB Nothing [] my_writer_options
-     read_html  = readHtml defaultParserState
+xhtml_to_epub epmd xhtml = do
+  stylesheet <- construct_stylesheet
+  writeEPUB
+    (Just stylesheet)
+    []
+    my_writer_options
+    (read_html xhtml)
+  where
+    my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
+    read_html  = readHtml defaultParserState
  
  
  --
  
  
  --