Return-style fix.

[dead/lwn-epub.git] / src / LWN / Page.hs
diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs

index 97171c641be08a6761a1a1800002190c7f10b9ed..3d53284c4b66e9c67ff222d19982ce4e2ced45d2 100644 (file)
--- a/src/LWN/Page.hs
+++ b/src/LWN/Page.hs
@@ -3,10 +3,12 @@
  module LWN.Page
  where
  
  module LWN.Page
  where
  
+import Control.Concurrent.ParallelIO (parallel)
  import qualified Data.Map as Map (lookup)
  import Data.Time (getCurrentTime)
  import qualified Data.ByteString.Lazy as B (ByteString, hPut)
  import Data.String.Utils (split, strip)
  import qualified Data.Map as Map (lookup)
  import Data.Time (getCurrentTime)
  import qualified Data.ByteString.Lazy as B (ByteString, hPut)
  import Data.String.Utils (split, strip)
+import qualified Data.Map as Map (Map, empty, insert)
  import Data.Maybe (catMaybes, fromJust, isNothing)
  import Prelude hiding (readFile)
  import System.IO (Handle, hClose, hFlush)
  import Data.Maybe (catMaybes, fromJust, isNothing)
  import Prelude hiding (readFile)
  import System.IO (Handle, hClose, hFlush)
@@ -18,26 +20,32 @@ import Text.Pandoc (
    defaultWriterOptions,
    readHtml,
    writeEPUB,
    defaultWriterOptions,
    readHtml,
    writeEPUB,
-  writerEPUBMetadata)
+  writerEPUBMetadata,
+  writerUserDataDir)
+import Text.Pandoc.Shared ( readDataFile )
  import Text.XML.HXT.Core (
    ArrowXml,
    IOSArrow,
    XmlTree,
  import Text.XML.HXT.Core (
    ArrowXml,
    IOSArrow,
    XmlTree,
+  ($<),
    (>>>),
    (/>),
    (//>),
    changeAttrValue,
    (>>>),
    (/>),
    (//>),
    changeAttrValue,
+  getAttrValue,
    getChildren,
    getText,
    hasName,
    getChildren,
    getText,
    hasName,
+  none,
    processAttrl,
    processTopDown,
    processAttrl,
    processTopDown,
+  this,
    runX,
    xshow,
    when)
  import Text.HandsomeSoup (css, parseHtml)
  
    runX,
    xshow,
    when)
  import Text.HandsomeSoup (css, parseHtml)
  
-import Configuration (Cfg)
+import Configuration (Cfg, full_stories)
  import LWN.Article
  import LWN.HTTP (
    ImageMap,
  import LWN.Article
  import LWN.HTTP (
    ImageMap,
@@ -46,12 +54,17 @@ import LWN.HTTP (
  import LWN.URI (URL)
  import LWN.XHTML (
    XHTML,
  import LWN.URI (URL)
  import LWN.XHTML (
    XHTML,
+  full_story_urls,
    image_srcs,
    image_srcs,
+  full_story_link,
+  full_story_paragraph,
    is_image,
    preprocess,
    remove_byline,
    is_image,
    preprocess,
    remove_byline,
+  remove_full_story_paragraphs,
    remove_title,
    to_xhtml,
    remove_title,
    to_xhtml,
+  to_xml,
    xml_from_contents)
  
  
    xml_from_contents)
  
  
@@ -102,23 +115,46 @@ instance XHTML Page where
  
  
  
  
  
  
+-- | Stolen from writeEPUB.
+default_stylesheet :: IO String
+default_stylesheet =
+  -- This comes with Pandoc, I guess.
+  readDataFile (writerUserDataDir defaultWriterOptions) "epub.css"
+
+
+construct_stylesheet :: IO String
+construct_stylesheet = do
+  defaults <- default_stylesheet
+  -- Allow word-wrapping in <pre> elements.
+  let my_additions = "\n" ++ "pre { white-space: pre-wrap; }" ++ "\n"
+  return $ defaults ++ my_additions
+
  page_from_url :: Cfg -> URL -> IO (Maybe Page)
  page_from_url cfg url = do
    contents <- get_article_contents cfg url  
    case (xml_from_contents contents) of
  page_from_url :: Cfg -> URL -> IO (Maybe Page)
  page_from_url cfg url = do
    contents <- get_article_contents cfg url  
    case (xml_from_contents contents) of
-    Just html -> parse html
+    Just html -> parse cfg html
      Nothing -> return Nothing
  
  
  
      Nothing -> return Nothing
  
  
  
--- Should be called *after* preprocessing.
-download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
-download_images xml = do
-  image_urls <- runX $ xml >>> image_srcs
-  download_image_urls image_urls
-
-
-
+insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
+insert_full_stories story_map =
+  processTopDown (article_xml `when` full_story_paragraph)
+  where
+    lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
+    lookup_func href =
+      case Map.lookup href story_map of
+        -- Drop the paragraph if we don't have the contents.
+        Nothing -> none
+        Just v -> to_xml v
+
+    article_xml :: (ArrowXml a) => a XmlTree XmlTree
+    article_xml =
+      lookup_func
+      $< -- From HXT's Control.Arrow.ArrowList
+      (this /> full_story_link >>> getAttrValue "href")
+                   
  replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
  replace_remote_img_srcs image_map =
    processTopDown (make_srcs_local `when` is_image)
  replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
  replace_remote_img_srcs image_map =
    processTopDown (make_srcs_local `when` is_image)
@@ -140,11 +176,51 @@ replace_remote_img_srcs image_map =
        processAttrl $ (change_src `when` (hasName "src"))
  
  
        processAttrl $ (change_src `when` (hasName "src"))
  
  
-parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
-parse xml = do
-  let clean_xml = xml >>> preprocess
+
+
+-- Should be called *after* preprocessing.
+download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
+download_images xml = do
+  image_urls <- runX $ xml >>> image_srcs
+  download_image_urls image_urls
+
+
+
+type StoryMap = Map.Map URL Article
+
+-- These come *before* preprocessing.
+download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
+download_full_story_urls cfg story_urls = do
+  pages <- parallel $ map (page_from_url cfg) story_urls
+  let pairs = zip story_urls pages
+  return $ foldl my_insert empty_map pairs
+  where
+    empty_map = Map.empty :: StoryMap
+
+    my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
+    my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
+    my_insert dict (_, _)  = dict
+
+
+download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
+download_full_stories cfg xml = do
+  story_urls <- runX $ xml >>> full_story_urls
+  download_full_story_urls cfg story_urls
+  
+
+parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
+parse cfg xml = do
+  fs_xml <- if (full_stories cfg) then do
+             story_map <- download_full_stories cfg xml
+             return $ xml >>> insert_full_stories story_map
+           else do
+             -- Get rid of them if we don't want them.
+             return $ xml >>> remove_full_story_paragraphs
+
+  let clean_xml = fs_xml >>> preprocess
    image_map <- download_images clean_xml
    let local_xml = clean_xml >>> replace_remote_img_srcs image_map
    image_map <- download_images clean_xml
    let local_xml = clean_xml >>> replace_remote_img_srcs image_map
+
    appr <- ap_parse local_xml
    fppr <- fp_parse local_xml
    return $
    appr <- ap_parse local_xml
    fppr <- fp_parse local_xml
    return $
@@ -207,8 +283,6 @@ ap_parse_articles xml = do
    parsed_byline   <- parse_byline xml
    parsed_body     <- ap_parse_body xml
  
    parsed_byline   <- parse_byline xml
    parsed_body     <- ap_parse_body xml
  
-  putStrLn $ fromJust parsed_headline
-
    if (isNothing parsed_headline) || (isNothing parsed_body)
    then return []
    else do
    if (isNothing parsed_headline) || (isNothing parsed_body)
    then return []
    else do
@@ -228,9 +302,9 @@ fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
  fp_parse xml = do
      hl <- parse_headline xml
      parsed_articles <- fp_parse_articles xml
  fp_parse xml = do
      hl <- parse_headline xml
      parsed_articles <- fp_parse_articles xml
-    case parsed_articles of
-      []          -> return Nothing
-      x -> return $ Just $ FullPage (fromJust hl) x
+    return $ case parsed_articles of
+      [] -> Nothing
+      x  -> Just $ FullPage (fromJust hl) x
  
  
  
  
  
  
@@ -299,7 +373,6 @@ fp_parse_articles xml = do
                            []    -> []
    -- Put the separator back, it was lost during the split.
    let split_articles = map (article_separator ++) split_articles'
                            []    -> []
    -- Put the separator back, it was lost during the split.
    let split_articles = map (article_separator ++) split_articles'
-  --_ <- mapM print_article split_articles
    real_articles <- mapM parse_html_article split_articles
    let just_articles = catMaybes real_articles
    return just_articles
    real_articles <- mapM parse_html_article split_articles
    let just_articles = catMaybes real_articles
    return just_articles
@@ -344,12 +417,16 @@ epublish obj handle = do
    hClose handle
  
  xhtml_to_epub :: String -> String -> IO B.ByteString
    hClose handle
  
  xhtml_to_epub :: String -> String -> IO B.ByteString
-xhtml_to_epub epmd =
-   write_epub . read_html
-   where
-     my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
-     write_epub = writeEPUB Nothing [] my_writer_options
-     read_html  = readHtml defaultParserState
+xhtml_to_epub epmd xhtml = do
+  stylesheet <- construct_stylesheet
+  writeEPUB
+    (Just stylesheet)
+    []
+    my_writer_options
+    (read_html xhtml)
+  where
+    my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
+    read_html  = readHtml defaultParserState
  
  
  --
  
  
  --
@@ -423,10 +500,33 @@ test_comments_removed = do
      expected_xml'' = parseHtml expected_html
  
  
      expected_xml'' = parseHtml expected_html
  
  
+test_full_story_urls_parsed :: Assertion
+test_full_story_urls_parsed = do
+  actual <- runX $ actual'
+
+  assertEqual
+    "Full Story URLs are parsed"
+    expected
+    actual
+  where
+    expected = ["/Articles/500738/", "/Articles/501837/"]
+
+    full_story_html =
+      concat ["<p>",
+              "<a href=\"/Articles/500738/\">Full Story</a> ",
+              "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
+              "<p>",
+              "<a href=\"/Articles/501837/\">Full Story</a> ",
+              "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
+              "<p>"]
+
+    xml = parseHtml full_story_html
+    actual' = xml >>> full_story_urls
  
  page_tests :: Test
  page_tests =
    testGroup "Page Tests" [
      testCase "Links are replaced with spans" test_preprocess_links,
      testCase "Image srcs are made absolute" test_absolve_images,
  
  page_tests :: Test
  page_tests =
    testGroup "Page Tests" [
      testCase "Links are replaced with spans" test_preprocess_links,
      testCase "Image srcs are made absolute" test_absolve_images,
-    testCase "Comment links are removed" test_comments_removed ]
+    testCase "Comment links are removed" test_comments_removed,
+    testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]