Don't download full stories if we aren't going to use them.

[dead/lwn-epub.git] / src / LWN / Page.hs
diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs

index 97171c641be08a6761a1a1800002190c7f10b9ed..3dccf466fea98f49a8ee42dcf44a2cdebe66a42c 100644 (file)
--- a/src/LWN/Page.hs
+++ b/src/LWN/Page.hs
@@ -3,10 +3,12 @@
  module LWN.Page
  where
  
  module LWN.Page
  where
  
+import Control.Concurrent.ParallelIO (parallel)
  import qualified Data.Map as Map (lookup)
  import Data.Time (getCurrentTime)
  import qualified Data.ByteString.Lazy as B (ByteString, hPut)
  import Data.String.Utils (split, strip)
  import qualified Data.Map as Map (lookup)
  import Data.Time (getCurrentTime)
  import qualified Data.ByteString.Lazy as B (ByteString, hPut)
  import Data.String.Utils (split, strip)
+import qualified Data.Map as Map (Map, empty, insert)
  import Data.Maybe (catMaybes, fromJust, isNothing)
  import Prelude hiding (readFile)
  import System.IO (Handle, hClose, hFlush)
  import Data.Maybe (catMaybes, fromJust, isNothing)
  import Prelude hiding (readFile)
  import System.IO (Handle, hClose, hFlush)
@@ -23,21 +25,24 @@ import Text.XML.HXT.Core (
    ArrowXml,
    IOSArrow,
    XmlTree,
    ArrowXml,
    IOSArrow,
    XmlTree,
+  ($<),
    (>>>),
    (/>),
    (//>),
    changeAttrValue,
    (>>>),
    (/>),
    (//>),
    changeAttrValue,
+  getAttrValue,
    getChildren,
    getText,
    hasName,
    processAttrl,
    processTopDown,
    getChildren,
    getText,
    hasName,
    processAttrl,
    processTopDown,
+  this,
    runX,
    xshow,
    when)
  import Text.HandsomeSoup (css, parseHtml)
  
    runX,
    xshow,
    when)
  import Text.HandsomeSoup (css, parseHtml)
  
-import Configuration (Cfg)
+import Configuration (Cfg, full_stories)
  import LWN.Article
  import LWN.HTTP (
    ImageMap,
  import LWN.Article
  import LWN.HTTP (
    ImageMap,
@@ -46,12 +51,16 @@ import LWN.HTTP (
  import LWN.URI (URL)
  import LWN.XHTML (
    XHTML,
  import LWN.URI (URL)
  import LWN.XHTML (
    XHTML,
+  full_story_urls,
    image_srcs,
    image_srcs,
+  full_story_link,
+  full_story_paragraph,
    is_image,
    preprocess,
    remove_byline,
    remove_title,
    to_xhtml,
    is_image,
    preprocess,
    remove_byline,
    remove_title,
    to_xhtml,
+  to_xml,
    xml_from_contents)
  
  
    xml_from_contents)
  
  
@@ -106,19 +115,28 @@ page_from_url :: Cfg -> URL -> IO (Maybe Page)
  page_from_url cfg url = do
    contents <- get_article_contents cfg url  
    case (xml_from_contents contents) of
  page_from_url cfg url = do
    contents <- get_article_contents cfg url  
    case (xml_from_contents contents) of
-    Just html -> parse html
+    Just html -> parse cfg html
      Nothing -> return Nothing
  
  
  
      Nothing -> return Nothing
  
  
  
--- Should be called *after* preprocessing.
-download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
-download_images xml = do
-  image_urls <- runX $ xml >>> image_srcs
-  download_image_urls image_urls
-
-
-
+insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
+insert_full_stories story_map =
+  processTopDown (article_xml `when` full_story_paragraph)
+  where
+    lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
+    lookup_func href =
+      case Map.lookup href story_map of
+        -- Leave it alone if we don't have the full story.
+        Nothing -> this
+        Just v -> to_xml v
+
+    article_xml :: (ArrowXml a) => a XmlTree XmlTree
+    article_xml =
+      lookup_func
+      $< -- From HXT's Control.Arrow.ArrowList
+      (this /> full_story_link >>> getAttrValue "href")
+                   
  replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
  replace_remote_img_srcs image_map =
    processTopDown (make_srcs_local `when` is_image)
  replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
  replace_remote_img_srcs image_map =
    processTopDown (make_srcs_local `when` is_image)
@@ -140,11 +158,50 @@ replace_remote_img_srcs image_map =
        processAttrl $ (change_src `when` (hasName "src"))
  
  
        processAttrl $ (change_src `when` (hasName "src"))
  
  
-parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
-parse xml = do
-  let clean_xml = xml >>> preprocess
+
+
+-- Should be called *after* preprocessing.
+download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
+download_images xml = do
+  image_urls <- runX $ xml >>> image_srcs
+  download_image_urls image_urls
+
+
+
+type StoryMap = Map.Map URL Article
+
+-- These come *before* preprocessing.
+download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
+download_full_story_urls cfg story_urls = do
+  pages <- parallel $ map (page_from_url cfg) story_urls
+  let pairs = zip story_urls pages
+  return $ foldl my_insert empty_map pairs
+  where
+    empty_map = Map.empty :: StoryMap
+
+    my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
+    my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
+    my_insert dict (_, _)  = dict
+
+
+download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
+download_full_stories cfg xml = do
+  story_urls <- runX $ xml >>> full_story_urls
+  download_full_story_urls cfg story_urls
+  
+
+parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
+parse cfg xml = do
+  fs_xml <- if (full_stories cfg) then do
+             story_map <- download_full_stories cfg xml
+             return $ xml >>> insert_full_stories story_map
+           else do
+             return xml
+
+  let clean_xml = fs_xml >>> preprocess
    image_map <- download_images clean_xml
    let local_xml = clean_xml >>> replace_remote_img_srcs image_map
    image_map <- download_images clean_xml
    let local_xml = clean_xml >>> replace_remote_img_srcs image_map
+
    appr <- ap_parse local_xml
    fppr <- fp_parse local_xml
    return $
    appr <- ap_parse local_xml
    fppr <- fp_parse local_xml
    return $
@@ -207,8 +264,6 @@ ap_parse_articles xml = do
    parsed_byline   <- parse_byline xml
    parsed_body     <- ap_parse_body xml
  
    parsed_byline   <- parse_byline xml
    parsed_body     <- ap_parse_body xml
  
-  putStrLn $ fromJust parsed_headline
-
    if (isNothing parsed_headline) || (isNothing parsed_body)
    then return []
    else do
    if (isNothing parsed_headline) || (isNothing parsed_body)
    then return []
    else do
@@ -299,7 +354,6 @@ fp_parse_articles xml = do
                            []    -> []
    -- Put the separator back, it was lost during the split.
    let split_articles = map (article_separator ++) split_articles'
                            []    -> []
    -- Put the separator back, it was lost during the split.
    let split_articles = map (article_separator ++) split_articles'
-  --_ <- mapM print_article split_articles
    real_articles <- mapM parse_html_article split_articles
    let just_articles = catMaybes real_articles
    return just_articles
    real_articles <- mapM parse_html_article split_articles
    let just_articles = catMaybes real_articles
    return just_articles
@@ -423,10 +477,33 @@ test_comments_removed = do
      expected_xml'' = parseHtml expected_html
  
  
      expected_xml'' = parseHtml expected_html
  
  
+test_full_story_urls_parsed :: Assertion
+test_full_story_urls_parsed = do
+  actual <- runX $ actual'
+
+  assertEqual
+    "Full Story URLs are parsed"
+    expected
+    actual
+  where
+    expected = ["/Articles/500738/", "/Articles/501837/"]
+
+    full_story_html =
+      concat ["<p>",
+              "<a href=\"/Articles/500738/\">Full Story</a> ",
+              "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
+              "<p>",
+              "<a href=\"/Articles/501837/\">Full Story</a> ",
+              "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
+              "<p>"]
+
+    xml = parseHtml full_story_html
+    actual' = xml >>> full_story_urls
  
  page_tests :: Test
  page_tests =
    testGroup "Page Tests" [
      testCase "Links are replaced with spans" test_preprocess_links,
      testCase "Image srcs are made absolute" test_absolve_images,
  
  page_tests :: Test
  page_tests =
    testGroup "Page Tests" [
      testCase "Links are replaced with spans" test_preprocess_links,
      testCase "Image srcs are made absolute" test_absolve_images,
-    testCase "Comment links are removed" test_comments_removed ]
+    testCase "Comment links are removed" test_comments_removed,
+    testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]