Implement image download and replacement.

[dead/lwn-epub.git] / src / LWN / Page.hs
diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs

index 24997154eeb84951188d804b6336647c8477a61f..0307214176a82c469f1f555a056b553c05bd3b06 100644 (file)
--- a/src/LWN/Page.hs
+++ b/src/LWN/Page.hs
@@ -1,16 +1,20 @@
+{-# LANGUAGE DoAndIfThenElse #-}
+
  module LWN.Page
  where
  
  module LWN.Page
  where
  
-import Text.Pandoc
+import qualified Data.Map as Map
  import Data.Time (getCurrentTime)
  import System.IO (Handle)
  import qualified Data.ByteString.Lazy as B (ByteString, hPut)
  import Data.Time (getCurrentTime)
  import System.IO (Handle)
  import qualified Data.ByteString.Lazy as B (ByteString, hPut)
+import Data.List (isInfixOf)
  import Data.String.Utils (split, strip)
  import Data.Maybe (catMaybes, fromJust, isNothing)
  import Data.Tree.NTree.TypeDefs (NTree)
  import Test.HUnit (Assertion, assertEqual)
  import Test.Framework (Test, testGroup)
  import Test.Framework.Providers.HUnit (testCase)
  import Data.String.Utils (split, strip)
  import Data.Maybe (catMaybes, fromJust, isNothing)
  import Data.Tree.NTree.TypeDefs (NTree)
  import Test.HUnit (Assertion, assertEqual)
  import Test.Framework (Test, testGroup)
  import Test.Framework.Providers.HUnit (testCase)
+import Text.Pandoc
  import Text.XML.HXT.Core (
    ArrowXml,
    IOSArrow,
  import Text.XML.HXT.Core (
    ArrowXml,
    IOSArrow,
@@ -19,10 +23,13 @@ import Text.XML.HXT.Core (
    (>>>),
    (/>),
    (//>),
    (>>>),
    (/>),
    (//>),
+  changeAttrValue,
+  getAttrValue,
    getChildren,
    getText,
    hasAttrValue,
    hasName,
    getChildren,
    getText,
    hasAttrValue,
    hasName,
+  isElem,
    mkName,
    none,
    processAttrl,
    mkName,
    none,
    processAttrl,
@@ -35,8 +42,29 @@ import Text.XML.HXT.Core (
  import Text.HandsomeSoup (css, parseHtml)
  
  import LWN.Article
  import Text.HandsomeSoup (css, parseHtml)
  
  import LWN.Article
+import LWN.HTTP (save_image)
+import LWN.URI (URL, try_make_absolute_url)
  import XHTML
  
  import XHTML
  
+-- Map absolute image URLs to local system file paths where the image
+-- referenced by the URL is stored.
+type ImageMap = Map.Map URL FilePath
+
+-- Should be called *after* preprocessing.
+download_images :: IOSArrow XmlTree (NTree XNode) -> IO ImageMap
+download_images xml = do
+  image_urls <- runX $ xml >>> image_srcs
+  files <- mapM save_image image_urls
+  let pairs = zip image_urls files
+  return $ foldl my_insert empty_map pairs
+  where
+    empty_map = Map.empty :: ImageMap
+
+    my_insert :: ImageMap -> (URL, Maybe FilePath) -> ImageMap
+    my_insert dict (_, Nothing)  = dict
+    my_insert dict (k, Just v) = Map.insert k v dict
+
+
  data Page =
    -- | An LWN page with one article on it.
    ArticlePage { article :: Article } |
  data Page =
    -- | An LWN page with one article on it.
    ArticlePage { article :: Article } |
@@ -84,14 +112,22 @@ instance XHTML Page where
  
  
  
  
  
  
-remove_images :: (ArrowXml a) => a XmlTree XmlTree
-remove_images =
-  processTopDown ((none) `when` is_image)
-
-
  is_link :: (ArrowXml a) => a XmlTree XmlTree
  is_link =
  is_link :: (ArrowXml a) => a XmlTree XmlTree
  is_link =
-  hasName "a"
+  isElem >>> hasName "a"
+
+
+remove_comment_links  :: (ArrowXml a) => a XmlTree XmlTree
+remove_comment_links =
+  processTopDown $ kill_comments `when` is_link
+  where    
+    contains = isInfixOf
+
+    is_comment_link =
+      hasAttrValue "href" (contains "#Comments")
+
+    kill_comments =
+      none `when` is_comment_link
  
  replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
  replace_links_with_spans =
  
  replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
  replace_links_with_spans =
@@ -100,55 +136,88 @@ replace_links_with_spans =
      make_span    = setElemName $ mkName "span"
      remove_attrs = processAttrl none
  
      make_span    = setElemName $ mkName "span"
      remove_attrs = processAttrl none
  
+
  -- | Preprocessing common to both page types.
  -- | Preprocessing common to both page types.
-preprocess :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
-preprocess xml =
-  xml >>> remove_images >>> replace_links_with_spans
+preprocess :: (ArrowXml a) => a XmlTree XmlTree
+preprocess =
+  make_image_srcs_absolute
+  >>>
+  remove_comment_links
+  >>>
+  replace_links_with_spans
+
+
+replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
+replace_remote_img_srcs image_map =
+  processTopDown (make_srcs_local `when` is_image)
+  where
+    -- old_src -> new_src
+    change_src_func :: String -> String
+    change_src_func old_src =
+      case Map.lookup old_src image_map of
+        -- Leave it alone if we don't have the file locally
+        Nothing -> old_src
+        Just v -> v
+
+    change_src :: (ArrowXml a) => a XmlTree XmlTree
+    change_src =
+      changeAttrValue change_src_func
+
+    make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
+    make_srcs_local =
+      processAttrl $ (change_src `when` (hasName "src"))
  
  
  parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
  parse xml = do
  
  
  parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
  parse xml = do
-  let clean_xml = preprocess xml
-  appr <- ap_parse clean_xml
-  fppr <- fp_parse clean_xml
+  let clean_xml = xml >>> preprocess
+  image_map <- download_images clean_xml
+  let local_xml = clean_xml >>> replace_remote_img_srcs image_map
+  appr <- ap_parse local_xml
+  fppr <- fp_parse local_xml
    return $
      if (isNothing appr) then
        fppr
    return $
      if (isNothing appr) then
        fppr
-    else
+    else 
        appr
  
        appr
  
---
--- ArticlePage Stuff
---
-ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
-ap_parse xml = do
-    arts <- ap_parse_articles xml
-    case arts of
-      Just [x] -> return $ Just $ ArticlePage x
-      _   -> return Nothing
  
  
  
  
-ap_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-ap_parse_headline xml = do
+parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_headline xml = do
    let element_filter = xml >>> css "div.PageHeadline h1"
    let element_text_filter = element_filter /> getText
    element_text <- runX element_text_filter
    let element_filter = xml >>> css "div.PageHeadline h1"
    let element_text_filter = element_filter /> getText
    element_text <- runX element_text_filter
-  return $ case element_text of
-            [x] -> Just $ strip x
-            []  -> Nothing
-            _   -> error "Found more than one headline."
+  return $
+    case element_text of
+      [x] -> Just $ strip x
+      []  -> Nothing
+      _   -> error "Found more than one headline."
+
  
  
-ap_parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-ap_parse_byline xml = do
-  let element_filter = xml >>> css "div.Byline"
+parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_byline xml = do
+  let element_filter = xml >>> css "div.FeatureByLine"
    let element_text_filter = element_filter /> getText
    element_text <- runX element_text_filter
    let element_text_filter = element_filter /> getText
    element_text <- runX element_text_filter
-  return $ case element_text of
-            [x] -> Just $ strip x
-            []  -> Nothing
-            _   -> error "Found more than one byline."
+  return $
+    case element_text of
+      [x] -> Just $ strip x
+      []  -> Nothing
+      _   -> error "Found more than one article byline."
+
  
  
+--
+-- ArticlePage Stuff
+--
+ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
+ap_parse xml = do
+    arts <- ap_parse_articles xml          
+    case arts of
+      [x] -> return $ Just $ ArticlePage x
+      _   -> return Nothing
  
  
+  
  ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  ap_parse_body xml = do
    let element_filter = xml >>> css "div.ArticleText"
  ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  ap_parse_body xml = do
    let element_filter = xml >>> css "div.ArticleText"
@@ -160,16 +229,22 @@ ap_parse_body xml = do
              _   -> error "Found more than one article."
  
  
              _   -> error "Found more than one article."
  
  
-ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
+ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
  ap_parse_articles xml = do
  ap_parse_articles xml = do
-  parsed_headline <- ap_parse_headline xml
-  parsed_byline   <- ap_parse_byline xml
+  parsed_headline <- parse_headline xml
+  parsed_byline   <- parse_byline xml
    parsed_body     <- ap_parse_body xml
    parsed_body     <- ap_parse_body xml
-  let title'   = Title (fromJust parsed_headline)
-  let byline' = Byline  parsed_byline
-  let body'   = BodyHtml (fromJust parsed_body)
-  return $ Just $ [Article title' byline' body']
  
  
+  putStrLn $ fromJust parsed_headline
+
+  if (isNothing parsed_headline) || (isNothing parsed_body)
+  then return []
+  else do    
+    let title'  = Title    $ fromJust parsed_headline
+    let byline' = Byline     parsed_byline
+    let body'   = BodyHtml $ fromJust parsed_body
+    
+    return $ [Article title' byline' body']
  
  
  
  
  
  
@@ -179,36 +254,14 @@ ap_parse_articles xml = do
  
  fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
  fp_parse xml = do
  
  fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
  fp_parse xml = do
-    hl <- fp_parse_headline xml
+    hl <- parse_headline xml
      parsed_articles <- fp_parse_articles xml
      case parsed_articles of
      parsed_articles <- fp_parse_articles xml
      case parsed_articles of
-      them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
-      _          -> return Nothing
-
+      []          -> return Nothing
+      x -> return $ Just $ FullPage (fromJust hl) x
  
  
  
  
  
  
-fp_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-fp_parse_headline xml = do
-  let element_filter = xml >>> css "div.PageHeadline h1"
-  let element_text_filter = element_filter /> getText
-  element_text <- runX element_text_filter
-  return $ case element_text of
-            [x] -> Just $ strip x
-            []  -> Nothing
-            _   -> error "Found more than one headline."
-
-fp_parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-fp_parse_article_byline xml = do
-  let element_filter = xml >>> css "div.FeatureByLine"
-  let element_text_filter = element_filter /> getText
-  element_text <- runX element_text_filter
-  return $ case element_text of
-            [x] -> Just $ strip x
-            []  -> Nothing
-            _   -> error "Found more than one article byline."
-
-
  fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  fp_parse_article_title xml = do
    let element_filter = xml >>> css "h2.SummaryHL"
  fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
  fp_parse_article_title xml = do
    let element_filter = xml >>> css "h2.SummaryHL"
@@ -236,8 +289,7 @@ is_byline =
  
  
  is_image :: (ArrowXml a) => a XmlTree XmlTree
  
  
  is_image :: (ArrowXml a) => a XmlTree XmlTree
-is_image =
-  hasName "img"
+is_image = isElem >>> hasName "img"
  
  remove_title :: (ArrowXml a) => a XmlTree XmlTree
  remove_title =
  
  remove_title :: (ArrowXml a) => a XmlTree XmlTree
  remove_title =
@@ -266,12 +318,17 @@ fp_parse_article_body xml = do
  fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
  fp_parse_article xml = do
    parsed_article_title    <- fp_parse_article_title xml
  fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
  fp_parse_article xml = do
    parsed_article_title    <- fp_parse_article_title xml
-  parsed_article_byline   <- fp_parse_article_byline xml
+  parsed_article_byline   <- parse_byline xml
    parsed_article_body     <- fp_parse_article_body xml
    parsed_article_body     <- fp_parse_article_body xml
-  let title'   = Title    $ fromJust parsed_article_title
-  let byline'  = Byline     parsed_article_byline
-  let body'    = BodyHtml $ fromJust parsed_article_body
-  return $ Just $ Article title' byline' body'
+
+  if (isNothing parsed_article_title) || (isNothing parsed_article_body)
+  then
+    return Nothing
+  else do
+    let title'   = Title    $ fromJust parsed_article_title
+    let byline'  = Byline     parsed_article_byline
+    let body'    = BodyHtml $ fromJust parsed_article_body
+    return $ Just $ Article title' byline' body'
  
  parse_html_article :: String -> IO (Maybe Article)
  parse_html_article html = do
  
  parse_html_article :: String -> IO (Maybe Article)
  parse_html_article html = do
@@ -280,7 +337,7 @@ parse_html_article html = do
  
    
  -- | In the full page, all of the article titles and bodies are
  
    
  -- | In the full page, all of the article titles and bodies are
---   wrapped in a div.ArticleText.
+--   wrapped in one big div.ArticleText.
  parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
  parse_bodies xml =
    xml >>> css "div.ArticleText"
  parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
  parse_bodies xml =
    xml >>> css "div.ArticleText"
@@ -292,7 +349,9 @@ fp_parse_articles xml = do
    let article_separator = "<h2 class=\"SummaryHL\">"
    let split_articles'' = split article_separator (concat bodies)
    -- The first element will contain the crap before the first <h2...>.
    let article_separator = "<h2 class=\"SummaryHL\">"
    let split_articles'' = split article_separator (concat bodies)
    -- The first element will contain the crap before the first <h2...>.
-  let split_articles' = tail split_articles''
+  let split_articles' = case split_articles'' of
+                          (_:_) -> tail split_articles''
+                          []    -> []
    -- Put the separator back, it was lost during the split.
    let split_articles = map (article_separator ++) split_articles'
    --_ <- mapM print_article split_articles
    -- Put the separator back, it was lost during the split.
    let split_articles = map (article_separator ++) split_articles'
    --_ <- mapM print_article split_articles
@@ -301,6 +360,8 @@ fp_parse_articles xml = do
    return just_articles
  
  
    return just_articles
  
  
+-- | This makes it easy to select otherwise-random chunks of html
+--   using 'css'.
  wrap_in_body_div :: String -> String
  wrap_in_body_div s =
    "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
  wrap_in_body_div :: String -> String
  wrap_in_body_div s =
    "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
@@ -346,10 +407,36 @@ xhtml_to_epub epmd =
  
  
  
  
  
  
+--
+-- Misc
+--
+
+image_srcs :: (ArrowXml a) => a XmlTree URL
+image_srcs =
+  css "img"
+  >>>
+  getAttrValue "src"
+
+make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
+make_image_srcs_absolute =
+  processTopDown (make_srcs_absolute `when` is_image)
+  where
+    change_src :: (ArrowXml a) => a XmlTree XmlTree
+    change_src =
+      changeAttrValue try_make_absolute_url
+
+    make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
+    make_srcs_absolute = 
+      processAttrl $ change_src `when` hasName "src"
+
+
+--
+-- Tests
+--
  
  test_preprocess_links :: Assertion
  test_preprocess_links = do
  
  test_preprocess_links :: Assertion
  test_preprocess_links = do
-  actual_xml' <- runX $ (preprocess input_xml) >>> css "body"
+  actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
    let actual_xml = actual_xml' !! 0
  
    expected_xml' <- runX $ expected_xml'' >>> css "body"
    let actual_xml = actual_xml' !! 0
  
    expected_xml' <- runX $ expected_xml'' >>> css "body"
@@ -365,7 +452,59 @@ test_preprocess_links = do
      expected_html = "<body><span>Hello, world!</span></body>"
      expected_xml'' = parseHtml expected_html
  
      expected_html = "<body><span>Hello, world!</span></body>"
      expected_xml'' = parseHtml expected_html
  
+
+test_absolve_images :: Assertion
+test_absolve_images = do
+  actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
+  let actual_xml = actual_xml' !! 0
+
+  expected_xml' <- runX $ expected_xml'' >>> css "body"
+  let expected_xml = expected_xml' !! 0
+
+  assertEqual
+    "Image srcs are made absolute"
+    expected_xml
+    actual_xml
+  where
+    input_html =
+      "<body>" ++
+      "<img src=\"/images/2012/example.jpg\" />" ++
+      "</body>"
+    input_xml  = parseHtml input_html
+    expected_html =
+      "<body>" ++
+      "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
+      "</body>"
+    expected_xml'' = parseHtml expected_html
+
+
+test_comments_removed :: Assertion
+test_comments_removed = do
+  actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
+  let actual_xml = actual_xml' !! 0
+
+  expected_xml' <- runX $ expected_xml'' >>> css "body"
+  let expected_xml = expected_xml' !! 0
+
+  assertEqual
+    "Comment links are removed"
+    expected_xml
+    actual_xml
+  where
+    input_html =
+      "<body><p>" ++
+      "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
+      "</p></body>"
+    input_xml  = parseHtml input_html
+
+    expected_html  = "<body><p></p></body>"
+    expected_xml'' = parseHtml expected_html
+
+
+
  page_tests :: Test
  page_tests =
    testGroup "Page Tests" [
  page_tests :: Test
  page_tests =
    testGroup "Page Tests" [
-    testCase "Links are replaced with spans" test_preprocess_links ]
+    testCase "Links are replaced with spans" test_preprocess_links,
+    testCase "Image srcs are made absolute" test_absolve_images,
+    testCase "Comment links are removed" test_comments_removed ]