X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FPage.hs;h=5f3b9ee608f42dc9895c458e3968e0363f4702bb;hp=4d61cfbc5a4f2ad089e852bd13a459f2280f1cbf;hb=HEAD;hpb=6f0e6cbece7e1b1a3c6b43d19eb2f29088af981c diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index 4d61cfb..5f3b9ee 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -1,35 +1,73 @@ +{-# LANGUAGE DoAndIfThenElse #-} + module LWN.Page where -import Text.Pandoc +import Control.Concurrent.ParallelIO (parallel) import Data.Time (getCurrentTime) -import System.IO (Handle) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) +import qualified Data.Map as Map (Map, empty, insert, lookup) import Data.Maybe (catMaybes, fromJust, isNothing) -import Data.Tree.NTree.TypeDefs (NTree) +import Prelude hiding (readFile) +import System.IO (Handle, hClose, hFlush) +import Test.HUnit (Assertion, assertEqual) +import Test.Framework (Test, testGroup) +import Test.Framework.Providers.HUnit (testCase) +import Text.Pandoc ( + defaultParserState, + defaultWriterOptions, + readHtml, + writeEPUB, + writerEPUBMetadata, + writerUserDataDir) +import Text.Pandoc.Shared ( readDataFile ) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, XmlTree, - XNode, + ($<), (>>>), (/>), (//>), + changeAttrValue, + getAttrValue, getChildren, getText, - hasAttrValue, hasName, none, + processAttrl, processTopDown, + this, runX, xshow, - when - ) + when) import Text.HandsomeSoup (css, parseHtml) +import Configuration (Cfg, full_stories) import LWN.Article -import XHTML +import LWN.HTTP ( + ImageMap, + download_image_urls, + get_article_contents) +import LWN.URI (URL) +import LWN.XHTML ( + XHTML, + full_story_urls, + image_srcs, + full_story_link, + full_story_paragraph, + is_image, + preprocess, + remove_byline, + remove_full_story_paragraphs, + remove_title, + to_xhtml, + to_xml, + xml_from_contents) + + + data Page = -- | An LWN page with one article on it. @@ -53,9 +91,7 @@ instance XHTML Page where " " ++ (show $ LWN.Article.title a) ++ "" ++ "" ++ "" ++ - "
" ++ (to_xhtml a) ++ - "
" ++ "" ++ "" @@ -78,61 +114,159 @@ instance XHTML Page where -remove_images :: (ArrowXml a) => a XmlTree XmlTree -remove_images = - processTopDown ((none) `when` is_image) +-- | Stolen from writeEPUB. +default_stylesheet :: IO String +default_stylesheet = + -- This comes with Pandoc, I guess. + readDataFile (writerUserDataDir defaultWriterOptions) "epub.css" + + +construct_stylesheet :: IO String +construct_stylesheet = do + defaults <- default_stylesheet + -- Allow word-wrapping in
 elements.
+  let my_additions = "\n" ++ "pre { white-space: pre-wrap; }" ++ "\n"
+  return $ defaults ++ my_additions
+
+page_from_url :: Cfg -> URL -> IO (Maybe Page)
+page_from_url cfg url = do
+  contents <- get_article_contents cfg url  
+  case (xml_from_contents contents) of
+    Just html -> parse cfg html
+    Nothing -> return Nothing
+
+
+
+insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
+insert_full_stories story_map =
+  processTopDown (article_xml `when` full_story_paragraph)
+  where
+    lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
+    lookup_func href =
+      case Map.lookup href story_map of
+        -- Drop the paragraph if we don't have the contents.
+        Nothing -> none
+        Just v -> to_xml v
+
+    article_xml :: (ArrowXml a) => a XmlTree XmlTree
+    article_xml =
+      lookup_func
+      $< -- From HXT's Control.Arrow.ArrowList
+      (this /> full_story_link >>> getAttrValue "href")
+                   
+replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
+replace_remote_img_srcs image_map =
+  processTopDown (make_srcs_local `when` is_image)
+  where
+    -- old_src -> new_src
+    change_src_func :: String -> String
+    change_src_func old_src =
+      case Map.lookup old_src image_map of
+        -- If we don't have the file, empty the src. Pandoc will crash
+        -- otherwise.
+        Nothing -> ""
+        Just v -> v
+
+    change_src :: (ArrowXml a) => a XmlTree XmlTree
+    change_src =
+      changeAttrValue change_src_func
+
+    make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
+    make_srcs_local =
+      processAttrl $ (change_src `when` (hasName "src"))
+
+
+
+
+-- Should be called *after* preprocessing.
+download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
+download_images xml = do
+  image_urls <- runX $ xml >>> image_srcs
+  download_image_urls image_urls
+
+
+
+type StoryMap = Map.Map URL Article
 
+-- These come *before* preprocessing.
+download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
+download_full_story_urls cfg story_urls = do
+  pages <- parallel $ map (page_from_url cfg) story_urls
+  let pairs = zip story_urls pages
+  return $ foldl my_insert empty_map pairs
+  where
+    empty_map = Map.empty :: StoryMap
 
--- | Preprocessing common to both page types.
-preprocess :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
-preprocess xml =
-  xml >>>remove_images
+    my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
+    my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
+    my_insert dict (_, _)  = dict
 
 
-parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
-parse xml = do
-  let clean_xml = preprocess xml
-  appr <- ap_parse clean_xml
-  fppr <- fp_parse clean_xml
+download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
+download_full_stories cfg xml = do
+  story_urls <- runX $ xml >>> full_story_urls
+  download_full_story_urls cfg story_urls
+  
+
+parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
+parse cfg xml = do
+  fs_xml <- if (full_stories cfg) then do
+             story_map <- download_full_stories cfg xml
+             return $ xml >>> insert_full_stories story_map
+           else do
+             -- Get rid of them if we don't want them.
+             return $ xml >>> remove_full_story_paragraphs
+
+  let clean_xml = fs_xml >>> preprocess
+  image_map <- download_images clean_xml
+  let local_xml = clean_xml >>> replace_remote_img_srcs image_map
+
+  appr <- ap_parse local_xml
+  fppr <- fp_parse local_xml
   return $
     if (isNothing appr) then
       fppr
     else
       appr
 
---
--- ArticlePage Stuff
---
-ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
-ap_parse xml = do
-    arts <- ap_parse_articles xml
-    case arts of
-      Just [x] -> return $ Just $ ArticlePage x
-      _   -> return Nothing
 
 
-ap_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-ap_parse_headline xml = do
+parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
+parse_headline xml = do
   let element_filter = xml >>> css "div.PageHeadline h1"
   let element_text_filter = element_filter /> getText
   element_text <- runX element_text_filter
-  return $ case element_text of
-            [x] -> Just $ strip x
-            []  -> Nothing
-            _   -> error "Found more than one headline."
+  return $
+    case element_text of
+      [x] -> Just $ strip x
+      []  -> Nothing
+      _   -> error "Found more than one headline."
+
 
-ap_parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-ap_parse_byline xml = do
-  let element_filter = xml >>> css "div.Byline"
+parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
+parse_byline xml = do
+  let element_filter = xml >>> css "div.FeatureByLine"
   let element_text_filter = element_filter /> getText
   element_text <- runX element_text_filter
-  return $ case element_text of
-            [x] -> Just $ strip x
-            []  -> Nothing
-            _   -> error "Found more than one byline."
+  return $
+    case element_text of
+      [x] -> Just $ strip x
+      []  -> Nothing
+      _   -> error "Found more than one article byline."
+
+
+--
+-- ArticlePage Stuff
+--
+ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
+ap_parse xml = do
+    arts <- ap_parse_articles xml
+    case arts of
+      [x] -> return $ Just $ ArticlePage x
+      _   -> return Nothing
 
 
-ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 ap_parse_body xml = do
   let element_filter = xml >>> css "div.ArticleText"
   let element_html_filter = xshow element_filter
@@ -143,16 +277,20 @@ ap_parse_body xml = do
             _   -> error "Found more than one article."
 
 
-ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
+ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 ap_parse_articles xml = do
-  parsed_headline <- ap_parse_headline xml
-  parsed_byline   <- ap_parse_byline xml
+  parsed_headline <- parse_headline xml
+  parsed_byline   <- parse_byline xml
   parsed_body     <- ap_parse_body xml
-  let title'   = Title (fromJust parsed_headline)
-  let byline' = Byline  parsed_byline
-  let body'   = BodyHtml (fromJust parsed_body)
-  return $ Just $ [Article title' byline' body']
 
+  if (isNothing parsed_headline) || (isNothing parsed_body)
+  then return []
+  else do
+    let title'  = Title    $ fromJust parsed_headline
+    let byline' = Byline     parsed_byline
+    let body'   = BodyHtml $ fromJust parsed_body
+
+    return $ [Article title' byline' body']
 
 
 
@@ -160,39 +298,17 @@ ap_parse_articles xml = do
 -- FullPage Stuff
 --
 
-fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
+fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 fp_parse xml = do
-    hl <- fp_parse_headline xml
+    hl <- parse_headline xml
     parsed_articles <- fp_parse_articles xml
-    case parsed_articles of
-      them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
-      _          -> return Nothing
-
+    return $ case parsed_articles of
+      [] -> Nothing
+      x  -> Just $ FullPage (fromJust hl) x
 
 
 
-fp_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-fp_parse_headline xml = do
-  let element_filter = xml >>> css "div.PageHeadline h1"
-  let element_text_filter = element_filter /> getText
-  element_text <- runX element_text_filter
-  return $ case element_text of
-            [x] -> Just $ strip x
-            []  -> Nothing
-            _   -> error "Found more than one headline."
-
-fp_parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-fp_parse_article_byline xml = do
-  let element_filter = xml >>> css "div.FeatureByLine"
-  let element_text_filter = element_filter /> getText
-  element_text <- runX element_text_filter
-  return $ case element_text of
-            [x] -> Just $ strip x
-            []  -> Nothing
-            _   -> error "Found more than one article byline."
-
-
-fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 fp_parse_article_title xml = do
   let element_filter = xml >>> css "h2.SummaryHL"
   let element_text_filter = element_filter //> getText
@@ -204,37 +320,8 @@ fp_parse_article_title xml = do
 
 
 
-is_title :: (ArrowXml a) => a XmlTree XmlTree
-is_title =
-  (hasName "h2")
-  >>>
-  (hasAttrValue "class" (== "SummaryHL"))
-
-
-is_byline :: (ArrowXml a) => a XmlTree XmlTree
-is_byline =
-  (hasName "div")
-  >>>
-  (hasAttrValue "class" (== "FeatureByLine"))
-
-
-is_image :: (ArrowXml a) => a XmlTree XmlTree
-is_image =
-  hasName "img"
-
-
-remove_title :: (ArrowXml a) => a XmlTree XmlTree
-remove_title =
-  processTopDown ((none) `when` is_title)
-
-
-remove_byline :: (ArrowXml a) => a XmlTree XmlTree
-remove_byline =
-  processTopDown ((none) `when` is_byline)
-
 
-
-fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 fp_parse_article_body xml = do
   -- First, delete the article title and byline.
   let clean_xml' = xml >>> remove_title >>> remove_byline
@@ -247,44 +334,52 @@ fp_parse_article_body xml = do
             []  -> Nothing
             _   -> error "Found more than one article body."
 
-fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
+fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
 fp_parse_article xml = do
   parsed_article_title    <- fp_parse_article_title xml
-  parsed_article_byline   <- fp_parse_article_byline xml
+  parsed_article_byline   <- parse_byline xml
   parsed_article_body     <- fp_parse_article_body xml
-  let title'   = Title    $ fromJust parsed_article_title
-  let byline'  = Byline     parsed_article_byline
-  let body'    = BodyHtml $ fromJust parsed_article_body
-  return $ Just $ Article title' byline' body'
+
+  if (isNothing parsed_article_title) || (isNothing parsed_article_body)
+  then
+    return Nothing
+  else do
+    let title'   = Title    $ fromJust parsed_article_title
+    let byline'  = Byline     parsed_article_byline
+    let body'    = BodyHtml $ fromJust parsed_article_body
+    return $ Just $ Article title' byline' body'
 
 parse_html_article :: String -> IO (Maybe Article)
 parse_html_article html = do
   let xml = parseHtml $ wrap_in_body_div html
   fp_parse_article xml
 
-  
+
 -- | In the full page, all of the article titles and bodies are
---   wrapped in a div.ArticleText.
-parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
+--   wrapped in one big div.ArticleText.
+parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
 parse_bodies xml =
   xml >>> css "div.ArticleText"
 
 
-fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
+fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 fp_parse_articles xml = do
   bodies <- runX . xshow $ parse_bodies xml
   let article_separator = "

" let split_articles'' = split article_separator (concat bodies) -- The first element will contain the crap before the first . - let split_articles' = tail split_articles'' + let split_articles' = case split_articles'' of + (_:_) -> tail split_articles'' + [] -> [] -- Put the separator back, it was lost during the split. let split_articles = map (article_separator ++) split_articles' - --_ <- mapM print_article split_articles real_articles <- mapM parse_html_article split_articles let just_articles = catMaybes real_articles return just_articles +-- | This makes it easy to select otherwise-random chunks of html +-- using 'css'. wrap_in_body_div :: String -> String wrap_in_body_div s = "
" ++ s ++ "
" @@ -318,12 +413,120 @@ epublish obj handle = do epmd <- metadata obj epub <- xhtml_to_epub epmd xhtml B.hPut handle epub - + hFlush handle + hClose handle xhtml_to_epub :: String -> String -> IO B.ByteString -xhtml_to_epub epmd = - write_epub . read_html - where - my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd } - write_epub = writeEPUB Nothing [] my_writer_options - read_html = readHtml defaultParserState +xhtml_to_epub epmd xhtml = do + stylesheet <- construct_stylesheet + writeEPUB + (Just stylesheet) + [] + my_writer_options + (read_html xhtml) + where + my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd } + read_html = readHtml defaultParserState + + +-- +-- Tests +-- + +test_preprocess_links :: Assertion +test_preprocess_links = do + actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" + let actual_xml = actual_xml' !! 0 + + expected_xml' <- runX $ expected_xml'' >>> css "body" + let expected_xml = expected_xml' !! 0 + + assertEqual + "Links are replaced with spans" + expected_xml + actual_xml + where + input_html = "Hello, world!" + input_xml = parseHtml input_html + expected_html = "Hello, world!" + expected_xml'' = parseHtml expected_html + + +test_absolve_images :: Assertion +test_absolve_images = do + actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" + let actual_xml = actual_xml' !! 0 + + expected_xml' <- runX $ expected_xml'' >>> css "body" + let expected_xml = expected_xml' !! 0 + + assertEqual + "Image srcs are made absolute" + expected_xml + actual_xml + where + input_html = + "" ++ + "" ++ + "" + input_xml = parseHtml input_html + expected_html = + "" ++ + "" ++ + "" + expected_xml'' = parseHtml expected_html + + +test_comments_removed :: Assertion +test_comments_removed = do + actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" + let actual_xml = actual_xml' !! 0 + + expected_xml' <- runX $ expected_xml'' >>> css "body" + let expected_xml = expected_xml' !! 0 + + assertEqual + "Comment links are removed" + expected_xml + actual_xml + where + input_html = + "

" ++ + "Comments (6 posted)" ++ + "

" + input_xml = parseHtml input_html + + expected_html = "

" + expected_xml'' = parseHtml expected_html + + +test_full_story_urls_parsed :: Assertion +test_full_story_urls_parsed = do + actual <- runX $ actual' + + assertEqual + "Full Story URLs are parsed" + expected + actual + where + expected = ["/Articles/500738/", "/Articles/501837/"] + + full_story_html = + concat ["

", + "Full Story ", + "(comments: 49)", + "

", + "Full Story ", + "(comments: none)", + "

"] + + xml = parseHtml full_story_html + actual' = xml >>> full_story_urls + +page_tests :: Test +page_tests = + testGroup "Page Tests" [ + testCase "Links are replaced with spans" test_preprocess_links, + testCase "Image srcs are made absolute" test_absolve_images, + testCase "Comment links are removed" test_comments_removed, + testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]