" ++ (show $ LWN.Article.title a) ++ "

X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FPage.hs;h=3d53284c4b66e9c67ff222d19982ce4e2ced45d2;hp=7419402a3af304974dac87236826b4fcd8f545ae;hb=6a7cfdf0880ee5c5367e794babb30fa7eac22f39;hpb=ebedcdb6b1b8925dcfb5700d076f25743fac8645 diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index 7419402..3d53284 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -3,22 +3,31 @@ module LWN.Page where -import qualified Data.Map as Map +import Control.Concurrent.ParallelIO (parallel) +import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) -import System.IO (Handle) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) +import qualified Data.Map as Map (Map, empty, insert) import Data.Maybe (catMaybes, fromJust, isNothing) -import Data.Tree.NTree.TypeDefs (NTree) +import Prelude hiding (readFile) +import System.IO (Handle, hClose, hFlush) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) -import Text.Pandoc +import Text.Pandoc ( + defaultParserState, + defaultWriterOptions, + readHtml, + writeEPUB, + writerEPUBMetadata, + writerUserDataDir) +import Text.Pandoc.Shared ( readDataFile ) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, XmlTree, - XNode, + ($<), (>>>), (/>), (//>), @@ -26,43 +35,39 @@ import Text.XML.HXT.Core ( getAttrValue, getChildren, getText, - hasAttrValue, hasName, - isElem, - mkName, none, processAttrl, processTopDown, + this, runX, - setElemName, xshow, - when - ) + when) import Text.HandsomeSoup (css, parseHtml) +import Configuration (Cfg, full_stories) import LWN.Article -import LWN.HTTP (save_image) -import LWN.URI (URL, try_make_absolute_url) -import Misc (contains) -import XHTML +import LWN.HTTP ( + ImageMap, + download_image_urls, + get_article_contents) +import LWN.URI (URL) +import LWN.XHTML ( + XHTML, + full_story_urls, + image_srcs, + full_story_link, + full_story_paragraph, + is_image, + preprocess, + remove_byline, + remove_full_story_paragraphs, + remove_title, + to_xhtml, + to_xml, + xml_from_contents) --- Map absolute image URLs to local system file paths where the image --- referenced by the URL is stored. -type ImageMap = Map.Map URL FilePath --- Should be called *after* preprocessing. -download_images :: IOSArrow XmlTree (NTree XNode) -> IO ImageMap -download_images xml = do - image_urls <- runX $ xml >>> image_srcs - files <- mapM save_image image_urls - let pairs = zip image_urls files - return $ foldl my_insert empty_map pairs - where - empty_map = Map.empty :: ImageMap - - my_insert :: ImageMap -> (URL, Maybe FilePath) -> ImageMap - my_insert dict (_, Nothing) = dict - my_insert dict (k, Just v) = Map.insert k v dict data Page = @@ -87,9 +92,7 @@ instance XHTML Page where " " ++ (show $ LWN.Article.title a) ++ "" ++ "" ++ "" ++ - "

" ++ (to_xhtml a) ++ - "

" ++ "" ++ "" @@ -112,39 +115,46 @@ instance XHTML Page where -is_link :: (ArrowXml a) => a XmlTree XmlTree -is_link = - isElem >>> hasName "a" +-- | Stolen from writeEPUB. +default_stylesheet :: IO String +default_stylesheet = + -- This comes with Pandoc, I guess. + readDataFile (writerUserDataDir defaultWriterOptions) "epub.css" -remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree -remove_comment_links = - processTopDown $ kill_comments `when` is_link - where - is_comment_link = - hasAttrValue "href" (contains "#Comments") +construct_stylesheet :: IO String +construct_stylesheet = do + defaults <- default_stylesheet + -- Allow word-wrapping in

 elements.
+  let my_additions = "\n" ++ "pre { white-space: pre-wrap; }" ++ "\n"
+  return $ defaults ++ my_additions
 
-    kill_comments =
-      none `when` is_comment_link
-
-replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
-replace_links_with_spans =
-  processTopDown $ (make_span >>> remove_attrs) `when` is_link
-  where
-    make_span    = setElemName $ mkName "span"
-    remove_attrs = processAttrl none
+page_from_url :: Cfg -> URL -> IO (Maybe Page)
+page_from_url cfg url = do
+  contents <- get_article_contents cfg url  
+  case (xml_from_contents contents) of
+    Just html -> parse cfg html
+    Nothing -> return Nothing
 
 
--- | Preprocessing common to both page types.
-preprocess :: (ArrowXml a) => a XmlTree XmlTree
-preprocess =
-  make_image_srcs_absolute
-  >>>
-  remove_comment_links
-  >>>
-  replace_links_with_spans
-
 
+insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
+insert_full_stories story_map =
+  processTopDown (article_xml `when` full_story_paragraph)
+  where
+    lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
+    lookup_func href =
+      case Map.lookup href story_map of
+        -- Drop the paragraph if we don't have the contents.
+        Nothing -> none
+        Just v -> to_xml v
+
+    article_xml :: (ArrowXml a) => a XmlTree XmlTree
+    article_xml =
+      lookup_func
+      $< -- From HXT's Control.Arrow.ArrowList
+      (this /> full_story_link >>> getAttrValue "href")
+                   
 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
 replace_remote_img_srcs image_map =
   processTopDown (make_srcs_local `when` is_image)
@@ -166,22 +176,62 @@ replace_remote_img_srcs image_map =
       processAttrl $ (change_src `when` (hasName "src"))
 
 
-parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
-parse xml = do
-  let clean_xml = xml >>> preprocess
+
+
+-- Should be called *after* preprocessing.
+download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
+download_images xml = do
+  image_urls <- runX $ xml >>> image_srcs
+  download_image_urls image_urls
+
+
+
+type StoryMap = Map.Map URL Article
+
+-- These come *before* preprocessing.
+download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
+download_full_story_urls cfg story_urls = do
+  pages <- parallel $ map (page_from_url cfg) story_urls
+  let pairs = zip story_urls pages
+  return $ foldl my_insert empty_map pairs
+  where
+    empty_map = Map.empty :: StoryMap
+
+    my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
+    my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
+    my_insert dict (_, _)  = dict
+
+
+download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
+download_full_stories cfg xml = do
+  story_urls <- runX $ xml >>> full_story_urls
+  download_full_story_urls cfg story_urls
+  
+
+parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
+parse cfg xml = do
+  fs_xml <- if (full_stories cfg) then do
+             story_map <- download_full_stories cfg xml
+             return $ xml >>> insert_full_stories story_map
+           else do
+             -- Get rid of them if we don't want them.
+             return $ xml >>> remove_full_story_paragraphs
+
+  let clean_xml = fs_xml >>> preprocess
   image_map <- download_images clean_xml
   let local_xml = clean_xml >>> replace_remote_img_srcs image_map
+
   appr <- ap_parse local_xml
   fppr <- fp_parse local_xml
   return $
     if (isNothing appr) then
       fppr
-    else 
+    else
       appr
 
 
 
-parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 parse_headline xml = do
   let element_filter = xml >>> css "div.PageHeadline h1"
   let element_text_filter = element_filter /> getText
@@ -193,7 +243,7 @@ parse_headline xml = do
       _   -> error "Found more than one headline."
 
 
-parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 parse_byline xml = do
   let element_filter = xml >>> css "div.FeatureByLine"
   let element_text_filter = element_filter /> getText
@@ -208,15 +258,15 @@ parse_byline xml = do
 --
 -- ArticlePage Stuff
 --
-ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
+ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 ap_parse xml = do
-    arts <- ap_parse_articles xml          
+    arts <- ap_parse_articles xml
     case arts of
       [x] -> return $ Just $ ArticlePage x
       _   -> return Nothing
 
-  
-ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+
+ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 ap_parse_body xml = do
   let element_filter = xml >>> css "div.ArticleText"
   let element_html_filter = xshow element_filter
@@ -227,21 +277,19 @@ ap_parse_body xml = do
             _   -> error "Found more than one article."
 
 
-ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
+ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 ap_parse_articles xml = do
   parsed_headline <- parse_headline xml
   parsed_byline   <- parse_byline xml
   parsed_body     <- ap_parse_body xml
 
-  putStrLn $ fromJust parsed_headline
-
   if (isNothing parsed_headline) || (isNothing parsed_body)
   then return []
-  else do    
+  else do
     let title'  = Title    $ fromJust parsed_headline
     let byline' = Byline     parsed_byline
     let body'   = BodyHtml $ fromJust parsed_body
-    
+
     return $ [Article title' byline' body']
 
 
@@ -250,17 +298,17 @@ ap_parse_articles xml = do
 -- FullPage Stuff
 --
 
-fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
+fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 fp_parse xml = do
     hl <- parse_headline xml
     parsed_articles <- fp_parse_articles xml
-    case parsed_articles of
-      []          -> return Nothing
-      x -> return $ Just $ FullPage (fromJust hl) x
+    return $ case parsed_articles of
+      [] -> Nothing
+      x  -> Just $ FullPage (fromJust hl) x
 
 
 
-fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 fp_parse_article_title xml = do
   let element_filter = xml >>> css "h2.SummaryHL"
   let element_text_filter = element_filter //> getText
@@ -272,35 +320,8 @@ fp_parse_article_title xml = do
 
 
 
-is_title :: (ArrowXml a) => a XmlTree XmlTree
-is_title =
-  (hasName "h2")
-  >>>
-  (hasAttrValue "class" (== "SummaryHL"))
-
-
-is_byline :: (ArrowXml a) => a XmlTree XmlTree
-is_byline =
-  (hasName "div")
-  >>>
-  (hasAttrValue "class" (== "FeatureByLine"))
-
-
-is_image :: (ArrowXml a) => a XmlTree XmlTree
-is_image = isElem >>> hasName "img"
 
-remove_title :: (ArrowXml a) => a XmlTree XmlTree
-remove_title =
-  processTopDown ((none) `when` is_title)
-
-
-remove_byline :: (ArrowXml a) => a XmlTree XmlTree
-remove_byline =
-  processTopDown ((none) `when` is_byline)
-
-
-
-fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 fp_parse_article_body xml = do
   -- First, delete the article title and byline.
   let clean_xml' = xml >>> remove_title >>> remove_byline
@@ -313,7 +334,7 @@ fp_parse_article_body xml = do
             []  -> Nothing
             _   -> error "Found more than one article body."
 
-fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
+fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
 fp_parse_article xml = do
   parsed_article_title    <- fp_parse_article_title xml
   parsed_article_byline   <- parse_byline xml
@@ -333,15 +354,15 @@ parse_html_article html = do
   let xml = parseHtml $ wrap_in_body_div html
   fp_parse_article xml
 
-  
+
 -- | In the full page, all of the article titles and bodies are
 --   wrapped in one big div.ArticleText.
-parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
+parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
 parse_bodies xml =
   xml >>> css "div.ArticleText"
 
 
-fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
+fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
 fp_parse_articles xml = do
   bodies <- runX . xshow $ parse_bodies xml
   let article_separator = ""
@@ -352,7 +373,6 @@ fp_parse_articles xml = do
                           []    -> []
   -- Put the separator back, it was lost during the split.
   let split_articles = map (article_separator ++) split_articles'
-  --_ <- mapM print_article split_articles
   real_articles <- mapM parse_html_article split_articles
   let just_articles = catMaybes real_articles
   return just_articles
@@ -393,39 +413,20 @@ epublish obj handle = do
   epmd <- metadata obj
   epub <- xhtml_to_epub epmd xhtml
   B.hPut handle epub
-
+  hFlush handle
+  hClose handle
 
 xhtml_to_epub :: String -> String -> IO B.ByteString
-xhtml_to_epub epmd =
-   write_epub . read_html
-   where
-     my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
-     write_epub = writeEPUB Nothing [] my_writer_options
-     read_html  = readHtml defaultParserState
-
-
-
---
--- Misc
---
-
-image_srcs :: (ArrowXml a) => a XmlTree URL
-image_srcs =
-  css "img"
-  >>>
-  getAttrValue "src"
-
-make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
-make_image_srcs_absolute =
-  processTopDown (make_srcs_absolute `when` is_image)
+xhtml_to_epub epmd xhtml = do
+  stylesheet <- construct_stylesheet
+  writeEPUB
+    (Just stylesheet)
+    []
+    my_writer_options
+    (read_html xhtml)
   where
-    change_src :: (ArrowXml a) => a XmlTree XmlTree
-    change_src =
-      changeAttrValue try_make_absolute_url
-
-    make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
-    make_srcs_absolute = 
-      processAttrl $ change_src `when` hasName "src"
+    my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
+    read_html  = readHtml defaultParserState
 
 
 --
@@ -499,10 +500,33 @@ test_comments_removed = do
     expected_xml'' = parseHtml expected_html
 
 
+test_full_story_urls_parsed :: Assertion
+test_full_story_urls_parsed = do
+  actual <- runX $ actual'
+
+  assertEqual
+    "Full Story URLs are parsed"
+    expected
+    actual
+  where
+    expected = ["/Articles/500738/", "/Articles/501837/"]
+
+    full_story_html =
+      concat ["",
+              "Full Story ",
+              "(comments: 49)",
+              "
",
+              "Full Story ",
+              "(comments: none)",
+              "
"]
+
+    xml = parseHtml full_story_html
+    actual' = xml >>> full_story_urls
 
 page_tests :: Test
 page_tests =
   testGroup "Page Tests" [
     testCase "Links are replaced with spans" test_preprocess_links,
     testCase "Image srcs are made absolute" test_absolve_images,
-    testCase "Comment links are removed" test_comments_removed ]
+    testCase "Comment links are removed" test_comments_removed,
+    testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]