" ++ (show $ LWN.Article.title a) ++ "

X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FPage.hs;h=3d53284c4b66e9c67ff222d19982ce4e2ced45d2;hp=6a097cbf7714d8929a37708ddd92a29c8982bf21;hb=6a7cfdf0880ee5c5367e794babb30fa7eac22f39;hpb=f3321e2ce7d7645ad562dc8f6620bfd561edc75d diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index 6a097cb..3d53284 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -3,12 +3,15 @@ module LWN.Page where +import Control.Concurrent.ParallelIO (parallel) import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) -import System.IO (Handle, hClose, hFlush) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) +import qualified Data.Map as Map (Map, empty, insert) import Data.Maybe (catMaybes, fromJust, isNothing) +import Prelude hiding (readFile) +import System.IO (Handle, hClose, hFlush) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) @@ -17,11 +20,14 @@ import Text.Pandoc ( defaultWriterOptions, readHtml, writeEPUB, - writerEPUBMetadata) + writerEPUBMetadata, + writerUserDataDir) +import Text.Pandoc.Shared ( readDataFile ) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, XmlTree, + ($<), (>>>), (/>), (//>), @@ -29,30 +35,39 @@ import Text.XML.HXT.Core ( getAttrValue, getChildren, getText, - hasAttrValue, hasName, - isElem, - mkName, none, processAttrl, processTopDown, + this, runX, - setElemName, xshow, when) import Text.HandsomeSoup (css, parseHtml) +import Configuration (Cfg, full_stories) import LWN.Article -import LWN.HTTP (ImageMap, download_image_urls) -import LWN.URI (URL, try_make_absolute_url) -import LWN.XHTML (XHTML, to_xhtml) -import Misc (contains) +import LWN.HTTP ( + ImageMap, + download_image_urls, + get_article_contents) +import LWN.URI (URL) +import LWN.XHTML ( + XHTML, + full_story_urls, + image_srcs, + full_story_link, + full_story_paragraph, + is_image, + preprocess, + remove_byline, + remove_full_story_paragraphs, + remove_title, + to_xhtml, + to_xml, + xml_from_contents) + --- Should be called *after* preprocessing. -download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap -download_images xml = do - image_urls <- runX $ xml >>> image_srcs - download_image_urls image_urls data Page = @@ -77,9 +92,7 @@ instance XHTML Page where " " ++ (show $ LWN.Article.title a) ++ "" ++ "" ++ "" ++ - "

" ++ (to_xhtml a) ++ - "

" ++ "" ++ "" @@ -102,39 +115,46 @@ instance XHTML Page where -is_link :: (ArrowXml a) => a XmlTree XmlTree -is_link = - isElem >>> hasName "a" - - -remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree -remove_comment_links = - processTopDown $ kill_comments `when` is_link - where - is_comment_link = - hasAttrValue "href" (contains "#Comments") +-- | Stolen from writeEPUB. +default_stylesheet :: IO String +default_stylesheet = + -- This comes with Pandoc, I guess. + readDataFile (writerUserDataDir defaultWriterOptions) "epub.css" - kill_comments = - none `when` is_comment_link -replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree -replace_links_with_spans = - processTopDown $ (make_span >>> remove_attrs) `when` is_link - where - make_span = setElemName $ mkName "span" - remove_attrs = processAttrl none +construct_stylesheet :: IO String +construct_stylesheet = do + defaults <- default_stylesheet + -- Allow word-wrapping in

 elements.
+  let my_additions = "\n" ++ "pre { white-space: pre-wrap; }" ++ "\n"
+  return $ defaults ++ my_additions
 
+page_from_url :: Cfg -> URL -> IO (Maybe Page)
+page_from_url cfg url = do
+  contents <- get_article_contents cfg url  
+  case (xml_from_contents contents) of
+    Just html -> parse cfg html
+    Nothing -> return Nothing
 
--- | Preprocessing common to both page types.
-preprocess :: (ArrowXml a) => a XmlTree XmlTree
-preprocess =
-  make_image_srcs_absolute
-  >>>
-  remove_comment_links
-  >>>
-  replace_links_with_spans
 
 
+insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
+insert_full_stories story_map =
+  processTopDown (article_xml `when` full_story_paragraph)
+  where
+    lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
+    lookup_func href =
+      case Map.lookup href story_map of
+        -- Drop the paragraph if we don't have the contents.
+        Nothing -> none
+        Just v -> to_xml v
+
+    article_xml :: (ArrowXml a) => a XmlTree XmlTree
+    article_xml =
+      lookup_func
+      $< -- From HXT's Control.Arrow.ArrowList
+      (this /> full_story_link >>> getAttrValue "href")
+                   
 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
 replace_remote_img_srcs image_map =
   processTopDown (make_srcs_local `when` is_image)
@@ -156,17 +176,57 @@ replace_remote_img_srcs image_map =
       processAttrl $ (change_src `when` (hasName "src"))
 
 
-parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
-parse xml = do
-  let clean_xml = xml >>> preprocess
+
+
+-- Should be called *after* preprocessing.
+download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
+download_images xml = do
+  image_urls <- runX $ xml >>> image_srcs
+  download_image_urls image_urls
+
+
+
+type StoryMap = Map.Map URL Article
+
+-- These come *before* preprocessing.
+download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
+download_full_story_urls cfg story_urls = do
+  pages <- parallel $ map (page_from_url cfg) story_urls
+  let pairs = zip story_urls pages
+  return $ foldl my_insert empty_map pairs
+  where
+    empty_map = Map.empty :: StoryMap
+
+    my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
+    my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
+    my_insert dict (_, _)  = dict
+
+
+download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
+download_full_stories cfg xml = do
+  story_urls <- runX $ xml >>> full_story_urls
+  download_full_story_urls cfg story_urls
+  
+
+parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
+parse cfg xml = do
+  fs_xml <- if (full_stories cfg) then do
+             story_map <- download_full_stories cfg xml
+             return $ xml >>> insert_full_stories story_map
+           else do
+             -- Get rid of them if we don't want them.
+             return $ xml >>> remove_full_story_paragraphs
+
+  let clean_xml = fs_xml >>> preprocess
   image_map <- download_images clean_xml
   let local_xml = clean_xml >>> replace_remote_img_srcs image_map
+
   appr <- ap_parse local_xml
   fppr <- fp_parse local_xml
   return $
     if (isNothing appr) then
       fppr
-    else 
+    else
       appr
 
 
@@ -200,12 +260,12 @@ parse_byline xml = do
 --
 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 ap_parse xml = do
-    arts <- ap_parse_articles xml          
+    arts <- ap_parse_articles xml
     case arts of
       [x] -> return $ Just $ ArticlePage x
       _   -> return Nothing
 
-  
+
 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 ap_parse_body xml = do
   let element_filter = xml >>> css "div.ArticleText"
@@ -223,15 +283,13 @@ ap_parse_articles xml = do
   parsed_byline   <- parse_byline xml
   parsed_body     <- ap_parse_body xml
 
-  putStrLn $ fromJust parsed_headline
-
   if (isNothing parsed_headline) || (isNothing parsed_body)
   then return []
-  else do    
+  else do
     let title'  = Title    $ fromJust parsed_headline
     let byline' = Byline     parsed_byline
     let body'   = BodyHtml $ fromJust parsed_body
-    
+
     return $ [Article title' byline' body']
 
 
@@ -244,9 +302,9 @@ fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 fp_parse xml = do
     hl <- parse_headline xml
     parsed_articles <- fp_parse_articles xml
-    case parsed_articles of
-      []          -> return Nothing
-      x -> return $ Just $ FullPage (fromJust hl) x
+    return $ case parsed_articles of
+      [] -> Nothing
+      x  -> Just $ FullPage (fromJust hl) x
 
 
 
@@ -262,33 +320,6 @@ fp_parse_article_title xml = do
 
 
 
-is_title :: (ArrowXml a) => a XmlTree XmlTree
-is_title =
-  (hasName "h2")
-  >>>
-  (hasAttrValue "class" (== "SummaryHL"))
-
-
-is_byline :: (ArrowXml a) => a XmlTree XmlTree
-is_byline =
-  (hasName "div")
-  >>>
-  (hasAttrValue "class" (== "FeatureByLine"))
-
-
-is_image :: (ArrowXml a) => a XmlTree XmlTree
-is_image = isElem >>> hasName "img"
-
-remove_title :: (ArrowXml a) => a XmlTree XmlTree
-remove_title =
-  processTopDown ((none) `when` is_title)
-
-
-remove_byline :: (ArrowXml a) => a XmlTree XmlTree
-remove_byline =
-  processTopDown ((none) `when` is_byline)
-
-
 
 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 fp_parse_article_body xml = do
@@ -323,7 +354,7 @@ parse_html_article html = do
   let xml = parseHtml $ wrap_in_body_div html
   fp_parse_article xml
 
-  
+
 -- | In the full page, all of the article titles and bodies are
 --   wrapped in one big div.ArticleText.
 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
@@ -342,7 +373,6 @@ fp_parse_articles xml = do
                           []    -> []
   -- Put the separator back, it was lost during the split.
   let split_articles = map (article_separator ++) split_articles'
-  --_ <- mapM print_article split_articles
   real_articles <- mapM parse_html_article split_articles
   let just_articles = catMaybes real_articles
   return just_articles
@@ -387,36 +417,16 @@ epublish obj handle = do
   hClose handle
 
 xhtml_to_epub :: String -> String -> IO B.ByteString
-xhtml_to_epub epmd =
-   write_epub . read_html
-   where
-     my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
-     write_epub = writeEPUB Nothing [] my_writer_options
-     read_html  = readHtml defaultParserState
-
-
-
---
--- Misc
---
-
-image_srcs :: (ArrowXml a) => a XmlTree URL
-image_srcs =
-  css "img"
-  >>>
-  getAttrValue "src"
-
-make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
-make_image_srcs_absolute =
-  processTopDown (make_srcs_absolute `when` is_image)
+xhtml_to_epub epmd xhtml = do
+  stylesheet <- construct_stylesheet
+  writeEPUB
+    (Just stylesheet)
+    []
+    my_writer_options
+    (read_html xhtml)
   where
-    change_src :: (ArrowXml a) => a XmlTree XmlTree
-    change_src =
-      changeAttrValue try_make_absolute_url
-
-    make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
-    make_srcs_absolute = 
-      processAttrl $ change_src `when` hasName "src"
+    my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
+    read_html  = readHtml defaultParserState
 
 
 --
@@ -490,10 +500,33 @@ test_comments_removed = do
     expected_xml'' = parseHtml expected_html
 
 
+test_full_story_urls_parsed :: Assertion
+test_full_story_urls_parsed = do
+  actual <- runX $ actual'
+
+  assertEqual
+    "Full Story URLs are parsed"
+    expected
+    actual
+  where
+    expected = ["/Articles/500738/", "/Articles/501837/"]
+
+    full_story_html =
+      concat ["",
+              "Full Story ",
+              "(comments: 49)",
+              "
",
+              "Full Story ",
+              "(comments: none)",
+              "
"]
+
+    xml = parseHtml full_story_html
+    actual' = xml >>> full_story_urls
 
 page_tests :: Test
 page_tests =
   testGroup "Page Tests" [
     testCase "Links are replaced with spans" test_preprocess_links,
     testCase "Image srcs are made absolute" test_absolve_images,
-    testCase "Comment links are removed" test_comments_removed ]
+    testCase "Comment links are removed" test_comments_removed,
+    testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]