X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FPage.hs;h=3d53284c4b66e9c67ff222d19982ce4e2ced45d2;hp=3705f3bcf62bd89759c9793410ee6a3633cc877e;hb=6a7cfdf0880ee5c5367e794babb30fa7eac22f39;hpb=10f322ce20600de109c4643967b6ce3f61f69bf6 diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index 3705f3b..3d53284 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -3,15 +3,15 @@ module LWN.Page where +import Control.Concurrent.ParallelIO (parallel) import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) +import qualified Data.Map as Map (Map, empty, insert) import Data.Maybe (catMaybes, fromJust, isNothing) import Prelude hiding (readFile) -import System.Directory (doesFileExist) -import System.IO (Handle, hClose, hFlush, hPutStrLn, stderr) -import System.IO.UTF8 (readFile) +import System.IO (Handle, hClose, hFlush) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) @@ -20,12 +20,14 @@ import Text.Pandoc ( defaultWriterOptions, readHtml, writeEPUB, - writerEPUBMetadata) + writerEPUBMetadata, + writerUserDataDir) +import Text.Pandoc.Shared ( readDataFile ) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, - IOStateArrow, XmlTree, + ($<), (>>>), (/>), (//>), @@ -33,83 +35,40 @@ import Text.XML.HXT.Core ( getAttrValue, getChildren, getText, - hasAttrValue, hasName, - isElem, - mkName, none, processAttrl, processTopDown, + this, runX, - setElemName, xshow, when) import Text.HandsomeSoup (css, parseHtml) -import Configuration (Cfg, password, use_account, username) +import Configuration (Cfg, full_stories) import LWN.Article import LWN.HTTP ( ImageMap, download_image_urls, - get_page, - log_in, - make_cookie_jar) -import LWN.URI (URL, try_make_absolute_url) -import LWN.XHTML (XHTML, parse_lwn, to_xhtml) -import Misc (contains) - - --- | Try to parse the given article using HXT. We try a few different --- methods; if none of them work, we return 'Nothing'. -get_xml_from_article :: Cfg -> URL -> IO (Maybe (IOStateArrow s b XmlTree)) -get_xml_from_article cfg article_name = do - my_article <- real_article_path article_name - is_file <- doesFileExist my_article - case is_file of - True -> do - contents <- readFile my_article - return $ Just $ parse_lwn contents - False -> do - -- Download the URL and try to parse it. - if use_account cfg then do - -- use_account would be false if these fromJusts would fail. - cj <- make_cookie_jar - li_result <- log_in cj - (fromJust $ username cfg) - (fromJust $ password cfg) - - case li_result of - Left err -> do - let msg = "Failed to log in. " ++ err - hPutStrLn stderr msg - Right response_body -> do - hPutStrLn stderr response_body - - html <- get_page (Just cj) my_article - - case html of - Left err -> do - let msg = "Failed to retrieve page. " ++ err - hPutStrLn stderr msg - return Nothing - Right h -> return $ Just $ parse_lwn h - else do - html <- get_page Nothing my_article - case html of - Left err -> do - let msg = "Failed to retrieve page. " ++ err - hPutStrLn stderr msg - return Nothing - Right h -> return $ Just $ parse_lwn h + get_article_contents) +import LWN.URI (URL) +import LWN.XHTML ( + XHTML, + full_story_urls, + image_srcs, + full_story_link, + full_story_paragraph, + is_image, + preprocess, + remove_byline, + remove_full_story_paragraphs, + remove_title, + to_xhtml, + to_xml, + xml_from_contents) --- Should be called *after* preprocessing. -download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap -download_images xml = do - image_urls <- runX $ xml >>> image_srcs - download_image_urls image_urls - data Page = -- | An LWN page with one article on it. @@ -133,9 +92,7 @@ instance XHTML Page where " " ++ (show $ LWN.Article.title a) ++ "" ++ "" ++ "" ++ - "
" ++ (to_xhtml a) ++ - "
" ++ "" ++ "" @@ -158,47 +115,46 @@ instance XHTML Page where -page_from_url :: Cfg -> URL -> IO (Maybe Page) -page_from_url cfg url = do - maybe_html <- get_xml_from_article cfg url - case maybe_html of - Just html -> parse html - Nothing -> return Nothing +-- | Stolen from writeEPUB. +default_stylesheet :: IO String +default_stylesheet = + -- This comes with Pandoc, I guess. + readDataFile (writerUserDataDir defaultWriterOptions) "epub.css" -is_link :: (ArrowXml a) => a XmlTree XmlTree -is_link = - isElem >>> hasName "a" +construct_stylesheet :: IO String +construct_stylesheet = do + defaults <- default_stylesheet + -- Allow word-wrapping in
 elements.
+  let my_additions = "\n" ++ "pre { white-space: pre-wrap; }" ++ "\n"
+  return $ defaults ++ my_additions
 
+page_from_url :: Cfg -> URL -> IO (Maybe Page)
+page_from_url cfg url = do
+  contents <- get_article_contents cfg url  
+  case (xml_from_contents contents) of
+    Just html -> parse cfg html
+    Nothing -> return Nothing
 
-remove_comment_links  :: (ArrowXml a) => a XmlTree XmlTree
-remove_comment_links =
-  processTopDown $ kill_comments `when` is_link
-  where
-    is_comment_link =
-      hasAttrValue "href" (contains "#Comments")
 
-    kill_comments =
-      none `when` is_comment_link
 
-replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
-replace_links_with_spans =
-  processTopDown $ (make_span >>> remove_attrs) `when` is_link
+insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
+insert_full_stories story_map =
+  processTopDown (article_xml `when` full_story_paragraph)
   where
-    make_span    = setElemName $ mkName "span"
-    remove_attrs = processAttrl none
-
-
--- | Preprocessing common to both page types.
-preprocess :: (ArrowXml a) => a XmlTree XmlTree
-preprocess =
-  make_image_srcs_absolute
-  >>>
-  remove_comment_links
-  >>>
-  replace_links_with_spans
-
-
+    lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
+    lookup_func href =
+      case Map.lookup href story_map of
+        -- Drop the paragraph if we don't have the contents.
+        Nothing -> none
+        Just v -> to_xml v
+
+    article_xml :: (ArrowXml a) => a XmlTree XmlTree
+    article_xml =
+      lookup_func
+      $< -- From HXT's Control.Arrow.ArrowList
+      (this /> full_story_link >>> getAttrValue "href")
+                   
 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
 replace_remote_img_srcs image_map =
   processTopDown (make_srcs_local `when` is_image)
@@ -220,11 +176,51 @@ replace_remote_img_srcs image_map =
       processAttrl $ (change_src `when` (hasName "src"))
 
 
-parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
-parse xml = do
-  let clean_xml = xml >>> preprocess
+
+
+-- Should be called *after* preprocessing.
+download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
+download_images xml = do
+  image_urls <- runX $ xml >>> image_srcs
+  download_image_urls image_urls
+
+
+
+type StoryMap = Map.Map URL Article
+
+-- These come *before* preprocessing.
+download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
+download_full_story_urls cfg story_urls = do
+  pages <- parallel $ map (page_from_url cfg) story_urls
+  let pairs = zip story_urls pages
+  return $ foldl my_insert empty_map pairs
+  where
+    empty_map = Map.empty :: StoryMap
+
+    my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
+    my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
+    my_insert dict (_, _)  = dict
+
+
+download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
+download_full_stories cfg xml = do
+  story_urls <- runX $ xml >>> full_story_urls
+  download_full_story_urls cfg story_urls
+  
+
+parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
+parse cfg xml = do
+  fs_xml <- if (full_stories cfg) then do
+             story_map <- download_full_stories cfg xml
+             return $ xml >>> insert_full_stories story_map
+           else do
+             -- Get rid of them if we don't want them.
+             return $ xml >>> remove_full_story_paragraphs
+
+  let clean_xml = fs_xml >>> preprocess
   image_map <- download_images clean_xml
   let local_xml = clean_xml >>> replace_remote_img_srcs image_map
+
   appr <- ap_parse local_xml
   fppr <- fp_parse local_xml
   return $
@@ -287,8 +283,6 @@ ap_parse_articles xml = do
   parsed_byline   <- parse_byline xml
   parsed_body     <- ap_parse_body xml
 
-  putStrLn $ fromJust parsed_headline
-
   if (isNothing parsed_headline) || (isNothing parsed_body)
   then return []
   else do
@@ -308,9 +302,9 @@ fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
 fp_parse xml = do
     hl <- parse_headline xml
     parsed_articles <- fp_parse_articles xml
-    case parsed_articles of
-      []          -> return Nothing
-      x -> return $ Just $ FullPage (fromJust hl) x
+    return $ case parsed_articles of
+      [] -> Nothing
+      x  -> Just $ FullPage (fromJust hl) x
 
 
 
@@ -326,33 +320,6 @@ fp_parse_article_title xml = do
 
 
 
-is_title :: (ArrowXml a) => a XmlTree XmlTree
-is_title =
-  (hasName "h2")
-  >>>
-  (hasAttrValue "class" (== "SummaryHL"))
-
-
-is_byline :: (ArrowXml a) => a XmlTree XmlTree
-is_byline =
-  (hasName "div")
-  >>>
-  (hasAttrValue "class" (== "FeatureByLine"))
-
-
-is_image :: (ArrowXml a) => a XmlTree XmlTree
-is_image = isElem >>> hasName "img"
-
-remove_title :: (ArrowXml a) => a XmlTree XmlTree
-remove_title =
-  processTopDown ((none) `when` is_title)
-
-
-remove_byline :: (ArrowXml a) => a XmlTree XmlTree
-remove_byline =
-  processTopDown ((none) `when` is_byline)
-
-
 
 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
 fp_parse_article_body xml = do
@@ -406,7 +373,6 @@ fp_parse_articles xml = do
                           []    -> []
   -- Put the separator back, it was lost during the split.
   let split_articles = map (article_separator ++) split_articles'
-  --_ <- mapM print_article split_articles
   real_articles <- mapM parse_html_article split_articles
   let just_articles = catMaybes real_articles
   return just_articles
@@ -451,36 +417,16 @@ epublish obj handle = do
   hClose handle
 
 xhtml_to_epub :: String -> String -> IO B.ByteString
-xhtml_to_epub epmd =
-   write_epub . read_html
-   where
-     my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
-     write_epub = writeEPUB Nothing [] my_writer_options
-     read_html  = readHtml defaultParserState
-
-
-
---
--- Misc
---
-
-image_srcs :: (ArrowXml a) => a XmlTree URL
-image_srcs =
-  css "img"
-  >>>
-  getAttrValue "src"
-
-make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
-make_image_srcs_absolute =
-  processTopDown (make_srcs_absolute `when` is_image)
+xhtml_to_epub epmd xhtml = do
+  stylesheet <- construct_stylesheet
+  writeEPUB
+    (Just stylesheet)
+    []
+    my_writer_options
+    (read_html xhtml)
   where
-    change_src :: (ArrowXml a) => a XmlTree XmlTree
-    change_src =
-      changeAttrValue try_make_absolute_url
-
-    make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
-    make_srcs_absolute =
-      processAttrl $ change_src `when` hasName "src"
+    my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
+    read_html  = readHtml defaultParserState
 
 
 --
@@ -554,10 +500,33 @@ test_comments_removed = do
     expected_xml'' = parseHtml expected_html
 
 
+test_full_story_urls_parsed :: Assertion
+test_full_story_urls_parsed = do
+  actual <- runX $ actual'
+
+  assertEqual
+    "Full Story URLs are parsed"
+    expected
+    actual
+  where
+    expected = ["/Articles/500738/", "/Articles/501837/"]
+
+    full_story_html =
+      concat ["

", + "Full Story ", + "(comments: 49)", + "

", + "Full Story ", + "(comments: none)", + "

"] + + xml = parseHtml full_story_html + actual' = xml >>> full_story_urls page_tests :: Test page_tests = testGroup "Page Tests" [ testCase "Links are replaced with spans" test_preprocess_links, testCase "Image srcs are made absolute" test_absolve_images, - testCase "Comment links are removed" test_comments_removed ] + testCase "Comment links are removed" test_comments_removed, + testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]