X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FPage.hs;h=d4aeb3006cab8c331c76219a723d1a20baf6f623;hp=24997154eeb84951188d804b6336647c8477a61f;hb=1d9b1f888c9dcb7c82ce35682b4e9f0210f93f0a;hpb=d79424c546d96dcd3955fdc6cb43eb529566be1e diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index 2499715..d4aeb30 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -1,41 +1,70 @@ +{-# LANGUAGE DoAndIfThenElse #-} + module LWN.Page where -import Text.Pandoc +import Control.Concurrent.ParallelIO (parallel) +import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) -import System.IO (Handle) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) +import qualified Data.Map as Map (Map, empty, insert) import Data.Maybe (catMaybes, fromJust, isNothing) -import Data.Tree.NTree.TypeDefs (NTree) +import Prelude hiding (readFile) +import System.IO (Handle, hClose, hFlush) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) +import Text.Pandoc ( + defaultParserState, + defaultWriterOptions, + readHtml, + writeEPUB, + writerEPUBMetadata) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, XmlTree, - XNode, + ($<), (>>>), (/>), (//>), + changeAttrValue, + getAttrValue, getChildren, getText, - hasAttrValue, hasName, - mkName, - none, processAttrl, processTopDown, + this, runX, - setElemName, xshow, - when - ) + when) import Text.HandsomeSoup (css, parseHtml) +import Configuration (Cfg, full_stories) import LWN.Article -import XHTML +import LWN.HTTP ( + ImageMap, + download_image_urls, + get_article_contents) +import LWN.URI (URL) +import LWN.XHTML ( + XHTML, + full_story_urls, + image_srcs, + full_story_link, + full_story_paragraph, + is_image, + preprocess, + remove_byline, + remove_title, + to_xhtml, + to_xml, + xml_from_contents) + + + data Page = -- | An LWN page with one article on it. @@ -59,9 +88,7 @@ instance XHTML Page where " " ++ (show $ LWN.Article.title a) ++ "" ++ "" ++ "" ++ - "
" ++ (to_xhtml a) ++ - "
" ++ "" ++ "" @@ -84,72 +111,143 @@ instance XHTML Page where -remove_images :: (ArrowXml a) => a XmlTree XmlTree -remove_images = - processTopDown ((none) `when` is_image) +page_from_url :: Cfg -> URL -> IO (Maybe Page) +page_from_url cfg url = do + contents <- get_article_contents cfg url + case (xml_from_contents contents) of + Just html -> parse cfg html + Nothing -> return Nothing -is_link :: (ArrowXml a) => a XmlTree XmlTree -is_link = - hasName "a" -replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree -replace_links_with_spans = - processTopDown $ (make_span >>> remove_attrs) `when` is_link +insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree +insert_full_stories story_map = + processTopDown (article_xml `when` full_story_paragraph) where - make_span = setElemName $ mkName "span" - remove_attrs = processAttrl none + lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree + lookup_func href = + case Map.lookup href story_map of + -- Leave it alone if we don't have the full story. + Nothing -> this + Just v -> to_xml v + + article_xml :: (ArrowXml a) => a XmlTree XmlTree + article_xml = + lookup_func + $< + (this /> full_story_link >>> getAttrValue "href") + +replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree +replace_remote_img_srcs image_map = + processTopDown (make_srcs_local `when` is_image) + where + -- old_src -> new_src + change_src_func :: String -> String + change_src_func old_src = + case Map.lookup old_src image_map of + -- Leave it alone if we don't have the file locally + Nothing -> old_src + Just v -> v + + change_src :: (ArrowXml a) => a XmlTree XmlTree + change_src = + changeAttrValue change_src_func --- | Preprocessing common to both page types. -preprocess :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode) -preprocess xml = - xml >>> remove_images >>> replace_links_with_spans + make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree + make_srcs_local = + processAttrl $ (change_src `when` (hasName "src")) -parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page) -parse xml = do - let clean_xml = preprocess xml - appr <- ap_parse clean_xml - fppr <- fp_parse clean_xml + + +-- Should be called *after* preprocessing. +download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap +download_images xml = do + image_urls <- runX $ xml >>> image_srcs + download_image_urls image_urls + + + +type StoryMap = Map.Map URL Article + +-- These come *before* preprocessing. +download_full_story_urls :: Cfg -> [URL] -> IO StoryMap +download_full_story_urls cfg story_urls = do + pages <- parallel $ map (page_from_url cfg) story_urls + let pairs = zip story_urls pages + return $ foldl my_insert empty_map pairs + where + empty_map = Map.empty :: StoryMap + + my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap + my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict + my_insert dict (_, _) = dict + + +download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap +download_full_stories cfg xml = do + story_urls <- runX $ xml >>> full_story_urls + download_full_story_urls cfg story_urls + + +parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page) +parse cfg xml = do + story_map <- download_full_stories cfg xml + let fs_xml = if (full_stories cfg) then + xml >>> insert_full_stories story_map + else + xml + + let clean_xml = fs_xml >>> preprocess + image_map <- download_images clean_xml + let local_xml = clean_xml >>> replace_remote_img_srcs image_map + + appr <- ap_parse local_xml + fppr <- fp_parse local_xml return $ if (isNothing appr) then fppr else appr --- --- ArticlePage Stuff --- -ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page) -ap_parse xml = do - arts <- ap_parse_articles xml - case arts of - Just [x] -> return $ Just $ ArticlePage x - _ -> return Nothing -ap_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) -ap_parse_headline xml = do +parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String) +parse_headline xml = do let element_filter = xml >>> css "div.PageHeadline h1" let element_text_filter = element_filter /> getText element_text <- runX element_text_filter - return $ case element_text of - [x] -> Just $ strip x - [] -> Nothing - _ -> error "Found more than one headline." + return $ + case element_text of + [x] -> Just $ strip x + [] -> Nothing + _ -> error "Found more than one headline." -ap_parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) -ap_parse_byline xml = do - let element_filter = xml >>> css "div.Byline" + +parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String) +parse_byline xml = do + let element_filter = xml >>> css "div.FeatureByLine" let element_text_filter = element_filter /> getText element_text <- runX element_text_filter - return $ case element_text of - [x] -> Just $ strip x - [] -> Nothing - _ -> error "Found more than one byline." + return $ + case element_text of + [x] -> Just $ strip x + [] -> Nothing + _ -> error "Found more than one article byline." -ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +-- +-- ArticlePage Stuff +-- +ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) +ap_parse xml = do + arts <- ap_parse_articles xml + case arts of + [x] -> return $ Just $ ArticlePage x + _ -> return Nothing + + +ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String) ap_parse_body xml = do let element_filter = xml >>> css "div.ArticleText" let element_html_filter = xshow element_filter @@ -160,16 +258,20 @@ ap_parse_body xml = do _ -> error "Found more than one article." -ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article]) +ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article] ap_parse_articles xml = do - parsed_headline <- ap_parse_headline xml - parsed_byline <- ap_parse_byline xml + parsed_headline <- parse_headline xml + parsed_byline <- parse_byline xml parsed_body <- ap_parse_body xml - let title' = Title (fromJust parsed_headline) - let byline' = Byline parsed_byline - let body' = BodyHtml (fromJust parsed_body) - return $ Just $ [Article title' byline' body'] + if (isNothing parsed_headline) || (isNothing parsed_body) + then return [] + else do + let title' = Title $ fromJust parsed_headline + let byline' = Byline parsed_byline + let body' = BodyHtml $ fromJust parsed_body + + return $ [Article title' byline' body'] @@ -177,39 +279,17 @@ ap_parse_articles xml = do -- FullPage Stuff -- -fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page) +fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) fp_parse xml = do - hl <- fp_parse_headline xml + hl <- parse_headline xml parsed_articles <- fp_parse_articles xml case parsed_articles of - them@(_:_) -> return $ Just $ FullPage (fromJust hl) them - _ -> return Nothing - - + [] -> return Nothing + x -> return $ Just $ FullPage (fromJust hl) x -fp_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) -fp_parse_headline xml = do - let element_filter = xml >>> css "div.PageHeadline h1" - let element_text_filter = element_filter /> getText - element_text <- runX element_text_filter - return $ case element_text of - [x] -> Just $ strip x - [] -> Nothing - _ -> error "Found more than one headline." -fp_parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) -fp_parse_article_byline xml = do - let element_filter = xml >>> css "div.FeatureByLine" - let element_text_filter = element_filter /> getText - element_text <- runX element_text_filter - return $ case element_text of - [x] -> Just $ strip x - [] -> Nothing - _ -> error "Found more than one article byline." - - -fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String) fp_parse_article_title xml = do let element_filter = xml >>> css "h2.SummaryHL" let element_text_filter = element_filter //> getText @@ -221,36 +301,8 @@ fp_parse_article_title xml = do -is_title :: (ArrowXml a) => a XmlTree XmlTree -is_title = - (hasName "h2") - >>> - (hasAttrValue "class" (== "SummaryHL")) - - -is_byline :: (ArrowXml a) => a XmlTree XmlTree -is_byline = - (hasName "div") - >>> - (hasAttrValue "class" (== "FeatureByLine")) - - -is_image :: (ArrowXml a) => a XmlTree XmlTree -is_image = - hasName "img" - -remove_title :: (ArrowXml a) => a XmlTree XmlTree -remove_title = - processTopDown ((none) `when` is_title) - - -remove_byline :: (ArrowXml a) => a XmlTree XmlTree -remove_byline = - processTopDown ((none) `when` is_byline) - - -fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String) fp_parse_article_body xml = do -- First, delete the article title and byline. let clean_xml' = xml >>> remove_title >>> remove_byline @@ -263,44 +315,52 @@ fp_parse_article_body xml = do [] -> Nothing _ -> error "Found more than one article body." -fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article) +fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article) fp_parse_article xml = do parsed_article_title <- fp_parse_article_title xml - parsed_article_byline <- fp_parse_article_byline xml + parsed_article_byline <- parse_byline xml parsed_article_body <- fp_parse_article_body xml - let title' = Title $ fromJust parsed_article_title - let byline' = Byline parsed_article_byline - let body' = BodyHtml $ fromJust parsed_article_body - return $ Just $ Article title' byline' body' + + if (isNothing parsed_article_title) || (isNothing parsed_article_body) + then + return Nothing + else do + let title' = Title $ fromJust parsed_article_title + let byline' = Byline parsed_article_byline + let body' = BodyHtml $ fromJust parsed_article_body + return $ Just $ Article title' byline' body' parse_html_article :: String -> IO (Maybe Article) parse_html_article html = do let xml = parseHtml $ wrap_in_body_div html fp_parse_article xml - + -- | In the full page, all of the article titles and bodies are --- wrapped in a div.ArticleText. -parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode) +-- wrapped in one big div.ArticleText. +parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree parse_bodies xml = xml >>> css "div.ArticleText" -fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article] +fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article] fp_parse_articles xml = do bodies <- runX . xshow $ parse_bodies xml let article_separator = "

" let split_articles'' = split article_separator (concat bodies) -- The first element will contain the crap before the first . - let split_articles' = tail split_articles'' + let split_articles' = case split_articles'' of + (_:_) -> tail split_articles'' + [] -> [] -- Put the separator back, it was lost during the split. let split_articles = map (article_separator ++) split_articles' - --_ <- mapM print_article split_articles real_articles <- mapM parse_html_article split_articles let just_articles = catMaybes real_articles return just_articles +-- | This makes it easy to select otherwise-random chunks of html +-- using 'css'. wrap_in_body_div :: String -> String wrap_in_body_div s = "
" ++ s ++ "
" @@ -334,7 +394,8 @@ epublish obj handle = do epmd <- metadata obj epub <- xhtml_to_epub epmd xhtml B.hPut handle epub - + hFlush handle + hClose handle xhtml_to_epub :: String -> String -> IO B.ByteString xhtml_to_epub epmd = @@ -345,11 +406,13 @@ xhtml_to_epub epmd = read_html = readHtml defaultParserState - +-- +-- Tests +-- test_preprocess_links :: Assertion test_preprocess_links = do - actual_xml' <- runX $ (preprocess input_xml) >>> css "body" + actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" let actual_xml = actual_xml' !! 0 expected_xml' <- runX $ expected_xml'' >>> css "body" @@ -365,7 +428,82 @@ test_preprocess_links = do expected_html = "Hello, world!" expected_xml'' = parseHtml expected_html + +test_absolve_images :: Assertion +test_absolve_images = do + actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" + let actual_xml = actual_xml' !! 0 + + expected_xml' <- runX $ expected_xml'' >>> css "body" + let expected_xml = expected_xml' !! 0 + + assertEqual + "Image srcs are made absolute" + expected_xml + actual_xml + where + input_html = + "" ++ + "" ++ + "" + input_xml = parseHtml input_html + expected_html = + "" ++ + "" ++ + "" + expected_xml'' = parseHtml expected_html + + +test_comments_removed :: Assertion +test_comments_removed = do + actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" + let actual_xml = actual_xml' !! 0 + + expected_xml' <- runX $ expected_xml'' >>> css "body" + let expected_xml = expected_xml' !! 0 + + assertEqual + "Comment links are removed" + expected_xml + actual_xml + where + input_html = + "

" ++ + "Comments (6 posted)" ++ + "

" + input_xml = parseHtml input_html + + expected_html = "

" + expected_xml'' = parseHtml expected_html + + +test_full_story_urls_parsed :: Assertion +test_full_story_urls_parsed = do + actual <- runX $ actual' + + assertEqual + "Full Story URLs are parsed" + expected + actual + where + expected = ["/Articles/500738/", "/Articles/501837/"] + + full_story_html = + concat ["

", + "Full Story ", + "(comments: 49)", + "

", + "Full Story ", + "(comments: none)", + "

"] + + xml = parseHtml full_story_html + actual' = xml >>> full_story_urls + page_tests :: Test page_tests = testGroup "Page Tests" [ - testCase "Links are replaced with spans" test_preprocess_links ] + testCase "Links are replaced with spans" test_preprocess_links, + testCase "Image srcs are made absolute" test_absolve_images, + testCase "Comment links are removed" test_comments_removed, + testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]