{-# LANGUAGE DoAndIfThenElse #-} module LWN.Page where import Control.Concurrent.ParallelIO (parallel) import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) import qualified Data.Map as Map (Map, empty, insert) import Data.Maybe (catMaybes, fromJust, isNothing) import Prelude hiding (readFile) import System.IO (Handle, hClose, hFlush) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) import Text.Pandoc ( defaultParserState, defaultWriterOptions, readHtml, writeEPUB, writerEPUBMetadata) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, XmlTree, ($<), (>>>), (/>), (//>), changeAttrValue, getAttrValue, getChildren, getText, hasName, none, processAttrl, processTopDown, this, runX, xshow, when) import Text.HandsomeSoup (css, parseHtml) import Configuration (Cfg, full_stories) import LWN.Article import LWN.HTTP ( ImageMap, download_image_urls, get_article_contents) import LWN.URI (URL) import LWN.XHTML ( XHTML, full_story_urls, image_srcs, full_story_link, full_story_paragraph, is_image, preprocess, remove_byline, remove_full_story_paragraphs, remove_title, to_xhtml, to_xml, xml_from_contents) data Page = -- | An LWN page with one article on it. ArticlePage { article :: Article } | -- | An LWN page with more than one article on it. These require -- different parsing and display functions than the single-article -- pages. FullPage { headline :: String, articles :: [Article] } instance XHTML Page where to_xhtml (ArticlePage a) = "" ++ "" ++ "" ++ " " ++ " " ++ (show $ LWN.Article.title a) ++ "" ++ "" ++ "" ++ (to_xhtml a) ++ "" ++ "" to_xhtml (FullPage hl as) = "" ++ "" ++ "" ++ " " ++ " " ++ hl ++ "" ++ "" ++ "" ++ "

" ++ "

" ++ hl ++ "

" ++ (concatMap to_xhtml as) ++ "

" ++ "" ++ "" page_from_url :: Cfg -> URL -> IO (Maybe Page) page_from_url cfg url = do contents <- get_article_contents cfg url case (xml_from_contents contents) of Just html -> parse cfg html Nothing -> return Nothing insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree insert_full_stories story_map = processTopDown (article_xml `when` full_story_paragraph) where lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree lookup_func href = case Map.lookup href story_map of -- Drop the paragraph if we don't have the contents. Nothing -> none Just v -> to_xml v article_xml :: (ArrowXml a) => a XmlTree XmlTree article_xml = lookup_func $< -- From HXT's Control.Arrow.ArrowList (this /> full_story_link >>> getAttrValue "href") replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree replace_remote_img_srcs image_map = processTopDown (make_srcs_local `when` is_image) where -- old_src -> new_src change_src_func :: String -> String change_src_func old_src = case Map.lookup old_src image_map of -- Leave it alone if we don't have the file locally Nothing -> old_src Just v -> v change_src :: (ArrowXml a) => a XmlTree XmlTree change_src = changeAttrValue change_src_func make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree make_srcs_local = processAttrl $ (change_src `when` (hasName "src")) -- Should be called *after* preprocessing. download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap download_images xml = do image_urls <- runX $ xml >>> image_srcs download_image_urls image_urls type StoryMap = Map.Map URL Article -- These come *before* preprocessing. download_full_story_urls :: Cfg -> [URL] -> IO StoryMap download_full_story_urls cfg story_urls = do pages <- parallel $ map (page_from_url cfg) story_urls let pairs = zip story_urls pages return $ foldl my_insert empty_map pairs where empty_map = Map.empty :: StoryMap my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict my_insert dict (_, _) = dict download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap download_full_stories cfg xml = do story_urls <- runX $ xml >>> full_story_urls download_full_story_urls cfg story_urls parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page) parse cfg xml = do fs_xml <- if (full_stories cfg) then do story_map <- download_full_stories cfg xml return $ xml >>> insert_full_stories story_map else do -- Get rid of them if we don't want them. return $ xml >>> remove_full_story_paragraphs let clean_xml = fs_xml >>> preprocess image_map <- download_images clean_xml let local_xml = clean_xml >>> replace_remote_img_srcs image_map appr <- ap_parse local_xml fppr <- fp_parse local_xml return $ if (isNothing appr) then fppr else appr parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String) parse_headline xml = do let element_filter = xml >>> css "div.PageHeadline h1" let element_text_filter = element_filter /> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x [] -> Nothing _ -> error "Found more than one headline." parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String) parse_byline xml = do let element_filter = xml >>> css "div.FeatureByLine" let element_text_filter = element_filter /> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x [] -> Nothing _ -> error "Found more than one article byline." -- -- ArticlePage Stuff -- ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) ap_parse xml = do arts <- ap_parse_articles xml case arts of [x] -> return $ Just $ ArticlePage x _ -> return Nothing ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String) ap_parse_body xml = do let element_filter = xml >>> css "div.ArticleText" let element_html_filter = xshow element_filter element_html <- runX element_html_filter return $ case element_html of [x] -> Just x [] -> Nothing _ -> error "Found more than one article." ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article] ap_parse_articles xml = do parsed_headline <- parse_headline xml parsed_byline <- parse_byline xml parsed_body <- ap_parse_body xml if (isNothing parsed_headline) || (isNothing parsed_body) then return [] else do let title' = Title $ fromJust parsed_headline let byline' = Byline parsed_byline let body' = BodyHtml $ fromJust parsed_body return $ [Article title' byline' body'] -- -- FullPage Stuff -- fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) fp_parse xml = do hl <- parse_headline xml parsed_articles <- fp_parse_articles xml case parsed_articles of [] -> return Nothing x -> return $ Just $ FullPage (fromJust hl) x fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String) fp_parse_article_title xml = do let element_filter = xml >>> css "h2.SummaryHL" let element_text_filter = element_filter //> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x [] -> Nothing _ -> error "Found more than one article title." fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String) fp_parse_article_body xml = do -- First, delete the article title and byline. let clean_xml' = xml >>> remove_title >>> remove_byline -- The only child of the body element should be a div.lwn-article -- since we wrapped the article's HTML in that. let clean_xml = clean_xml' >>> css "body" >>> getChildren clean_html <- runX . xshow $ clean_xml return $ case clean_html of [x] -> Just x [] -> Nothing _ -> error "Found more than one article body." fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article) fp_parse_article xml = do parsed_article_title <- fp_parse_article_title xml parsed_article_byline <- parse_byline xml parsed_article_body <- fp_parse_article_body xml if (isNothing parsed_article_title) || (isNothing parsed_article_body) then return Nothing else do let title' = Title $ fromJust parsed_article_title let byline' = Byline parsed_article_byline let body' = BodyHtml $ fromJust parsed_article_body return $ Just $ Article title' byline' body' parse_html_article :: String -> IO (Maybe Article) parse_html_article html = do let xml = parseHtml $ wrap_in_body_div html fp_parse_article xml -- | In the full page, all of the article titles and bodies are -- wrapped in one big div.ArticleText. parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree parse_bodies xml = xml >>> css "div.ArticleText" fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article] fp_parse_articles xml = do bodies <- runX . xshow $ parse_bodies xml let article_separator = "

" let split_articles'' = split article_separator (concat bodies) -- The first element will contain the crap before the first . let split_articles' = case split_articles'' of (_:_) -> tail split_articles'' [] -> [] -- Put the separator back, it was lost during the split. let split_articles = map (article_separator ++) split_articles' real_articles <- mapM parse_html_article split_articles let just_articles = catMaybes real_articles return just_articles -- | This makes it easy to select otherwise-random chunks of html -- using 'css'. wrap_in_body_div :: String -> String wrap_in_body_div s = "
" ++ s ++ "
" -- -- Epublishable stuff -- title :: Page -> String title (ArticlePage a) = getTitle $ LWN.Article.title a title (FullPage hl _) = hl metadata :: Page -> IO String metadata obj = do date <- getCurrentTime return $ "http://lwn.net/\n" ++ "" ++ (show date) ++ "\n" ++ "en-US\n" ++ "Copyright Eklektix, Inc.\n" ++ "" ++ (LWN.Page.title obj) ++ "\n" epublish :: Page -> Handle -> IO () epublish obj handle = do let xhtml = to_xhtml obj epmd <- metadata obj epub <- xhtml_to_epub epmd xhtml B.hPut handle epub hFlush handle hClose handle xhtml_to_epub :: String -> String -> IO B.ByteString xhtml_to_epub epmd = write_epub . read_html where my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd } write_epub = writeEPUB Nothing [] my_writer_options read_html = readHtml defaultParserState -- -- Tests -- test_preprocess_links :: Assertion test_preprocess_links = do actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" let actual_xml = actual_xml' !! 0 expected_xml' <- runX $ expected_xml'' >>> css "body" let expected_xml = expected_xml' !! 0 assertEqual "Links are replaced with spans" expected_xml actual_xml where input_html = "Hello, world!" input_xml = parseHtml input_html expected_html = "Hello, world!" expected_xml'' = parseHtml expected_html test_absolve_images :: Assertion test_absolve_images = do actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" let actual_xml = actual_xml' !! 0 expected_xml' <- runX $ expected_xml'' >>> css "body" let expected_xml = expected_xml' !! 0 assertEqual "Image srcs are made absolute" expected_xml actual_xml where input_html = "" ++ "" ++ "" input_xml = parseHtml input_html expected_html = "" ++ "" ++ "" expected_xml'' = parseHtml expected_html test_comments_removed :: Assertion test_comments_removed = do actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" let actual_xml = actual_xml' !! 0 expected_xml' <- runX $ expected_xml'' >>> css "body" let expected_xml = expected_xml' !! 0 assertEqual "Comment links are removed" expected_xml actual_xml where input_html = "
" ++ "Comments (6 posted)" ++ "
" input_xml = parseHtml input_html expected_html = "
" expected_xml'' = parseHtml expected_html test_full_story_urls_parsed :: Assertion test_full_story_urls_parsed = do actual <- runX $ actual' assertEqual "Full Story URLs are parsed" expected actual where expected = ["/Articles/500738/", "/Articles/501837/"] full_story_html = concat ["
", "Full Story ", "(comments: 49)", "
", "Full Story ", "(comments: none)", "
"] xml = parseHtml full_story_html actual' = xml >>> full_story_urls page_tests :: Test page_tests = testGroup "Page Tests" [ testCase "Links are replaced with spans" test_preprocess_links, testCase "Image srcs are made absolute" test_absolve_images, testCase "Comment links are removed" test_comments_removed, testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]