{-# LANGUAGE DoAndIfThenElse #-} module LWN.Page where import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) import System.IO (Handle, hClose, hFlush) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) import Data.Maybe (catMaybes, fromJust, isNothing) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) import Text.Pandoc ( defaultParserState, defaultWriterOptions, readHtml, writeEPUB, writerEPUBMetadata) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, XmlTree, (>>>), (/>), (//>), changeAttrValue, getAttrValue, getChildren, getText, hasAttrValue, hasName, isElem, mkName, none, processAttrl, processTopDown, runX, setElemName, xshow, when) import Text.HandsomeSoup (css, parseHtml) import LWN.Article import LWN.HTTP (ImageMap, download_image_urls) import LWN.URI (URL, try_make_absolute_url) import LWN.XHTML (XHTML, to_xhtml) import Misc (contains) -- Should be called *after* preprocessing. download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap download_images xml = do image_urls <- runX $ xml >>> image_srcs download_image_urls image_urls data Page = -- | An LWN page with one article on it. ArticlePage { article :: Article } | -- | An LWN page with more than one article on it. These require -- different parsing and display functions than the single-article -- pages. FullPage { headline :: String, articles :: [Article] } instance XHTML Page where to_xhtml (ArticlePage a) = "" ++ "" ++ "" ++ " " ++ " " ++ (show $ LWN.Article.title a) ++ "" ++ "" ++ "" ++ "
" ++ (to_xhtml a) ++ "
" ++ "" ++ "" to_xhtml (FullPage hl as) = "" ++ "" ++ "" ++ " " ++ " " ++ hl ++ "" ++ "" ++ "" ++ "
" ++ "

" ++ hl ++ "

" ++ (concatMap to_xhtml as) ++ "
" ++ "" ++ "" is_link :: (ArrowXml a) => a XmlTree XmlTree is_link = isElem >>> hasName "a" remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree remove_comment_links = processTopDown $ kill_comments `when` is_link where is_comment_link = hasAttrValue "href" (contains "#Comments") kill_comments = none `when` is_comment_link replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree replace_links_with_spans = processTopDown $ (make_span >>> remove_attrs) `when` is_link where make_span = setElemName $ mkName "span" remove_attrs = processAttrl none -- | Preprocessing common to both page types. preprocess :: (ArrowXml a) => a XmlTree XmlTree preprocess = make_image_srcs_absolute >>> remove_comment_links >>> replace_links_with_spans replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree replace_remote_img_srcs image_map = processTopDown (make_srcs_local `when` is_image) where -- old_src -> new_src change_src_func :: String -> String change_src_func old_src = case Map.lookup old_src image_map of -- Leave it alone if we don't have the file locally Nothing -> old_src Just v -> v change_src :: (ArrowXml a) => a XmlTree XmlTree change_src = changeAttrValue change_src_func make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree make_srcs_local = processAttrl $ (change_src `when` (hasName "src")) parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) parse xml = do let clean_xml = xml >>> preprocess image_map <- download_images clean_xml let local_xml = clean_xml >>> replace_remote_img_srcs image_map appr <- ap_parse local_xml fppr <- fp_parse local_xml return $ if (isNothing appr) then fppr else appr parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String) parse_headline xml = do let element_filter = xml >>> css "div.PageHeadline h1" let element_text_filter = element_filter /> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x [] -> Nothing _ -> error "Found more than one headline." parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String) parse_byline xml = do let element_filter = xml >>> css "div.FeatureByLine" let element_text_filter = element_filter /> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x [] -> Nothing _ -> error "Found more than one article byline." -- -- ArticlePage Stuff -- ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) ap_parse xml = do arts <- ap_parse_articles xml case arts of [x] -> return $ Just $ ArticlePage x _ -> return Nothing ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String) ap_parse_body xml = do let element_filter = xml >>> css "div.ArticleText" let element_html_filter = xshow element_filter element_html <- runX element_html_filter return $ case element_html of [x] -> Just x [] -> Nothing _ -> error "Found more than one article." ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article] ap_parse_articles xml = do parsed_headline <- parse_headline xml parsed_byline <- parse_byline xml parsed_body <- ap_parse_body xml putStrLn $ fromJust parsed_headline if (isNothing parsed_headline) || (isNothing parsed_body) then return [] else do let title' = Title $ fromJust parsed_headline let byline' = Byline parsed_byline let body' = BodyHtml $ fromJust parsed_body return $ [Article title' byline' body'] -- -- FullPage Stuff -- fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) fp_parse xml = do hl <- parse_headline xml parsed_articles <- fp_parse_articles xml case parsed_articles of [] -> return Nothing x -> return $ Just $ FullPage (fromJust hl) x fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String) fp_parse_article_title xml = do let element_filter = xml >>> css "h2.SummaryHL" let element_text_filter = element_filter //> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x [] -> Nothing _ -> error "Found more than one article title." is_title :: (ArrowXml a) => a XmlTree XmlTree is_title = (hasName "h2") >>> (hasAttrValue "class" (== "SummaryHL")) is_byline :: (ArrowXml a) => a XmlTree XmlTree is_byline = (hasName "div") >>> (hasAttrValue "class" (== "FeatureByLine")) is_image :: (ArrowXml a) => a XmlTree XmlTree is_image = isElem >>> hasName "img" remove_title :: (ArrowXml a) => a XmlTree XmlTree remove_title = processTopDown ((none) `when` is_title) remove_byline :: (ArrowXml a) => a XmlTree XmlTree remove_byline = processTopDown ((none) `when` is_byline) fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String) fp_parse_article_body xml = do -- First, delete the article title and byline. let clean_xml' = xml >>> remove_title >>> remove_byline -- The only child of the body element should be a div.lwn-article -- since we wrapped the article's HTML in that. let clean_xml = clean_xml' >>> css "body" >>> getChildren clean_html <- runX . xshow $ clean_xml return $ case clean_html of [x] -> Just x [] -> Nothing _ -> error "Found more than one article body." fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article) fp_parse_article xml = do parsed_article_title <- fp_parse_article_title xml parsed_article_byline <- parse_byline xml parsed_article_body <- fp_parse_article_body xml if (isNothing parsed_article_title) || (isNothing parsed_article_body) then return Nothing else do let title' = Title $ fromJust parsed_article_title let byline' = Byline parsed_article_byline let body' = BodyHtml $ fromJust parsed_article_body return $ Just $ Article title' byline' body' parse_html_article :: String -> IO (Maybe Article) parse_html_article html = do let xml = parseHtml $ wrap_in_body_div html fp_parse_article xml -- | In the full page, all of the article titles and bodies are -- wrapped in one big div.ArticleText. parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree parse_bodies xml = xml >>> css "div.ArticleText" fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article] fp_parse_articles xml = do bodies <- runX . xshow $ parse_bodies xml let article_separator = "

" let split_articles'' = split article_separator (concat bodies) -- The first element will contain the crap before the first . let split_articles' = case split_articles'' of (_:_) -> tail split_articles'' [] -> [] -- Put the separator back, it was lost during the split. let split_articles = map (article_separator ++) split_articles' --_ <- mapM print_article split_articles real_articles <- mapM parse_html_article split_articles let just_articles = catMaybes real_articles return just_articles -- | This makes it easy to select otherwise-random chunks of html -- using 'css'. wrap_in_body_div :: String -> String wrap_in_body_div s = "
" ++ s ++ "
" -- -- Epublishable stuff -- title :: Page -> String title (ArticlePage a) = getTitle $ LWN.Article.title a title (FullPage hl _) = hl metadata :: Page -> IO String metadata obj = do date <- getCurrentTime return $ "http://lwn.net/\n" ++ "" ++ (show date) ++ "\n" ++ "en-US\n" ++ "Copyright Eklektix, Inc.\n" ++ "" ++ (LWN.Page.title obj) ++ "\n" epublish :: Page -> Handle -> IO () epublish obj handle = do let xhtml = to_xhtml obj epmd <- metadata obj epub <- xhtml_to_epub epmd xhtml B.hPut handle epub hFlush handle hClose handle xhtml_to_epub :: String -> String -> IO B.ByteString xhtml_to_epub epmd = write_epub . read_html where my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd } write_epub = writeEPUB Nothing [] my_writer_options read_html = readHtml defaultParserState -- -- Misc -- image_srcs :: (ArrowXml a) => a XmlTree URL image_srcs = css "img" >>> getAttrValue "src" make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree make_image_srcs_absolute = processTopDown (make_srcs_absolute `when` is_image) where change_src :: (ArrowXml a) => a XmlTree XmlTree change_src = changeAttrValue try_make_absolute_url make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree make_srcs_absolute = processAttrl $ change_src `when` hasName "src" -- -- Tests -- test_preprocess_links :: Assertion test_preprocess_links = do actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" let actual_xml = actual_xml' !! 0 expected_xml' <- runX $ expected_xml'' >>> css "body" let expected_xml = expected_xml' !! 0 assertEqual "Links are replaced with spans" expected_xml actual_xml where input_html = "Hello, world!" input_xml = parseHtml input_html expected_html = "Hello, world!" expected_xml'' = parseHtml expected_html test_absolve_images :: Assertion test_absolve_images = do actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" let actual_xml = actual_xml' !! 0 expected_xml' <- runX $ expected_xml'' >>> css "body" let expected_xml = expected_xml' !! 0 assertEqual "Image srcs are made absolute" expected_xml actual_xml where input_html = "" ++ "" ++ "" input_xml = parseHtml input_html expected_html = "" ++ "" ++ "" expected_xml'' = parseHtml expected_html test_comments_removed :: Assertion test_comments_removed = do actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" let actual_xml = actual_xml' !! 0 expected_xml' <- runX $ expected_xml'' >>> css "body" let expected_xml = expected_xml' !! 0 assertEqual "Comment links are removed" expected_xml actual_xml where input_html = "

" ++ "Comments (6 posted)" ++ "

" input_xml = parseHtml input_html expected_html = "

" expected_xml'' = parseHtml expected_html page_tests :: Test page_tests = testGroup "Page Tests" [ testCase "Links are replaced with spans" test_preprocess_links, testCase "Image srcs are made absolute" test_absolve_images, testCase "Comment links are removed" test_comments_removed ]