{-# LANGUAGE DoAndIfThenElse #-} module LWN.Page where import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) import Data.Maybe (catMaybes, fromJust, isNothing) import Prelude hiding (readFile) import System.Directory (doesFileExist) import System.IO (Handle, hClose, hFlush, hPutStrLn, stderr) import System.IO.UTF8 (readFile) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) import Text.Pandoc ( defaultParserState, defaultWriterOptions, readHtml, writeEPUB, writerEPUBMetadata) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, IOStateArrow, XmlTree, (>>>), (/>), (//>), changeAttrValue, getAttrValue, getChildren, getText, hasAttrValue, hasName, isElem, mkName, none, processAttrl, processTopDown, runX, setElemName, xshow, when) import Text.HandsomeSoup (css, parseHtml) import Configuration (Cfg, password, use_account, username) import LWN.Article import LWN.HTTP ( ImageMap, download_image_urls, get_page, log_in, make_cookie_jar) import LWN.URI (URL, try_make_absolute_url) import LWN.XHTML (XHTML, parse_lwn, to_xhtml) import Misc (contains) -- | Try to parse the given article using HXT. We try a few different -- methods; if none of them work, we return 'Nothing'. get_xml_from_article :: Cfg -> URL -> IO (Maybe (IOStateArrow s b XmlTree)) get_xml_from_article cfg article_name = do my_article <- real_article_path article_name is_file <- doesFileExist my_article case is_file of True -> do contents <- readFile my_article return $ Just $ parse_lwn contents False -> do -- Download the URL and try to parse it. if use_account cfg then do -- use_account would be false if these fromJusts would fail. cj <- make_cookie_jar li_result <- log_in cj (fromJust $ username cfg) (fromJust $ password cfg) case li_result of Left err -> do let msg = "Failed to log in. " ++ err hPutStrLn stderr msg Right response_body -> do hPutStrLn stderr response_body html <- get_page (Just cj) my_article case html of Left err -> do let msg = "Failed to retrieve page. " ++ err hPutStrLn stderr msg return Nothing Right h -> return $ Just $ parse_lwn h else do html <- get_page Nothing my_article case html of Left err -> do let msg = "Failed to retrieve page. " ++ err hPutStrLn stderr msg return Nothing Right h -> return $ Just $ parse_lwn h -- Should be called *after* preprocessing. download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap download_images xml = do image_urls <- runX $ xml >>> image_srcs download_image_urls image_urls data Page = -- | An LWN page with one article on it. ArticlePage { article :: Article } | -- | An LWN page with more than one article on it. These require -- different parsing and display functions than the single-article -- pages. FullPage { headline :: String, articles :: [Article] } instance XHTML Page where to_xhtml (ArticlePage a) = "" ++ "" ++ "" ++ " " ++ " " ++ (show $ LWN.Article.title a) ++ "" ++ "" ++ "" ++ (to_xhtml a) ++ "" ++ "" to_xhtml (FullPage hl as) = "" ++ "" ++ "" ++ " " ++ " " ++ hl ++ "" ++ "" ++ "" ++ "
" ++ "

" ++ hl ++ "

" ++ (concatMap to_xhtml as) ++ "
" ++ "" ++ "" page_from_url :: Cfg -> URL -> IO (Maybe Page) page_from_url cfg url = do maybe_html <- get_xml_from_article cfg url case maybe_html of Just html -> parse html Nothing -> return Nothing is_link :: (ArrowXml a) => a XmlTree XmlTree is_link = isElem >>> hasName "a" remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree remove_comment_links = processTopDown $ kill_comments `when` is_link where is_comment_link = hasAttrValue "href" (contains "#Comments") kill_comments = none `when` is_comment_link replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree replace_links_with_spans = processTopDown $ (make_span >>> remove_attrs) `when` is_link where make_span = setElemName $ mkName "span" remove_attrs = processAttrl none -- | Preprocessing common to both page types. preprocess :: (ArrowXml a) => a XmlTree XmlTree preprocess = make_image_srcs_absolute >>> remove_comment_links >>> replace_links_with_spans replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree replace_remote_img_srcs image_map = processTopDown (make_srcs_local `when` is_image) where -- old_src -> new_src change_src_func :: String -> String change_src_func old_src = case Map.lookup old_src image_map of -- Leave it alone if we don't have the file locally Nothing -> old_src Just v -> v change_src :: (ArrowXml a) => a XmlTree XmlTree change_src = changeAttrValue change_src_func make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree make_srcs_local = processAttrl $ (change_src `when` (hasName "src")) parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) parse xml = do let clean_xml = xml >>> preprocess image_map <- download_images clean_xml let local_xml = clean_xml >>> replace_remote_img_srcs image_map appr <- ap_parse local_xml fppr <- fp_parse local_xml return $ if (isNothing appr) then fppr else appr parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String) parse_headline xml = do let element_filter = xml >>> css "div.PageHeadline h1" let element_text_filter = element_filter /> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x [] -> Nothing _ -> error "Found more than one headline." parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String) parse_byline xml = do let element_filter = xml >>> css "div.FeatureByLine" let element_text_filter = element_filter /> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x [] -> Nothing _ -> error "Found more than one article byline." -- -- ArticlePage Stuff -- ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) ap_parse xml = do arts <- ap_parse_articles xml case arts of [x] -> return $ Just $ ArticlePage x _ -> return Nothing ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String) ap_parse_body xml = do let element_filter = xml >>> css "div.ArticleText" let element_html_filter = xshow element_filter element_html <- runX element_html_filter return $ case element_html of [x] -> Just x [] -> Nothing _ -> error "Found more than one article." ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article] ap_parse_articles xml = do parsed_headline <- parse_headline xml parsed_byline <- parse_byline xml parsed_body <- ap_parse_body xml putStrLn $ fromJust parsed_headline if (isNothing parsed_headline) || (isNothing parsed_body) then return [] else do let title' = Title $ fromJust parsed_headline let byline' = Byline parsed_byline let body' = BodyHtml $ fromJust parsed_body return $ [Article title' byline' body'] -- -- FullPage Stuff -- fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) fp_parse xml = do hl <- parse_headline xml parsed_articles <- fp_parse_articles xml case parsed_articles of [] -> return Nothing x -> return $ Just $ FullPage (fromJust hl) x fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String) fp_parse_article_title xml = do let element_filter = xml >>> css "h2.SummaryHL" let element_text_filter = element_filter //> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x [] -> Nothing _ -> error "Found more than one article title." is_title :: (ArrowXml a) => a XmlTree XmlTree is_title = (hasName "h2") >>> (hasAttrValue "class" (== "SummaryHL")) is_byline :: (ArrowXml a) => a XmlTree XmlTree is_byline = (hasName "div") >>> (hasAttrValue "class" (== "FeatureByLine")) is_image :: (ArrowXml a) => a XmlTree XmlTree is_image = isElem >>> hasName "img" remove_title :: (ArrowXml a) => a XmlTree XmlTree remove_title = processTopDown ((none) `when` is_title) remove_byline :: (ArrowXml a) => a XmlTree XmlTree remove_byline = processTopDown ((none) `when` is_byline) fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String) fp_parse_article_body xml = do -- First, delete the article title and byline. let clean_xml' = xml >>> remove_title >>> remove_byline -- The only child of the body element should be a div.lwn-article -- since we wrapped the article's HTML in that. let clean_xml = clean_xml' >>> css "body" >>> getChildren clean_html <- runX . xshow $ clean_xml return $ case clean_html of [x] -> Just x [] -> Nothing _ -> error "Found more than one article body." fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article) fp_parse_article xml = do parsed_article_title <- fp_parse_article_title xml parsed_article_byline <- parse_byline xml parsed_article_body <- fp_parse_article_body xml if (isNothing parsed_article_title) || (isNothing parsed_article_body) then return Nothing else do let title' = Title $ fromJust parsed_article_title let byline' = Byline parsed_article_byline let body' = BodyHtml $ fromJust parsed_article_body return $ Just $ Article title' byline' body' parse_html_article :: String -> IO (Maybe Article) parse_html_article html = do let xml = parseHtml $ wrap_in_body_div html fp_parse_article xml -- | In the full page, all of the article titles and bodies are -- wrapped in one big div.ArticleText. parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree parse_bodies xml = xml >>> css "div.ArticleText" fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article] fp_parse_articles xml = do bodies <- runX . xshow $ parse_bodies xml let article_separator = "

" let split_articles'' = split article_separator (concat bodies) -- The first element will contain the crap before the first . let split_articles' = case split_articles'' of (_:_) -> tail split_articles'' [] -> [] -- Put the separator back, it was lost during the split. let split_articles = map (article_separator ++) split_articles' --_ <- mapM print_article split_articles real_articles <- mapM parse_html_article split_articles let just_articles = catMaybes real_articles return just_articles -- | This makes it easy to select otherwise-random chunks of html -- using 'css'. wrap_in_body_div :: String -> String wrap_in_body_div s = "
" ++ s ++ "
" -- -- Epublishable stuff -- title :: Page -> String title (ArticlePage a) = getTitle $ LWN.Article.title a title (FullPage hl _) = hl metadata :: Page -> IO String metadata obj = do date <- getCurrentTime return $ "http://lwn.net/\n" ++ "" ++ (show date) ++ "\n" ++ "en-US\n" ++ "Copyright Eklektix, Inc.\n" ++ "" ++ (LWN.Page.title obj) ++ "\n" epublish :: Page -> Handle -> IO () epublish obj handle = do let xhtml = to_xhtml obj epmd <- metadata obj epub <- xhtml_to_epub epmd xhtml B.hPut handle epub hFlush handle hClose handle xhtml_to_epub :: String -> String -> IO B.ByteString xhtml_to_epub epmd = write_epub . read_html where my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd } write_epub = writeEPUB Nothing [] my_writer_options read_html = readHtml defaultParserState -- -- Misc -- image_srcs :: (ArrowXml a) => a XmlTree URL image_srcs = css "img" >>> getAttrValue "src" make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree make_image_srcs_absolute = processTopDown (make_srcs_absolute `when` is_image) where change_src :: (ArrowXml a) => a XmlTree XmlTree change_src = changeAttrValue try_make_absolute_url make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree make_srcs_absolute = processAttrl $ change_src `when` hasName "src" -- -- Tests -- test_preprocess_links :: Assertion test_preprocess_links = do actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" let actual_xml = actual_xml' !! 0 expected_xml' <- runX $ expected_xml'' >>> css "body" let expected_xml = expected_xml' !! 0 assertEqual "Links are replaced with spans" expected_xml actual_xml where input_html = "Hello, world!" input_xml = parseHtml input_html expected_html = "Hello, world!" expected_xml'' = parseHtml expected_html test_absolve_images :: Assertion test_absolve_images = do actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" let actual_xml = actual_xml' !! 0 expected_xml' <- runX $ expected_xml'' >>> css "body" let expected_xml = expected_xml' !! 0 assertEqual "Image srcs are made absolute" expected_xml actual_xml where input_html = "" ++ "" ++ "" input_xml = parseHtml input_html expected_html = "" ++ "" ++ "" expected_xml'' = parseHtml expected_html test_comments_removed :: Assertion test_comments_removed = do actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" let actual_xml = actual_xml' !! 0 expected_xml' <- runX $ expected_xml'' >>> css "body" let expected_xml = expected_xml' !! 0 assertEqual "Comment links are removed" expected_xml actual_xml where input_html = "

" ++ "Comments (6 posted)" ++ "

" input_xml = parseHtml input_html expected_html = "

" expected_xml'' = parseHtml expected_html page_tests :: Test page_tests = testGroup "Page Tests" [ testCase "Links are replaced with spans" test_preprocess_links, testCase "Image srcs are made absolute" test_absolve_images, testCase "Comment links are removed" test_comments_removed ]