module LWN.FullPage where import Data.String.Utils (split, strip) import Data.Maybe (catMaybes, fromJust) import Data.Tree.NTree.TypeDefs (NTree) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, XmlTree, XNode, (>>>), (/>), (//>), getChildren, getText, hasAttrValue, hasName, none, processTopDown, runX, xshow, when ) import Text.HandsomeSoup (css, parseHtml) import Epublishable import LWN.Article import XHTML -- | An LWN page with more than one article on it. These require -- different parsing and display functions than the single-article -- pages. data FullPage = FullPage { headline :: String, articles :: [Article] } articles_xhtml :: FullPage -> String articles_xhtml fp = concatMap to_xhtml (articles fp) instance XHTML FullPage where to_xhtml fp = "" ++ "" ++ "" ++ " " ++ " " ++ (headline fp) ++ "" ++ "" ++ "" ++ "
" ++ "

" ++ (headline fp) ++ "

" ++ (articles_xhtml fp) ++ "
" ++ "" ++ "" instance Epublishable FullPage where parse xml = do hl <- parse_headline xml parsed_articles <- parse_articles xml case parsed_articles of them@(_:_) -> return $ Just $ FullPage (fromJust hl) them _ -> return Nothing title = headline parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) parse_headline xml = do let element_filter = xml >>> css "div.PageHeadline h1" let element_text_filter = element_filter /> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x [] -> Nothing _ -> error "Found more than one headline." parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) parse_article_byline xml = do let element_filter = xml >>> css "div.FeatureByLine" let element_text_filter = element_filter /> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x [] -> Nothing _ -> error "Found more than one article byline." parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) parse_article_title xml = do let element_filter = xml >>> css "h2.SummaryHL" let element_text_filter = element_filter //> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x [] -> Nothing _ -> error "Found more than one article title." is_title :: (ArrowXml a) => a XmlTree XmlTree is_title = (hasName "h2") >>> (hasAttrValue "class" (== "SummaryHL")) is_byline :: (ArrowXml a) => a XmlTree XmlTree is_byline = (hasName "div") >>> (hasAttrValue "class" (== "FeatureByLine")) is_image :: (ArrowXml a) => a XmlTree XmlTree is_image = hasName "img" remove_title :: (ArrowXml a) => a XmlTree XmlTree remove_title = processTopDown ((none) `when` is_title) remove_byline :: (ArrowXml a) => a XmlTree XmlTree remove_byline = processTopDown ((none) `when` is_byline) remove_images :: (ArrowXml a) => a XmlTree XmlTree remove_images = processTopDown ((none) `when` is_image) parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) parse_article_body xml = do -- First, delete the article title and byline. let clean_xml' = xml >>> remove_title >>> remove_byline >>> remove_images -- The only child of the body element should be a div.lwn-article -- since we wrapped the article's HTML in that. let clean_xml = clean_xml' >>> css "body" >>> getChildren clean_html <- runX . xshow $ clean_xml return $ case clean_html of [x] -> Just x [] -> Nothing _ -> error "Found more than one article body." parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article) parse_article xml = do parsed_article_title <- parse_article_title xml parsed_article_byline <- parse_article_byline xml parsed_article_body <- parse_article_body xml let title' = Title $ fromJust parsed_article_title let byline' = Byline parsed_article_byline let body' = BodyHtml $ fromJust parsed_article_body return $ Just $ Article title' byline' body' parse_html_article :: String -> IO (Maybe Article) parse_html_article html = do let xml = parseHtml $ wrap_in_body_div html parse_article xml -- | In the full page, all of the article titles and bodies are -- wrapped in a div.ArticleText. parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode) parse_bodies xml = xml >>> css "div.ArticleText" -- Debug, print a string. print_article :: String -> IO () print_article s = do putStrLn "-----------" putStrLn "- Article -" putStrLn "-----------" putStrLn "" putStrLn s putStrLn "" -- Debug, print an article's body html. print_body :: Article -> IO () print_body x = print_article bh where bh' = body_html x bh = getBodyHtml bh' parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article] parse_articles xml = do bodies <- runX . xshow $ parse_bodies xml let article_separator = "

" let split_articles'' = split article_separator (concat bodies) -- The first element will contain the crap before the first . let split_articles' = tail split_articles'' -- Put the separator back, it was lost during the split. let split_articles = map (article_separator ++) split_articles' --_ <- mapM print_article split_articles real_articles <- mapM parse_html_article split_articles let just_articles = catMaybes real_articles return just_articles wrap_in_body_div :: String -> String wrap_in_body_div s = "
" ++ s ++ "
"