module LWN.FullPage where import Data.String.Utils (split, strip) import Data.Maybe (catMaybes, fromJust) import Data.Tree.NTree.TypeDefs (NTree) import Text.XML.HXT.Core ( IOSArrow, XmlTree, XNode, (>>>), (/>), getText, hasAttrValue, hasName, none, processTopDown, runX, xshow, when ) import Text.XML.HXT.Arrow.Edit (indentDoc) import Text.HandsomeSoup (css, parseHtml) import Epublishable import LWN.Article import XHTML -- | An LWN page with more than one article on it. These require -- different parsing and display functions than the single-article -- pages. data FullPage = FullPage { headline :: String, articles :: [Article] } articles_xhtml :: FullPage -> String articles_xhtml fp = concatMap to_xhtml (articles fp) instance XHTML FullPage where to_xhtml fp = "" ++ "" ++ "\n\n" ++ " " ++ " " ++ (headline fp) ++ "\n" ++ "\n" ++ "\n" ++ "
\n\n" ++ "

" ++ (headline fp) ++ "

" ++ (articles_xhtml fp) ++ "\n\n
\n" ++ "\n\n" ++ "" instance Epublishable FullPage where parse xml = do hl <- parse_headline xml parsed_articles <- parse_articles xml case parsed_articles of them@(_:_) -> return $ Just $ FullPage (fromJust hl) them _ -> return Nothing title _ = "LWN.net" parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) parse_headline xml = do let element_filter = xml >>> css "div.PageHeadline h1" let element_text_filter = element_filter /> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x [] -> Nothing _ -> error "Found more than one headline." parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) parse_article_byline xml = do let element_filter = xml >>> css "div.FeatureByLine" let element_text_filter = element_filter /> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x [] -> Nothing _ -> error "Found more than one article byline." parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) parse_article_title xml = do let element_filter = xml >>> css "h2.SummaryHL a" let element_text_filter = element_filter /> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x [] -> Nothing _ -> error "Found more than one article title." --is_title :: Integer --is_title = (hasName "h2") >>> (hasAttrValue "class" (== "SummaryHL")) --is_byline :: Integer --is_byline = (hasName "div") >>> (hasAttrValue "class" (== "FeatureByLine")) parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) parse_article_body xml = do -- First, delete the article title and byline. --let clean_xml = xml >>> processTopDown ((none) `when` is_title) >>> processTopDown ((none) `when` is_byline) let clean_xml = xml clean_html <- runX $ xshow clean_xml return $ case clean_html of [x] -> Just x [] -> Nothing _ -> error "Found more than one article body." parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article) parse_article xml = do parsed_article_title <- parse_article_title xml parsed_article_byline <- parse_article_byline xml parsed_article_body <- parse_article_body xml -- let title' = fromJust parsed_article_title let title' = "title" -- let byline' = fromJust parsed_article_byline let byline' = "byline" -- let body' = fromJust parsed_article_body body' <- runX . xshow $ xml return $ Just $ Article title' byline' (body' !! 0) parse_html_article :: String -> IO (Maybe Article) parse_html_article html = do let xml = parseHtml html parse_article xml -- | In the full page, all of the article titles and bodies are -- wrapped in a div.ArticleText. parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode) parse_bodies xml = xml >>> css "div.ArticleText" parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article] parse_articles xml = do bodies <- runX . xshow $ (parse_bodies xml) >>> indentDoc let article_separator = "

" let split_articles'' = split article_separator (bodies !! 0) -- The first element will contain the crap before the first . let split_articles' = tail split_articles'' -- Put the separator back, it was lost during the split. let split_articles = map (("\n" ++ article_separator) ++) split_articles' putStrLn "split articles\n\n" mapM putStrLn split_articles real_articles <- mapM parse_html_article split_articles let just_articles = catMaybes real_articles return just_articles