+++ /dev/null
-module LWN.FullPage
-where
-
-import Data.String.Utils (split, strip)
-import Data.Maybe (catMaybes, fromJust)
-import Data.Tree.NTree.TypeDefs (NTree)
-import Text.XML.HXT.Core (
- ArrowXml,
- IOSArrow,
- XmlTree,
- XNode,
- (>>>),
- (/>),
- (//>),
- getChildren,
- getText,
- hasAttrValue,
- hasName,
- none,
- processTopDown,
- runX,
- xshow,
- when
- )
-import Text.HandsomeSoup (css, parseHtml)
-
-import Epublishable
-import LWN.Article
-import XHTML
-
--- | An LWN page with more than one article on it. These require
--- different parsing and display functions than the single-article
--- pages.
-data FullPage = FullPage { headline :: String,
- articles :: [Article] }
-
-articles_xhtml :: FullPage -> String
-articles_xhtml fp = concatMap to_xhtml (articles fp)
-
-instance XHTML FullPage where
- to_xhtml fp =
- "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
- "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
- "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
- "<head>" ++
- " <meta http-equiv=\"Content-Type\"" ++
- " content=\"application/xhtml+xml; charset=utf-8\" />" ++
- " <title>" ++ (headline fp) ++ "</title>" ++
- "</head>" ++
- "<body>" ++
- "<div>" ++
- "<h1>" ++ (headline fp) ++ "</h1>" ++
- (articles_xhtml fp) ++
- "</div>" ++
- "</body>" ++
- "</html>"
-
-instance Epublishable FullPage where
- parse xml = do
- hl <- parse_headline xml
- parsed_articles <- parse_articles xml
- case parsed_articles of
- them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
- _ -> return Nothing
-
- title = headline
-
-
-parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-parse_headline xml = do
- let element_filter = xml >>> css "div.PageHeadline h1"
- let element_text_filter = element_filter /> getText
- element_text <- runX element_text_filter
- return $ case element_text of
- [x] -> Just $ strip x
- [] -> Nothing
- _ -> error "Found more than one headline."
-
-parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-parse_article_byline xml = do
- let element_filter = xml >>> css "div.FeatureByLine"
- let element_text_filter = element_filter /> getText
- element_text <- runX element_text_filter
- return $ case element_text of
- [x] -> Just $ strip x
- [] -> Nothing
- _ -> error "Found more than one article byline."
-
-
-parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-parse_article_title xml = do
- let element_filter = xml >>> css "h2.SummaryHL"
- let element_text_filter = element_filter //> getText
- element_text <- runX element_text_filter
- return $ case element_text of
- [x] -> Just $ strip x
- [] -> Nothing
- _ -> error "Found more than one article title."
-
-
-
-is_title :: (ArrowXml a) => a XmlTree XmlTree
-is_title =
- (hasName "h2")
- >>>
- (hasAttrValue "class" (== "SummaryHL"))
-
-
-is_byline :: (ArrowXml a) => a XmlTree XmlTree
-is_byline =
- (hasName "div")
- >>>
- (hasAttrValue "class" (== "FeatureByLine"))
-
-
-is_image :: (ArrowXml a) => a XmlTree XmlTree
-is_image =
- hasName "img"
-
-
-remove_title :: (ArrowXml a) => a XmlTree XmlTree
-remove_title =
- processTopDown ((none) `when` is_title)
-
-
-remove_byline :: (ArrowXml a) => a XmlTree XmlTree
-remove_byline =
- processTopDown ((none) `when` is_byline)
-
-
-remove_images :: (ArrowXml a) => a XmlTree XmlTree
-remove_images =
- processTopDown ((none) `when` is_image)
-
-
-
-parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-parse_article_body xml = do
- -- First, delete the article title and byline.
- let clean_xml' = xml >>> remove_title >>> remove_byline >>> remove_images
- -- The only child of the body element should be a div.lwn-article
- -- since we wrapped the article's HTML in that.
- let clean_xml = clean_xml' >>> css "body" >>> getChildren
- clean_html <- runX . xshow $ clean_xml
- return $ case clean_html of
- [x] -> Just x
- [] -> Nothing
- _ -> error "Found more than one article body."
-
-parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
-parse_article xml = do
- parsed_article_title <- parse_article_title xml
- parsed_article_byline <- parse_article_byline xml
- parsed_article_body <- parse_article_body xml
- let title' = Title $ fromJust parsed_article_title
- let byline' = Byline parsed_article_byline
- let body' = BodyHtml $ fromJust parsed_article_body
- return $ Just $ Article title' byline' body'
-
-parse_html_article :: String -> IO (Maybe Article)
-parse_html_article html = do
- let xml = parseHtml $ wrap_in_body_div html
- parse_article xml
-
-
--- | In the full page, all of the article titles and bodies are
--- wrapped in a div.ArticleText.
-parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
-parse_bodies xml =
- xml >>> css "div.ArticleText"
-
-
--- Debug, print a string.
-print_article :: String -> IO ()
-print_article s = do
- putStrLn "-----------"
- putStrLn "- Article -"
- putStrLn "-----------"
- putStrLn ""
- putStrLn s
- putStrLn ""
-
-
--- Debug, print an article's body html.
-print_body :: Article -> IO ()
-print_body x =
- print_article bh
- where
- bh' = body_html x
- bh = getBodyHtml bh'
-
-
-parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
-parse_articles xml = do
- bodies <- runX . xshow $ parse_bodies xml
- let article_separator = "<h2 class=\"SummaryHL\">"
- let split_articles'' = split article_separator (concat bodies)
- -- The first element will contain the crap before the first <h2...>.
- let split_articles' = tail split_articles''
- -- Put the separator back, it was lost during the split.
- let split_articles = map (article_separator ++) split_articles'
- --_ <- mapM print_article split_articles
- real_articles <- mapM parse_html_article split_articles
- let just_articles = catMaybes real_articles
- return just_articles
-
-
-wrap_in_body_div :: String -> String
-wrap_in_body_div s =
- "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"