module LWN.FullPage
where
-import Data.String.Utils (strip)
-import Data.Maybe (fromJust)
+import Data.String.Utils (split, strip)
+import Data.Maybe (catMaybes, fromJust)
import Data.Tree.NTree.TypeDefs (NTree)
import Text.XML.HXT.Core (
+ ArrowXml,
IOSArrow,
XmlTree,
XNode,
(>>>),
(/>),
+ (//>),
+ getChildren,
getText,
+ hasAttrValue,
+ hasName,
+ none,
+ processTopDown,
runX,
- xshow
+ xshow,
+ when
)
-import Text.HandsomeSoup (css)
+import Text.HandsomeSoup (css, parseHtml)
import Epublishable
import LWN.Article
-- | An LWN page with more than one article on it. These require
-- different parsing and display functions than the single-article
-- pages.
-data FullPage = FullPage { articles :: [Article] }
+data FullPage = FullPage { headline :: String,
+ articles :: [Article] }
articles_xhtml :: FullPage -> String
-articles_xhtml fp = concatMap show (articles x)
+articles_xhtml fp = concatMap to_xhtml (articles fp)
instance XHTML FullPage where
to_xhtml fp =
"<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
"\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
- "\n<head>\n" ++
+ "<head>" ++
" <meta http-equiv=\"Content-Type\"" ++
" content=\"application/xhtml+xml; charset=utf-8\" />" ++
- " <title>" ++ (headline a) ++ "</title>\n" ++
- "</head>\n" ++
- "<body>\n" ++
- "<div>\n\n" ++
+ " <title>" ++ (headline fp) ++ "</title>" ++
+ "</head>" ++
+ "<body>" ++
+ "<div>" ++
+ "<h1>" ++ (headline fp) ++ "</h1>" ++
(articles_xhtml fp) ++
- "\n\n</div>\n" ++
- "\n</body>\n" ++
+ "</div>" ++
+ "</body>" ++
"</html>"
instance Epublishable FullPage where
parse xml = do
- articles <- parse_articles xml
- case articles of
- (x:xs)@all -> return $ Just $ FullPage all
+ hl <- parse_headline xml
+ parsed_articles <- parse_articles xml
+ case parsed_articles of
+ them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
_ -> return Nothing
- title _ = "LWN.net"
-
-
--- parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
--- parse_headline xml = do
--- let element_filter = xml >>> css "div.PageHeadline h1"
--- let element_text_filter = element_filter /> getText
--- element_text <- runX element_text_filter
--- return $ case element_text of
--- [x] -> Just $ strip x
--- [] -> Nothing
--- _ -> error "Found more than one headline."
-
--- parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
--- parse_byline xml = do
--- let element_filter = xml >>> css "div.Byline"
--- let element_text_filter = element_filter /> getText
--- element_text <- runX element_text_filter
--- return $ case element_text of
--- [x] -> Just $ strip x
--- [] -> Nothing
--- _ -> error "Found more than one byline."
-
-
--- parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
--- parse_body xml = do
--- let element_filter = xml >>> css "div.ArticleText"
--- let element_html_filter = xshow element_filter
--- element_html <- runX element_html_filter
--- return $ case element_html of
--- [x] -> Just x
--- [] -> Nothing
--- _ -> error "Found more than one article."
-
-
--- parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
--- parse_articles xml = do
--- parsed_headline <- parse_headline xml
--- parsed_byline <- parse_byline xml
--- parsed_body <- parse_body xml
--- let headline' = fromJust parsed_headline
--- let byline' = fromJust parsed_byline
--- let body' = fromJust parsed_body
--- return $ Just $ [Article headline' byline' body']
+ title = headline
+
+parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_headline xml = do
+ let element_filter = xml >>> css "div.PageHeadline h1"
+ let element_text_filter = element_filter /> getText
+ element_text <- runX element_text_filter
+ return $ case element_text of
+ [x] -> Just $ strip x
+ [] -> Nothing
+ _ -> error "Found more than one headline."
+
+parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_article_byline xml = do
+ let element_filter = xml >>> css "div.FeatureByLine"
+ let element_text_filter = element_filter /> getText
+ element_text <- runX element_text_filter
+ return $ case element_text of
+ [x] -> Just $ strip x
+ [] -> Nothing
+ _ -> error "Found more than one article byline."
+
+
+parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_article_title xml = do
+ let element_filter = xml >>> css "h2.SummaryHL"
+ let element_text_filter = element_filter //> getText
+ element_text <- runX element_text_filter
+ return $ case element_text of
+ [x] -> Just $ strip x
+ [] -> Nothing
+ _ -> error "Found more than one article title."
+
+
+
+is_title :: (ArrowXml a) => a XmlTree XmlTree
+is_title =
+ (hasName "h2")
+ >>>
+ (hasAttrValue "class" (== "SummaryHL"))
+
+
+is_byline :: (ArrowXml a) => a XmlTree XmlTree
+is_byline =
+ (hasName "div")
+ >>>
+ (hasAttrValue "class" (== "FeatureByLine"))
+
+
+is_image :: (ArrowXml a) => a XmlTree XmlTree
+is_image =
+ hasName "img"
+
+
+remove_title :: (ArrowXml a) => a XmlTree XmlTree
+remove_title =
+ processTopDown ((none) `when` is_title)
+
+
+remove_byline :: (ArrowXml a) => a XmlTree XmlTree
+remove_byline =
+ processTopDown ((none) `when` is_byline)
+
+
+remove_images :: (ArrowXml a) => a XmlTree XmlTree
+remove_images =
+ processTopDown ((none) `when` is_image)
+
+
+
+parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_article_body xml = do
+ -- First, delete the article title and byline.
+ let clean_xml' = xml >>> remove_title >>> remove_byline >>> remove_images
+ -- The only child of the body element should be a div.lwn-article
+ -- since we wrapped the article's HTML in that.
+ let clean_xml = clean_xml' >>> css "body" >>> getChildren
+ clean_html <- runX . xshow $ clean_xml
+ return $ case clean_html of
+ [x] -> Just x
+ [] -> Nothing
+ _ -> error "Found more than one article body."
+
+parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
+parse_article xml = do
+ parsed_article_title <- parse_article_title xml
+ parsed_article_byline <- parse_article_byline xml
+ parsed_article_body <- parse_article_body xml
+ let title' = Title $ fromJust parsed_article_title
+ let byline' = Byline parsed_article_byline
+ let body' = BodyHtml $ fromJust parsed_article_body
+ return $ Just $ Article title' byline' body'
+
+parse_html_article :: String -> IO (Maybe Article)
+parse_html_article html = do
+ let xml = parseHtml $ wrap_in_body_div html
+ parse_article xml
+
+
+-- | In the full page, all of the article titles and bodies are
+-- wrapped in a div.ArticleText.
+parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
+parse_bodies xml =
+ xml >>> css "div.ArticleText"
+
+
+-- Debug, print a string.
+print_article :: String -> IO ()
+print_article s = do
+ putStrLn "-----------"
+ putStrLn "- Article -"
+ putStrLn "-----------"
+ putStrLn ""
+ putStrLn s
+ putStrLn ""
+
+
+-- Debug, print an article's body html.
+print_body :: Article -> IO ()
+print_body x =
+ print_article bh
+ where
+ bh' = body_html x
+ bh = getBodyHtml bh'
+
+
+parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
+parse_articles xml = do
+ bodies <- runX . xshow $ parse_bodies xml
+ let article_separator = "<h2 class=\"SummaryHL\">"
+ let split_articles'' = split article_separator (concat bodies)
+ -- The first element will contain the crap before the first <h2...>.
+ let split_articles' = tail split_articles''
+ -- Put the separator back, it was lost during the split.
+ let split_articles = map (article_separator ++) split_articles'
+ --_ <- mapM print_article split_articles
+ real_articles <- mapM parse_html_article split_articles
+ let just_articles = catMaybes real_articles
+ return just_articles
+
+
+wrap_in_body_div :: String -> String
+wrap_in_body_div s =
+ "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"