X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FFullPage.hs;h=1ba7910cf329d414a6b8e6d6ab10dbceee239050;hp=9ff782c89bc0b7e60cda43a7d47379b97470083b;hb=ed2ed8abd62ba3dec7f799253de1133732f8c153;hpb=b591d5f5c3c47d253c24144beae1edf8648cd94b diff --git a/src/LWN/FullPage.hs b/src/LWN/FullPage.hs index 9ff782c..1ba7910 100644 --- a/src/LWN/FullPage.hs +++ b/src/LWN/FullPage.hs @@ -1,6 +1,29 @@ module LWN.FullPage where +import Data.String.Utils (split, strip) +import Data.Maybe (catMaybes, fromJust) +import Data.Tree.NTree.TypeDefs (NTree) +import Text.XML.HXT.Core ( + ArrowXml, + IOSArrow, + XmlTree, + XNode, + (>>>), + (/>), + (//>), + getChildren, + getText, + hasAttrValue, + hasName, + none, + processTopDown, + runX, + xshow, + when + ) +import Text.HandsomeSoup (css, parseHtml) + import Epublishable import LWN.Article import XHTML @@ -8,16 +31,180 @@ import XHTML -- | An LWN page with more than one article on it. These require -- different parsing and display functions than the single-article -- pages. -data FullPage = FullPage { articles :: [Article] } +data FullPage = FullPage { headline :: String, + articles :: [Article] } + +articles_xhtml :: FullPage -> String +articles_xhtml fp = concatMap to_xhtml (articles fp) instance XHTML FullPage where - to_xhtml fp = "" + to_xhtml fp = + "" ++ + "" ++ + "" ++ + " " ++ + " " ++ (headline fp) ++ "" ++ + "" ++ + "" ++ + "
" ++ + "

" ++ (headline fp) ++ "

" ++ + (articles_xhtml fp) ++ + "
" ++ + "" ++ + "" instance Epublishable FullPage where parse xml = do - articles <- parse_articles xml - case articles of - (x:xs)@all -> return $ Just $ FullPage all + hl <- parse_headline xml + parsed_articles <- parse_articles xml + case parsed_articles of + them@(_:_) -> return $ Just $ FullPage (fromJust hl) them _ -> return Nothing - title _ = "LWN.net" + title = headline + + +parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_headline xml = do + let element_filter = xml >>> css "div.PageHeadline h1" + let element_text_filter = element_filter /> getText + element_text <- runX element_text_filter + return $ case element_text of + [x] -> Just $ strip x + [] -> Nothing + _ -> error "Found more than one headline." + +parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_article_byline xml = do + let element_filter = xml >>> css "div.FeatureByLine" + let element_text_filter = element_filter /> getText + element_text <- runX element_text_filter + return $ case element_text of + [x] -> Just $ strip x + [] -> Nothing + _ -> error "Found more than one article byline." + + +parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_article_title xml = do + let element_filter = xml >>> css "h2.SummaryHL" + let element_text_filter = element_filter //> getText + element_text <- runX element_text_filter + return $ case element_text of + [x] -> Just $ strip x + [] -> Nothing + _ -> error "Found more than one article title." + + + +is_title :: (ArrowXml a) => a XmlTree XmlTree +is_title = + (hasName "h2") + >>> + (hasAttrValue "class" (== "SummaryHL")) + + +is_byline :: (ArrowXml a) => a XmlTree XmlTree +is_byline = + (hasName "div") + >>> + (hasAttrValue "class" (== "FeatureByLine")) + + +is_image :: (ArrowXml a) => a XmlTree XmlTree +is_image = + hasName "img" + + +remove_title :: (ArrowXml a) => a XmlTree XmlTree +remove_title = + processTopDown ((none) `when` is_title) + + +remove_byline :: (ArrowXml a) => a XmlTree XmlTree +remove_byline = + processTopDown ((none) `when` is_byline) + + +remove_images :: (ArrowXml a) => a XmlTree XmlTree +remove_images = + processTopDown ((none) `when` is_image) + + + +parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_article_body xml = do + -- First, delete the article title and byline. + let clean_xml' = xml >>> remove_title >>> remove_byline >>> remove_images + -- The only child of the body element should be a div.lwn-article + -- since we wrapped the article's HTML in that. + let clean_xml = clean_xml' >>> css "body" >>> getChildren + clean_html <- runX . xshow $ clean_xml + return $ case clean_html of + [x] -> Just x + [] -> Nothing + _ -> error "Found more than one article body." + +parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article) +parse_article xml = do + parsed_article_title <- parse_article_title xml + parsed_article_byline <- parse_article_byline xml + parsed_article_body <- parse_article_body xml + let title' = Title $ fromJust parsed_article_title + let byline' = Byline parsed_article_byline + let body' = BodyHtml $ fromJust parsed_article_body + return $ Just $ Article title' byline' body' + +parse_html_article :: String -> IO (Maybe Article) +parse_html_article html = do + let xml = parseHtml $ wrap_in_body_div html + parse_article xml + + +-- | In the full page, all of the article titles and bodies are +-- wrapped in a div.ArticleText. +parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode) +parse_bodies xml = + xml >>> css "div.ArticleText" + + +-- Debug, print a string. +print_article :: String -> IO () +print_article s = do + putStrLn "-----------" + putStrLn "- Article -" + putStrLn "-----------" + putStrLn "" + putStrLn s + putStrLn "" + + +-- Debug, print an article's body html. +print_body :: Article -> IO () +print_body x = + print_article bh + where + bh' = body_html x + bh = getBodyHtml bh' + + +parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article] +parse_articles xml = do + bodies <- runX . xshow $ parse_bodies xml + let article_separator = "

" + let split_articles'' = split article_separator (concat bodies) + -- The first element will contain the crap before the first . + let split_articles' = tail split_articles'' + -- Put the separator back, it was lost during the split. + let split_articles = map (article_separator ++) split_articles' + --_ <- mapM print_article split_articles + real_articles <- mapM parse_html_article split_articles + let just_articles = catMaybes real_articles + return just_articles + + +wrap_in_body_div :: String -> String +wrap_in_body_div s = + "
" ++ s ++ "
"