X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FArticlePage.hs;fp=src%2FLWN%2FArticlePage.hs;h=0e699a0978953ad61e9aaa7105c97025b11be3c2;hp=0000000000000000000000000000000000000000;hb=b591d5f5c3c47d253c24144beae1edf8648cd94b;hpb=8e5616a377196f0a200947173d4c78a3dca8a55f diff --git a/src/LWN/ArticlePage.hs b/src/LWN/ArticlePage.hs new file mode 100644 index 0000000..0e699a0 --- /dev/null +++ b/src/LWN/ArticlePage.hs @@ -0,0 +1,101 @@ +module LWN.ArticlePage +where + +import Data.String.Utils (strip) +import Data.Maybe (fromJust) +import Data.Tree.NTree.TypeDefs (NTree) +import Text.XML.HXT.Core ( + IOSArrow, + XmlTree, + XNode, + (>>>), + (/>), + getText, + runX, + xshow + ) +import Text.HandsomeSoup (css) + +import Epublishable +import LWN.Article +import XHTML + +-- | Defines the ArticlePage data type, containing one 'Article'. +data ArticlePage = ArticlePage { article :: Article } + + +instance XHTML ArticlePage where + to_xhtml (ArticlePage a) = + "" ++ + "" ++ + "\n\n" ++ + " " ++ + " " ++ (headline a) ++ "\n" ++ + "\n" ++ + "\n" ++ + "
\n\n" ++ + (to_xhtml a) ++ + "\n\n
\n" ++ + "\n\n" ++ + "" + + +instance Epublishable ArticlePage where + parse xml = do + articles <- parse_articles xml + case articles of + Just [x] -> return $ Just $ ArticlePage x + _ -> return Nothing + + title (ArticlePage x) = headline x + + +-- | Takes data from an LWN page and determines whether or not it's a +-- single article (as opposed to a page with multiple articles). +is_article_page :: String -> IO Bool +is_article_page _ = return True + + +parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_headline xml = do + let element_filter = xml >>> css "div.PageHeadline h1" + let element_text_filter = element_filter /> getText + element_text <- runX element_text_filter + return $ case element_text of + [x] -> Just $ strip x + [] -> Nothing + _ -> error "Found more than one headline." + +parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_byline xml = do + let element_filter = xml >>> css "div.Byline" + let element_text_filter = element_filter /> getText + element_text <- runX element_text_filter + return $ case element_text of + [x] -> Just $ strip x + [] -> Nothing + _ -> error "Found more than one byline." + + +parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_body xml = do + let element_filter = xml >>> css "div.ArticleText" + let element_html_filter = xshow element_filter + element_html <- runX element_html_filter + return $ case element_html of + [x] -> Just x + [] -> Nothing + _ -> error "Found more than one article." + + +parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article]) +parse_articles xml = do + parsed_headline <- parse_headline xml + parsed_byline <- parse_byline xml + parsed_body <- parse_body xml + let headline' = fromJust parsed_headline + let byline' = fromJust parsed_byline + let body' = fromJust parsed_body + return $ Just $ [Article headline' byline' body']