X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FFullPage.hs;fp=src%2FLWN%2FFullPage.hs;h=0000000000000000000000000000000000000000;hp=1ba7910cf329d414a6b8e6d6ab10dbceee239050;hb=6f0e6cbece7e1b1a3c6b43d19eb2f29088af981c;hpb=ed2ed8abd62ba3dec7f799253de1133732f8c153 diff --git a/src/LWN/FullPage.hs b/src/LWN/FullPage.hs deleted file mode 100644 index 1ba7910..0000000 --- a/src/LWN/FullPage.hs +++ /dev/null @@ -1,210 +0,0 @@ -module LWN.FullPage -where - -import Data.String.Utils (split, strip) -import Data.Maybe (catMaybes, fromJust) -import Data.Tree.NTree.TypeDefs (NTree) -import Text.XML.HXT.Core ( - ArrowXml, - IOSArrow, - XmlTree, - XNode, - (>>>), - (/>), - (//>), - getChildren, - getText, - hasAttrValue, - hasName, - none, - processTopDown, - runX, - xshow, - when - ) -import Text.HandsomeSoup (css, parseHtml) - -import Epublishable -import LWN.Article -import XHTML - --- | An LWN page with more than one article on it. These require --- different parsing and display functions than the single-article --- pages. -data FullPage = FullPage { headline :: String, - articles :: [Article] } - -articles_xhtml :: FullPage -> String -articles_xhtml fp = concatMap to_xhtml (articles fp) - -instance XHTML FullPage where - to_xhtml fp = - "" ++ - "" ++ - "" ++ - " " ++ - " " ++ (headline fp) ++ "" ++ - "" ++ - "" ++ - "

" ++ - "

" ++ (headline fp) ++ "

" ++ - (articles_xhtml fp) ++ - "

" ++ - "" ++ - "" - -instance Epublishable FullPage where - parse xml = do - hl <- parse_headline xml - parsed_articles <- parse_articles xml - case parsed_articles of - them@(_:_) -> return $ Just $ FullPage (fromJust hl) them - _ -> return Nothing - - title = headline - - -parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) -parse_headline xml = do - let element_filter = xml >>> css "div.PageHeadline h1" - let element_text_filter = element_filter /> getText - element_text <- runX element_text_filter - return $ case element_text of - [x] -> Just $ strip x - [] -> Nothing - _ -> error "Found more than one headline." - -parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) -parse_article_byline xml = do - let element_filter = xml >>> css "div.FeatureByLine" - let element_text_filter = element_filter /> getText - element_text <- runX element_text_filter - return $ case element_text of - [x] -> Just $ strip x - [] -> Nothing - _ -> error "Found more than one article byline." - - -parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) -parse_article_title xml = do - let element_filter = xml >>> css "h2.SummaryHL" - let element_text_filter = element_filter //> getText - element_text <- runX element_text_filter - return $ case element_text of - [x] -> Just $ strip x - [] -> Nothing - _ -> error "Found more than one article title." - - - -is_title :: (ArrowXml a) => a XmlTree XmlTree -is_title = - (hasName "h2") - >>> - (hasAttrValue "class" (== "SummaryHL")) - - -is_byline :: (ArrowXml a) => a XmlTree XmlTree -is_byline = - (hasName "div") - >>> - (hasAttrValue "class" (== "FeatureByLine")) - - -is_image :: (ArrowXml a) => a XmlTree XmlTree -is_image = - hasName "img" - - -remove_title :: (ArrowXml a) => a XmlTree XmlTree -remove_title = - processTopDown ((none) `when` is_title) - - -remove_byline :: (ArrowXml a) => a XmlTree XmlTree -remove_byline = - processTopDown ((none) `when` is_byline) - - -remove_images :: (ArrowXml a) => a XmlTree XmlTree -remove_images = - processTopDown ((none) `when` is_image) - - - -parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) -parse_article_body xml = do - -- First, delete the article title and byline. - let clean_xml' = xml >>> remove_title >>> remove_byline >>> remove_images - -- The only child of the body element should be a div.lwn-article - -- since we wrapped the article's HTML in that. - let clean_xml = clean_xml' >>> css "body" >>> getChildren - clean_html <- runX . xshow $ clean_xml - return $ case clean_html of - [x] -> Just x - [] -> Nothing - _ -> error "Found more than one article body." - -parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article) -parse_article xml = do - parsed_article_title <- parse_article_title xml - parsed_article_byline <- parse_article_byline xml - parsed_article_body <- parse_article_body xml - let title' = Title $ fromJust parsed_article_title - let byline' = Byline parsed_article_byline - let body' = BodyHtml $ fromJust parsed_article_body - return $ Just $ Article title' byline' body' - -parse_html_article :: String -> IO (Maybe Article) -parse_html_article html = do - let xml = parseHtml $ wrap_in_body_div html - parse_article xml - - --- | In the full page, all of the article titles and bodies are --- wrapped in a div.ArticleText. -parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode) -parse_bodies xml = - xml >>> css "div.ArticleText" - - --- Debug, print a string. -print_article :: String -> IO () -print_article s = do - putStrLn "-----------" - putStrLn "- Article -" - putStrLn "-----------" - putStrLn "" - putStrLn s - putStrLn "" - - --- Debug, print an article's body html. -print_body :: Article -> IO () -print_body x = - print_article bh - where - bh' = body_html x - bh = getBodyHtml bh' - - -parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article] -parse_articles xml = do - bodies <- runX . xshow $ parse_bodies xml - let article_separator = "

" - let split_articles'' = split article_separator (concat bodies) - -- The first element will contain the crap before the first . - let split_articles' = tail split_articles'' - -- Put the separator back, it was lost during the split. - let split_articles = map (article_separator ++) split_articles' - --_ <- mapM print_article split_articles - real_articles <- mapM parse_html_article split_articles - let just_articles = catMaybes real_articles - return just_articles - - -wrap_in_body_div :: String -> String -wrap_in_body_div s = - "
" ++ s ++ "
"