\n\n" ++ @@ -49,7 +49,7 @@ instance Epublishable ArticlePage where Just [x] -> return $ Just $ ArticlePage x _ -> return Nothing - title (ArticlePage x) = headline x + title (ArticlePage x) = LWN.Article.title x -- | Takes data from an LWN page and determines whether or not it's a @@ -95,7 +95,7 @@ parse_articles xml = do parsed_headline <- parse_headline xml parsed_byline <- parse_byline xml parsed_body <- parse_body xml - let headline' = fromJust parsed_headline - let byline' = fromJust parsed_byline - let body' = fromJust parsed_body - return $ Just $ [Article headline' byline' body'] + let title' = fromJust parsed_headline + let byline' = fromJust parsed_byline + let body' = fromJust parsed_body + return $ Just $ [Article title' byline' body'] diff --git a/src/LWN/FullPage.hs b/src/LWN/FullPage.hs index cc89737..ac88862 100644 --- a/src/LWN/FullPage.hs +++ b/src/LWN/FullPage.hs @@ -1,8 +1,8 @@ module LWN.FullPage where -import Data.String.Utils (strip) -import Data.Maybe (fromJust) +import Data.String.Utils (split, strip) +import Data.Maybe (catMaybes, fromJust) import Data.Tree.NTree.TypeDefs (NTree) import Text.XML.HXT.Core ( IOSArrow, @@ -11,10 +11,16 @@ import Text.XML.HXT.Core ( (>>>), (/>), getText, + hasAttrValue, + hasName, + none, + processTopDown, runX, - xshow + xshow, + when ) -import Text.HandsomeSoup (css) +import Text.XML.HXT.Arrow.Edit (indentDoc) +import Text.HandsomeSoup (css, parseHtml) import Epublishable import LWN.Article @@ -23,10 +29,11 @@ import XHTML -- | An LWN page with more than one article on it. These require -- different parsing and display functions than the single-article -- pages. -data FullPage = FullPage { articles :: [Article] } +data FullPage = FullPage { headline :: String, + articles :: [Article] } articles_xhtml :: FullPage -> String -articles_xhtml fp = concatMap show (articles x) +articles_xhtml fp = concatMap to_xhtml (articles fp) instance XHTML FullPage where to_xhtml fp = @@ -36,10 +43,11 @@ instance XHTML FullPage where "\n\n" ++ " " ++ - " " ++ (headline a) ++ "\n" ++ + " " ++ (headline fp) ++ "\n" ++ "\n" ++ "\n" ++ "

\n\n" ++ + "

" ++ (headline fp) ++ "

" ++ (articles_xhtml fp) ++ "\n\n

\n" ++ "\n\n" ++ @@ -47,53 +55,104 @@ instance XHTML FullPage where instance Epublishable FullPage where parse xml = do - articles <- parse_articles xml - case articles of - (x:xs)@all -> return $ Just $ FullPage all + hl <- parse_headline xml + parsed_articles <- parse_articles xml + case parsed_articles of + them@(_:_) -> return $ Just $ FullPage (fromJust hl) them _ -> return Nothing title _ = "LWN.net" --- parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) --- parse_headline xml = do --- let element_filter = xml >>> css "div.PageHeadline h1" --- let element_text_filter = element_filter /> getText --- element_text <- runX element_text_filter --- return $ case element_text of --- [x] -> Just $ strip x --- [] -> Nothing --- _ -> error "Found more than one headline." - --- parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) --- parse_byline xml = do --- let element_filter = xml >>> css "div.Byline" --- let element_text_filter = element_filter /> getText --- element_text <- runX element_text_filter --- return $ case element_text of --- [x] -> Just $ strip x --- [] -> Nothing --- _ -> error "Found more than one byline." - - --- parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) --- parse_body xml = do --- let element_filter = xml >>> css "div.ArticleText" --- let element_html_filter = xshow element_filter --- element_html <- runX element_html_filter --- return $ case element_html of --- [x] -> Just x --- [] -> Nothing --- _ -> error "Found more than one article." - - --- parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article]) --- parse_articles xml = do --- parsed_headline <- parse_headline xml --- parsed_byline <- parse_byline xml --- parsed_body <- parse_body xml --- let headline' = fromJust parsed_headline --- let byline' = fromJust parsed_byline --- let body' = fromJust parsed_body --- return $ Just $ [Article headline' byline' body'] +parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_headline xml = do + let element_filter = xml >>> css "div.PageHeadline h1" + let element_text_filter = element_filter /> getText + element_text <- runX element_text_filter + return $ case element_text of + [x] -> Just $ strip x + [] -> Nothing + _ -> error "Found more than one headline." + +parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_article_byline xml = do + let element_filter = xml >>> css "div.FeatureByLine" + let element_text_filter = element_filter /> getText + element_text <- runX element_text_filter + return $ case element_text of + [x] -> Just $ strip x + [] -> Nothing + _ -> error "Found more than one article byline." + + +parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_article_title xml = do + let element_filter = xml >>> css "h2.SummaryHL a" + let element_text_filter = element_filter /> getText + element_text <- runX element_text_filter + return $ case element_text of + [x] -> Just $ strip x + [] -> Nothing + _ -> error "Found more than one article title." + + +--is_title :: Integer +--is_title = (hasName "h2") >>> (hasAttrValue "class" (== "SummaryHL")) + +--is_byline :: Integer +--is_byline = (hasName "div") >>> (hasAttrValue "class" (== "FeatureByLine")) + +parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_article_body xml = do + -- First, delete the article title and byline. + --let clean_xml = xml >>> processTopDown ((none) `when` is_title) >>> processTopDown ((none) `when` is_byline) + let clean_xml = xml + clean_html <- runX $ xshow clean_xml + return $ case clean_html of + [x] -> Just x + [] -> Nothing + _ -> error "Found more than one article body." + +parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article) +parse_article xml = do + parsed_article_title <- parse_article_title xml + parsed_article_byline <- parse_article_byline xml + parsed_article_body <- parse_article_body xml +-- let title' = fromJust parsed_article_title + let title' = "title" +-- let byline' = fromJust parsed_article_byline + let byline' = "byline" +-- let body' = fromJust parsed_article_body + body' <- runX . xshow $ xml + return $ Just $ Article title' byline' (body' !! 0) + +parse_html_article :: String -> IO (Maybe Article) +parse_html_article html = do + let xml = parseHtml html + parse_article xml + + +-- | In the full page, all of the article titles and bodies are +-- wrapped in a div.ArticleText. +parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode) +parse_bodies xml = + xml >>> css "div.ArticleText" + + + +parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article] +parse_articles xml = do + bodies <- runX . xshow $ (parse_bodies xml) >>> indentDoc + let article_separator = "

" + let split_articles'' = split article_separator (bodies !! 0) + -- The first element will contain the crap before the first . + let split_articles' = tail split_articles'' + -- Put the separator back, it was lost during the split. + let split_articles = map (("\n" ++ article_separator) ++) split_articles' + putStrLn "split articles\n\n" + mapM putStrLn split_articles + real_articles <- mapM parse_html_article split_articles + let just_articles = catMaybes real_articles + return just_articles + diff --git a/src/Main.hs b/src/Main.hs index a810bc9..37f8967 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -7,11 +7,19 @@ import Text.HandsomeSoup (parseHtml) import Epublishable import LWN.ArticlePage +import LWN.FullPage +import XHTML main :: IO () main = do article_html <- readFile "test/fixtures/501317-article.html" ioap <- parse $ parseHtml article_html let article_page :: ArticlePage = fromJust $ ioap - epublish article_page "out.epub" + epublish article_page "single_article.epub" + + page_html <- readFile "test/fixtures/500848-page.html" + ioap_f <- parse $ parseHtml page_html + let full_page :: FullPage = fromJust $ ioap_f + --putStrLn $ to_xhtml full_page + --epublish full_page "full_page.epub" putStrLn "Done."

" ++ hl ++ "

" ++ bl ++ "

" ++ t ++ "

" ++ (headline fp) ++ "