From 6103dbc5f8d3689e32001c3fd7627f3153e40bb0 Mon Sep 17 00:00:00 2001 From: Michael Orlitzky Date: Sat, 23 Jun 2012 23:47:24 -0400 Subject: [PATCH] Go through a good bit of nonsense to get it successfully parsing our three test cases. --- src/LWN/Article.hs | 36 ++++++++++--- src/LWN/ArticlePage.hs | 22 ++++---- src/LWN/FullPage.hs | 114 ++++++++++++++++++++++++++++++----------- src/Main.hs | 27 +++++++--- 4 files changed, 143 insertions(+), 56 deletions(-) diff --git a/src/LWN/Article.hs b/src/LWN/Article.hs index 2820d54..70c68a5 100644 --- a/src/LWN/Article.hs +++ b/src/LWN/Article.hs @@ -3,12 +3,36 @@ where import XHTML -data Article = Article { title :: String, - byline :: String, - body_html :: String } +newtype Title = Title { getTitle :: String } +newtype Byline = Byline { getByline :: Maybe String } +newtype BodyHtml = BodyHtml { getBodyHtml :: String } + +instance Show Title where + show = getTitle + +instance Show Byline where + show (Byline (Just bl)) = bl + show (Byline Nothing ) = "" + +instance Show BodyHtml where + show = getBodyHtml + +instance XHTML Title where + to_xhtml (Title t) = "

" ++ t ++ "

" + +instance XHTML Byline where + to_xhtml (Byline (Just bl)) = "

" ++ bl ++ "

" + to_xhtml (Byline Nothing) = "" + +instance XHTML BodyHtml where + to_xhtml = getBodyHtml + +data Article = Article { title :: Title, + byline :: Byline, + body_html :: BodyHtml } instance XHTML Article where to_xhtml (Article t bl b) = - "

" ++ t ++ "

\n\n" ++ - "

" ++ bl ++ "

\n\n" ++ - b + (to_xhtml t) ++ + (to_xhtml bl) ++ + (to_xhtml b) diff --git a/src/LWN/ArticlePage.hs b/src/LWN/ArticlePage.hs index 927e241..5964013 100644 --- a/src/LWN/ArticlePage.hs +++ b/src/LWN/ArticlePage.hs @@ -29,16 +29,16 @@ instance XHTML ArticlePage where "" ++ "" ++ - "\n\n" ++ + "" ++ " " ++ - " " ++ (LWN.Article.title a) ++ "\n" ++ - "\n" ++ - "\n" ++ - "
\n\n" ++ + " " ++ (show $ LWN.Article.title a) ++ "" ++ + "" ++ + "" ++ + "
" ++ (to_xhtml a) ++ - "\n\n
\n" ++ - "\n\n" ++ + "
" ++ + "" ++ "" @@ -49,7 +49,7 @@ instance Epublishable ArticlePage where Just [x] -> return $ Just $ ArticlePage x _ -> return Nothing - title (ArticlePage x) = LWN.Article.title x + title (ArticlePage x) = show $ LWN.Article.title x -- | Takes data from an LWN page and determines whether or not it's a @@ -95,7 +95,7 @@ parse_articles xml = do parsed_headline <- parse_headline xml parsed_byline <- parse_byline xml parsed_body <- parse_body xml - let title' = fromJust parsed_headline - let byline' = fromJust parsed_byline - let body' = fromJust parsed_body + let title' = Title (fromJust parsed_headline) + let byline' = Byline parsed_byline + let body' = BodyHtml (fromJust parsed_body) return $ Just $ [Article title' byline' body'] diff --git a/src/LWN/FullPage.hs b/src/LWN/FullPage.hs index ac88862..1ba7910 100644 --- a/src/LWN/FullPage.hs +++ b/src/LWN/FullPage.hs @@ -5,11 +5,14 @@ import Data.String.Utils (split, strip) import Data.Maybe (catMaybes, fromJust) import Data.Tree.NTree.TypeDefs (NTree) import Text.XML.HXT.Core ( + ArrowXml, IOSArrow, XmlTree, XNode, (>>>), (/>), + (//>), + getChildren, getText, hasAttrValue, hasName, @@ -19,7 +22,6 @@ import Text.XML.HXT.Core ( xshow, when ) -import Text.XML.HXT.Arrow.Edit (indentDoc) import Text.HandsomeSoup (css, parseHtml) import Epublishable @@ -40,17 +42,17 @@ instance XHTML FullPage where "" ++ "" ++ - "\n\n" ++ + "" ++ " " ++ - " " ++ (headline fp) ++ "\n" ++ - "\n" ++ - "\n" ++ - "
\n\n" ++ + " " ++ (headline fp) ++ "" ++ + "" ++ + "" ++ + "
" ++ "

" ++ (headline fp) ++ "

" ++ (articles_xhtml fp) ++ - "\n\n
\n" ++ - "\n\n" ++ + "
" ++ + "" ++ "" instance Epublishable FullPage where @@ -61,7 +63,7 @@ instance Epublishable FullPage where them@(_:_) -> return $ Just $ FullPage (fromJust hl) them _ -> return Nothing - title _ = "LWN.net" + title = headline parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) @@ -87,8 +89,8 @@ parse_article_byline xml = do parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) parse_article_title xml = do - let element_filter = xml >>> css "h2.SummaryHL a" - let element_text_filter = element_filter /> getText + let element_filter = xml >>> css "h2.SummaryHL" + let element_text_filter = element_filter //> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x @@ -96,18 +98,50 @@ parse_article_title xml = do _ -> error "Found more than one article title." ---is_title :: Integer ---is_title = (hasName "h2") >>> (hasAttrValue "class" (== "SummaryHL")) ---is_byline :: Integer ---is_byline = (hasName "div") >>> (hasAttrValue "class" (== "FeatureByLine")) +is_title :: (ArrowXml a) => a XmlTree XmlTree +is_title = + (hasName "h2") + >>> + (hasAttrValue "class" (== "SummaryHL")) + + +is_byline :: (ArrowXml a) => a XmlTree XmlTree +is_byline = + (hasName "div") + >>> + (hasAttrValue "class" (== "FeatureByLine")) + + +is_image :: (ArrowXml a) => a XmlTree XmlTree +is_image = + hasName "img" + + +remove_title :: (ArrowXml a) => a XmlTree XmlTree +remove_title = + processTopDown ((none) `when` is_title) + + +remove_byline :: (ArrowXml a) => a XmlTree XmlTree +remove_byline = + processTopDown ((none) `when` is_byline) + + +remove_images :: (ArrowXml a) => a XmlTree XmlTree +remove_images = + processTopDown ((none) `when` is_image) + + parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) parse_article_body xml = do -- First, delete the article title and byline. - --let clean_xml = xml >>> processTopDown ((none) `when` is_title) >>> processTopDown ((none) `when` is_byline) - let clean_xml = xml - clean_html <- runX $ xshow clean_xml + let clean_xml' = xml >>> remove_title >>> remove_byline >>> remove_images + -- The only child of the body element should be a div.lwn-article + -- since we wrapped the article's HTML in that. + let clean_xml = clean_xml' >>> css "body" >>> getChildren + clean_html <- runX . xshow $ clean_xml return $ case clean_html of [x] -> Just x [] -> Nothing @@ -118,17 +152,14 @@ parse_article xml = do parsed_article_title <- parse_article_title xml parsed_article_byline <- parse_article_byline xml parsed_article_body <- parse_article_body xml --- let title' = fromJust parsed_article_title - let title' = "title" --- let byline' = fromJust parsed_article_byline - let byline' = "byline" --- let body' = fromJust parsed_article_body - body' <- runX . xshow $ xml - return $ Just $ Article title' byline' (body' !! 0) + let title' = Title $ fromJust parsed_article_title + let byline' = Byline parsed_article_byline + let body' = BodyHtml $ fromJust parsed_article_body + return $ Just $ Article title' byline' body' parse_html_article :: String -> IO (Maybe Article) parse_html_article html = do - let xml = parseHtml html + let xml = parseHtml $ wrap_in_body_div html parse_article xml @@ -139,20 +170,41 @@ parse_bodies xml = xml >>> css "div.ArticleText" +-- Debug, print a string. +print_article :: String -> IO () +print_article s = do + putStrLn "-----------" + putStrLn "- Article -" + putStrLn "-----------" + putStrLn "" + putStrLn s + putStrLn "" + + +-- Debug, print an article's body html. +print_body :: Article -> IO () +print_body x = + print_article bh + where + bh' = body_html x + bh = getBodyHtml bh' + parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article] parse_articles xml = do - bodies <- runX . xshow $ (parse_bodies xml) >>> indentDoc + bodies <- runX . xshow $ parse_bodies xml let article_separator = "

" - let split_articles'' = split article_separator (bodies !! 0) + let split_articles'' = split article_separator (concat bodies) -- The first element will contain the crap before the first . let split_articles' = tail split_articles'' -- Put the separator back, it was lost during the split. - let split_articles = map (("\n" ++ article_separator) ++) split_articles' - putStrLn "split articles\n\n" - mapM putStrLn split_articles + let split_articles = map (article_separator ++) split_articles' + --_ <- mapM print_article split_articles real_articles <- mapM parse_html_article split_articles let just_articles = catMaybes real_articles return just_articles +wrap_in_body_div :: String -> String +wrap_in_body_div s = + "
" ++ s ++ "
" diff --git a/src/Main.hs b/src/Main.hs index 37f8967..c6ab443 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -3,23 +3,34 @@ module Main where import Data.Maybe (fromJust) -import Text.HandsomeSoup (parseHtml) +import Text.XML.HXT.Core -- (SysConfigList, IOStateArrow, XmlTree, readDocument) import Epublishable import LWN.ArticlePage import LWN.FullPage -import XHTML + +my_read :: String -> IOStateArrow s b XmlTree +my_read = + readDocument [ withValidate no, + withParseHTML yes, + withInputEncoding utf8, + withWarnings no ] main :: IO () main = do - article_html <- readFile "test/fixtures/501317-article.html" - ioap <- parse $ parseHtml article_html + let article_html = my_read "test/fixtures/501317-article.html" + ioap <- parse article_html let article_page :: ArticlePage = fromJust $ ioap epublish article_page "single_article.epub" - page_html <- readFile "test/fixtures/500848-page.html" - ioap_f <- parse $ parseHtml page_html + let page_html = my_read "test/fixtures/500848-page.html" + ioap_f <- parse page_html let full_page :: FullPage = fromJust $ ioap_f - --putStrLn $ to_xhtml full_page - --epublish full_page "full_page.epub" + epublish full_page "full_page.epub" + + let bigpage_html = my_read "test/fixtures/50844-bigpage.html" + ioap_bp <- parse bigpage_html + let bigpage :: FullPage = fromJust $ ioap_bp + epublish bigpage "bigpage.epub" + putStrLn "Done." -- 2.43.2