X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FFullPage.hs;h=1ba7910cf329d414a6b8e6d6ab10dbceee239050;hp=ac8886279547e926de7e91fd12a84a33af9a3bc0;hb=6103dbc5f8d3689e32001c3fd7627f3153e40bb0;hpb=5cb0170a5ab418147e3403fb141797f2282b78f4 diff --git a/src/LWN/FullPage.hs b/src/LWN/FullPage.hs index ac88862..1ba7910 100644 --- a/src/LWN/FullPage.hs +++ b/src/LWN/FullPage.hs @@ -5,11 +5,14 @@ import Data.String.Utils (split, strip) import Data.Maybe (catMaybes, fromJust) import Data.Tree.NTree.TypeDefs (NTree) import Text.XML.HXT.Core ( + ArrowXml, IOSArrow, XmlTree, XNode, (>>>), (/>), + (//>), + getChildren, getText, hasAttrValue, hasName, @@ -19,7 +22,6 @@ import Text.XML.HXT.Core ( xshow, when ) -import Text.XML.HXT.Arrow.Edit (indentDoc) import Text.HandsomeSoup (css, parseHtml) import Epublishable @@ -40,17 +42,17 @@ instance XHTML FullPage where "" ++ "" ++ - "\n\n" ++ + "" ++ " " ++ - " " ++ (headline fp) ++ "\n" ++ - "\n" ++ - "\n" ++ - "

\n\n" ++ + " " ++ (headline fp) ++ "" ++ + "" ++ + "" ++ + "

" ++ "

" ++ (headline fp) ++ "

" ++ (articles_xhtml fp) ++ - "\n\n

\n" ++ - "\n\n" ++ + "

" ++ + "" ++ "" instance Epublishable FullPage where @@ -61,7 +63,7 @@ instance Epublishable FullPage where them@(_:_) -> return $ Just $ FullPage (fromJust hl) them _ -> return Nothing - title _ = "LWN.net" + title = headline parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) @@ -87,8 +89,8 @@ parse_article_byline xml = do parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) parse_article_title xml = do - let element_filter = xml >>> css "h2.SummaryHL a" - let element_text_filter = element_filter /> getText + let element_filter = xml >>> css "h2.SummaryHL" + let element_text_filter = element_filter //> getText element_text <- runX element_text_filter return $ case element_text of [x] -> Just $ strip x @@ -96,18 +98,50 @@ parse_article_title xml = do _ -> error "Found more than one article title." ---is_title :: Integer ---is_title = (hasName "h2") >>> (hasAttrValue "class" (== "SummaryHL")) ---is_byline :: Integer ---is_byline = (hasName "div") >>> (hasAttrValue "class" (== "FeatureByLine")) +is_title :: (ArrowXml a) => a XmlTree XmlTree +is_title = + (hasName "h2") + >>> + (hasAttrValue "class" (== "SummaryHL")) + + +is_byline :: (ArrowXml a) => a XmlTree XmlTree +is_byline = + (hasName "div") + >>> + (hasAttrValue "class" (== "FeatureByLine")) + + +is_image :: (ArrowXml a) => a XmlTree XmlTree +is_image = + hasName "img" + + +remove_title :: (ArrowXml a) => a XmlTree XmlTree +remove_title = + processTopDown ((none) `when` is_title) + + +remove_byline :: (ArrowXml a) => a XmlTree XmlTree +remove_byline = + processTopDown ((none) `when` is_byline) + + +remove_images :: (ArrowXml a) => a XmlTree XmlTree +remove_images = + processTopDown ((none) `when` is_image) + + parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) parse_article_body xml = do -- First, delete the article title and byline. - --let clean_xml = xml >>> processTopDown ((none) `when` is_title) >>> processTopDown ((none) `when` is_byline) - let clean_xml = xml - clean_html <- runX $ xshow clean_xml + let clean_xml' = xml >>> remove_title >>> remove_byline >>> remove_images + -- The only child of the body element should be a div.lwn-article + -- since we wrapped the article's HTML in that. + let clean_xml = clean_xml' >>> css "body" >>> getChildren + clean_html <- runX . xshow $ clean_xml return $ case clean_html of [x] -> Just x [] -> Nothing @@ -118,17 +152,14 @@ parse_article xml = do parsed_article_title <- parse_article_title xml parsed_article_byline <- parse_article_byline xml parsed_article_body <- parse_article_body xml --- let title' = fromJust parsed_article_title - let title' = "title" --- let byline' = fromJust parsed_article_byline - let byline' = "byline" --- let body' = fromJust parsed_article_body - body' <- runX . xshow $ xml - return $ Just $ Article title' byline' (body' !! 0) + let title' = Title $ fromJust parsed_article_title + let byline' = Byline parsed_article_byline + let body' = BodyHtml $ fromJust parsed_article_body + return $ Just $ Article title' byline' body' parse_html_article :: String -> IO (Maybe Article) parse_html_article html = do - let xml = parseHtml html + let xml = parseHtml $ wrap_in_body_div html parse_article xml @@ -139,20 +170,41 @@ parse_bodies xml = xml >>> css "div.ArticleText" +-- Debug, print a string. +print_article :: String -> IO () +print_article s = do + putStrLn "-----------" + putStrLn "- Article -" + putStrLn "-----------" + putStrLn "" + putStrLn s + putStrLn "" + + +-- Debug, print an article's body html. +print_body :: Article -> IO () +print_body x = + print_article bh + where + bh' = body_html x + bh = getBodyHtml bh' + parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article] parse_articles xml = do - bodies <- runX . xshow $ (parse_bodies xml) >>> indentDoc + bodies <- runX . xshow $ parse_bodies xml let article_separator = "

" - let split_articles'' = split article_separator (bodies !! 0) + let split_articles'' = split article_separator (concat bodies) -- The first element will contain the crap before the first . let split_articles' = tail split_articles'' -- Put the separator back, it was lost during the split. - let split_articles = map (("\n" ++ article_separator) ++) split_articles' - putStrLn "split articles\n\n" - mapM putStrLn split_articles + let split_articles = map (article_separator ++) split_articles' + --_ <- mapM print_article split_articles real_articles <- mapM parse_html_article split_articles let just_articles = catMaybes real_articles return just_articles +wrap_in_body_div :: String -> String +wrap_in_body_div s = + "
" ++ s ++ "
"