import Data.Maybe (catMaybes, fromJust)
import Data.Tree.NTree.TypeDefs (NTree)
import Text.XML.HXT.Core (
+ ArrowXml,
IOSArrow,
XmlTree,
XNode,
(>>>),
(/>),
+ (//>),
+ getChildren,
getText,
hasAttrValue,
hasName,
xshow,
when
)
-import Text.XML.HXT.Arrow.Edit (indentDoc)
import Text.HandsomeSoup (css, parseHtml)
import Epublishable
"<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
"\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
- "\n<head>\n" ++
+ "<head>" ++
" <meta http-equiv=\"Content-Type\"" ++
" content=\"application/xhtml+xml; charset=utf-8\" />" ++
- " <title>" ++ (headline fp) ++ "</title>\n" ++
- "</head>\n" ++
- "<body>\n" ++
- "<div>\n\n" ++
+ " <title>" ++ (headline fp) ++ "</title>" ++
+ "</head>" ++
+ "<body>" ++
+ "<div>" ++
"<h1>" ++ (headline fp) ++ "</h1>" ++
(articles_xhtml fp) ++
- "\n\n</div>\n" ++
- "\n</body>\n" ++
+ "</div>" ++
+ "</body>" ++
"</html>"
instance Epublishable FullPage where
them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
_ -> return Nothing
- title _ = "LWN.net"
+ title = headline
parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
parse_article_title xml = do
- let element_filter = xml >>> css "h2.SummaryHL a"
- let element_text_filter = element_filter /> getText
+ let element_filter = xml >>> css "h2.SummaryHL"
+ let element_text_filter = element_filter //> getText
element_text <- runX element_text_filter
return $ case element_text of
[x] -> Just $ strip x
_ -> error "Found more than one article title."
---is_title :: Integer
---is_title = (hasName "h2") >>> (hasAttrValue "class" (== "SummaryHL"))
---is_byline :: Integer
---is_byline = (hasName "div") >>> (hasAttrValue "class" (== "FeatureByLine"))
+is_title :: (ArrowXml a) => a XmlTree XmlTree
+is_title =
+ (hasName "h2")
+ >>>
+ (hasAttrValue "class" (== "SummaryHL"))
+
+
+is_byline :: (ArrowXml a) => a XmlTree XmlTree
+is_byline =
+ (hasName "div")
+ >>>
+ (hasAttrValue "class" (== "FeatureByLine"))
+
+
+is_image :: (ArrowXml a) => a XmlTree XmlTree
+is_image =
+ hasName "img"
+
+
+remove_title :: (ArrowXml a) => a XmlTree XmlTree
+remove_title =
+ processTopDown ((none) `when` is_title)
+
+
+remove_byline :: (ArrowXml a) => a XmlTree XmlTree
+remove_byline =
+ processTopDown ((none) `when` is_byline)
+
+
+remove_images :: (ArrowXml a) => a XmlTree XmlTree
+remove_images =
+ processTopDown ((none) `when` is_image)
+
+
parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
parse_article_body xml = do
-- First, delete the article title and byline.
- --let clean_xml = xml >>> processTopDown ((none) `when` is_title) >>> processTopDown ((none) `when` is_byline)
- let clean_xml = xml
- clean_html <- runX $ xshow clean_xml
+ let clean_xml' = xml >>> remove_title >>> remove_byline >>> remove_images
+ -- The only child of the body element should be a div.lwn-article
+ -- since we wrapped the article's HTML in that.
+ let clean_xml = clean_xml' >>> css "body" >>> getChildren
+ clean_html <- runX . xshow $ clean_xml
return $ case clean_html of
[x] -> Just x
[] -> Nothing
parsed_article_title <- parse_article_title xml
parsed_article_byline <- parse_article_byline xml
parsed_article_body <- parse_article_body xml
--- let title' = fromJust parsed_article_title
- let title' = "title"
--- let byline' = fromJust parsed_article_byline
- let byline' = "byline"
--- let body' = fromJust parsed_article_body
- body' <- runX . xshow $ xml
- return $ Just $ Article title' byline' (body' !! 0)
+ let title' = Title $ fromJust parsed_article_title
+ let byline' = Byline parsed_article_byline
+ let body' = BodyHtml $ fromJust parsed_article_body
+ return $ Just $ Article title' byline' body'
parse_html_article :: String -> IO (Maybe Article)
parse_html_article html = do
- let xml = parseHtml html
+ let xml = parseHtml $ wrap_in_body_div html
parse_article xml
xml >>> css "div.ArticleText"
+-- Debug, print a string.
+print_article :: String -> IO ()
+print_article s = do
+ putStrLn "-----------"
+ putStrLn "- Article -"
+ putStrLn "-----------"
+ putStrLn ""
+ putStrLn s
+ putStrLn ""
+
+
+-- Debug, print an article's body html.
+print_body :: Article -> IO ()
+print_body x =
+ print_article bh
+ where
+ bh' = body_html x
+ bh = getBodyHtml bh'
+
parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
parse_articles xml = do
- bodies <- runX . xshow $ (parse_bodies xml) >>> indentDoc
+ bodies <- runX . xshow $ parse_bodies xml
let article_separator = "<h2 class=\"SummaryHL\">"
- let split_articles'' = split article_separator (bodies !! 0)
+ let split_articles'' = split article_separator (concat bodies)
-- The first element will contain the crap before the first <h2...>.
let split_articles' = tail split_articles''
-- Put the separator back, it was lost during the split.
- let split_articles = map (("\n" ++ article_separator) ++) split_articles'
- putStrLn "split articles\n\n"
- mapM putStrLn split_articles
+ let split_articles = map (article_separator ++) split_articles'
+ --_ <- mapM print_article split_articles
real_articles <- mapM parse_html_article split_articles
let just_articles = catMaybes real_articles
return just_articles
+wrap_in_body_div :: String -> String
+wrap_in_body_div s =
+ "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"