5 import Data.Time (getCurrentTime)
6 import System.IO (Handle)
7 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
8 import Data.String.Utils (split, strip)
9 import Data.Maybe (catMaybes, fromJust, isNothing)
10 import Data.Tree.NTree.TypeDefs (NTree)
11 import Test.HUnit (Assertion, assertEqual)
12 import Test.Framework (Test, testGroup)
13 import Test.Framework.Providers.HUnit (testCase)
14 import Text.XML.HXT.Core (
35 import Text.HandsomeSoup (css, parseHtml)
41 -- | An LWN page with one article on it.
42 ArticlePage { article :: Article } |
44 -- | An LWN page with more than one article on it. These require
45 -- different parsing and display functions than the single-article
47 FullPage { headline :: String,
48 articles :: [Article] }
51 instance XHTML Page where
52 to_xhtml (ArticlePage a) =
53 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
54 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
55 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
57 " <meta http-equiv=\"Content-Type\"" ++
58 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
59 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
68 to_xhtml (FullPage hl as) =
69 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
70 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
71 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
73 " <meta http-equiv=\"Content-Type\"" ++
74 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
75 " <title>" ++ hl ++ "</title>" ++
79 "<h1>" ++ hl ++ "</h1>" ++
80 (concatMap to_xhtml as) ++
87 remove_images :: (ArrowXml a) => a XmlTree XmlTree
89 processTopDown ((none) `when` is_image)
92 is_link :: (ArrowXml a) => a XmlTree XmlTree
96 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
97 replace_links_with_spans =
98 processTopDown $ (make_span >>> remove_attrs) `when` is_link
100 make_span = setElemName $ mkName "span"
101 remove_attrs = processAttrl none
103 -- | Preprocessing common to both page types.
104 preprocess :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
106 xml >>> remove_images >>> replace_links_with_spans
109 parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
111 let clean_xml = preprocess xml
112 appr <- ap_parse clean_xml
113 fppr <- fp_parse clean_xml
115 if (isNothing appr) then
123 ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
125 arts <- ap_parse_articles xml
127 Just [x] -> return $ Just $ ArticlePage x
131 ap_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
132 ap_parse_headline xml = do
133 let element_filter = xml >>> css "div.PageHeadline h1"
134 let element_text_filter = element_filter /> getText
135 element_text <- runX element_text_filter
136 return $ case element_text of
137 [x] -> Just $ strip x
139 _ -> error "Found more than one headline."
141 ap_parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
142 ap_parse_byline xml = do
143 let element_filter = xml >>> css "div.Byline"
144 let element_text_filter = element_filter /> getText
145 element_text <- runX element_text_filter
146 return $ case element_text of
147 [x] -> Just $ strip x
149 _ -> error "Found more than one byline."
152 ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
153 ap_parse_body xml = do
154 let element_filter = xml >>> css "div.ArticleText"
155 let element_html_filter = xshow element_filter
156 element_html <- runX element_html_filter
157 return $ case element_html of
160 _ -> error "Found more than one article."
163 ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
164 ap_parse_articles xml = do
165 parsed_headline <- ap_parse_headline xml
166 parsed_byline <- ap_parse_byline xml
167 parsed_body <- ap_parse_body xml
168 let title' = Title (fromJust parsed_headline)
169 let byline' = Byline parsed_byline
170 let body' = BodyHtml (fromJust parsed_body)
171 return $ Just $ [Article title' byline' body']
180 fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
182 hl <- fp_parse_headline xml
183 parsed_articles <- fp_parse_articles xml
184 case parsed_articles of
185 them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
191 fp_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
192 fp_parse_headline xml = do
193 let element_filter = xml >>> css "div.PageHeadline h1"
194 let element_text_filter = element_filter /> getText
195 element_text <- runX element_text_filter
196 return $ case element_text of
197 [x] -> Just $ strip x
199 _ -> error "Found more than one headline."
201 fp_parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
202 fp_parse_article_byline xml = do
203 let element_filter = xml >>> css "div.FeatureByLine"
204 let element_text_filter = element_filter /> getText
205 element_text <- runX element_text_filter
206 return $ case element_text of
207 [x] -> Just $ strip x
209 _ -> error "Found more than one article byline."
212 fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
213 fp_parse_article_title xml = do
214 let element_filter = xml >>> css "h2.SummaryHL"
215 let element_text_filter = element_filter //> getText
216 element_text <- runX element_text_filter
217 return $ case element_text of
218 [x] -> Just $ strip x
220 _ -> error "Found more than one article title."
224 is_title :: (ArrowXml a) => a XmlTree XmlTree
228 (hasAttrValue "class" (== "SummaryHL"))
231 is_byline :: (ArrowXml a) => a XmlTree XmlTree
235 (hasAttrValue "class" (== "FeatureByLine"))
238 is_image :: (ArrowXml a) => a XmlTree XmlTree
242 remove_title :: (ArrowXml a) => a XmlTree XmlTree
244 processTopDown ((none) `when` is_title)
247 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
249 processTopDown ((none) `when` is_byline)
253 fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
254 fp_parse_article_body xml = do
255 -- First, delete the article title and byline.
256 let clean_xml' = xml >>> remove_title >>> remove_byline
257 -- The only child of the body element should be a div.lwn-article
258 -- since we wrapped the article's HTML in that.
259 let clean_xml = clean_xml' >>> css "body" >>> getChildren
260 clean_html <- runX . xshow $ clean_xml
261 return $ case clean_html of
264 _ -> error "Found more than one article body."
266 fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
267 fp_parse_article xml = do
268 parsed_article_title <- fp_parse_article_title xml
269 parsed_article_byline <- fp_parse_article_byline xml
270 parsed_article_body <- fp_parse_article_body xml
271 let title' = Title $ fromJust parsed_article_title
272 let byline' = Byline parsed_article_byline
273 let body' = BodyHtml $ fromJust parsed_article_body
274 return $ Just $ Article title' byline' body'
276 parse_html_article :: String -> IO (Maybe Article)
277 parse_html_article html = do
278 let xml = parseHtml $ wrap_in_body_div html
282 -- | In the full page, all of the article titles and bodies are
283 -- wrapped in a div.ArticleText.
284 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
286 xml >>> css "div.ArticleText"
289 fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
290 fp_parse_articles xml = do
291 bodies <- runX . xshow $ parse_bodies xml
292 let article_separator = "<h2 class=\"SummaryHL\">"
293 let split_articles'' = split article_separator (concat bodies)
294 -- The first element will contain the crap before the first <h2...>.
295 let split_articles' = tail split_articles''
296 -- Put the separator back, it was lost during the split.
297 let split_articles = map (article_separator ++) split_articles'
298 --_ <- mapM print_article split_articles
299 real_articles <- mapM parse_html_article split_articles
300 let just_articles = catMaybes real_articles
304 wrap_in_body_div :: String -> String
306 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
312 -- Epublishable stuff
315 title :: Page -> String
316 title (ArticlePage a) = getTitle $ LWN.Article.title a
317 title (FullPage hl _) = hl
320 metadata :: Page -> IO String
322 date <- getCurrentTime
324 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
325 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
326 "<dc:language>en-US</dc:language>\n" ++
327 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
328 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
331 epublish :: Page -> Handle -> IO ()
332 epublish obj handle = do
333 let xhtml = to_xhtml obj
335 epub <- xhtml_to_epub epmd xhtml
339 xhtml_to_epub :: String -> String -> IO B.ByteString
341 write_epub . read_html
343 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
344 write_epub = writeEPUB Nothing [] my_writer_options
345 read_html = readHtml defaultParserState
350 test_preprocess_links :: Assertion
351 test_preprocess_links = do
352 actual_xml' <- runX $ (preprocess input_xml) >>> css "body"
353 let actual_xml = actual_xml' !! 0
355 expected_xml' <- runX $ expected_xml'' >>> css "body"
356 let expected_xml = expected_xml' !! 0
359 "Links are replaced with spans"
363 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
364 input_xml = parseHtml input_html
365 expected_html = "<body><span>Hello, world!</span></body>"
366 expected_xml'' = parseHtml expected_html
370 testGroup "Page Tests" [
371 testCase "Links are replaced with spans" test_preprocess_links ]