1 {-# LANGUAGE DoAndIfThenElse #-}
6 import qualified Data.Map as Map
7 import Data.Time (getCurrentTime)
8 import System.IO (Handle, hClose, hFlush)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.String.Utils (split, strip)
11 import Data.Maybe (catMaybes, fromJust, isNothing)
12 import Data.Tree.NTree.TypeDefs (NTree)
13 import Test.HUnit (Assertion, assertEqual)
14 import Test.Framework (Test, testGroup)
15 import Test.Framework.Providers.HUnit (testCase)
17 import Text.XML.HXT.Core (
41 import Text.HandsomeSoup (css, parseHtml)
44 import LWN.HTTP (save_image)
45 import LWN.URI (URL, try_make_absolute_url)
46 import Misc (contains)
49 -- Map absolute image URLs to local system file paths where the image
50 -- referenced by the URL is stored.
51 type ImageMap = Map.Map URL FilePath
53 -- Should be called *after* preprocessing.
54 download_images :: IOSArrow XmlTree (NTree XNode) -> IO ImageMap
55 download_images xml = do
56 image_urls <- runX $ xml >>> image_srcs
57 files <- mapM save_image image_urls
58 let pairs = zip image_urls files
59 return $ foldl my_insert empty_map pairs
61 empty_map = Map.empty :: ImageMap
63 my_insert :: ImageMap -> (URL, Maybe FilePath) -> ImageMap
64 my_insert dict (_, Nothing) = dict
65 my_insert dict (k, Just v) = Map.insert k v dict
69 -- | An LWN page with one article on it.
70 ArticlePage { article :: Article } |
72 -- | An LWN page with more than one article on it. These require
73 -- different parsing and display functions than the single-article
75 FullPage { headline :: String,
76 articles :: [Article] }
79 instance XHTML Page where
80 to_xhtml (ArticlePage a) =
81 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
82 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
83 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
85 " <meta http-equiv=\"Content-Type\"" ++
86 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
87 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
96 to_xhtml (FullPage hl as) =
97 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
98 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
99 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
101 " <meta http-equiv=\"Content-Type\"" ++
102 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
103 " <title>" ++ hl ++ "</title>" ++
107 "<h1>" ++ hl ++ "</h1>" ++
108 (concatMap to_xhtml as) ++
115 is_link :: (ArrowXml a) => a XmlTree XmlTree
117 isElem >>> hasName "a"
120 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
121 remove_comment_links =
122 processTopDown $ kill_comments `when` is_link
125 hasAttrValue "href" (contains "#Comments")
128 none `when` is_comment_link
130 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
131 replace_links_with_spans =
132 processTopDown $ (make_span >>> remove_attrs) `when` is_link
134 make_span = setElemName $ mkName "span"
135 remove_attrs = processAttrl none
138 -- | Preprocessing common to both page types.
139 preprocess :: (ArrowXml a) => a XmlTree XmlTree
141 make_image_srcs_absolute
145 replace_links_with_spans
148 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
149 replace_remote_img_srcs image_map =
150 processTopDown (make_srcs_local `when` is_image)
152 -- old_src -> new_src
153 change_src_func :: String -> String
154 change_src_func old_src =
155 case Map.lookup old_src image_map of
156 -- Leave it alone if we don't have the file locally
160 change_src :: (ArrowXml a) => a XmlTree XmlTree
162 changeAttrValue change_src_func
164 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
166 processAttrl $ (change_src `when` (hasName "src"))
169 parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
171 let clean_xml = xml >>> preprocess
172 image_map <- download_images clean_xml
173 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
174 appr <- ap_parse local_xml
175 fppr <- fp_parse local_xml
177 if (isNothing appr) then
184 parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
185 parse_headline xml = do
186 let element_filter = xml >>> css "div.PageHeadline h1"
187 let element_text_filter = element_filter /> getText
188 element_text <- runX element_text_filter
191 [x] -> Just $ strip x
193 _ -> error "Found more than one headline."
196 parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
197 parse_byline xml = do
198 let element_filter = xml >>> css "div.FeatureByLine"
199 let element_text_filter = element_filter /> getText
200 element_text <- runX element_text_filter
203 [x] -> Just $ strip x
205 _ -> error "Found more than one article byline."
211 ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
213 arts <- ap_parse_articles xml
215 [x] -> return $ Just $ ArticlePage x
219 ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
220 ap_parse_body xml = do
221 let element_filter = xml >>> css "div.ArticleText"
222 let element_html_filter = xshow element_filter
223 element_html <- runX element_html_filter
224 return $ case element_html of
227 _ -> error "Found more than one article."
230 ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
231 ap_parse_articles xml = do
232 parsed_headline <- parse_headline xml
233 parsed_byline <- parse_byline xml
234 parsed_body <- ap_parse_body xml
236 putStrLn $ fromJust parsed_headline
238 if (isNothing parsed_headline) || (isNothing parsed_body)
241 let title' = Title $ fromJust parsed_headline
242 let byline' = Byline parsed_byline
243 let body' = BodyHtml $ fromJust parsed_body
245 return $ [Article title' byline' body']
253 fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
255 hl <- parse_headline xml
256 parsed_articles <- fp_parse_articles xml
257 case parsed_articles of
259 x -> return $ Just $ FullPage (fromJust hl) x
263 fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
264 fp_parse_article_title xml = do
265 let element_filter = xml >>> css "h2.SummaryHL"
266 let element_text_filter = element_filter //> getText
267 element_text <- runX element_text_filter
268 return $ case element_text of
269 [x] -> Just $ strip x
271 _ -> error "Found more than one article title."
275 is_title :: (ArrowXml a) => a XmlTree XmlTree
279 (hasAttrValue "class" (== "SummaryHL"))
282 is_byline :: (ArrowXml a) => a XmlTree XmlTree
286 (hasAttrValue "class" (== "FeatureByLine"))
289 is_image :: (ArrowXml a) => a XmlTree XmlTree
290 is_image = isElem >>> hasName "img"
292 remove_title :: (ArrowXml a) => a XmlTree XmlTree
294 processTopDown ((none) `when` is_title)
297 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
299 processTopDown ((none) `when` is_byline)
303 fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
304 fp_parse_article_body xml = do
305 -- First, delete the article title and byline.
306 let clean_xml' = xml >>> remove_title >>> remove_byline
307 -- The only child of the body element should be a div.lwn-article
308 -- since we wrapped the article's HTML in that.
309 let clean_xml = clean_xml' >>> css "body" >>> getChildren
310 clean_html <- runX . xshow $ clean_xml
311 return $ case clean_html of
314 _ -> error "Found more than one article body."
316 fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
317 fp_parse_article xml = do
318 parsed_article_title <- fp_parse_article_title xml
319 parsed_article_byline <- parse_byline xml
320 parsed_article_body <- fp_parse_article_body xml
322 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
326 let title' = Title $ fromJust parsed_article_title
327 let byline' = Byline parsed_article_byline
328 let body' = BodyHtml $ fromJust parsed_article_body
329 return $ Just $ Article title' byline' body'
331 parse_html_article :: String -> IO (Maybe Article)
332 parse_html_article html = do
333 let xml = parseHtml $ wrap_in_body_div html
337 -- | In the full page, all of the article titles and bodies are
338 -- wrapped in one big div.ArticleText.
339 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
341 xml >>> css "div.ArticleText"
344 fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
345 fp_parse_articles xml = do
346 bodies <- runX . xshow $ parse_bodies xml
347 let article_separator = "<h2 class=\"SummaryHL\">"
348 let split_articles'' = split article_separator (concat bodies)
349 -- The first element will contain the crap before the first <h2...>.
350 let split_articles' = case split_articles'' of
351 (_:_) -> tail split_articles''
353 -- Put the separator back, it was lost during the split.
354 let split_articles = map (article_separator ++) split_articles'
355 --_ <- mapM print_article split_articles
356 real_articles <- mapM parse_html_article split_articles
357 let just_articles = catMaybes real_articles
361 -- | This makes it easy to select otherwise-random chunks of html
363 wrap_in_body_div :: String -> String
365 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
371 -- Epublishable stuff
374 title :: Page -> String
375 title (ArticlePage a) = getTitle $ LWN.Article.title a
376 title (FullPage hl _) = hl
379 metadata :: Page -> IO String
381 date <- getCurrentTime
383 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
384 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
385 "<dc:language>en-US</dc:language>\n" ++
386 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
387 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
390 epublish :: Page -> Handle -> IO ()
391 epublish obj handle = do
392 let xhtml = to_xhtml obj
394 epub <- xhtml_to_epub epmd xhtml
399 xhtml_to_epub :: String -> String -> IO B.ByteString
401 write_epub . read_html
403 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
404 write_epub = writeEPUB Nothing [] my_writer_options
405 read_html = readHtml defaultParserState
413 image_srcs :: (ArrowXml a) => a XmlTree URL
419 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
420 make_image_srcs_absolute =
421 processTopDown (make_srcs_absolute `when` is_image)
423 change_src :: (ArrowXml a) => a XmlTree XmlTree
425 changeAttrValue try_make_absolute_url
427 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
429 processAttrl $ change_src `when` hasName "src"
436 test_preprocess_links :: Assertion
437 test_preprocess_links = do
438 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
439 let actual_xml = actual_xml' !! 0
441 expected_xml' <- runX $ expected_xml'' >>> css "body"
442 let expected_xml = expected_xml' !! 0
445 "Links are replaced with spans"
449 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
450 input_xml = parseHtml input_html
451 expected_html = "<body><span>Hello, world!</span></body>"
452 expected_xml'' = parseHtml expected_html
455 test_absolve_images :: Assertion
456 test_absolve_images = do
457 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
458 let actual_xml = actual_xml' !! 0
460 expected_xml' <- runX $ expected_xml'' >>> css "body"
461 let expected_xml = expected_xml' !! 0
464 "Image srcs are made absolute"
470 "<img src=\"/images/2012/example.jpg\" />" ++
472 input_xml = parseHtml input_html
475 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
477 expected_xml'' = parseHtml expected_html
480 test_comments_removed :: Assertion
481 test_comments_removed = do
482 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
483 let actual_xml = actual_xml' !! 0
485 expected_xml' <- runX $ expected_xml'' >>> css "body"
486 let expected_xml = expected_xml' !! 0
489 "Comment links are removed"
495 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
497 input_xml = parseHtml input_html
499 expected_html = "<body><p></p></body>"
500 expected_xml'' = parseHtml expected_html
506 testGroup "Page Tests" [
507 testCase "Links are replaced with spans" test_preprocess_links,
508 testCase "Image srcs are made absolute" test_absolve_images,
509 testCase "Comment links are removed" test_comments_removed ]