1 {-# LANGUAGE DoAndIfThenElse #-}
6 import qualified Data.Map as Map (lookup)
7 import Data.Time (getCurrentTime)
8 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
9 import Data.String.Utils (split, strip)
10 import Data.Maybe (catMaybes, fromJust, isNothing)
11 import Prelude hiding (readFile)
12 import System.Directory (doesFileExist)
13 import System.IO (Handle, hClose, hFlush, hPutStrLn, stderr)
14 import System.IO.UTF8 (readFile)
15 import Test.HUnit (Assertion, assertEqual)
16 import Test.Framework (Test, testGroup)
17 import Test.Framework.Providers.HUnit (testCase)
24 import Text.XML.HXT.Core (
47 import Text.HandsomeSoup (css, parseHtml)
49 import Configuration (Cfg, password, use_account, username)
57 import LWN.URI (URL, try_make_absolute_url)
58 import LWN.XHTML (XHTML, parse_lwn, to_xhtml)
59 import Misc (contains)
62 -- | Try to parse the given article using HXT. We try a few different
63 -- methods; if none of them work, we return 'Nothing'.
64 get_xml_from_article :: Cfg -> URL -> IO (Maybe (IOStateArrow s b XmlTree))
65 get_xml_from_article cfg article_name = do
66 my_article <- real_article_path article_name
67 is_file <- doesFileExist my_article
70 contents <- readFile my_article
71 return $ Just $ parse_lwn contents
73 -- Download the URL and try to parse it.
74 if use_account cfg then do
75 -- use_account would be false if these fromJusts would fail.
77 li_result <- log_in cj
78 (fromJust $ username cfg)
79 (fromJust $ password cfg)
83 let msg = "Failed to log in. " ++ err
85 Right response_body -> do
86 hPutStrLn stderr response_body
88 html <- get_page (Just cj) my_article
92 let msg = "Failed to retrieve page. " ++ err
95 Right h -> return $ Just $ parse_lwn h
97 html <- get_page Nothing my_article
100 let msg = "Failed to retrieve page. " ++ err
103 Right h -> return $ Just $ parse_lwn h
107 -- Should be called *after* preprocessing.
108 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
109 download_images xml = do
110 image_urls <- runX $ xml >>> image_srcs
111 download_image_urls image_urls
115 -- | An LWN page with one article on it.
116 ArticlePage { article :: Article } |
118 -- | An LWN page with more than one article on it. These require
119 -- different parsing and display functions than the single-article
121 FullPage { headline :: String,
122 articles :: [Article] }
125 instance XHTML Page where
126 to_xhtml (ArticlePage a) =
127 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
128 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
129 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
131 " <meta http-equiv=\"Content-Type\"" ++
132 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
133 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
140 to_xhtml (FullPage hl as) =
141 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
142 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
143 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
145 " <meta http-equiv=\"Content-Type\"" ++
146 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
147 " <title>" ++ hl ++ "</title>" ++
151 "<h1>" ++ hl ++ "</h1>" ++
152 (concatMap to_xhtml as) ++
159 page_from_url :: Cfg -> URL -> IO (Maybe Page)
160 page_from_url cfg url = do
161 maybe_html <- get_xml_from_article cfg url
163 Just html -> parse html
164 Nothing -> return Nothing
167 is_link :: (ArrowXml a) => a XmlTree XmlTree
169 isElem >>> hasName "a"
172 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
173 remove_comment_links =
174 processTopDown $ kill_comments `when` is_link
177 hasAttrValue "href" (contains "#Comments")
180 none `when` is_comment_link
182 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
183 replace_links_with_spans =
184 processTopDown $ (make_span >>> remove_attrs) `when` is_link
186 make_span = setElemName $ mkName "span"
187 remove_attrs = processAttrl none
190 -- | Preprocessing common to both page types.
191 preprocess :: (ArrowXml a) => a XmlTree XmlTree
193 make_image_srcs_absolute
197 replace_links_with_spans
200 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
201 replace_remote_img_srcs image_map =
202 processTopDown (make_srcs_local `when` is_image)
204 -- old_src -> new_src
205 change_src_func :: String -> String
206 change_src_func old_src =
207 case Map.lookup old_src image_map of
208 -- Leave it alone if we don't have the file locally
212 change_src :: (ArrowXml a) => a XmlTree XmlTree
214 changeAttrValue change_src_func
216 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
218 processAttrl $ (change_src `when` (hasName "src"))
221 parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
223 let clean_xml = xml >>> preprocess
224 image_map <- download_images clean_xml
225 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
226 appr <- ap_parse local_xml
227 fppr <- fp_parse local_xml
229 if (isNothing appr) then
236 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
237 parse_headline xml = do
238 let element_filter = xml >>> css "div.PageHeadline h1"
239 let element_text_filter = element_filter /> getText
240 element_text <- runX element_text_filter
243 [x] -> Just $ strip x
245 _ -> error "Found more than one headline."
248 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
249 parse_byline xml = do
250 let element_filter = xml >>> css "div.FeatureByLine"
251 let element_text_filter = element_filter /> getText
252 element_text <- runX element_text_filter
255 [x] -> Just $ strip x
257 _ -> error "Found more than one article byline."
263 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
265 arts <- ap_parse_articles xml
267 [x] -> return $ Just $ ArticlePage x
271 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
272 ap_parse_body xml = do
273 let element_filter = xml >>> css "div.ArticleText"
274 let element_html_filter = xshow element_filter
275 element_html <- runX element_html_filter
276 return $ case element_html of
279 _ -> error "Found more than one article."
282 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
283 ap_parse_articles xml = do
284 parsed_headline <- parse_headline xml
285 parsed_byline <- parse_byline xml
286 parsed_body <- ap_parse_body xml
288 putStrLn $ fromJust parsed_headline
290 if (isNothing parsed_headline) || (isNothing parsed_body)
293 let title' = Title $ fromJust parsed_headline
294 let byline' = Byline parsed_byline
295 let body' = BodyHtml $ fromJust parsed_body
297 return $ [Article title' byline' body']
305 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
307 hl <- parse_headline xml
308 parsed_articles <- fp_parse_articles xml
309 case parsed_articles of
311 x -> return $ Just $ FullPage (fromJust hl) x
315 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
316 fp_parse_article_title xml = do
317 let element_filter = xml >>> css "h2.SummaryHL"
318 let element_text_filter = element_filter //> getText
319 element_text <- runX element_text_filter
320 return $ case element_text of
321 [x] -> Just $ strip x
323 _ -> error "Found more than one article title."
327 is_title :: (ArrowXml a) => a XmlTree XmlTree
331 (hasAttrValue "class" (== "SummaryHL"))
334 is_byline :: (ArrowXml a) => a XmlTree XmlTree
338 (hasAttrValue "class" (== "FeatureByLine"))
341 is_image :: (ArrowXml a) => a XmlTree XmlTree
342 is_image = isElem >>> hasName "img"
344 remove_title :: (ArrowXml a) => a XmlTree XmlTree
346 processTopDown ((none) `when` is_title)
349 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
351 processTopDown ((none) `when` is_byline)
355 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
356 fp_parse_article_body xml = do
357 -- First, delete the article title and byline.
358 let clean_xml' = xml >>> remove_title >>> remove_byline
359 -- The only child of the body element should be a div.lwn-article
360 -- since we wrapped the article's HTML in that.
361 let clean_xml = clean_xml' >>> css "body" >>> getChildren
362 clean_html <- runX . xshow $ clean_xml
363 return $ case clean_html of
366 _ -> error "Found more than one article body."
368 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
369 fp_parse_article xml = do
370 parsed_article_title <- fp_parse_article_title xml
371 parsed_article_byline <- parse_byline xml
372 parsed_article_body <- fp_parse_article_body xml
374 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
378 let title' = Title $ fromJust parsed_article_title
379 let byline' = Byline parsed_article_byline
380 let body' = BodyHtml $ fromJust parsed_article_body
381 return $ Just $ Article title' byline' body'
383 parse_html_article :: String -> IO (Maybe Article)
384 parse_html_article html = do
385 let xml = parseHtml $ wrap_in_body_div html
389 -- | In the full page, all of the article titles and bodies are
390 -- wrapped in one big div.ArticleText.
391 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
393 xml >>> css "div.ArticleText"
396 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
397 fp_parse_articles xml = do
398 bodies <- runX . xshow $ parse_bodies xml
399 let article_separator = "<h2 class=\"SummaryHL\">"
400 let split_articles'' = split article_separator (concat bodies)
401 -- The first element will contain the crap before the first <h2...>.
402 let split_articles' = case split_articles'' of
403 (_:_) -> tail split_articles''
405 -- Put the separator back, it was lost during the split.
406 let split_articles = map (article_separator ++) split_articles'
407 --_ <- mapM print_article split_articles
408 real_articles <- mapM parse_html_article split_articles
409 let just_articles = catMaybes real_articles
413 -- | This makes it easy to select otherwise-random chunks of html
415 wrap_in_body_div :: String -> String
417 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
423 -- Epublishable stuff
426 title :: Page -> String
427 title (ArticlePage a) = getTitle $ LWN.Article.title a
428 title (FullPage hl _) = hl
431 metadata :: Page -> IO String
433 date <- getCurrentTime
435 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
436 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
437 "<dc:language>en-US</dc:language>\n" ++
438 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
439 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
442 epublish :: Page -> Handle -> IO ()
443 epublish obj handle = do
444 let xhtml = to_xhtml obj
446 epub <- xhtml_to_epub epmd xhtml
451 xhtml_to_epub :: String -> String -> IO B.ByteString
453 write_epub . read_html
455 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
456 write_epub = writeEPUB Nothing [] my_writer_options
457 read_html = readHtml defaultParserState
465 image_srcs :: (ArrowXml a) => a XmlTree URL
471 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
472 make_image_srcs_absolute =
473 processTopDown (make_srcs_absolute `when` is_image)
475 change_src :: (ArrowXml a) => a XmlTree XmlTree
477 changeAttrValue try_make_absolute_url
479 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
481 processAttrl $ change_src `when` hasName "src"
488 test_preprocess_links :: Assertion
489 test_preprocess_links = do
490 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
491 let actual_xml = actual_xml' !! 0
493 expected_xml' <- runX $ expected_xml'' >>> css "body"
494 let expected_xml = expected_xml' !! 0
497 "Links are replaced with spans"
501 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
502 input_xml = parseHtml input_html
503 expected_html = "<body><span>Hello, world!</span></body>"
504 expected_xml'' = parseHtml expected_html
507 test_absolve_images :: Assertion
508 test_absolve_images = do
509 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
510 let actual_xml = actual_xml' !! 0
512 expected_xml' <- runX $ expected_xml'' >>> css "body"
513 let expected_xml = expected_xml' !! 0
516 "Image srcs are made absolute"
522 "<img src=\"/images/2012/example.jpg\" />" ++
524 input_xml = parseHtml input_html
527 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
529 expected_xml'' = parseHtml expected_html
532 test_comments_removed :: Assertion
533 test_comments_removed = do
534 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
535 let actual_xml = actual_xml' !! 0
537 expected_xml' <- runX $ expected_xml'' >>> css "body"
538 let expected_xml = expected_xml' !! 0
541 "Comment links are removed"
547 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
549 input_xml = parseHtml input_html
551 expected_html = "<body><p></p></body>"
552 expected_xml'' = parseHtml expected_html
558 testGroup "Page Tests" [
559 testCase "Links are replaced with spans" test_preprocess_links,
560 testCase "Image srcs are made absolute" test_absolve_images,
561 testCase "Comment links are removed" test_comments_removed ]