1 {-# LANGUAGE DoAndIfThenElse #-}
6 import qualified Data.Map as Map (lookup)
7 import Data.Time (getCurrentTime)
8 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
9 import Data.String.Utils (split, strip)
10 import Data.Maybe (catMaybes, fromJust, isNothing)
11 import Prelude hiding (readFile)
12 import System.Directory (doesFileExist)
13 import System.IO (Handle, hClose, hFlush, hPutStrLn, stderr)
14 import System.IO.UTF8 (readFile)
15 import Test.HUnit (Assertion, assertEqual)
16 import Test.Framework (Test, testGroup)
17 import Test.Framework.Providers.HUnit (testCase)
24 import Text.XML.HXT.Core (
47 import Text.HandsomeSoup (css, parseHtml)
49 import Configuration (Cfg, password, use_account, username)
57 import LWN.URI (URL, try_make_absolute_url)
58 import LWN.XHTML (XHTML, parse_lwn, to_xhtml)
59 import Misc (contains)
62 -- | Try to parse the given article using HXT. We try a few different
63 -- methods; if none of them work, we return 'Nothing'.
64 get_xml_from_article :: Cfg -> URL -> IO (Maybe (IOStateArrow s b XmlTree))
65 get_xml_from_article cfg article_name = do
66 my_article <- real_article_path article_name
67 is_file <- doesFileExist my_article
70 contents <- readFile my_article
71 return $ Just $ parse_lwn contents
73 -- Download the URL and try to parse it.
74 if use_account cfg then do
75 -- use_account would be false if these fromJusts would fail.
77 li_result <- log_in cj
78 (fromJust $ username cfg)
79 (fromJust $ password cfg)
83 let msg = "Failed to log in. " ++ err
85 Right response_body -> do
86 hPutStrLn stderr response_body
88 html <- get_page (Just cj) my_article
92 let msg = "Failed to retrieve page. " ++ err
95 Right h -> return $ Just $ parse_lwn h
97 html <- get_page Nothing my_article
100 let msg = "Failed to retrieve page. " ++ err
103 Right h -> return $ Just $ parse_lwn h
107 -- Should be called *after* preprocessing.
108 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
109 download_images xml = do
110 image_urls <- runX $ xml >>> image_srcs
111 download_image_urls image_urls
115 -- | An LWN page with one article on it.
116 ArticlePage { article :: Article } |
118 -- | An LWN page with more than one article on it. These require
119 -- different parsing and display functions than the single-article
121 FullPage { headline :: String,
122 articles :: [Article] }
125 instance XHTML Page where
126 to_xhtml (ArticlePage a) =
127 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
128 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
129 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
131 " <meta http-equiv=\"Content-Type\"" ++
132 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
133 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
142 to_xhtml (FullPage hl as) =
143 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
144 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
145 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
147 " <meta http-equiv=\"Content-Type\"" ++
148 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
149 " <title>" ++ hl ++ "</title>" ++
153 "<h1>" ++ hl ++ "</h1>" ++
154 (concatMap to_xhtml as) ++
161 page_from_url :: Cfg -> URL -> IO (Maybe Page)
162 page_from_url cfg url = do
163 maybe_html <- get_xml_from_article cfg url
165 Just html -> parse html
166 Nothing -> return Nothing
169 is_link :: (ArrowXml a) => a XmlTree XmlTree
171 isElem >>> hasName "a"
174 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
175 remove_comment_links =
176 processTopDown $ kill_comments `when` is_link
179 hasAttrValue "href" (contains "#Comments")
182 none `when` is_comment_link
184 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
185 replace_links_with_spans =
186 processTopDown $ (make_span >>> remove_attrs) `when` is_link
188 make_span = setElemName $ mkName "span"
189 remove_attrs = processAttrl none
192 -- | Preprocessing common to both page types.
193 preprocess :: (ArrowXml a) => a XmlTree XmlTree
195 make_image_srcs_absolute
199 replace_links_with_spans
202 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
203 replace_remote_img_srcs image_map =
204 processTopDown (make_srcs_local `when` is_image)
206 -- old_src -> new_src
207 change_src_func :: String -> String
208 change_src_func old_src =
209 case Map.lookup old_src image_map of
210 -- Leave it alone if we don't have the file locally
214 change_src :: (ArrowXml a) => a XmlTree XmlTree
216 changeAttrValue change_src_func
218 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
220 processAttrl $ (change_src `when` (hasName "src"))
223 parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
225 let clean_xml = xml >>> preprocess
226 image_map <- download_images clean_xml
227 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
228 appr <- ap_parse local_xml
229 fppr <- fp_parse local_xml
231 if (isNothing appr) then
238 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
239 parse_headline xml = do
240 let element_filter = xml >>> css "div.PageHeadline h1"
241 let element_text_filter = element_filter /> getText
242 element_text <- runX element_text_filter
245 [x] -> Just $ strip x
247 _ -> error "Found more than one headline."
250 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
251 parse_byline xml = do
252 let element_filter = xml >>> css "div.FeatureByLine"
253 let element_text_filter = element_filter /> getText
254 element_text <- runX element_text_filter
257 [x] -> Just $ strip x
259 _ -> error "Found more than one article byline."
265 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
267 arts <- ap_parse_articles xml
269 [x] -> return $ Just $ ArticlePage x
273 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
274 ap_parse_body xml = do
275 let element_filter = xml >>> css "div.ArticleText"
276 let element_html_filter = xshow element_filter
277 element_html <- runX element_html_filter
278 return $ case element_html of
281 _ -> error "Found more than one article."
284 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
285 ap_parse_articles xml = do
286 parsed_headline <- parse_headline xml
287 parsed_byline <- parse_byline xml
288 parsed_body <- ap_parse_body xml
290 putStrLn $ fromJust parsed_headline
292 if (isNothing parsed_headline) || (isNothing parsed_body)
295 let title' = Title $ fromJust parsed_headline
296 let byline' = Byline parsed_byline
297 let body' = BodyHtml $ fromJust parsed_body
299 return $ [Article title' byline' body']
307 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
309 hl <- parse_headline xml
310 parsed_articles <- fp_parse_articles xml
311 case parsed_articles of
313 x -> return $ Just $ FullPage (fromJust hl) x
317 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
318 fp_parse_article_title xml = do
319 let element_filter = xml >>> css "h2.SummaryHL"
320 let element_text_filter = element_filter //> getText
321 element_text <- runX element_text_filter
322 return $ case element_text of
323 [x] -> Just $ strip x
325 _ -> error "Found more than one article title."
329 is_title :: (ArrowXml a) => a XmlTree XmlTree
333 (hasAttrValue "class" (== "SummaryHL"))
336 is_byline :: (ArrowXml a) => a XmlTree XmlTree
340 (hasAttrValue "class" (== "FeatureByLine"))
343 is_image :: (ArrowXml a) => a XmlTree XmlTree
344 is_image = isElem >>> hasName "img"
346 remove_title :: (ArrowXml a) => a XmlTree XmlTree
348 processTopDown ((none) `when` is_title)
351 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
353 processTopDown ((none) `when` is_byline)
357 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
358 fp_parse_article_body xml = do
359 -- First, delete the article title and byline.
360 let clean_xml' = xml >>> remove_title >>> remove_byline
361 -- The only child of the body element should be a div.lwn-article
362 -- since we wrapped the article's HTML in that.
363 let clean_xml = clean_xml' >>> css "body" >>> getChildren
364 clean_html <- runX . xshow $ clean_xml
365 return $ case clean_html of
368 _ -> error "Found more than one article body."
370 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
371 fp_parse_article xml = do
372 parsed_article_title <- fp_parse_article_title xml
373 parsed_article_byline <- parse_byline xml
374 parsed_article_body <- fp_parse_article_body xml
376 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
380 let title' = Title $ fromJust parsed_article_title
381 let byline' = Byline parsed_article_byline
382 let body' = BodyHtml $ fromJust parsed_article_body
383 return $ Just $ Article title' byline' body'
385 parse_html_article :: String -> IO (Maybe Article)
386 parse_html_article html = do
387 let xml = parseHtml $ wrap_in_body_div html
391 -- | In the full page, all of the article titles and bodies are
392 -- wrapped in one big div.ArticleText.
393 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
395 xml >>> css "div.ArticleText"
398 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
399 fp_parse_articles xml = do
400 bodies <- runX . xshow $ parse_bodies xml
401 let article_separator = "<h2 class=\"SummaryHL\">"
402 let split_articles'' = split article_separator (concat bodies)
403 -- The first element will contain the crap before the first <h2...>.
404 let split_articles' = case split_articles'' of
405 (_:_) -> tail split_articles''
407 -- Put the separator back, it was lost during the split.
408 let split_articles = map (article_separator ++) split_articles'
409 --_ <- mapM print_article split_articles
410 real_articles <- mapM parse_html_article split_articles
411 let just_articles = catMaybes real_articles
415 -- | This makes it easy to select otherwise-random chunks of html
417 wrap_in_body_div :: String -> String
419 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
425 -- Epublishable stuff
428 title :: Page -> String
429 title (ArticlePage a) = getTitle $ LWN.Article.title a
430 title (FullPage hl _) = hl
433 metadata :: Page -> IO String
435 date <- getCurrentTime
437 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
438 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
439 "<dc:language>en-US</dc:language>\n" ++
440 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
441 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
444 epublish :: Page -> Handle -> IO ()
445 epublish obj handle = do
446 let xhtml = to_xhtml obj
448 epub <- xhtml_to_epub epmd xhtml
453 xhtml_to_epub :: String -> String -> IO B.ByteString
455 write_epub . read_html
457 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
458 write_epub = writeEPUB Nothing [] my_writer_options
459 read_html = readHtml defaultParserState
467 image_srcs :: (ArrowXml a) => a XmlTree URL
473 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
474 make_image_srcs_absolute =
475 processTopDown (make_srcs_absolute `when` is_image)
477 change_src :: (ArrowXml a) => a XmlTree XmlTree
479 changeAttrValue try_make_absolute_url
481 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
483 processAttrl $ change_src `when` hasName "src"
490 test_preprocess_links :: Assertion
491 test_preprocess_links = do
492 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
493 let actual_xml = actual_xml' !! 0
495 expected_xml' <- runX $ expected_xml'' >>> css "body"
496 let expected_xml = expected_xml' !! 0
499 "Links are replaced with spans"
503 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
504 input_xml = parseHtml input_html
505 expected_html = "<body><span>Hello, world!</span></body>"
506 expected_xml'' = parseHtml expected_html
509 test_absolve_images :: Assertion
510 test_absolve_images = do
511 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
512 let actual_xml = actual_xml' !! 0
514 expected_xml' <- runX $ expected_xml'' >>> css "body"
515 let expected_xml = expected_xml' !! 0
518 "Image srcs are made absolute"
524 "<img src=\"/images/2012/example.jpg\" />" ++
526 input_xml = parseHtml input_html
529 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
531 expected_xml'' = parseHtml expected_html
534 test_comments_removed :: Assertion
535 test_comments_removed = do
536 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
537 let actual_xml = actual_xml' !! 0
539 expected_xml' <- runX $ expected_xml'' >>> css "body"
540 let expected_xml = expected_xml' !! 0
543 "Comment links are removed"
549 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
551 input_xml = parseHtml input_html
553 expected_html = "<body><p></p></body>"
554 expected_xml'' = parseHtml expected_html
560 testGroup "Page Tests" [
561 testCase "Links are replaced with spans" test_preprocess_links,
562 testCase "Image srcs are made absolute" test_absolve_images,
563 testCase "Comment links are removed" test_comments_removed ]