1 {-# LANGUAGE DoAndIfThenElse #-}
6 import Control.Concurrent.ParallelIO (parallel)
7 import qualified Data.Map as Map (lookup)
8 import Data.Time (getCurrentTime)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.String.Utils (split, strip)
11 import qualified Data.Map as Map (Map, empty, insert)
12 import Data.Maybe (catMaybes, fromJust, isNothing)
13 import Prelude hiding (readFile)
14 import System.IO (Handle, hClose, hFlush)
15 import Test.HUnit (Assertion, assertEqual)
16 import Test.Framework (Test, testGroup)
17 import Test.Framework.Providers.HUnit (testCase)
25 import Text.Pandoc.Shared ( readDataFile )
26 import Text.XML.HXT.Core (
46 import Text.HandsomeSoup (css, parseHtml)
48 import Configuration (Cfg, full_stories)
64 remove_full_story_paragraphs,
74 -- | An LWN page with one article on it.
75 ArticlePage { article :: Article } |
77 -- | An LWN page with more than one article on it. These require
78 -- different parsing and display functions than the single-article
80 FullPage { headline :: String,
81 articles :: [Article] }
84 instance XHTML Page where
85 to_xhtml (ArticlePage a) =
86 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
87 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
88 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
90 " <meta http-equiv=\"Content-Type\"" ++
91 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
92 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
99 to_xhtml (FullPage hl as) =
100 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
101 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
102 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
104 " <meta http-equiv=\"Content-Type\"" ++
105 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
106 " <title>" ++ hl ++ "</title>" ++
110 "<h1>" ++ hl ++ "</h1>" ++
111 (concatMap to_xhtml as) ++
118 -- | Stolen from writeEPUB.
119 default_stylesheet :: IO String
121 -- This comes with Pandoc, I guess.
122 readDataFile (writerUserDataDir defaultWriterOptions) "epub.css"
125 construct_stylesheet :: IO String
126 construct_stylesheet = do
127 defaults <- default_stylesheet
128 -- Allow word-wrapping in <pre> elements.
129 let my_additions = "\n" ++ "pre { white-space: pre-wrap; }" ++ "\n"
130 return $ defaults ++ my_additions
132 page_from_url :: Cfg -> URL -> IO (Maybe Page)
133 page_from_url cfg url = do
134 contents <- get_article_contents cfg url
135 case (xml_from_contents contents) of
136 Just html -> parse cfg html
137 Nothing -> return Nothing
141 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
142 insert_full_stories story_map =
143 processTopDown (article_xml `when` full_story_paragraph)
145 lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
147 case Map.lookup href story_map of
148 -- Drop the paragraph if we don't have the contents.
152 article_xml :: (ArrowXml a) => a XmlTree XmlTree
155 $< -- From HXT's Control.Arrow.ArrowList
156 (this /> full_story_link >>> getAttrValue "href")
158 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
159 replace_remote_img_srcs image_map =
160 processTopDown (make_srcs_local `when` is_image)
162 -- old_src -> new_src
163 change_src_func :: String -> String
164 change_src_func old_src =
165 case Map.lookup old_src image_map of
166 -- Leave it alone if we don't have the file locally
170 change_src :: (ArrowXml a) => a XmlTree XmlTree
172 changeAttrValue change_src_func
174 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
176 processAttrl $ (change_src `when` (hasName "src"))
181 -- Should be called *after* preprocessing.
182 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
183 download_images xml = do
184 image_urls <- runX $ xml >>> image_srcs
185 download_image_urls image_urls
189 type StoryMap = Map.Map URL Article
191 -- These come *before* preprocessing.
192 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
193 download_full_story_urls cfg story_urls = do
194 pages <- parallel $ map (page_from_url cfg) story_urls
195 let pairs = zip story_urls pages
196 return $ foldl my_insert empty_map pairs
198 empty_map = Map.empty :: StoryMap
200 my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
201 my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
202 my_insert dict (_, _) = dict
205 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
206 download_full_stories cfg xml = do
207 story_urls <- runX $ xml >>> full_story_urls
208 download_full_story_urls cfg story_urls
211 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
213 fs_xml <- if (full_stories cfg) then do
214 story_map <- download_full_stories cfg xml
215 return $ xml >>> insert_full_stories story_map
217 -- Get rid of them if we don't want them.
218 return $ xml >>> remove_full_story_paragraphs
220 let clean_xml = fs_xml >>> preprocess
221 image_map <- download_images clean_xml
222 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
224 appr <- ap_parse local_xml
225 fppr <- fp_parse local_xml
227 if (isNothing appr) then
234 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
235 parse_headline xml = do
236 let element_filter = xml >>> css "div.PageHeadline h1"
237 let element_text_filter = element_filter /> getText
238 element_text <- runX element_text_filter
241 [x] -> Just $ strip x
243 _ -> error "Found more than one headline."
246 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
247 parse_byline xml = do
248 let element_filter = xml >>> css "div.FeatureByLine"
249 let element_text_filter = element_filter /> getText
250 element_text <- runX element_text_filter
253 [x] -> Just $ strip x
255 _ -> error "Found more than one article byline."
261 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
263 arts <- ap_parse_articles xml
265 [x] -> return $ Just $ ArticlePage x
269 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
270 ap_parse_body xml = do
271 let element_filter = xml >>> css "div.ArticleText"
272 let element_html_filter = xshow element_filter
273 element_html <- runX element_html_filter
274 return $ case element_html of
277 _ -> error "Found more than one article."
280 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
281 ap_parse_articles xml = do
282 parsed_headline <- parse_headline xml
283 parsed_byline <- parse_byline xml
284 parsed_body <- ap_parse_body xml
286 if (isNothing parsed_headline) || (isNothing parsed_body)
289 let title' = Title $ fromJust parsed_headline
290 let byline' = Byline parsed_byline
291 let body' = BodyHtml $ fromJust parsed_body
293 return $ [Article title' byline' body']
301 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
303 hl <- parse_headline xml
304 parsed_articles <- fp_parse_articles xml
305 return $ case parsed_articles of
307 x -> Just $ FullPage (fromJust hl) x
311 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
312 fp_parse_article_title xml = do
313 let element_filter = xml >>> css "h2.SummaryHL"
314 let element_text_filter = element_filter //> getText
315 element_text <- runX element_text_filter
316 return $ case element_text of
317 [x] -> Just $ strip x
319 _ -> error "Found more than one article title."
324 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
325 fp_parse_article_body xml = do
326 -- First, delete the article title and byline.
327 let clean_xml' = xml >>> remove_title >>> remove_byline
328 -- The only child of the body element should be a div.lwn-article
329 -- since we wrapped the article's HTML in that.
330 let clean_xml = clean_xml' >>> css "body" >>> getChildren
331 clean_html <- runX . xshow $ clean_xml
332 return $ case clean_html of
335 _ -> error "Found more than one article body."
337 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
338 fp_parse_article xml = do
339 parsed_article_title <- fp_parse_article_title xml
340 parsed_article_byline <- parse_byline xml
341 parsed_article_body <- fp_parse_article_body xml
343 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
347 let title' = Title $ fromJust parsed_article_title
348 let byline' = Byline parsed_article_byline
349 let body' = BodyHtml $ fromJust parsed_article_body
350 return $ Just $ Article title' byline' body'
352 parse_html_article :: String -> IO (Maybe Article)
353 parse_html_article html = do
354 let xml = parseHtml $ wrap_in_body_div html
358 -- | In the full page, all of the article titles and bodies are
359 -- wrapped in one big div.ArticleText.
360 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
362 xml >>> css "div.ArticleText"
365 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
366 fp_parse_articles xml = do
367 bodies <- runX . xshow $ parse_bodies xml
368 let article_separator = "<h2 class=\"SummaryHL\">"
369 let split_articles'' = split article_separator (concat bodies)
370 -- The first element will contain the crap before the first <h2...>.
371 let split_articles' = case split_articles'' of
372 (_:_) -> tail split_articles''
374 -- Put the separator back, it was lost during the split.
375 let split_articles = map (article_separator ++) split_articles'
376 real_articles <- mapM parse_html_article split_articles
377 let just_articles = catMaybes real_articles
381 -- | This makes it easy to select otherwise-random chunks of html
383 wrap_in_body_div :: String -> String
385 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
391 -- Epublishable stuff
394 title :: Page -> String
395 title (ArticlePage a) = getTitle $ LWN.Article.title a
396 title (FullPage hl _) = hl
399 metadata :: Page -> IO String
401 date <- getCurrentTime
403 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
404 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
405 "<dc:language>en-US</dc:language>\n" ++
406 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
407 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
410 epublish :: Page -> Handle -> IO ()
411 epublish obj handle = do
412 let xhtml = to_xhtml obj
414 epub <- xhtml_to_epub epmd xhtml
419 xhtml_to_epub :: String -> String -> IO B.ByteString
420 xhtml_to_epub epmd xhtml = do
421 stylesheet <- construct_stylesheet
428 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
429 read_html = readHtml defaultParserState
436 test_preprocess_links :: Assertion
437 test_preprocess_links = do
438 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
439 let actual_xml = actual_xml' !! 0
441 expected_xml' <- runX $ expected_xml'' >>> css "body"
442 let expected_xml = expected_xml' !! 0
445 "Links are replaced with spans"
449 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
450 input_xml = parseHtml input_html
451 expected_html = "<body><span>Hello, world!</span></body>"
452 expected_xml'' = parseHtml expected_html
455 test_absolve_images :: Assertion
456 test_absolve_images = do
457 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
458 let actual_xml = actual_xml' !! 0
460 expected_xml' <- runX $ expected_xml'' >>> css "body"
461 let expected_xml = expected_xml' !! 0
464 "Image srcs are made absolute"
470 "<img src=\"/images/2012/example.jpg\" />" ++
472 input_xml = parseHtml input_html
475 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
477 expected_xml'' = parseHtml expected_html
480 test_comments_removed :: Assertion
481 test_comments_removed = do
482 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
483 let actual_xml = actual_xml' !! 0
485 expected_xml' <- runX $ expected_xml'' >>> css "body"
486 let expected_xml = expected_xml' !! 0
489 "Comment links are removed"
495 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
497 input_xml = parseHtml input_html
499 expected_html = "<body><p></p></body>"
500 expected_xml'' = parseHtml expected_html
503 test_full_story_urls_parsed :: Assertion
504 test_full_story_urls_parsed = do
505 actual <- runX $ actual'
508 "Full Story URLs are parsed"
512 expected = ["/Articles/500738/", "/Articles/501837/"]
516 "<a href=\"/Articles/500738/\">Full Story</a> ",
517 "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
519 "<a href=\"/Articles/501837/\">Full Story</a> ",
520 "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
523 xml = parseHtml full_story_html
524 actual' = xml >>> full_story_urls
528 testGroup "Page Tests" [
529 testCase "Links are replaced with spans" test_preprocess_links,
530 testCase "Image srcs are made absolute" test_absolve_images,
531 testCase "Comment links are removed" test_comments_removed,
532 testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]