]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/Page.hs
Remove full story paragraphs if we aren't going to use them.
[dead/lwn-epub.git] / src / LWN / Page.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.Page
4 where
5
6 import Control.Concurrent.ParallelIO (parallel)
7 import qualified Data.Map as Map (lookup)
8 import Data.Time (getCurrentTime)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.String.Utils (split, strip)
11 import qualified Data.Map as Map (Map, empty, insert)
12 import Data.Maybe (catMaybes, fromJust, isNothing)
13 import Prelude hiding (readFile)
14 import System.IO (Handle, hClose, hFlush)
15 import Test.HUnit (Assertion, assertEqual)
16 import Test.Framework (Test, testGroup)
17 import Test.Framework.Providers.HUnit (testCase)
18 import Text.Pandoc (
19 defaultParserState,
20 defaultWriterOptions,
21 readHtml,
22 writeEPUB,
23 writerEPUBMetadata)
24 import Text.XML.HXT.Core (
25 ArrowXml,
26 IOSArrow,
27 XmlTree,
28 ($<),
29 (>>>),
30 (/>),
31 (//>),
32 changeAttrValue,
33 getAttrValue,
34 getChildren,
35 getText,
36 hasName,
37 processAttrl,
38 processTopDown,
39 this,
40 runX,
41 xshow,
42 when)
43 import Text.HandsomeSoup (css, parseHtml)
44
45 import Configuration (Cfg, full_stories)
46 import LWN.Article
47 import LWN.HTTP (
48 ImageMap,
49 download_image_urls,
50 get_article_contents)
51 import LWN.URI (URL)
52 import LWN.XHTML (
53 XHTML,
54 full_story_urls,
55 image_srcs,
56 full_story_link,
57 full_story_paragraph,
58 is_image,
59 preprocess,
60 remove_byline,
61 remove_full_story_paragraphs,
62 remove_title,
63 to_xhtml,
64 to_xml,
65 xml_from_contents)
66
67
68
69
70 data Page =
71 -- | An LWN page with one article on it.
72 ArticlePage { article :: Article } |
73
74 -- | An LWN page with more than one article on it. These require
75 -- different parsing and display functions than the single-article
76 -- pages.
77 FullPage { headline :: String,
78 articles :: [Article] }
79
80
81 instance XHTML Page where
82 to_xhtml (ArticlePage a) =
83 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
84 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
85 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
86 "<head>" ++
87 " <meta http-equiv=\"Content-Type\"" ++
88 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
89 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
90 "</head>" ++
91 "<body>" ++
92 (to_xhtml a) ++
93 "</body>" ++
94 "</html>"
95
96 to_xhtml (FullPage hl as) =
97 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
98 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
99 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
100 "<head>" ++
101 " <meta http-equiv=\"Content-Type\"" ++
102 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
103 " <title>" ++ hl ++ "</title>" ++
104 "</head>" ++
105 "<body>" ++
106 "<div>" ++
107 "<h1>" ++ hl ++ "</h1>" ++
108 (concatMap to_xhtml as) ++
109 "</div>" ++
110 "</body>" ++
111 "</html>"
112
113
114
115 page_from_url :: Cfg -> URL -> IO (Maybe Page)
116 page_from_url cfg url = do
117 contents <- get_article_contents cfg url
118 case (xml_from_contents contents) of
119 Just html -> parse cfg html
120 Nothing -> return Nothing
121
122
123
124 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
125 insert_full_stories story_map =
126 processTopDown (article_xml `when` full_story_paragraph)
127 where
128 lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
129 lookup_func href =
130 case Map.lookup href story_map of
131 -- Leave it alone if we don't have the full story.
132 Nothing -> this
133 Just v -> to_xml v
134
135 article_xml :: (ArrowXml a) => a XmlTree XmlTree
136 article_xml =
137 lookup_func
138 $< -- From HXT's Control.Arrow.ArrowList
139 (this /> full_story_link >>> getAttrValue "href")
140
141 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
142 replace_remote_img_srcs image_map =
143 processTopDown (make_srcs_local `when` is_image)
144 where
145 -- old_src -> new_src
146 change_src_func :: String -> String
147 change_src_func old_src =
148 case Map.lookup old_src image_map of
149 -- Leave it alone if we don't have the file locally
150 Nothing -> old_src
151 Just v -> v
152
153 change_src :: (ArrowXml a) => a XmlTree XmlTree
154 change_src =
155 changeAttrValue change_src_func
156
157 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
158 make_srcs_local =
159 processAttrl $ (change_src `when` (hasName "src"))
160
161
162
163
164 -- Should be called *after* preprocessing.
165 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
166 download_images xml = do
167 image_urls <- runX $ xml >>> image_srcs
168 download_image_urls image_urls
169
170
171
172 type StoryMap = Map.Map URL Article
173
174 -- These come *before* preprocessing.
175 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
176 download_full_story_urls cfg story_urls = do
177 pages <- parallel $ map (page_from_url cfg) story_urls
178 let pairs = zip story_urls pages
179 return $ foldl my_insert empty_map pairs
180 where
181 empty_map = Map.empty :: StoryMap
182
183 my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
184 my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
185 my_insert dict (_, _) = dict
186
187
188 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
189 download_full_stories cfg xml = do
190 story_urls <- runX $ xml >>> full_story_urls
191 download_full_story_urls cfg story_urls
192
193
194 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
195 parse cfg xml = do
196 fs_xml <- if (full_stories cfg) then do
197 story_map <- download_full_stories cfg xml
198 return $ xml >>> insert_full_stories story_map
199 else do
200 -- Get rid of them if we don't want them.
201 return $ xml >>> remove_full_story_paragraphs
202
203 let clean_xml = fs_xml >>> preprocess
204 image_map <- download_images clean_xml
205 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
206
207 appr <- ap_parse local_xml
208 fppr <- fp_parse local_xml
209 return $
210 if (isNothing appr) then
211 fppr
212 else
213 appr
214
215
216
217 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
218 parse_headline xml = do
219 let element_filter = xml >>> css "div.PageHeadline h1"
220 let element_text_filter = element_filter /> getText
221 element_text <- runX element_text_filter
222 return $
223 case element_text of
224 [x] -> Just $ strip x
225 [] -> Nothing
226 _ -> error "Found more than one headline."
227
228
229 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
230 parse_byline xml = do
231 let element_filter = xml >>> css "div.FeatureByLine"
232 let element_text_filter = element_filter /> getText
233 element_text <- runX element_text_filter
234 return $
235 case element_text of
236 [x] -> Just $ strip x
237 [] -> Nothing
238 _ -> error "Found more than one article byline."
239
240
241 --
242 -- ArticlePage Stuff
243 --
244 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
245 ap_parse xml = do
246 arts <- ap_parse_articles xml
247 case arts of
248 [x] -> return $ Just $ ArticlePage x
249 _ -> return Nothing
250
251
252 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
253 ap_parse_body xml = do
254 let element_filter = xml >>> css "div.ArticleText"
255 let element_html_filter = xshow element_filter
256 element_html <- runX element_html_filter
257 return $ case element_html of
258 [x] -> Just x
259 [] -> Nothing
260 _ -> error "Found more than one article."
261
262
263 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
264 ap_parse_articles xml = do
265 parsed_headline <- parse_headline xml
266 parsed_byline <- parse_byline xml
267 parsed_body <- ap_parse_body xml
268
269 if (isNothing parsed_headline) || (isNothing parsed_body)
270 then return []
271 else do
272 let title' = Title $ fromJust parsed_headline
273 let byline' = Byline parsed_byline
274 let body' = BodyHtml $ fromJust parsed_body
275
276 return $ [Article title' byline' body']
277
278
279
280 --
281 -- FullPage Stuff
282 --
283
284 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
285 fp_parse xml = do
286 hl <- parse_headline xml
287 parsed_articles <- fp_parse_articles xml
288 case parsed_articles of
289 [] -> return Nothing
290 x -> return $ Just $ FullPage (fromJust hl) x
291
292
293
294 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
295 fp_parse_article_title xml = do
296 let element_filter = xml >>> css "h2.SummaryHL"
297 let element_text_filter = element_filter //> getText
298 element_text <- runX element_text_filter
299 return $ case element_text of
300 [x] -> Just $ strip x
301 [] -> Nothing
302 _ -> error "Found more than one article title."
303
304
305
306
307 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
308 fp_parse_article_body xml = do
309 -- First, delete the article title and byline.
310 let clean_xml' = xml >>> remove_title >>> remove_byline
311 -- The only child of the body element should be a div.lwn-article
312 -- since we wrapped the article's HTML in that.
313 let clean_xml = clean_xml' >>> css "body" >>> getChildren
314 clean_html <- runX . xshow $ clean_xml
315 return $ case clean_html of
316 [x] -> Just x
317 [] -> Nothing
318 _ -> error "Found more than one article body."
319
320 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
321 fp_parse_article xml = do
322 parsed_article_title <- fp_parse_article_title xml
323 parsed_article_byline <- parse_byline xml
324 parsed_article_body <- fp_parse_article_body xml
325
326 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
327 then
328 return Nothing
329 else do
330 let title' = Title $ fromJust parsed_article_title
331 let byline' = Byline parsed_article_byline
332 let body' = BodyHtml $ fromJust parsed_article_body
333 return $ Just $ Article title' byline' body'
334
335 parse_html_article :: String -> IO (Maybe Article)
336 parse_html_article html = do
337 let xml = parseHtml $ wrap_in_body_div html
338 fp_parse_article xml
339
340
341 -- | In the full page, all of the article titles and bodies are
342 -- wrapped in one big div.ArticleText.
343 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
344 parse_bodies xml =
345 xml >>> css "div.ArticleText"
346
347
348 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
349 fp_parse_articles xml = do
350 bodies <- runX . xshow $ parse_bodies xml
351 let article_separator = "<h2 class=\"SummaryHL\">"
352 let split_articles'' = split article_separator (concat bodies)
353 -- The first element will contain the crap before the first <h2...>.
354 let split_articles' = case split_articles'' of
355 (_:_) -> tail split_articles''
356 [] -> []
357 -- Put the separator back, it was lost during the split.
358 let split_articles = map (article_separator ++) split_articles'
359 real_articles <- mapM parse_html_article split_articles
360 let just_articles = catMaybes real_articles
361 return just_articles
362
363
364 -- | This makes it easy to select otherwise-random chunks of html
365 -- using 'css'.
366 wrap_in_body_div :: String -> String
367 wrap_in_body_div s =
368 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
369
370
371
372
373 --
374 -- Epublishable stuff
375 --
376
377 title :: Page -> String
378 title (ArticlePage a) = getTitle $ LWN.Article.title a
379 title (FullPage hl _) = hl
380
381
382 metadata :: Page -> IO String
383 metadata obj = do
384 date <- getCurrentTime
385 return $
386 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
387 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
388 "<dc:language>en-US</dc:language>\n" ++
389 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
390 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
391
392
393 epublish :: Page -> Handle -> IO ()
394 epublish obj handle = do
395 let xhtml = to_xhtml obj
396 epmd <- metadata obj
397 epub <- xhtml_to_epub epmd xhtml
398 B.hPut handle epub
399 hFlush handle
400 hClose handle
401
402 xhtml_to_epub :: String -> String -> IO B.ByteString
403 xhtml_to_epub epmd =
404 write_epub . read_html
405 where
406 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
407 write_epub = writeEPUB Nothing [] my_writer_options
408 read_html = readHtml defaultParserState
409
410
411 --
412 -- Tests
413 --
414
415 test_preprocess_links :: Assertion
416 test_preprocess_links = do
417 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
418 let actual_xml = actual_xml' !! 0
419
420 expected_xml' <- runX $ expected_xml'' >>> css "body"
421 let expected_xml = expected_xml' !! 0
422
423 assertEqual
424 "Links are replaced with spans"
425 expected_xml
426 actual_xml
427 where
428 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
429 input_xml = parseHtml input_html
430 expected_html = "<body><span>Hello, world!</span></body>"
431 expected_xml'' = parseHtml expected_html
432
433
434 test_absolve_images :: Assertion
435 test_absolve_images = do
436 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
437 let actual_xml = actual_xml' !! 0
438
439 expected_xml' <- runX $ expected_xml'' >>> css "body"
440 let expected_xml = expected_xml' !! 0
441
442 assertEqual
443 "Image srcs are made absolute"
444 expected_xml
445 actual_xml
446 where
447 input_html =
448 "<body>" ++
449 "<img src=\"/images/2012/example.jpg\" />" ++
450 "</body>"
451 input_xml = parseHtml input_html
452 expected_html =
453 "<body>" ++
454 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
455 "</body>"
456 expected_xml'' = parseHtml expected_html
457
458
459 test_comments_removed :: Assertion
460 test_comments_removed = do
461 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
462 let actual_xml = actual_xml' !! 0
463
464 expected_xml' <- runX $ expected_xml'' >>> css "body"
465 let expected_xml = expected_xml' !! 0
466
467 assertEqual
468 "Comment links are removed"
469 expected_xml
470 actual_xml
471 where
472 input_html =
473 "<body><p>" ++
474 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
475 "</p></body>"
476 input_xml = parseHtml input_html
477
478 expected_html = "<body><p></p></body>"
479 expected_xml'' = parseHtml expected_html
480
481
482 test_full_story_urls_parsed :: Assertion
483 test_full_story_urls_parsed = do
484 actual <- runX $ actual'
485
486 assertEqual
487 "Full Story URLs are parsed"
488 expected
489 actual
490 where
491 expected = ["/Articles/500738/", "/Articles/501837/"]
492
493 full_story_html =
494 concat ["<p>",
495 "<a href=\"/Articles/500738/\">Full Story</a> ",
496 "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
497 "<p>",
498 "<a href=\"/Articles/501837/\">Full Story</a> ",
499 "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
500 "<p>"]
501
502 xml = parseHtml full_story_html
503 actual' = xml >>> full_story_urls
504
505 page_tests :: Test
506 page_tests =
507 testGroup "Page Tests" [
508 testCase "Links are replaced with spans" test_preprocess_links,
509 testCase "Image srcs are made absolute" test_absolve_images,
510 testCase "Comment links are removed" test_comments_removed,
511 testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]