]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/Page.hs
If we don't have the contents of a particular full story, drop its paragraph.
[dead/lwn-epub.git] / src / LWN / Page.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.Page
4 where
5
6 import Control.Concurrent.ParallelIO (parallel)
7 import qualified Data.Map as Map (lookup)
8 import Data.Time (getCurrentTime)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.String.Utils (split, strip)
11 import qualified Data.Map as Map (Map, empty, insert)
12 import Data.Maybe (catMaybes, fromJust, isNothing)
13 import Prelude hiding (readFile)
14 import System.IO (Handle, hClose, hFlush)
15 import Test.HUnit (Assertion, assertEqual)
16 import Test.Framework (Test, testGroup)
17 import Test.Framework.Providers.HUnit (testCase)
18 import Text.Pandoc (
19 defaultParserState,
20 defaultWriterOptions,
21 readHtml,
22 writeEPUB,
23 writerEPUBMetadata)
24 import Text.XML.HXT.Core (
25 ArrowXml,
26 IOSArrow,
27 XmlTree,
28 ($<),
29 (>>>),
30 (/>),
31 (//>),
32 changeAttrValue,
33 getAttrValue,
34 getChildren,
35 getText,
36 hasName,
37 none,
38 processAttrl,
39 processTopDown,
40 this,
41 runX,
42 xshow,
43 when)
44 import Text.HandsomeSoup (css, parseHtml)
45
46 import Configuration (Cfg, full_stories)
47 import LWN.Article
48 import LWN.HTTP (
49 ImageMap,
50 download_image_urls,
51 get_article_contents)
52 import LWN.URI (URL)
53 import LWN.XHTML (
54 XHTML,
55 full_story_urls,
56 image_srcs,
57 full_story_link,
58 full_story_paragraph,
59 is_image,
60 preprocess,
61 remove_byline,
62 remove_full_story_paragraphs,
63 remove_title,
64 to_xhtml,
65 to_xml,
66 xml_from_contents)
67
68
69
70
71 data Page =
72 -- | An LWN page with one article on it.
73 ArticlePage { article :: Article } |
74
75 -- | An LWN page with more than one article on it. These require
76 -- different parsing and display functions than the single-article
77 -- pages.
78 FullPage { headline :: String,
79 articles :: [Article] }
80
81
82 instance XHTML Page where
83 to_xhtml (ArticlePage a) =
84 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
85 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
86 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
87 "<head>" ++
88 " <meta http-equiv=\"Content-Type\"" ++
89 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
90 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
91 "</head>" ++
92 "<body>" ++
93 (to_xhtml a) ++
94 "</body>" ++
95 "</html>"
96
97 to_xhtml (FullPage hl as) =
98 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
99 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
100 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
101 "<head>" ++
102 " <meta http-equiv=\"Content-Type\"" ++
103 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
104 " <title>" ++ hl ++ "</title>" ++
105 "</head>" ++
106 "<body>" ++
107 "<div>" ++
108 "<h1>" ++ hl ++ "</h1>" ++
109 (concatMap to_xhtml as) ++
110 "</div>" ++
111 "</body>" ++
112 "</html>"
113
114
115
116 page_from_url :: Cfg -> URL -> IO (Maybe Page)
117 page_from_url cfg url = do
118 contents <- get_article_contents cfg url
119 case (xml_from_contents contents) of
120 Just html -> parse cfg html
121 Nothing -> return Nothing
122
123
124
125 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
126 insert_full_stories story_map =
127 processTopDown (article_xml `when` full_story_paragraph)
128 where
129 lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
130 lookup_func href =
131 case Map.lookup href story_map of
132 -- Drop the paragraph if we don't have the contents.
133 Nothing -> none
134 Just v -> to_xml v
135
136 article_xml :: (ArrowXml a) => a XmlTree XmlTree
137 article_xml =
138 lookup_func
139 $< -- From HXT's Control.Arrow.ArrowList
140 (this /> full_story_link >>> getAttrValue "href")
141
142 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
143 replace_remote_img_srcs image_map =
144 processTopDown (make_srcs_local `when` is_image)
145 where
146 -- old_src -> new_src
147 change_src_func :: String -> String
148 change_src_func old_src =
149 case Map.lookup old_src image_map of
150 -- Leave it alone if we don't have the file locally
151 Nothing -> old_src
152 Just v -> v
153
154 change_src :: (ArrowXml a) => a XmlTree XmlTree
155 change_src =
156 changeAttrValue change_src_func
157
158 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
159 make_srcs_local =
160 processAttrl $ (change_src `when` (hasName "src"))
161
162
163
164
165 -- Should be called *after* preprocessing.
166 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
167 download_images xml = do
168 image_urls <- runX $ xml >>> image_srcs
169 download_image_urls image_urls
170
171
172
173 type StoryMap = Map.Map URL Article
174
175 -- These come *before* preprocessing.
176 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
177 download_full_story_urls cfg story_urls = do
178 pages <- parallel $ map (page_from_url cfg) story_urls
179 let pairs = zip story_urls pages
180 return $ foldl my_insert empty_map pairs
181 where
182 empty_map = Map.empty :: StoryMap
183
184 my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
185 my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
186 my_insert dict (_, _) = dict
187
188
189 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
190 download_full_stories cfg xml = do
191 story_urls <- runX $ xml >>> full_story_urls
192 download_full_story_urls cfg story_urls
193
194
195 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
196 parse cfg xml = do
197 fs_xml <- if (full_stories cfg) then do
198 story_map <- download_full_stories cfg xml
199 return $ xml >>> insert_full_stories story_map
200 else do
201 -- Get rid of them if we don't want them.
202 return $ xml >>> remove_full_story_paragraphs
203
204 let clean_xml = fs_xml >>> preprocess
205 image_map <- download_images clean_xml
206 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
207
208 appr <- ap_parse local_xml
209 fppr <- fp_parse local_xml
210 return $
211 if (isNothing appr) then
212 fppr
213 else
214 appr
215
216
217
218 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
219 parse_headline xml = do
220 let element_filter = xml >>> css "div.PageHeadline h1"
221 let element_text_filter = element_filter /> getText
222 element_text <- runX element_text_filter
223 return $
224 case element_text of
225 [x] -> Just $ strip x
226 [] -> Nothing
227 _ -> error "Found more than one headline."
228
229
230 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
231 parse_byline xml = do
232 let element_filter = xml >>> css "div.FeatureByLine"
233 let element_text_filter = element_filter /> getText
234 element_text <- runX element_text_filter
235 return $
236 case element_text of
237 [x] -> Just $ strip x
238 [] -> Nothing
239 _ -> error "Found more than one article byline."
240
241
242 --
243 -- ArticlePage Stuff
244 --
245 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
246 ap_parse xml = do
247 arts <- ap_parse_articles xml
248 case arts of
249 [x] -> return $ Just $ ArticlePage x
250 _ -> return Nothing
251
252
253 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
254 ap_parse_body xml = do
255 let element_filter = xml >>> css "div.ArticleText"
256 let element_html_filter = xshow element_filter
257 element_html <- runX element_html_filter
258 return $ case element_html of
259 [x] -> Just x
260 [] -> Nothing
261 _ -> error "Found more than one article."
262
263
264 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
265 ap_parse_articles xml = do
266 parsed_headline <- parse_headline xml
267 parsed_byline <- parse_byline xml
268 parsed_body <- ap_parse_body xml
269
270 if (isNothing parsed_headline) || (isNothing parsed_body)
271 then return []
272 else do
273 let title' = Title $ fromJust parsed_headline
274 let byline' = Byline parsed_byline
275 let body' = BodyHtml $ fromJust parsed_body
276
277 return $ [Article title' byline' body']
278
279
280
281 --
282 -- FullPage Stuff
283 --
284
285 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
286 fp_parse xml = do
287 hl <- parse_headline xml
288 parsed_articles <- fp_parse_articles xml
289 case parsed_articles of
290 [] -> return Nothing
291 x -> return $ Just $ FullPage (fromJust hl) x
292
293
294
295 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
296 fp_parse_article_title xml = do
297 let element_filter = xml >>> css "h2.SummaryHL"
298 let element_text_filter = element_filter //> getText
299 element_text <- runX element_text_filter
300 return $ case element_text of
301 [x] -> Just $ strip x
302 [] -> Nothing
303 _ -> error "Found more than one article title."
304
305
306
307
308 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
309 fp_parse_article_body xml = do
310 -- First, delete the article title and byline.
311 let clean_xml' = xml >>> remove_title >>> remove_byline
312 -- The only child of the body element should be a div.lwn-article
313 -- since we wrapped the article's HTML in that.
314 let clean_xml = clean_xml' >>> css "body" >>> getChildren
315 clean_html <- runX . xshow $ clean_xml
316 return $ case clean_html of
317 [x] -> Just x
318 [] -> Nothing
319 _ -> error "Found more than one article body."
320
321 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
322 fp_parse_article xml = do
323 parsed_article_title <- fp_parse_article_title xml
324 parsed_article_byline <- parse_byline xml
325 parsed_article_body <- fp_parse_article_body xml
326
327 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
328 then
329 return Nothing
330 else do
331 let title' = Title $ fromJust parsed_article_title
332 let byline' = Byline parsed_article_byline
333 let body' = BodyHtml $ fromJust parsed_article_body
334 return $ Just $ Article title' byline' body'
335
336 parse_html_article :: String -> IO (Maybe Article)
337 parse_html_article html = do
338 let xml = parseHtml $ wrap_in_body_div html
339 fp_parse_article xml
340
341
342 -- | In the full page, all of the article titles and bodies are
343 -- wrapped in one big div.ArticleText.
344 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
345 parse_bodies xml =
346 xml >>> css "div.ArticleText"
347
348
349 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
350 fp_parse_articles xml = do
351 bodies <- runX . xshow $ parse_bodies xml
352 let article_separator = "<h2 class=\"SummaryHL\">"
353 let split_articles'' = split article_separator (concat bodies)
354 -- The first element will contain the crap before the first <h2...>.
355 let split_articles' = case split_articles'' of
356 (_:_) -> tail split_articles''
357 [] -> []
358 -- Put the separator back, it was lost during the split.
359 let split_articles = map (article_separator ++) split_articles'
360 real_articles <- mapM parse_html_article split_articles
361 let just_articles = catMaybes real_articles
362 return just_articles
363
364
365 -- | This makes it easy to select otherwise-random chunks of html
366 -- using 'css'.
367 wrap_in_body_div :: String -> String
368 wrap_in_body_div s =
369 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
370
371
372
373
374 --
375 -- Epublishable stuff
376 --
377
378 title :: Page -> String
379 title (ArticlePage a) = getTitle $ LWN.Article.title a
380 title (FullPage hl _) = hl
381
382
383 metadata :: Page -> IO String
384 metadata obj = do
385 date <- getCurrentTime
386 return $
387 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
388 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
389 "<dc:language>en-US</dc:language>\n" ++
390 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
391 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
392
393
394 epublish :: Page -> Handle -> IO ()
395 epublish obj handle = do
396 let xhtml = to_xhtml obj
397 epmd <- metadata obj
398 epub <- xhtml_to_epub epmd xhtml
399 B.hPut handle epub
400 hFlush handle
401 hClose handle
402
403 xhtml_to_epub :: String -> String -> IO B.ByteString
404 xhtml_to_epub epmd =
405 write_epub . read_html
406 where
407 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
408 write_epub = writeEPUB Nothing [] my_writer_options
409 read_html = readHtml defaultParserState
410
411
412 --
413 -- Tests
414 --
415
416 test_preprocess_links :: Assertion
417 test_preprocess_links = do
418 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
419 let actual_xml = actual_xml' !! 0
420
421 expected_xml' <- runX $ expected_xml'' >>> css "body"
422 let expected_xml = expected_xml' !! 0
423
424 assertEqual
425 "Links are replaced with spans"
426 expected_xml
427 actual_xml
428 where
429 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
430 input_xml = parseHtml input_html
431 expected_html = "<body><span>Hello, world!</span></body>"
432 expected_xml'' = parseHtml expected_html
433
434
435 test_absolve_images :: Assertion
436 test_absolve_images = do
437 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
438 let actual_xml = actual_xml' !! 0
439
440 expected_xml' <- runX $ expected_xml'' >>> css "body"
441 let expected_xml = expected_xml' !! 0
442
443 assertEqual
444 "Image srcs are made absolute"
445 expected_xml
446 actual_xml
447 where
448 input_html =
449 "<body>" ++
450 "<img src=\"/images/2012/example.jpg\" />" ++
451 "</body>"
452 input_xml = parseHtml input_html
453 expected_html =
454 "<body>" ++
455 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
456 "</body>"
457 expected_xml'' = parseHtml expected_html
458
459
460 test_comments_removed :: Assertion
461 test_comments_removed = do
462 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
463 let actual_xml = actual_xml' !! 0
464
465 expected_xml' <- runX $ expected_xml'' >>> css "body"
466 let expected_xml = expected_xml' !! 0
467
468 assertEqual
469 "Comment links are removed"
470 expected_xml
471 actual_xml
472 where
473 input_html =
474 "<body><p>" ++
475 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
476 "</p></body>"
477 input_xml = parseHtml input_html
478
479 expected_html = "<body><p></p></body>"
480 expected_xml'' = parseHtml expected_html
481
482
483 test_full_story_urls_parsed :: Assertion
484 test_full_story_urls_parsed = do
485 actual <- runX $ actual'
486
487 assertEqual
488 "Full Story URLs are parsed"
489 expected
490 actual
491 where
492 expected = ["/Articles/500738/", "/Articles/501837/"]
493
494 full_story_html =
495 concat ["<p>",
496 "<a href=\"/Articles/500738/\">Full Story</a> ",
497 "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
498 "<p>",
499 "<a href=\"/Articles/501837/\">Full Story</a> ",
500 "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
501 "<p>"]
502
503 xml = parseHtml full_story_html
504 actual' = xml >>> full_story_urls
505
506 page_tests :: Test
507 page_tests =
508 testGroup "Page Tests" [
509 testCase "Links are replaced with spans" test_preprocess_links,
510 testCase "Image srcs are made absolute" test_absolve_images,
511 testCase "Comment links are removed" test_comments_removed,
512 testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]