]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/Page.hs
"Fix" the unicode URL bug.
[dead/lwn-epub.git] / src / LWN / Page.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.Page
4 where
5
6 import Control.Concurrent.ParallelIO (parallel)
7 import qualified Data.Map as Map (lookup)
8 import Data.Time (getCurrentTime)
9 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
10 import Data.String.Utils (split, strip)
11 import qualified Data.Map as Map (Map, empty, insert)
12 import Data.Maybe (catMaybes, fromJust, isNothing)
13 import Prelude hiding (readFile)
14 import System.IO (Handle, hClose, hFlush)
15 import Test.HUnit (Assertion, assertEqual)
16 import Test.Framework (Test, testGroup)
17 import Test.Framework.Providers.HUnit (testCase)
18 import Text.Pandoc (
19 defaultParserState,
20 defaultWriterOptions,
21 readHtml,
22 writeEPUB,
23 writerEPUBMetadata,
24 writerUserDataDir)
25 import Text.Pandoc.Shared ( readDataFile )
26 import Text.XML.HXT.Core (
27 ArrowXml,
28 IOSArrow,
29 XmlTree,
30 ($<),
31 (>>>),
32 (/>),
33 (//>),
34 changeAttrValue,
35 getAttrValue,
36 getChildren,
37 getText,
38 hasName,
39 none,
40 processAttrl,
41 processTopDown,
42 this,
43 runX,
44 xshow,
45 when)
46 import Text.HandsomeSoup (css, parseHtml)
47
48 import Configuration (Cfg, full_stories)
49 import LWN.Article
50 import LWN.HTTP (
51 ImageMap,
52 download_image_urls,
53 get_article_contents)
54 import LWN.URI (URL)
55 import LWN.XHTML (
56 XHTML,
57 full_story_urls,
58 image_srcs,
59 full_story_link,
60 full_story_paragraph,
61 is_image,
62 preprocess,
63 remove_byline,
64 remove_full_story_paragraphs,
65 remove_title,
66 to_xhtml,
67 to_xml,
68 xml_from_contents)
69
70
71
72
73 data Page =
74 -- | An LWN page with one article on it.
75 ArticlePage { article :: Article } |
76
77 -- | An LWN page with more than one article on it. These require
78 -- different parsing and display functions than the single-article
79 -- pages.
80 FullPage { headline :: String,
81 articles :: [Article] }
82
83
84 instance XHTML Page where
85 to_xhtml (ArticlePage a) =
86 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
87 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
88 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
89 "<head>" ++
90 " <meta http-equiv=\"Content-Type\"" ++
91 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
92 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
93 "</head>" ++
94 "<body>" ++
95 (to_xhtml a) ++
96 "</body>" ++
97 "</html>"
98
99 to_xhtml (FullPage hl as) =
100 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
101 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
102 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
103 "<head>" ++
104 " <meta http-equiv=\"Content-Type\"" ++
105 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
106 " <title>" ++ hl ++ "</title>" ++
107 "</head>" ++
108 "<body>" ++
109 "<div>" ++
110 "<h1>" ++ hl ++ "</h1>" ++
111 (concatMap to_xhtml as) ++
112 "</div>" ++
113 "</body>" ++
114 "</html>"
115
116
117
118 -- | Stolen from writeEPUB.
119 default_stylesheet :: IO String
120 default_stylesheet =
121 -- This comes with Pandoc, I guess.
122 readDataFile (writerUserDataDir defaultWriterOptions) "epub.css"
123
124
125 construct_stylesheet :: IO String
126 construct_stylesheet = do
127 defaults <- default_stylesheet
128 -- Allow word-wrapping in <pre> elements.
129 let my_additions = "\n" ++ "pre { white-space: pre-wrap; }" ++ "\n"
130 return $ defaults ++ my_additions
131
132 page_from_url :: Cfg -> URL -> IO (Maybe Page)
133 page_from_url cfg url = do
134 contents <- get_article_contents cfg url
135 case (xml_from_contents contents) of
136 Just html -> parse cfg html
137 Nothing -> return Nothing
138
139
140
141 insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
142 insert_full_stories story_map =
143 processTopDown (article_xml `when` full_story_paragraph)
144 where
145 lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
146 lookup_func href =
147 case Map.lookup href story_map of
148 -- Drop the paragraph if we don't have the contents.
149 Nothing -> none
150 Just v -> to_xml v
151
152 article_xml :: (ArrowXml a) => a XmlTree XmlTree
153 article_xml =
154 lookup_func
155 $< -- From HXT's Control.Arrow.ArrowList
156 (this /> full_story_link >>> getAttrValue "href")
157
158 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
159 replace_remote_img_srcs image_map =
160 processTopDown (make_srcs_local `when` is_image)
161 where
162 -- old_src -> new_src
163 change_src_func :: String -> String
164 change_src_func old_src =
165 case Map.lookup old_src image_map of
166 -- If we don't have the file, empty the src. Pandoc will crash
167 -- otherwise.
168 Nothing -> ""
169 Just v -> v
170
171 change_src :: (ArrowXml a) => a XmlTree XmlTree
172 change_src =
173 changeAttrValue change_src_func
174
175 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
176 make_srcs_local =
177 processAttrl $ (change_src `when` (hasName "src"))
178
179
180
181
182 -- Should be called *after* preprocessing.
183 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
184 download_images xml = do
185 image_urls <- runX $ xml >>> image_srcs
186 download_image_urls image_urls
187
188
189
190 type StoryMap = Map.Map URL Article
191
192 -- These come *before* preprocessing.
193 download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
194 download_full_story_urls cfg story_urls = do
195 pages <- parallel $ map (page_from_url cfg) story_urls
196 let pairs = zip story_urls pages
197 return $ foldl my_insert empty_map pairs
198 where
199 empty_map = Map.empty :: StoryMap
200
201 my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
202 my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
203 my_insert dict (_, _) = dict
204
205
206 download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
207 download_full_stories cfg xml = do
208 story_urls <- runX $ xml >>> full_story_urls
209 download_full_story_urls cfg story_urls
210
211
212 parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
213 parse cfg xml = do
214 fs_xml <- if (full_stories cfg) then do
215 story_map <- download_full_stories cfg xml
216 return $ xml >>> insert_full_stories story_map
217 else do
218 -- Get rid of them if we don't want them.
219 return $ xml >>> remove_full_story_paragraphs
220
221 let clean_xml = fs_xml >>> preprocess
222 image_map <- download_images clean_xml
223 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
224
225 appr <- ap_parse local_xml
226 fppr <- fp_parse local_xml
227 return $
228 if (isNothing appr) then
229 fppr
230 else
231 appr
232
233
234
235 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
236 parse_headline xml = do
237 let element_filter = xml >>> css "div.PageHeadline h1"
238 let element_text_filter = element_filter /> getText
239 element_text <- runX element_text_filter
240 return $
241 case element_text of
242 [x] -> Just $ strip x
243 [] -> Nothing
244 _ -> error "Found more than one headline."
245
246
247 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
248 parse_byline xml = do
249 let element_filter = xml >>> css "div.FeatureByLine"
250 let element_text_filter = element_filter /> getText
251 element_text <- runX element_text_filter
252 return $
253 case element_text of
254 [x] -> Just $ strip x
255 [] -> Nothing
256 _ -> error "Found more than one article byline."
257
258
259 --
260 -- ArticlePage Stuff
261 --
262 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
263 ap_parse xml = do
264 arts <- ap_parse_articles xml
265 case arts of
266 [x] -> return $ Just $ ArticlePage x
267 _ -> return Nothing
268
269
270 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
271 ap_parse_body xml = do
272 let element_filter = xml >>> css "div.ArticleText"
273 let element_html_filter = xshow element_filter
274 element_html <- runX element_html_filter
275 return $ case element_html of
276 [x] -> Just x
277 [] -> Nothing
278 _ -> error "Found more than one article."
279
280
281 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
282 ap_parse_articles xml = do
283 parsed_headline <- parse_headline xml
284 parsed_byline <- parse_byline xml
285 parsed_body <- ap_parse_body xml
286
287 if (isNothing parsed_headline) || (isNothing parsed_body)
288 then return []
289 else do
290 let title' = Title $ fromJust parsed_headline
291 let byline' = Byline parsed_byline
292 let body' = BodyHtml $ fromJust parsed_body
293
294 return $ [Article title' byline' body']
295
296
297
298 --
299 -- FullPage Stuff
300 --
301
302 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
303 fp_parse xml = do
304 hl <- parse_headline xml
305 parsed_articles <- fp_parse_articles xml
306 return $ case parsed_articles of
307 [] -> Nothing
308 x -> Just $ FullPage (fromJust hl) x
309
310
311
312 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
313 fp_parse_article_title xml = do
314 let element_filter = xml >>> css "h2.SummaryHL"
315 let element_text_filter = element_filter //> getText
316 element_text <- runX element_text_filter
317 return $ case element_text of
318 [x] -> Just $ strip x
319 [] -> Nothing
320 _ -> error "Found more than one article title."
321
322
323
324
325 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
326 fp_parse_article_body xml = do
327 -- First, delete the article title and byline.
328 let clean_xml' = xml >>> remove_title >>> remove_byline
329 -- The only child of the body element should be a div.lwn-article
330 -- since we wrapped the article's HTML in that.
331 let clean_xml = clean_xml' >>> css "body" >>> getChildren
332 clean_html <- runX . xshow $ clean_xml
333 return $ case clean_html of
334 [x] -> Just x
335 [] -> Nothing
336 _ -> error "Found more than one article body."
337
338 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
339 fp_parse_article xml = do
340 parsed_article_title <- fp_parse_article_title xml
341 parsed_article_byline <- parse_byline xml
342 parsed_article_body <- fp_parse_article_body xml
343
344 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
345 then
346 return Nothing
347 else do
348 let title' = Title $ fromJust parsed_article_title
349 let byline' = Byline parsed_article_byline
350 let body' = BodyHtml $ fromJust parsed_article_body
351 return $ Just $ Article title' byline' body'
352
353 parse_html_article :: String -> IO (Maybe Article)
354 parse_html_article html = do
355 let xml = parseHtml $ wrap_in_body_div html
356 fp_parse_article xml
357
358
359 -- | In the full page, all of the article titles and bodies are
360 -- wrapped in one big div.ArticleText.
361 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
362 parse_bodies xml =
363 xml >>> css "div.ArticleText"
364
365
366 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
367 fp_parse_articles xml = do
368 bodies <- runX . xshow $ parse_bodies xml
369 let article_separator = "<h2 class=\"SummaryHL\">"
370 let split_articles'' = split article_separator (concat bodies)
371 -- The first element will contain the crap before the first <h2...>.
372 let split_articles' = case split_articles'' of
373 (_:_) -> tail split_articles''
374 [] -> []
375 -- Put the separator back, it was lost during the split.
376 let split_articles = map (article_separator ++) split_articles'
377 real_articles <- mapM parse_html_article split_articles
378 let just_articles = catMaybes real_articles
379 return just_articles
380
381
382 -- | This makes it easy to select otherwise-random chunks of html
383 -- using 'css'.
384 wrap_in_body_div :: String -> String
385 wrap_in_body_div s =
386 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
387
388
389
390
391 --
392 -- Epublishable stuff
393 --
394
395 title :: Page -> String
396 title (ArticlePage a) = getTitle $ LWN.Article.title a
397 title (FullPage hl _) = hl
398
399
400 metadata :: Page -> IO String
401 metadata obj = do
402 date <- getCurrentTime
403 return $
404 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
405 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
406 "<dc:language>en-US</dc:language>\n" ++
407 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
408 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
409
410
411 epublish :: Page -> Handle -> IO ()
412 epublish obj handle = do
413 let xhtml = to_xhtml obj
414 epmd <- metadata obj
415 epub <- xhtml_to_epub epmd xhtml
416 B.hPut handle epub
417 hFlush handle
418 hClose handle
419
420 xhtml_to_epub :: String -> String -> IO B.ByteString
421 xhtml_to_epub epmd xhtml = do
422 stylesheet <- construct_stylesheet
423 writeEPUB
424 (Just stylesheet)
425 []
426 my_writer_options
427 (read_html xhtml)
428 where
429 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
430 read_html = readHtml defaultParserState
431
432
433 --
434 -- Tests
435 --
436
437 test_preprocess_links :: Assertion
438 test_preprocess_links = do
439 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
440 let actual_xml = actual_xml' !! 0
441
442 expected_xml' <- runX $ expected_xml'' >>> css "body"
443 let expected_xml = expected_xml' !! 0
444
445 assertEqual
446 "Links are replaced with spans"
447 expected_xml
448 actual_xml
449 where
450 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
451 input_xml = parseHtml input_html
452 expected_html = "<body><span>Hello, world!</span></body>"
453 expected_xml'' = parseHtml expected_html
454
455
456 test_absolve_images :: Assertion
457 test_absolve_images = do
458 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
459 let actual_xml = actual_xml' !! 0
460
461 expected_xml' <- runX $ expected_xml'' >>> css "body"
462 let expected_xml = expected_xml' !! 0
463
464 assertEqual
465 "Image srcs are made absolute"
466 expected_xml
467 actual_xml
468 where
469 input_html =
470 "<body>" ++
471 "<img src=\"/images/2012/example.jpg\" />" ++
472 "</body>"
473 input_xml = parseHtml input_html
474 expected_html =
475 "<body>" ++
476 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
477 "</body>"
478 expected_xml'' = parseHtml expected_html
479
480
481 test_comments_removed :: Assertion
482 test_comments_removed = do
483 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
484 let actual_xml = actual_xml' !! 0
485
486 expected_xml' <- runX $ expected_xml'' >>> css "body"
487 let expected_xml = expected_xml' !! 0
488
489 assertEqual
490 "Comment links are removed"
491 expected_xml
492 actual_xml
493 where
494 input_html =
495 "<body><p>" ++
496 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
497 "</p></body>"
498 input_xml = parseHtml input_html
499
500 expected_html = "<body><p></p></body>"
501 expected_xml'' = parseHtml expected_html
502
503
504 test_full_story_urls_parsed :: Assertion
505 test_full_story_urls_parsed = do
506 actual <- runX $ actual'
507
508 assertEqual
509 "Full Story URLs are parsed"
510 expected
511 actual
512 where
513 expected = ["/Articles/500738/", "/Articles/501837/"]
514
515 full_story_html =
516 concat ["<p>",
517 "<a href=\"/Articles/500738/\">Full Story</a> ",
518 "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
519 "<p>",
520 "<a href=\"/Articles/501837/\">Full Story</a> ",
521 "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
522 "<p>"]
523
524 xml = parseHtml full_story_html
525 actual' = xml >>> full_story_urls
526
527 page_tests :: Test
528 page_tests =
529 testGroup "Page Tests" [
530 testCase "Links are replaced with spans" test_preprocess_links,
531 testCase "Image srcs are made absolute" test_absolve_images,
532 testCase "Comment links are removed" test_comments_removed,
533 testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]