97171c641be08a6761a1a1800002190c7f10b9ed
[dead/lwn-epub.git] / src / LWN / Page.hs
1 {-# LANGUAGE DoAndIfThenElse #-}
2
3 module LWN.Page
4 where
5
6 import qualified Data.Map as Map (lookup)
7 import Data.Time (getCurrentTime)
8 import qualified Data.ByteString.Lazy as B (ByteString, hPut)
9 import Data.String.Utils (split, strip)
10 import Data.Maybe (catMaybes, fromJust, isNothing)
11 import Prelude hiding (readFile)
12 import System.IO (Handle, hClose, hFlush)
13 import Test.HUnit (Assertion, assertEqual)
14 import Test.Framework (Test, testGroup)
15 import Test.Framework.Providers.HUnit (testCase)
16 import Text.Pandoc (
17 defaultParserState,
18 defaultWriterOptions,
19 readHtml,
20 writeEPUB,
21 writerEPUBMetadata)
22 import Text.XML.HXT.Core (
23 ArrowXml,
24 IOSArrow,
25 XmlTree,
26 (>>>),
27 (/>),
28 (//>),
29 changeAttrValue,
30 getChildren,
31 getText,
32 hasName,
33 processAttrl,
34 processTopDown,
35 runX,
36 xshow,
37 when)
38 import Text.HandsomeSoup (css, parseHtml)
39
40 import Configuration (Cfg)
41 import LWN.Article
42 import LWN.HTTP (
43 ImageMap,
44 download_image_urls,
45 get_article_contents)
46 import LWN.URI (URL)
47 import LWN.XHTML (
48 XHTML,
49 image_srcs,
50 is_image,
51 preprocess,
52 remove_byline,
53 remove_title,
54 to_xhtml,
55 xml_from_contents)
56
57
58
59
60 data Page =
61 -- | An LWN page with one article on it.
62 ArticlePage { article :: Article } |
63
64 -- | An LWN page with more than one article on it. These require
65 -- different parsing and display functions than the single-article
66 -- pages.
67 FullPage { headline :: String,
68 articles :: [Article] }
69
70
71 instance XHTML Page where
72 to_xhtml (ArticlePage a) =
73 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
74 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
75 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
76 "<head>" ++
77 " <meta http-equiv=\"Content-Type\"" ++
78 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
79 " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
80 "</head>" ++
81 "<body>" ++
82 (to_xhtml a) ++
83 "</body>" ++
84 "</html>"
85
86 to_xhtml (FullPage hl as) =
87 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
88 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
89 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
90 "<head>" ++
91 " <meta http-equiv=\"Content-Type\"" ++
92 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
93 " <title>" ++ hl ++ "</title>" ++
94 "</head>" ++
95 "<body>" ++
96 "<div>" ++
97 "<h1>" ++ hl ++ "</h1>" ++
98 (concatMap to_xhtml as) ++
99 "</div>" ++
100 "</body>" ++
101 "</html>"
102
103
104
105 page_from_url :: Cfg -> URL -> IO (Maybe Page)
106 page_from_url cfg url = do
107 contents <- get_article_contents cfg url
108 case (xml_from_contents contents) of
109 Just html -> parse html
110 Nothing -> return Nothing
111
112
113
114 -- Should be called *after* preprocessing.
115 download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
116 download_images xml = do
117 image_urls <- runX $ xml >>> image_srcs
118 download_image_urls image_urls
119
120
121
122 replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
123 replace_remote_img_srcs image_map =
124 processTopDown (make_srcs_local `when` is_image)
125 where
126 -- old_src -> new_src
127 change_src_func :: String -> String
128 change_src_func old_src =
129 case Map.lookup old_src image_map of
130 -- Leave it alone if we don't have the file locally
131 Nothing -> old_src
132 Just v -> v
133
134 change_src :: (ArrowXml a) => a XmlTree XmlTree
135 change_src =
136 changeAttrValue change_src_func
137
138 make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree
139 make_srcs_local =
140 processAttrl $ (change_src `when` (hasName "src"))
141
142
143 parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
144 parse xml = do
145 let clean_xml = xml >>> preprocess
146 image_map <- download_images clean_xml
147 let local_xml = clean_xml >>> replace_remote_img_srcs image_map
148 appr <- ap_parse local_xml
149 fppr <- fp_parse local_xml
150 return $
151 if (isNothing appr) then
152 fppr
153 else
154 appr
155
156
157
158 parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
159 parse_headline xml = do
160 let element_filter = xml >>> css "div.PageHeadline h1"
161 let element_text_filter = element_filter /> getText
162 element_text <- runX element_text_filter
163 return $
164 case element_text of
165 [x] -> Just $ strip x
166 [] -> Nothing
167 _ -> error "Found more than one headline."
168
169
170 parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
171 parse_byline xml = do
172 let element_filter = xml >>> css "div.FeatureByLine"
173 let element_text_filter = element_filter /> getText
174 element_text <- runX element_text_filter
175 return $
176 case element_text of
177 [x] -> Just $ strip x
178 [] -> Nothing
179 _ -> error "Found more than one article byline."
180
181
182 --
183 -- ArticlePage Stuff
184 --
185 ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
186 ap_parse xml = do
187 arts <- ap_parse_articles xml
188 case arts of
189 [x] -> return $ Just $ ArticlePage x
190 _ -> return Nothing
191
192
193 ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
194 ap_parse_body xml = do
195 let element_filter = xml >>> css "div.ArticleText"
196 let element_html_filter = xshow element_filter
197 element_html <- runX element_html_filter
198 return $ case element_html of
199 [x] -> Just x
200 [] -> Nothing
201 _ -> error "Found more than one article."
202
203
204 ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
205 ap_parse_articles xml = do
206 parsed_headline <- parse_headline xml
207 parsed_byline <- parse_byline xml
208 parsed_body <- ap_parse_body xml
209
210 putStrLn $ fromJust parsed_headline
211
212 if (isNothing parsed_headline) || (isNothing parsed_body)
213 then return []
214 else do
215 let title' = Title $ fromJust parsed_headline
216 let byline' = Byline parsed_byline
217 let body' = BodyHtml $ fromJust parsed_body
218
219 return $ [Article title' byline' body']
220
221
222
223 --
224 -- FullPage Stuff
225 --
226
227 fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
228 fp_parse xml = do
229 hl <- parse_headline xml
230 parsed_articles <- fp_parse_articles xml
231 case parsed_articles of
232 [] -> return Nothing
233 x -> return $ Just $ FullPage (fromJust hl) x
234
235
236
237 fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
238 fp_parse_article_title xml = do
239 let element_filter = xml >>> css "h2.SummaryHL"
240 let element_text_filter = element_filter //> getText
241 element_text <- runX element_text_filter
242 return $ case element_text of
243 [x] -> Just $ strip x
244 [] -> Nothing
245 _ -> error "Found more than one article title."
246
247
248
249
250 fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String)
251 fp_parse_article_body xml = do
252 -- First, delete the article title and byline.
253 let clean_xml' = xml >>> remove_title >>> remove_byline
254 -- The only child of the body element should be a div.lwn-article
255 -- since we wrapped the article's HTML in that.
256 let clean_xml = clean_xml' >>> css "body" >>> getChildren
257 clean_html <- runX . xshow $ clean_xml
258 return $ case clean_html of
259 [x] -> Just x
260 [] -> Nothing
261 _ -> error "Found more than one article body."
262
263 fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article)
264 fp_parse_article xml = do
265 parsed_article_title <- fp_parse_article_title xml
266 parsed_article_byline <- parse_byline xml
267 parsed_article_body <- fp_parse_article_body xml
268
269 if (isNothing parsed_article_title) || (isNothing parsed_article_body)
270 then
271 return Nothing
272 else do
273 let title' = Title $ fromJust parsed_article_title
274 let byline' = Byline parsed_article_byline
275 let body' = BodyHtml $ fromJust parsed_article_body
276 return $ Just $ Article title' byline' body'
277
278 parse_html_article :: String -> IO (Maybe Article)
279 parse_html_article html = do
280 let xml = parseHtml $ wrap_in_body_div html
281 fp_parse_article xml
282
283
284 -- | In the full page, all of the article titles and bodies are
285 -- wrapped in one big div.ArticleText.
286 parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree
287 parse_bodies xml =
288 xml >>> css "div.ArticleText"
289
290
291 fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article]
292 fp_parse_articles xml = do
293 bodies <- runX . xshow $ parse_bodies xml
294 let article_separator = "<h2 class=\"SummaryHL\">"
295 let split_articles'' = split article_separator (concat bodies)
296 -- The first element will contain the crap before the first <h2...>.
297 let split_articles' = case split_articles'' of
298 (_:_) -> tail split_articles''
299 [] -> []
300 -- Put the separator back, it was lost during the split.
301 let split_articles = map (article_separator ++) split_articles'
302 --_ <- mapM print_article split_articles
303 real_articles <- mapM parse_html_article split_articles
304 let just_articles = catMaybes real_articles
305 return just_articles
306
307
308 -- | This makes it easy to select otherwise-random chunks of html
309 -- using 'css'.
310 wrap_in_body_div :: String -> String
311 wrap_in_body_div s =
312 "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
313
314
315
316
317 --
318 -- Epublishable stuff
319 --
320
321 title :: Page -> String
322 title (ArticlePage a) = getTitle $ LWN.Article.title a
323 title (FullPage hl _) = hl
324
325
326 metadata :: Page -> IO String
327 metadata obj = do
328 date <- getCurrentTime
329 return $
330 "<dc:creator>http://lwn.net/</dc:creator>\n" ++
331 "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
332 "<dc:language>en-US</dc:language>\n" ++
333 "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
334 "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
335
336
337 epublish :: Page -> Handle -> IO ()
338 epublish obj handle = do
339 let xhtml = to_xhtml obj
340 epmd <- metadata obj
341 epub <- xhtml_to_epub epmd xhtml
342 B.hPut handle epub
343 hFlush handle
344 hClose handle
345
346 xhtml_to_epub :: String -> String -> IO B.ByteString
347 xhtml_to_epub epmd =
348 write_epub . read_html
349 where
350 my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
351 write_epub = writeEPUB Nothing [] my_writer_options
352 read_html = readHtml defaultParserState
353
354
355 --
356 -- Tests
357 --
358
359 test_preprocess_links :: Assertion
360 test_preprocess_links = do
361 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
362 let actual_xml = actual_xml' !! 0
363
364 expected_xml' <- runX $ expected_xml'' >>> css "body"
365 let expected_xml = expected_xml' !! 0
366
367 assertEqual
368 "Links are replaced with spans"
369 expected_xml
370 actual_xml
371 where
372 input_html = "<body><a href=\"#\">Hello, world!</a></body>"
373 input_xml = parseHtml input_html
374 expected_html = "<body><span>Hello, world!</span></body>"
375 expected_xml'' = parseHtml expected_html
376
377
378 test_absolve_images :: Assertion
379 test_absolve_images = do
380 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
381 let actual_xml = actual_xml' !! 0
382
383 expected_xml' <- runX $ expected_xml'' >>> css "body"
384 let expected_xml = expected_xml' !! 0
385
386 assertEqual
387 "Image srcs are made absolute"
388 expected_xml
389 actual_xml
390 where
391 input_html =
392 "<body>" ++
393 "<img src=\"/images/2012/example.jpg\" />" ++
394 "</body>"
395 input_xml = parseHtml input_html
396 expected_html =
397 "<body>" ++
398 "<img src=\"https://lwn.net/images/2012/example.jpg\" />" ++
399 "</body>"
400 expected_xml'' = parseHtml expected_html
401
402
403 test_comments_removed :: Assertion
404 test_comments_removed = do
405 actual_xml' <- runX $ input_xml >>> preprocess >>> css "body"
406 let actual_xml = actual_xml' !! 0
407
408 expected_xml' <- runX $ expected_xml'' >>> css "body"
409 let expected_xml = expected_xml' !! 0
410
411 assertEqual
412 "Comment links are removed"
413 expected_xml
414 actual_xml
415 where
416 input_html =
417 "<body><p>" ++
418 "<a href=\"/Articles/501490/#Comments\">Comments (6 posted)</a>" ++
419 "</p></body>"
420 input_xml = parseHtml input_html
421
422 expected_html = "<body><p></p></body>"
423 expected_xml'' = parseHtml expected_html
424
425
426
427 page_tests :: Test
428 page_tests =
429 testGroup "Page Tests" [
430 testCase "Links are replaced with spans" test_preprocess_links,
431 testCase "Image srcs are made absolute" test_absolve_images,
432 testCase "Comment links are removed" test_comments_removed ]