0e699a0978953ad61e9aaa7105c97025b11be3c2
[dead/lwn-epub.git] / src / LWN / ArticlePage.hs
1 module LWN.ArticlePage
2 where
3
4 import Data.String.Utils (strip)
5 import Data.Maybe (fromJust)
6 import Data.Tree.NTree.TypeDefs (NTree)
7 import Text.XML.HXT.Core (
8 IOSArrow,
9 XmlTree,
10 XNode,
11 (>>>),
12 (/>),
13 getText,
14 runX,
15 xshow
16 )
17 import Text.HandsomeSoup (css)
18
19 import Epublishable
20 import LWN.Article
21 import XHTML
22
23 -- | Defines the ArticlePage data type, containing one 'Article'.
24 data ArticlePage = ArticlePage { article :: Article }
25
26
27 instance XHTML ArticlePage where
28 to_xhtml (ArticlePage a) =
29 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
30 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
31 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
32 "\n<head>\n" ++
33 " <meta http-equiv=\"Content-Type\"" ++
34 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
35 " <title>" ++ (headline a) ++ "</title>\n" ++
36 "</head>\n" ++
37 "<body>\n" ++
38 "<div>\n\n" ++
39 (to_xhtml a) ++
40 "\n\n</div>\n" ++
41 "\n</body>\n" ++
42 "</html>"
43
44
45 instance Epublishable ArticlePage where
46 parse xml = do
47 articles <- parse_articles xml
48 case articles of
49 Just [x] -> return $ Just $ ArticlePage x
50 _ -> return Nothing
51
52 title (ArticlePage x) = headline x
53
54
55 -- | Takes data from an LWN page and determines whether or not it's a
56 -- single article (as opposed to a page with multiple articles).
57 is_article_page :: String -> IO Bool
58 is_article_page _ = return True
59
60
61 parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
62 parse_headline xml = do
63 let element_filter = xml >>> css "div.PageHeadline h1"
64 let element_text_filter = element_filter /> getText
65 element_text <- runX element_text_filter
66 return $ case element_text of
67 [x] -> Just $ strip x
68 [] -> Nothing
69 _ -> error "Found more than one headline."
70
71 parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
72 parse_byline xml = do
73 let element_filter = xml >>> css "div.Byline"
74 let element_text_filter = element_filter /> getText
75 element_text <- runX element_text_filter
76 return $ case element_text of
77 [x] -> Just $ strip x
78 [] -> Nothing
79 _ -> error "Found more than one byline."
80
81
82 parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
83 parse_body xml = do
84 let element_filter = xml >>> css "div.ArticleText"
85 let element_html_filter = xshow element_filter
86 element_html <- runX element_html_filter
87 return $ case element_html of
88 [x] -> Just x
89 [] -> Nothing
90 _ -> error "Found more than one article."
91
92
93 parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
94 parse_articles xml = do
95 parsed_headline <- parse_headline xml
96 parsed_byline <- parse_byline xml
97 parsed_body <- parse_body xml
98 let headline' = fromJust parsed_headline
99 let byline' = fromJust parsed_byline
100 let body' = fromJust parsed_body
101 return $ Just $ [Article headline' byline' body']