Begin work on the full page code.
[dead/lwn-epub.git] / src / LWN / FullPage.hs
1 module LWN.FullPage
2 where
3
4 import Data.String.Utils (split, strip)
5 import Data.Maybe (catMaybes, fromJust)
6 import Data.Tree.NTree.TypeDefs (NTree)
7 import Text.XML.HXT.Core (
8 IOSArrow,
9 XmlTree,
10 XNode,
11 (>>>),
12 (/>),
13 getText,
14 hasAttrValue,
15 hasName,
16 none,
17 processTopDown,
18 runX,
19 xshow,
20 when
21 )
22 import Text.XML.HXT.Arrow.Edit (indentDoc)
23 import Text.HandsomeSoup (css, parseHtml)
24
25 import Epublishable
26 import LWN.Article
27 import XHTML
28
29 -- | An LWN page with more than one article on it. These require
30 -- different parsing and display functions than the single-article
31 -- pages.
32 data FullPage = FullPage { headline :: String,
33 articles :: [Article] }
34
35 articles_xhtml :: FullPage -> String
36 articles_xhtml fp = concatMap to_xhtml (articles fp)
37
38 instance XHTML FullPage where
39 to_xhtml fp =
40 "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
41 "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
42 "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
43 "\n<head>\n" ++
44 " <meta http-equiv=\"Content-Type\"" ++
45 " content=\"application/xhtml+xml; charset=utf-8\" />" ++
46 " <title>" ++ (headline fp) ++ "</title>\n" ++
47 "</head>\n" ++
48 "<body>\n" ++
49 "<div>\n\n" ++
50 "<h1>" ++ (headline fp) ++ "</h1>" ++
51 (articles_xhtml fp) ++
52 "\n\n</div>\n" ++
53 "\n</body>\n" ++
54 "</html>"
55
56 instance Epublishable FullPage where
57 parse xml = do
58 hl <- parse_headline xml
59 parsed_articles <- parse_articles xml
60 case parsed_articles of
61 them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
62 _ -> return Nothing
63
64 title _ = "LWN.net"
65
66
67 parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
68 parse_headline xml = do
69 let element_filter = xml >>> css "div.PageHeadline h1"
70 let element_text_filter = element_filter /> getText
71 element_text <- runX element_text_filter
72 return $ case element_text of
73 [x] -> Just $ strip x
74 [] -> Nothing
75 _ -> error "Found more than one headline."
76
77 parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
78 parse_article_byline xml = do
79 let element_filter = xml >>> css "div.FeatureByLine"
80 let element_text_filter = element_filter /> getText
81 element_text <- runX element_text_filter
82 return $ case element_text of
83 [x] -> Just $ strip x
84 [] -> Nothing
85 _ -> error "Found more than one article byline."
86
87
88 parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
89 parse_article_title xml = do
90 let element_filter = xml >>> css "h2.SummaryHL a"
91 let element_text_filter = element_filter /> getText
92 element_text <- runX element_text_filter
93 return $ case element_text of
94 [x] -> Just $ strip x
95 [] -> Nothing
96 _ -> error "Found more than one article title."
97
98
99 --is_title :: Integer
100 --is_title = (hasName "h2") >>> (hasAttrValue "class" (== "SummaryHL"))
101
102 --is_byline :: Integer
103 --is_byline = (hasName "div") >>> (hasAttrValue "class" (== "FeatureByLine"))
104
105 parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
106 parse_article_body xml = do
107 -- First, delete the article title and byline.
108 --let clean_xml = xml >>> processTopDown ((none) `when` is_title) >>> processTopDown ((none) `when` is_byline)
109 let clean_xml = xml
110 clean_html <- runX $ xshow clean_xml
111 return $ case clean_html of
112 [x] -> Just x
113 [] -> Nothing
114 _ -> error "Found more than one article body."
115
116 parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
117 parse_article xml = do
118 parsed_article_title <- parse_article_title xml
119 parsed_article_byline <- parse_article_byline xml
120 parsed_article_body <- parse_article_body xml
121 -- let title' = fromJust parsed_article_title
122 let title' = "title"
123 -- let byline' = fromJust parsed_article_byline
124 let byline' = "byline"
125 -- let body' = fromJust parsed_article_body
126 body' <- runX . xshow $ xml
127 return $ Just $ Article title' byline' (body' !! 0)
128
129 parse_html_article :: String -> IO (Maybe Article)
130 parse_html_article html = do
131 let xml = parseHtml html
132 parse_article xml
133
134
135 -- | In the full page, all of the article titles and bodies are
136 -- wrapped in a div.ArticleText.
137 parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
138 parse_bodies xml =
139 xml >>> css "div.ArticleText"
140
141
142
143 parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
144 parse_articles xml = do
145 bodies <- runX . xshow $ (parse_bodies xml) >>> indentDoc
146 let article_separator = "<h2 class=\"SummaryHL\">"
147 let split_articles'' = split article_separator (bodies !! 0)
148 -- The first element will contain the crap before the first <h2...>.
149 let split_articles' = tail split_articles''
150 -- Put the separator back, it was lost during the split.
151 let split_articles = map (("\n" ++ article_separator) ++) split_articles'
152 putStrLn "split articles\n\n"
153 mapM putStrLn split_articles
154 real_articles <- mapM parse_html_article split_articles
155 let just_articles = catMaybes real_articles
156 return just_articles
157
158