]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/XHTML.hs
c54bada32538c97e5e0f6d9c5fe579a37e2a4f77
[dead/lwn-epub.git] / src / LWN / XHTML.hs
1 module LWN.XHTML (
2 XHTML,
3 XML,
4 full_story_urls,
5 image_srcs,
6 full_story_link,
7 full_story_paragraph,
8 is_image,
9 parse_lwn,
10 preprocess,
11 remove_byline,
12 remove_full_story_paragraphs,
13 remove_title,
14 to_xhtml,
15 to_xml,
16 xml_from_contents)
17 where
18
19 import Text.HandsomeSoup (css)
20 import Text.XML.HXT.Core (
21 (>>>),
22 (/>),
23 ArrowXml,
24 IOStateArrow,
25 SysConfigList,
26 XmlTree,
27 changeAttrValue,
28 deep,
29 getAttrValue,
30 hasAttrValue,
31 hasName,
32 hasText,
33 ifA,
34 isElem,
35 mkName,
36 no,
37 none,
38 processAttrl,
39 processTopDown,
40 readString,
41 setElemName,
42 this,
43 when,
44 withParseHTML,
45 withValidate,
46 withWarnings,
47 yes)
48
49
50 import LWN.URI (URL, try_make_absolute_url)
51 import Misc (contains)
52
53
54 class XHTML a where
55 to_xhtml :: a -> String
56
57 class XML a where
58 to_xml :: (ArrowXml b) => a -> (b XmlTree XmlTree)
59
60 -- | Options used when parsing HTML.
61 my_read_opts :: SysConfigList
62 my_read_opts = [ withValidate no,
63 withParseHTML yes,
64 withWarnings no ]
65
66 -- | My version of HandsomeSoup's parseHTML.
67 parse_lwn :: String -> IOStateArrow s b XmlTree
68 parse_lwn = readString my_read_opts
69
70
71 -- | Takes the result of get_article_contents and calls parse_lwn on
72 -- the contained value.
73 xml_from_contents :: (Maybe String) -> Maybe (IOStateArrow s b XmlTree)
74 xml_from_contents =
75 fmap parse_lwn
76
77
78
79 -- | Preprocessing common to both page types.
80 preprocess :: (ArrowXml a) => a XmlTree XmlTree
81 preprocess =
82 make_image_srcs_absolute
83 >>>
84 remove_comment_links
85 >>>
86 replace_links_with_spans
87
88
89 is_link :: (ArrowXml a) => a XmlTree XmlTree
90 is_link =
91 isElem >>> hasName "a"
92
93
94 remove_full_story_paragraphs :: (ArrowXml a) => a XmlTree XmlTree
95 remove_full_story_paragraphs =
96 processTopDown $ none `when` full_story_paragraph
97
98
99 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
100 remove_comment_links =
101 processTopDown $ kill_comments `when` is_link
102 where
103 is_comment_link =
104 hasAttrValue "href" (contains "#Comments")
105
106 kill_comments =
107 none `when` is_comment_link
108
109 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
110 replace_links_with_spans =
111 processTopDown $ (make_span >>> remove_attrs) `when` is_link
112 where
113 make_span = setElemName $ mkName "span"
114 remove_attrs = processAttrl none
115
116
117 is_title :: (ArrowXml a) => a XmlTree XmlTree
118 is_title =
119 (hasName "h2")
120 >>>
121 (hasAttrValue "class" (== "SummaryHL"))
122
123
124 is_byline :: (ArrowXml a) => a XmlTree XmlTree
125 is_byline =
126 (hasName "div")
127 >>>
128 (hasAttrValue "class" (== "FeatureByLine"))
129
130
131 is_image :: (ArrowXml a) => a XmlTree XmlTree
132 is_image = isElem >>> hasName "img"
133
134 remove_title :: (ArrowXml a) => a XmlTree XmlTree
135 remove_title =
136 processTopDown ((none) `when` is_title)
137
138
139 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
140 remove_byline =
141 processTopDown ((none) `when` is_byline)
142
143
144 image_srcs :: (ArrowXml a) => a XmlTree URL
145 image_srcs =
146 css "img"
147 >>>
148 getAttrValue "src"
149
150
151 full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree
152 full_story_paragraph =
153 isElem
154 >>>
155 hasName "p"
156 >>>
157 ifA
158 (this /> full_story_link)
159 this
160 none
161
162
163 -- Without regard to the parent paragraph.
164 full_story_link :: (ArrowXml a) => a XmlTree XmlTree
165 full_story_link =
166 isElem
167 >>>
168 hasName "a"
169 >>>
170 ifA
171 (this /> hasText (== "Full Story"))
172 this
173 none
174
175
176 -- | Get the hrefs of all full story links.
177 full_story_urls :: (ArrowXml a) => a XmlTree URL
178 full_story_urls =
179 deep $
180 full_story_paragraph
181 />
182 full_story_link
183 >>>
184 getAttrValue "href"
185
186
187
188 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
189 make_image_srcs_absolute =
190 processTopDown (make_srcs_absolute `when` is_image)
191 where
192 change_src :: (ArrowXml a) => a XmlTree XmlTree
193 change_src =
194 changeAttrValue try_make_absolute_url
195
196 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
197 make_srcs_absolute =
198 processAttrl $ change_src `when` hasName "src"