]> gitweb.michael.orlitzky.com - dead/lwn-epub.git/blob - src/LWN/XHTML.hs
Only fetch login cookies once.
[dead/lwn-epub.git] / src / LWN / XHTML.hs
1 module LWN.XHTML (
2 XHTML,
3 XML,
4 full_story_urls,
5 image_srcs,
6 full_story_link,
7 full_story_paragraph,
8 is_image,
9 parse_lwn,
10 preprocess,
11 remove_byline,
12 remove_title,
13 to_xhtml,
14 to_xml,
15 xml_from_contents)
16 where
17
18 import Text.HandsomeSoup (css)
19 import Text.Regex.Posix ((=~))
20 import Text.XML.HXT.Core (
21 (>>>),
22 (/>),
23 ArrowXml,
24 IOStateArrow,
25 SysConfigList,
26 XmlTree,
27 changeAttrValue,
28 deep,
29 getAttrValue,
30 hasAttrValue,
31 hasName,
32 hasText,
33 ifA,
34 isElem,
35 mkName,
36 no,
37 none,
38 processAttrl,
39 processTopDown,
40 readString,
41 setElemName,
42 this,
43 when,
44 withParseHTML,
45 withValidate,
46 withWarnings,
47 yes)
48
49
50 import LWN.URI (URL, try_make_absolute_url)
51 import Misc (contains)
52
53
54 class XHTML a where
55 to_xhtml :: a -> String
56
57 class XML a where
58 to_xml :: (ArrowXml b) => a -> (b XmlTree XmlTree)
59
60 -- | Options used when parsing HTML.
61 my_read_opts :: SysConfigList
62 my_read_opts = [ withValidate no,
63 withParseHTML yes,
64 withWarnings no ]
65
66 -- | My version of HandsomeSoup's parseHTML.
67 parse_lwn :: String -> IOStateArrow s b XmlTree
68 parse_lwn = readString my_read_opts
69
70
71 -- | Takes the result of get_article_contents and calls parse_lwn on
72 -- the contained value.
73 xml_from_contents :: (Maybe String) -> Maybe (IOStateArrow s b XmlTree)
74 xml_from_contents =
75 fmap parse_lwn
76
77
78
79 -- | Preprocessing common to both page types.
80 preprocess :: (ArrowXml a) => a XmlTree XmlTree
81 preprocess =
82 make_image_srcs_absolute
83 >>>
84 remove_comment_links
85 >>>
86 replace_links_with_spans
87
88
89 is_link :: (ArrowXml a) => a XmlTree XmlTree
90 is_link =
91 isElem >>> hasName "a"
92
93
94 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
95 remove_comment_links =
96 processTopDown $ kill_comments `when` is_link
97 where
98 is_comment_link =
99 hasAttrValue "href" (contains "#Comments")
100
101 kill_comments =
102 none `when` is_comment_link
103
104 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
105 replace_links_with_spans =
106 processTopDown $ (make_span >>> remove_attrs) `when` is_link
107 where
108 make_span = setElemName $ mkName "span"
109 remove_attrs = processAttrl none
110
111
112 is_title :: (ArrowXml a) => a XmlTree XmlTree
113 is_title =
114 (hasName "h2")
115 >>>
116 (hasAttrValue "class" (== "SummaryHL"))
117
118
119 is_byline :: (ArrowXml a) => a XmlTree XmlTree
120 is_byline =
121 (hasName "div")
122 >>>
123 (hasAttrValue "class" (== "FeatureByLine"))
124
125
126 is_image :: (ArrowXml a) => a XmlTree XmlTree
127 is_image = isElem >>> hasName "img"
128
129 remove_title :: (ArrowXml a) => a XmlTree XmlTree
130 remove_title =
131 processTopDown ((none) `when` is_title)
132
133
134 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
135 remove_byline =
136 processTopDown ((none) `when` is_byline)
137
138
139 image_srcs :: (ArrowXml a) => a XmlTree URL
140 image_srcs =
141 css "img"
142 >>>
143 getAttrValue "src"
144
145
146 full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree
147 full_story_paragraph =
148 isElem
149 >>>
150 hasName "p"
151 >>>
152 ifA
153 (this /> full_story_link)
154 this
155 none
156
157
158 -- Without regard to the parent paragraph.
159 full_story_link :: (ArrowXml a) => a XmlTree XmlTree
160 full_story_link =
161 isElem
162 >>>
163 hasName "a"
164 >>>
165 ifA
166 (this /> hasText (=~ "Full Story"))
167 this
168 none
169
170
171 -- | Get the hrefs of all full story links.
172 full_story_urls :: (ArrowXml a) => a XmlTree URL
173 full_story_urls =
174 deep $
175 full_story_paragraph
176 />
177 full_story_link
178 >>>
179 getAttrValue "href"
180
181
182
183 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
184 make_image_srcs_absolute =
185 processTopDown (make_srcs_absolute `when` is_image)
186 where
187 change_src :: (ArrowXml a) => a XmlTree XmlTree
188 change_src =
189 changeAttrValue try_make_absolute_url
190
191 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
192 make_srcs_absolute =
193 processAttrl $ change_src `when` hasName "src"