12 remove_full_story_paragraphs,
19 import Text.HandsomeSoup (css)
20 import Text.XML.HXT.Core (
50 import LWN.URI (URL, try_make_absolute_url)
51 import Misc (contains)
55 to_xhtml :: a -> String
58 to_xml :: (ArrowXml b) => a -> (b XmlTree XmlTree)
60 -- | Options used when parsing HTML.
61 my_read_opts :: SysConfigList
62 my_read_opts = [ withValidate no,
66 -- | My version of HandsomeSoup's parseHTML.
67 parse_lwn :: String -> IOStateArrow s b XmlTree
68 parse_lwn = readString my_read_opts
71 -- | Takes the result of get_article_contents and calls parse_lwn on
72 -- the contained value.
73 xml_from_contents :: (Maybe String) -> Maybe (IOStateArrow s b XmlTree)
79 -- | Preprocessing common to both page types.
80 preprocess :: (ArrowXml a) => a XmlTree XmlTree
82 make_image_srcs_absolute
86 replace_links_with_spans
89 is_link :: (ArrowXml a) => a XmlTree XmlTree
91 isElem >>> hasName "a"
94 remove_full_story_paragraphs :: (ArrowXml a) => a XmlTree XmlTree
95 remove_full_story_paragraphs =
96 processTopDown $ none `when` full_story_paragraph
99 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
100 remove_comment_links =
101 processTopDown $ kill_comments `when` is_link
104 hasAttrValue "href" (contains "#Comments")
107 none `when` is_comment_link
109 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
110 replace_links_with_spans =
111 processTopDown $ (make_span >>> remove_attrs) `when` is_link
113 make_span = setElemName $ mkName "span"
114 remove_attrs = processAttrl none
117 is_title :: (ArrowXml a) => a XmlTree XmlTree
121 (hasAttrValue "class" (== "SummaryHL"))
124 is_byline :: (ArrowXml a) => a XmlTree XmlTree
128 (hasAttrValue "class" (== "FeatureByLine"))
131 is_image :: (ArrowXml a) => a XmlTree XmlTree
132 is_image = isElem >>> hasName "img"
134 remove_title :: (ArrowXml a) => a XmlTree XmlTree
136 processTopDown ((none) `when` is_title)
139 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
141 processTopDown ((none) `when` is_byline)
144 image_srcs :: (ArrowXml a) => a XmlTree URL
151 full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree
152 full_story_paragraph =
158 (this /> full_story_link)
163 -- Without regard to the parent paragraph.
164 full_story_link :: (ArrowXml a) => a XmlTree XmlTree
171 (this /> hasText (== "Full Story"))
176 -- | Get the hrefs of all full story links.
177 full_story_urls :: (ArrowXml a) => a XmlTree URL
188 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
189 make_image_srcs_absolute =
190 processTopDown (make_srcs_absolute `when` is_image)
192 change_src :: (ArrowXml a) => a XmlTree XmlTree
194 changeAttrValue try_make_absolute_url
196 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
198 processAttrl $ change_src `when` hasName "src"