18 import Text.HandsomeSoup (css)
19 import Text.Regex.Posix ((=~))
20 import Text.XML.HXT.Core (
50 import LWN.URI (URL, try_make_absolute_url)
51 import Misc (contains)
55 to_xhtml :: a -> String
58 to_xml :: (ArrowXml b) => a -> (b XmlTree XmlTree)
60 -- | Options used when parsing HTML.
61 my_read_opts :: SysConfigList
62 my_read_opts = [ withValidate no,
66 -- | My version of HandsomeSoup's parseHTML.
67 parse_lwn :: String -> IOStateArrow s b XmlTree
68 parse_lwn = readString my_read_opts
71 -- | Takes the result of get_article_contents and calls parse_lwn on
72 -- the contained value.
73 xml_from_contents :: (Maybe String) -> Maybe (IOStateArrow s b XmlTree)
79 -- | Preprocessing common to both page types.
80 preprocess :: (ArrowXml a) => a XmlTree XmlTree
82 make_image_srcs_absolute
86 replace_links_with_spans
89 is_link :: (ArrowXml a) => a XmlTree XmlTree
91 isElem >>> hasName "a"
94 remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
95 remove_comment_links =
96 processTopDown $ kill_comments `when` is_link
99 hasAttrValue "href" (contains "#Comments")
102 none `when` is_comment_link
104 replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
105 replace_links_with_spans =
106 processTopDown $ (make_span >>> remove_attrs) `when` is_link
108 make_span = setElemName $ mkName "span"
109 remove_attrs = processAttrl none
112 is_title :: (ArrowXml a) => a XmlTree XmlTree
116 (hasAttrValue "class" (== "SummaryHL"))
119 is_byline :: (ArrowXml a) => a XmlTree XmlTree
123 (hasAttrValue "class" (== "FeatureByLine"))
126 is_image :: (ArrowXml a) => a XmlTree XmlTree
127 is_image = isElem >>> hasName "img"
129 remove_title :: (ArrowXml a) => a XmlTree XmlTree
131 processTopDown ((none) `when` is_title)
134 remove_byline :: (ArrowXml a) => a XmlTree XmlTree
136 processTopDown ((none) `when` is_byline)
139 image_srcs :: (ArrowXml a) => a XmlTree URL
146 full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree
147 full_story_paragraph =
153 (this /> full_story_link)
158 -- Without regard to the parent paragraph.
159 full_story_link :: (ArrowXml a) => a XmlTree XmlTree
166 (this /> hasText (=~ "Full Story"))
171 -- | Get the hrefs of all full story links.
172 full_story_urls :: (ArrowXml a) => a XmlTree URL
183 make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
184 make_image_srcs_absolute =
185 processTopDown (make_srcs_absolute `when` is_image)
187 change_src :: (ArrowXml a) => a XmlTree XmlTree
189 changeAttrValue try_make_absolute_url
191 make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
193 processAttrl $ change_src `when` hasName "src"