module LWN.XHTML (
XHTML,
+ XML,
+ full_story_urls,
+ image_srcs,
+ full_story_link,
+ full_story_paragraph,
+ is_image,
parse_lwn,
- to_xhtml
- )
+ preprocess,
+ remove_byline,
+ remove_full_story_paragraphs,
+ remove_title,
+ to_xhtml,
+ to_xml,
+ xml_from_contents)
where
+import Data.String.Utils (replace)
+import Text.HandsomeSoup (css)
import Text.XML.HXT.Core (
+ (>>>),
+ (/>),
+ ArrowXml,
IOStateArrow,
SysConfigList,
XmlTree,
+ changeAttrValue,
+ changeText,
+ deep,
+ getAttrValue,
+ hasAttrValue,
+ hasName,
+ hasText,
+ ifA,
+ isElem,
+ mkName,
no,
+ none,
+ processAttrl,
+ processTopDown,
readString,
+ setElemName,
+ this,
+ when,
withParseHTML,
withValidate,
withWarnings,
- yes
- )
+ yes)
+
+
+import LWN.URI (URL, try_make_absolute_url)
+import Misc (contains)
+
class XHTML a where
to_xhtml :: a -> String
-
+class XML a where
+ to_xml :: (ArrowXml b) => a -> (b XmlTree XmlTree)
-- | Options used when parsing HTML.
my_read_opts :: SysConfigList
-- | My version of HandsomeSoup's parseHTML.
parse_lwn :: String -> IOStateArrow s b XmlTree
parse_lwn = readString my_read_opts
+
+
+-- | Takes the result of get_article_contents and calls parse_lwn on
+-- the contained value.
+xml_from_contents :: (Maybe String) -> Maybe (IOStateArrow s b XmlTree)
+xml_from_contents =
+ fmap parse_lwn
+
+
+
+-- | Preprocessing common to both page types.
+preprocess :: (ArrowXml a) => a XmlTree XmlTree
+preprocess =
+ make_image_srcs_absolute
+ >>>
+ remove_comment_links
+ >>>
+ replace_links_with_spans
+ >>>
+ replace_double_newline_in_pre
+
+
+is_link :: (ArrowXml a) => a XmlTree XmlTree
+is_link =
+ isElem >>> hasName "a"
+
+is_pre :: (ArrowXml a) => a XmlTree XmlTree
+is_pre =
+ isElem
+ >>>
+ hasName "pre"
+
+replace_double_newline_in_pre :: (ArrowXml a) => a XmlTree XmlTree
+replace_double_newline_in_pre =
+ processTopDown $ fix_it `when` is_pre
+ where
+ fix_it :: (ArrowXml a) => a XmlTree XmlTree
+ fix_it =
+ changeText $ replace "\n\n" "<br /><br />"
+
+remove_full_story_paragraphs :: (ArrowXml a) => a XmlTree XmlTree
+remove_full_story_paragraphs =
+ processTopDown $ none `when` full_story_paragraph
+
+
+remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree
+remove_comment_links =
+ processTopDown $ kill_comments `when` is_link
+ where
+ is_comment_link =
+ hasAttrValue "href" (contains "#Comments")
+
+ kill_comments =
+ none `when` is_comment_link
+
+replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree
+replace_links_with_spans =
+ processTopDown $ (make_span >>> remove_attrs) `when` is_link
+ where
+ make_span = setElemName $ mkName "span"
+ remove_attrs = processAttrl none
+
+
+is_title :: (ArrowXml a) => a XmlTree XmlTree
+is_title =
+ (hasName "h2")
+ >>>
+ (hasAttrValue "class" (== "SummaryHL"))
+
+
+is_byline :: (ArrowXml a) => a XmlTree XmlTree
+is_byline =
+ (hasName "div")
+ >>>
+ (hasAttrValue "class" (== "FeatureByLine"))
+
+
+is_image :: (ArrowXml a) => a XmlTree XmlTree
+is_image = isElem >>> hasName "img"
+
+remove_title :: (ArrowXml a) => a XmlTree XmlTree
+remove_title =
+ processTopDown ((none) `when` is_title)
+
+
+remove_byline :: (ArrowXml a) => a XmlTree XmlTree
+remove_byline =
+ processTopDown ((none) `when` is_byline)
+
+
+image_srcs :: (ArrowXml a) => a XmlTree URL
+image_srcs =
+ css "img"
+ >>>
+ getAttrValue "src"
+
+
+full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree
+full_story_paragraph =
+ isElem
+ >>>
+ hasName "p"
+ >>>
+ ifA
+ (this /> full_story_link)
+ this
+ none
+
+
+-- Without regard to the parent paragraph.
+full_story_link :: (ArrowXml a) => a XmlTree XmlTree
+full_story_link =
+ isElem
+ >>>
+ hasName "a"
+ >>>
+ ifA
+ (this /> hasText (== "Full Story"))
+ this
+ none
+
+
+-- | Get the hrefs of all full story links.
+full_story_urls :: (ArrowXml a) => a XmlTree URL
+full_story_urls =
+ deep $
+ full_story_paragraph
+ />
+ full_story_link
+ >>>
+ getAttrValue "href"
+
+
+
+make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
+make_image_srcs_absolute =
+ processTopDown (make_srcs_absolute `when` is_image)
+ where
+ change_src :: (ArrowXml a) => a XmlTree XmlTree
+ change_src =
+ changeAttrValue try_make_absolute_url
+
+ make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
+ make_srcs_absolute =
+ processAttrl $ change_src `when` hasName "src"