X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FXHTML.hs;h=c54bada32538c97e5e0f6d9c5fe579a37e2a4f77;hp=8dfe3b2e85645c451b601a58af96107f571e589b;hb=ce53d0d1ff76cc45f1f66504976a0549999677c0;hpb=aad40cd8e1e8c84c5fc294674a7159bb40838440 diff --git a/src/LWN/XHTML.hs b/src/LWN/XHTML.hs index 8dfe3b2..c54bada 100644 --- a/src/LWN/XHTML.hs +++ b/src/LWN/XHTML.hs @@ -1,11 +1,15 @@ module LWN.XHTML ( XHTML, XML, + full_story_urls, image_srcs, + full_story_link, + full_story_paragraph, is_image, parse_lwn, preprocess, remove_byline, + remove_full_story_paragraphs, remove_title, to_xhtml, to_xml, @@ -15,14 +19,18 @@ where import Text.HandsomeSoup (css) import Text.XML.HXT.Core ( (>>>), + (/>), ArrowXml, IOStateArrow, SysConfigList, XmlTree, changeAttrValue, + deep, getAttrValue, hasAttrValue, hasName, + hasText, + ifA, isElem, mkName, no, @@ -31,12 +39,14 @@ import Text.XML.HXT.Core ( processTopDown, readString, setElemName, + this, when, withParseHTML, withValidate, withWarnings, yes) + import LWN.URI (URL, try_make_absolute_url) import Misc (contains) @@ -81,6 +91,11 @@ is_link = isElem >>> hasName "a" +remove_full_story_paragraphs :: (ArrowXml a) => a XmlTree XmlTree +remove_full_story_paragraphs = + processTopDown $ none `when` full_story_paragraph + + remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree remove_comment_links = processTopDown $ kill_comments `when` is_link @@ -132,6 +147,44 @@ image_srcs = >>> getAttrValue "src" + +full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree +full_story_paragraph = + isElem + >>> + hasName "p" + >>> + ifA + (this /> full_story_link) + this + none + + +-- Without regard to the parent paragraph. +full_story_link :: (ArrowXml a) => a XmlTree XmlTree +full_story_link = + isElem + >>> + hasName "a" + >>> + ifA + (this /> hasText (== "Full Story")) + this + none + + +-- | Get the hrefs of all full story links. +full_story_urls :: (ArrowXml a) => a XmlTree URL +full_story_urls = + deep $ + full_story_paragraph + /> + full_story_link + >>> + getAttrValue "href" + + + make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree make_image_srcs_absolute = processTopDown (make_srcs_absolute `when` is_image)