X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FXHTML.hs;h=a2f103fa0d83b7ae433f239036e02456eb6e08f3;hp=8dfe3b2e85645c451b601a58af96107f571e589b;hb=fc0052e451aa03675ebd9a128dfa46573b9357d7;hpb=aad40cd8e1e8c84c5fc294674a7159bb40838440 diff --git a/src/LWN/XHTML.hs b/src/LWN/XHTML.hs index 8dfe3b2..a2f103f 100644 --- a/src/LWN/XHTML.hs +++ b/src/LWN/XHTML.hs @@ -1,7 +1,10 @@ module LWN.XHTML ( XHTML, XML, + full_story_urls, image_srcs, + full_story_link, + full_story_paragraph, is_image, parse_lwn, preprocess, @@ -13,16 +16,21 @@ module LWN.XHTML ( where import Text.HandsomeSoup (css) +import Text.Regex.Posix ((=~)) import Text.XML.HXT.Core ( (>>>), + (/>), ArrowXml, IOStateArrow, SysConfigList, XmlTree, changeAttrValue, + deep, getAttrValue, hasAttrValue, hasName, + hasText, + ifA, isElem, mkName, no, @@ -31,12 +39,14 @@ import Text.XML.HXT.Core ( processTopDown, readString, setElemName, + this, when, withParseHTML, withValidate, withWarnings, yes) + import LWN.URI (URL, try_make_absolute_url) import Misc (contains) @@ -132,6 +142,44 @@ image_srcs = >>> getAttrValue "src" + +full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree +full_story_paragraph = + isElem + >>> + hasName "p" + >>> + ifA + (this /> full_story_link) + this + none + + +-- Without regard to the parent paragraph. +full_story_link :: (ArrowXml a) => a XmlTree XmlTree +full_story_link = + isElem + >>> + hasName "a" + >>> + ifA + (this /> hasText (=~ "Full Story")) + this + none + + +-- | Get the hrefs of all full story links. +full_story_urls :: (ArrowXml a) => a XmlTree URL +full_story_urls = + deep $ + full_story_paragraph + /> + full_story_link + >>> + getAttrValue "href" + + + make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree make_image_srcs_absolute = processTopDown (make_srcs_absolute `when` is_image)