X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FPage.hs;h=a4a56c222a8ebd1533fff5e9773efbf2dfe22acf;hp=4d61cfbc5a4f2ad089e852bd13a459f2280f1cbf;hb=006b05fc061c0ce2fa0ce8b8c7f9361b0d64bb43;hpb=6f0e6cbece7e1b1a3c6b43d19eb2f29088af981c diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index 4d61cfb..a4a56c2 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -8,6 +8,9 @@ import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) import Data.Maybe (catMaybes, fromJust, isNothing) import Data.Tree.NTree.TypeDefs (NTree) +import Test.HUnit (Assertion, assertEqual) +import Test.Framework (Test, testGroup) +import Test.Framework.Providers.HUnit (testCase) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, @@ -20,9 +23,12 @@ import Text.XML.HXT.Core ( getText, hasAttrValue, hasName, + mkName, none, + processAttrl, processTopDown, runX, + setElemName, xshow, when ) @@ -83,10 +89,21 @@ remove_images = processTopDown ((none) `when` is_image) +is_link :: (ArrowXml a) => a XmlTree XmlTree +is_link = + hasName "a" + +replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree +replace_links_with_spans = + processTopDown $ (make_span >>> remove_attrs) `when` is_link + where + make_span = setElemName $ mkName "span" + remove_attrs = processAttrl none + -- | Preprocessing common to both page types. preprocess :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode) preprocess xml = - xml >>>remove_images + xml >>> remove_images >>> replace_links_with_spans parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page) @@ -100,38 +117,43 @@ parse xml = do else appr --- --- ArticlePage Stuff --- -ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page) -ap_parse xml = do - arts <- ap_parse_articles xml - case arts of - Just [x] -> return $ Just $ ArticlePage x - _ -> return Nothing -ap_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) -ap_parse_headline xml = do +parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_headline xml = do let element_filter = xml >>> css "div.PageHeadline h1" let element_text_filter = element_filter /> getText element_text <- runX element_text_filter - return $ case element_text of - [x] -> Just $ strip x - [] -> Nothing - _ -> error "Found more than one headline." + return $ + case element_text of + [x] -> Just $ strip x + [] -> Nothing + _ -> error "Found more than one headline." -ap_parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) -ap_parse_byline xml = do - let element_filter = xml >>> css "div.Byline" + +parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_byline xml = do + let element_filter = xml >>> css "div.FeatureByLine" let element_text_filter = element_filter /> getText element_text <- runX element_text_filter - return $ case element_text of - [x] -> Just $ strip x - [] -> Nothing - _ -> error "Found more than one byline." + return $ + case element_text of + [x] -> Just $ strip x + [] -> Nothing + _ -> error "Found more than one article byline." +-- +-- ArticlePage Stuff +-- +ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page) +ap_parse xml = do + arts <- ap_parse_articles xml + case arts of + Just [x] -> return $ Just $ ArticlePage x + _ -> return Nothing + + ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) ap_parse_body xml = do let element_filter = xml >>> css "div.ArticleText" @@ -145,8 +167,8 @@ ap_parse_body xml = do ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article]) ap_parse_articles xml = do - parsed_headline <- ap_parse_headline xml - parsed_byline <- ap_parse_byline xml + parsed_headline <- parse_headline xml + parsed_byline <- parse_byline xml parsed_body <- ap_parse_body xml let title' = Title (fromJust parsed_headline) let byline' = Byline parsed_byline @@ -162,7 +184,7 @@ ap_parse_articles xml = do fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page) fp_parse xml = do - hl <- fp_parse_headline xml + hl <- parse_headline xml parsed_articles <- fp_parse_articles xml case parsed_articles of them@(_:_) -> return $ Just $ FullPage (fromJust hl) them @@ -170,28 +192,6 @@ fp_parse xml = do - -fp_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) -fp_parse_headline xml = do - let element_filter = xml >>> css "div.PageHeadline h1" - let element_text_filter = element_filter /> getText - element_text <- runX element_text_filter - return $ case element_text of - [x] -> Just $ strip x - [] -> Nothing - _ -> error "Found more than one headline." - -fp_parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) -fp_parse_article_byline xml = do - let element_filter = xml >>> css "div.FeatureByLine" - let element_text_filter = element_filter /> getText - element_text <- runX element_text_filter - return $ case element_text of - [x] -> Just $ strip x - [] -> Nothing - _ -> error "Found more than one article byline." - - fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) fp_parse_article_title xml = do let element_filter = xml >>> css "h2.SummaryHL" @@ -222,7 +222,6 @@ is_image :: (ArrowXml a) => a XmlTree XmlTree is_image = hasName "img" - remove_title :: (ArrowXml a) => a XmlTree XmlTree remove_title = processTopDown ((none) `when` is_title) @@ -250,7 +249,7 @@ fp_parse_article_body xml = do fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article) fp_parse_article xml = do parsed_article_title <- fp_parse_article_title xml - parsed_article_byline <- fp_parse_article_byline xml + parsed_article_byline <- parse_byline xml parsed_article_body <- fp_parse_article_body xml let title' = Title $ fromJust parsed_article_title let byline' = Byline parsed_article_byline @@ -264,7 +263,7 @@ parse_html_article html = do -- | In the full page, all of the article titles and bodies are --- wrapped in a div.ArticleText. +-- wrapped in one big div.ArticleText. parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode) parse_bodies xml = xml >>> css "div.ArticleText" @@ -285,6 +284,8 @@ fp_parse_articles xml = do return just_articles +-- | This makes it easy to select otherwise-random chunks of html +-- using 'css'. wrap_in_body_div :: String -> String wrap_in_body_div s = "
" ++ s ++ "
" @@ -327,3 +328,34 @@ xhtml_to_epub epmd = my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd } write_epub = writeEPUB Nothing [] my_writer_options read_html = readHtml defaultParserState + + + + + +-- +-- Tests +-- + +test_preprocess_links :: Assertion +test_preprocess_links = do + actual_xml' <- runX $ (preprocess input_xml) >>> css "body" + let actual_xml = actual_xml' !! 0 + + expected_xml' <- runX $ expected_xml'' >>> css "body" + let expected_xml = expected_xml' !! 0 + + assertEqual + "Links are replaced with spans" + expected_xml + actual_xml + where + input_html = "Hello, world!" + input_xml = parseHtml input_html + expected_html = "Hello, world!" + expected_xml'' = parseHtml expected_html + +page_tests :: Test +page_tests = + testGroup "Page Tests" [ + testCase "Links are replaced with spans" test_preprocess_links ]