From 2bf48a15ded4501a127bdfe5b29eaf01acf6576e Mon Sep 17 00:00:00 2001 From: Michael Orlitzky Date: Sat, 30 Jun 2012 12:14:15 -0400 Subject: [PATCH] Clean up some types now that I know what I'm doing. --- src/LWN/Page.hs | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index bdfe8ca..2bbe21a 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -9,7 +9,6 @@ import System.IO (Handle, hClose, hFlush) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) import Data.Maybe (catMaybes, fromJust, isNothing) -import Data.Tree.NTree.TypeDefs (NTree) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) @@ -18,7 +17,6 @@ import Text.XML.HXT.Core ( ArrowXml, IOSArrow, XmlTree, - XNode, (>>>), (/>), (//>), @@ -50,10 +48,8 @@ import XHTML -- referenced by the URL is stored. type ImageMap = Map.Map URL FilePath --- Should be called *after* preprocessing. -download_images :: IOSArrow XmlTree (NTree XNode) -> IO ImageMap -download_images xml = do - image_urls <- runX $ xml >>> image_srcs +download_image_urls :: [URL] -> IO ImageMap +download_image_urls image_urls = do files <- mapM save_image image_urls let pairs = zip image_urls files return $ foldl my_insert empty_map pairs @@ -64,6 +60,12 @@ download_images xml = do my_insert dict (_, Nothing) = dict my_insert dict (k, Just v) = Map.insert k v dict +-- Should be called *after* preprocessing. +download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap +download_images xml = do + image_urls <- runX $ xml >>> image_srcs + download_image_urls image_urls + data Page = -- | An LWN page with one article on it. @@ -166,7 +168,7 @@ replace_remote_img_srcs image_map = processAttrl $ (change_src `when` (hasName "src")) -parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page) +parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) parse xml = do let clean_xml = xml >>> preprocess image_map <- download_images clean_xml @@ -181,7 +183,7 @@ parse xml = do -parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_headline :: IOSArrow XmlTree XmlTree -> IO (Maybe String) parse_headline xml = do let element_filter = xml >>> css "div.PageHeadline h1" let element_text_filter = element_filter /> getText @@ -193,7 +195,7 @@ parse_headline xml = do _ -> error "Found more than one headline." -parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_byline :: IOSArrow XmlTree XmlTree -> IO (Maybe String) parse_byline xml = do let element_filter = xml >>> css "div.FeatureByLine" let element_text_filter = element_filter /> getText @@ -208,7 +210,7 @@ parse_byline xml = do -- -- ArticlePage Stuff -- -ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page) +ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) ap_parse xml = do arts <- ap_parse_articles xml case arts of @@ -216,7 +218,7 @@ ap_parse xml = do _ -> return Nothing -ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String) ap_parse_body xml = do let element_filter = xml >>> css "div.ArticleText" let element_html_filter = xshow element_filter @@ -227,7 +229,7 @@ ap_parse_body xml = do _ -> error "Found more than one article." -ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article] +ap_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article] ap_parse_articles xml = do parsed_headline <- parse_headline xml parsed_byline <- parse_byline xml @@ -250,7 +252,7 @@ ap_parse_articles xml = do -- FullPage Stuff -- -fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page) +fp_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) fp_parse xml = do hl <- parse_headline xml parsed_articles <- fp_parse_articles xml @@ -260,7 +262,7 @@ fp_parse xml = do -fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +fp_parse_article_title :: IOSArrow XmlTree XmlTree -> IO (Maybe String) fp_parse_article_title xml = do let element_filter = xml >>> css "h2.SummaryHL" let element_text_filter = element_filter //> getText @@ -300,7 +302,7 @@ remove_byline = -fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +fp_parse_article_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String) fp_parse_article_body xml = do -- First, delete the article title and byline. let clean_xml' = xml >>> remove_title >>> remove_byline @@ -313,7 +315,7 @@ fp_parse_article_body xml = do [] -> Nothing _ -> error "Found more than one article body." -fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article) +fp_parse_article :: IOSArrow XmlTree XmlTree -> IO (Maybe Article) fp_parse_article xml = do parsed_article_title <- fp_parse_article_title xml parsed_article_byline <- parse_byline xml @@ -336,12 +338,12 @@ parse_html_article html = do -- | In the full page, all of the article titles and bodies are -- wrapped in one big div.ArticleText. -parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode) +parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree parse_bodies xml = xml >>> css "div.ArticleText" -fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article] +fp_parse_articles :: IOSArrow XmlTree XmlTree -> IO [Article] fp_parse_articles xml = do bodies <- runX . xshow $ parse_bodies xml let article_separator = "

" -- 2.44.2