From abd072d7a4b825cdfc7aaa49ef3c7897ffad3bf2 Mon Sep 17 00:00:00 2001 From: Michael Orlitzky Date: Wed, 27 Jun 2012 17:00:48 -0400 Subject: [PATCH] Implement image download and replacement. --- lwn-epub.cabal | 3 + src/LWN/HTTP.hs | 42 ++++++---- src/LWN/Page.hs | 213 ++++++++++++++++++++++++++++++++++++++++-------- src/LWN/URI.hs | 10 ++- 4 files changed, 217 insertions(+), 51 deletions(-) diff --git a/lwn-epub.cabal b/lwn-epub.cabal index abe1e61..c374d8d 100644 --- a/lwn-epub.cabal +++ b/lwn-epub.cabal @@ -12,6 +12,8 @@ executable lwn-epub base == 4.5.*, bytestring == 0.9.*, cmdargs == 0.9.*, + containers == 0.*, + curl == 1.*, directory == 1.1.*, download-curl == 0.1.*, filepath == 1.3.*, @@ -22,6 +24,7 @@ executable lwn-epub network == 2.3.*, pandoc == 1.9.*, regex-posix == 0.95.*, + temporary == 1.*, test-framework == 0.6.*, test-framework-hunit == 0.2.*, time == 1.*, diff --git a/src/LWN/HTTP.hs b/src/LWN/HTTP.hs index c74248b..743a99c 100644 --- a/src/LWN/HTTP.hs +++ b/src/LWN/HTTP.hs @@ -1,7 +1,10 @@ +{-# LANGUAGE DoAndIfThenElse #-} + module LWN.HTTP where import qualified Data.ByteString as B (hPut) + import Network.Curl ( CurlCode(..), CurlOption(..), @@ -14,7 +17,7 @@ import Network.Curl ( withCurlDo ) import Network.Curl.Download (openURI) -import System.Directory (getTemporaryDirectory) +import System.Directory (doesFileExist, getTemporaryDirectory) import System.IO (hPutStrLn, stderr) import System.IO.Temp (openBinaryTempFile) @@ -58,7 +61,7 @@ get_page cookie_jar url = -- Perform the request, and get back a CurlResponse object. -- The cast is needed to specify how we would like our headers -- and body returned (Strings). - resp <- do_curl_ curl login_url curl_opts :: IO CurlResponse + resp <- do_curl_ curl url curl_opts :: IO CurlResponse -- Pull out the response code as a CurlCode. let code = respCurlCode resp @@ -113,7 +116,7 @@ log_in cookie_jar username password = post_password = password_field ++ "=" ++ password post_data :: [String] - post_data = [post_username, post_password] + post_data = [post_username, post_password, post_submit] post_opts :: [CurlOption] post_opts = @@ -135,17 +138,22 @@ log_in cookie_jar username password = -- knows that type (jpg, png, etc.) it is. save_image :: URLString -> IO (Maybe FilePath) save_image url = do - let fn = filename url - case fn of - Nothing -> return Nothing - Just file -> do - temp_dir <- getTemporaryDirectory - (out_path, out_handle) <- openBinaryTempFile temp_dir file - result <- openURI url - case result of - Left err -> do - hPutStrLn stderr ("HTTP Error: " ++ err) - return Nothing - Right bs -> do - B.hPut out_handle bs - return $ Just out_path + it_exists <- doesFileExist url + if it_exists then do + -- It's local, just use it. + return $ Just url + else do + let fn = filename url + case fn of + Nothing -> return Nothing + Just file -> do + temp_dir <- getTemporaryDirectory + (out_path, out_handle) <- openBinaryTempFile temp_dir file + result <- openURI url + case result of + Left err -> do + hPutStrLn stderr ("HTTP Error: " ++ err) + return Nothing + Right bs -> do + B.hPut out_handle bs + return $ Just out_path diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index a4a56c2..0307214 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -1,16 +1,20 @@ +{-# LANGUAGE DoAndIfThenElse #-} + module LWN.Page where -import Text.Pandoc +import qualified Data.Map as Map import Data.Time (getCurrentTime) import System.IO (Handle) import qualified Data.ByteString.Lazy as B (ByteString, hPut) +import Data.List (isInfixOf) import Data.String.Utils (split, strip) import Data.Maybe (catMaybes, fromJust, isNothing) import Data.Tree.NTree.TypeDefs (NTree) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) +import Text.Pandoc import Text.XML.HXT.Core ( ArrowXml, IOSArrow, @@ -19,10 +23,13 @@ import Text.XML.HXT.Core ( (>>>), (/>), (//>), + changeAttrValue, + getAttrValue, getChildren, getText, hasAttrValue, hasName, + isElem, mkName, none, processAttrl, @@ -35,8 +42,29 @@ import Text.XML.HXT.Core ( import Text.HandsomeSoup (css, parseHtml) import LWN.Article +import LWN.HTTP (save_image) +import LWN.URI (URL, try_make_absolute_url) import XHTML +-- Map absolute image URLs to local system file paths where the image +-- referenced by the URL is stored. +type ImageMap = Map.Map URL FilePath + +-- Should be called *after* preprocessing. +download_images :: IOSArrow XmlTree (NTree XNode) -> IO ImageMap +download_images xml = do + image_urls <- runX $ xml >>> image_srcs + files <- mapM save_image image_urls + let pairs = zip image_urls files + return $ foldl my_insert empty_map pairs + where + empty_map = Map.empty :: ImageMap + + my_insert :: ImageMap -> (URL, Maybe FilePath) -> ImageMap + my_insert dict (_, Nothing) = dict + my_insert dict (k, Just v) = Map.insert k v dict + + data Page = -- | An LWN page with one article on it. ArticlePage { article :: Article } | @@ -84,14 +112,22 @@ instance XHTML Page where -remove_images :: (ArrowXml a) => a XmlTree XmlTree -remove_images = - processTopDown ((none) `when` is_image) - - is_link :: (ArrowXml a) => a XmlTree XmlTree is_link = - hasName "a" + isElem >>> hasName "a" + + +remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree +remove_comment_links = + processTopDown $ kill_comments `when` is_link + where + contains = isInfixOf + + is_comment_link = + hasAttrValue "href" (contains "#Comments") + + kill_comments = + none `when` is_comment_link replace_links_with_spans :: (ArrowXml a) => a XmlTree XmlTree replace_links_with_spans = @@ -100,21 +136,49 @@ replace_links_with_spans = make_span = setElemName $ mkName "span" remove_attrs = processAttrl none + -- | Preprocessing common to both page types. -preprocess :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode) -preprocess xml = - xml >>> remove_images >>> replace_links_with_spans +preprocess :: (ArrowXml a) => a XmlTree XmlTree +preprocess = + make_image_srcs_absolute + >>> + remove_comment_links + >>> + replace_links_with_spans + + +replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree +replace_remote_img_srcs image_map = + processTopDown (make_srcs_local `when` is_image) + where + -- old_src -> new_src + change_src_func :: String -> String + change_src_func old_src = + case Map.lookup old_src image_map of + -- Leave it alone if we don't have the file locally + Nothing -> old_src + Just v -> v + + change_src :: (ArrowXml a) => a XmlTree XmlTree + change_src = + changeAttrValue change_src_func + + make_srcs_local :: (ArrowXml a) => a XmlTree XmlTree + make_srcs_local = + processAttrl $ (change_src `when` (hasName "src")) parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page) parse xml = do - let clean_xml = preprocess xml - appr <- ap_parse clean_xml - fppr <- fp_parse clean_xml + let clean_xml = xml >>> preprocess + image_map <- download_images clean_xml + let local_xml = clean_xml >>> replace_remote_img_srcs image_map + appr <- ap_parse local_xml + fppr <- fp_parse local_xml return $ if (isNothing appr) then fppr - else + else appr @@ -148,9 +212,9 @@ parse_byline xml = do -- ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page) ap_parse xml = do - arts <- ap_parse_articles xml + arts <- ap_parse_articles xml case arts of - Just [x] -> return $ Just $ ArticlePage x + [x] -> return $ Just $ ArticlePage x _ -> return Nothing @@ -165,16 +229,22 @@ ap_parse_body xml = do _ -> error "Found more than one article." -ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article]) +ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article] ap_parse_articles xml = do parsed_headline <- parse_headline xml parsed_byline <- parse_byline xml parsed_body <- ap_parse_body xml - let title' = Title (fromJust parsed_headline) - let byline' = Byline parsed_byline - let body' = BodyHtml (fromJust parsed_body) - return $ Just $ [Article title' byline' body'] + putStrLn $ fromJust parsed_headline + + if (isNothing parsed_headline) || (isNothing parsed_body) + then return [] + else do + let title' = Title $ fromJust parsed_headline + let byline' = Byline parsed_byline + let body' = BodyHtml $ fromJust parsed_body + + return $ [Article title' byline' body'] @@ -187,8 +257,8 @@ fp_parse xml = do hl <- parse_headline xml parsed_articles <- fp_parse_articles xml case parsed_articles of - them@(_:_) -> return $ Just $ FullPage (fromJust hl) them - _ -> return Nothing + [] -> return Nothing + x -> return $ Just $ FullPage (fromJust hl) x @@ -219,8 +289,7 @@ is_byline = is_image :: (ArrowXml a) => a XmlTree XmlTree -is_image = - hasName "img" +is_image = isElem >>> hasName "img" remove_title :: (ArrowXml a) => a XmlTree XmlTree remove_title = @@ -251,10 +320,15 @@ fp_parse_article xml = do parsed_article_title <- fp_parse_article_title xml parsed_article_byline <- parse_byline xml parsed_article_body <- fp_parse_article_body xml - let title' = Title $ fromJust parsed_article_title - let byline' = Byline parsed_article_byline - let body' = BodyHtml $ fromJust parsed_article_body - return $ Just $ Article title' byline' body' + + if (isNothing parsed_article_title) || (isNothing parsed_article_body) + then + return Nothing + else do + let title' = Title $ fromJust parsed_article_title + let byline' = Byline parsed_article_byline + let body' = BodyHtml $ fromJust parsed_article_body + return $ Just $ Article title' byline' body' parse_html_article :: String -> IO (Maybe Article) parse_html_article html = do @@ -275,7 +349,9 @@ fp_parse_articles xml = do let article_separator = "

" let split_articles'' = split article_separator (concat bodies) -- The first element will contain the crap before the first . - let split_articles' = tail split_articles'' + let split_articles' = case split_articles'' of + (_:_) -> tail split_articles'' + [] -> [] -- Put the separator back, it was lost during the split. let split_articles = map (article_separator ++) split_articles' --_ <- mapM print_article split_articles @@ -331,6 +407,27 @@ xhtml_to_epub epmd = +-- +-- Misc +-- + +image_srcs :: (ArrowXml a) => a XmlTree URL +image_srcs = + css "img" + >>> + getAttrValue "src" + +make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree +make_image_srcs_absolute = + processTopDown (make_srcs_absolute `when` is_image) + where + change_src :: (ArrowXml a) => a XmlTree XmlTree + change_src = + changeAttrValue try_make_absolute_url + + make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree + make_srcs_absolute = + processAttrl $ change_src `when` hasName "src" -- @@ -339,7 +436,7 @@ xhtml_to_epub epmd = test_preprocess_links :: Assertion test_preprocess_links = do - actual_xml' <- runX $ (preprocess input_xml) >>> css "body" + actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" let actual_xml = actual_xml' !! 0 expected_xml' <- runX $ expected_xml'' >>> css "body" @@ -355,7 +452,59 @@ test_preprocess_links = do expected_html = "Hello, world!" expected_xml'' = parseHtml expected_html + +test_absolve_images :: Assertion +test_absolve_images = do + actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" + let actual_xml = actual_xml' !! 0 + + expected_xml' <- runX $ expected_xml'' >>> css "body" + let expected_xml = expected_xml' !! 0 + + assertEqual + "Image srcs are made absolute" + expected_xml + actual_xml + where + input_html = + "" ++ + "" ++ + "" + input_xml = parseHtml input_html + expected_html = + "" ++ + "" ++ + "" + expected_xml'' = parseHtml expected_html + + +test_comments_removed :: Assertion +test_comments_removed = do + actual_xml' <- runX $ input_xml >>> preprocess >>> css "body" + let actual_xml = actual_xml' !! 0 + + expected_xml' <- runX $ expected_xml'' >>> css "body" + let expected_xml = expected_xml' !! 0 + + assertEqual + "Comment links are removed" + expected_xml + actual_xml + where + input_html = + "

" ++ + "Comments (6 posted)" ++ + "

" + input_xml = parseHtml input_html + + expected_html = "

" + expected_xml'' = parseHtml expected_html + + + page_tests :: Test page_tests = testGroup "Page Tests" [ - testCase "Links are replaced with spans" test_preprocess_links ] + testCase "Links are replaced with spans" test_preprocess_links, + testCase "Image srcs are made absolute" test_absolve_images, + testCase "Comment links are removed" test_comments_removed ] diff --git a/src/LWN/URI.hs b/src/LWN/URI.hs index 5e61eb6..9e7c7d9 100644 --- a/src/LWN/URI.hs +++ b/src/LWN/URI.hs @@ -112,7 +112,7 @@ filename url = let reverse_components = reverse components in case reverse_components of [] -> Nothing - (x:xs) -> Just x + (x:_) -> Just x where parse_result = parseURIReference url @@ -144,7 +144,13 @@ make_absolute_url relative_url = where parse_result = parseURIReference relative_url - +-- | Like 'make_absolute_url', except returns its input instead of +-- 'Nothing' if the absolution fails. +try_make_absolute_url :: URL -> URL +try_make_absolute_url url = + case make_absolute_url url of + Nothing -> url + Just abs_url -> abs_url -- | A List of LWN URLs to use during testing. lwn_urls :: [URL] -- 2.44.2