From fc0052e451aa03675ebd9a128dfa46573b9357d7 Mon Sep 17 00:00:00 2001 From: Michael Orlitzky Date: Sun, 8 Jul 2012 23:24:34 -0400 Subject: [PATCH] Only fetch login cookies once. Implement the full story downloading (still buggy). Reorganize many of the XML functions. --- src/Configuration.hs | 46 ++++++++++--------- src/LWN/Article.hs | 2 + src/LWN/HTTP.hs | 65 +++++++++++++-------------- src/LWN/Page.hs | 102 +++++++++++++++++++++++++++++++++++++------ src/LWN/XHTML.hs | 48 ++++++++++++++++++++ src/Main.hs | 14 +++++- 6 files changed, 211 insertions(+), 66 deletions(-) diff --git a/src/Configuration.hs b/src/Configuration.hs index 4898c08..478d465 100644 --- a/src/Configuration.hs +++ b/src/Configuration.hs @@ -28,37 +28,43 @@ import ExitCodes -- | Contains all of our configurable options. data Cfg = Cfg { - article :: String, - output :: FilePath, - password :: Maybe String, - username :: Maybe String } + article :: String, + cookie_jar :: Maybe FilePath, + output :: FilePath, + password :: Maybe String, + username :: Maybe String } instance Monoid Cfg where mempty = Cfg { article = "", + cookie_jar = Nothing, output = "", password = Nothing, username = Nothing } mappend c1 c2 = - let article' = (if null article1 then article2 else article1) - output' = (if null output1 then output2 else output1) - password' = password1 `mplus` password2 - username' = username1 `mplus` username2 + let article' = (if null article1 then article2 else article1) + cookie_jar' = cookie_jar1 `mplus` cookie_jar2 + output' = (if null output1 then output2 else output1) + password' = password1 `mplus` password2 + username' = username1 `mplus` username2 in - Cfg { article = article', - output = output', - password = password', - username = username' } + Cfg { article = article', + cookie_jar = cookie_jar', + output = output', + password = password', + username = username' } where - article1 = article c1 - article2 = article c2 - output1 = output c1 - output2 = output c2 - password1 = password c1 - password2 = password c2 - username1 = username c1 - username2 = username c2 + article1 = article c1 + article2 = article c2 + cookie_jar1 = cookie_jar c1 + cookie_jar2 = cookie_jar c2 + output1 = output c1 + output2 = output c2 + password1 = password c1 + password2 = password c2 + username1 = username c1 + username2 = username c2 use_account :: Cfg -> Bool diff --git a/src/LWN/Article.hs b/src/LWN/Article.hs index ac3d456..47e3651 100644 --- a/src/LWN/Article.hs +++ b/src/LWN/Article.hs @@ -98,12 +98,14 @@ real_article_path path = do where abs_current = try_make_absolute_url ("/" ++ path) abs_article = try_make_absolute_url ("Articles/" ++ path) + abs_full_article = try_make_absolute_url path check_cases :: String check_cases | is_lwn_url path = make_https path | isPrefixOf "current" path = abs_current | path =~ "^[0-9]+$" = abs_article + | path =~ "^/Articles/[0-9]+/?$" = abs_full_article | otherwise = path -- Give up diff --git a/src/LWN/HTTP.hs b/src/LWN/HTTP.hs index 7da6ed1..63079fd 100644 --- a/src/LWN/HTTP.hs +++ b/src/LWN/HTTP.hs @@ -5,7 +5,7 @@ where import qualified Data.ByteString as B (hPut) import qualified Data.Map as Map (Map, empty, insert) -import Data.Maybe (fromJust) +import Data.Maybe (fromJust, isNothing) import Network.Curl ( CurlCode(..), CurlOption(..), @@ -22,7 +22,7 @@ import System.IO (hClose, hPutStrLn, stderr) import qualified System.IO.UTF8 as Utf8 (readFile) import System.IO.Temp (openBinaryTempFile, openTempFile) -import qualified Configuration as C (Cfg, password, use_account, username) +import qualified Configuration as C (Cfg(..)) import LWN.Article (real_article_path) import LWN.URI (URL, filename) @@ -188,6 +188,29 @@ download_image_urls image_urls = do my_insert dict (k, Just v) = Map.insert k v dict + + + +get_login_cookie :: C.Cfg -> IO C.Cfg +get_login_cookie cfg + | isNothing (C.username cfg) = return cfg + | isNothing (C.password cfg) = return cfg + | otherwise = do + let uname = fromJust $ C.username cfg + let pword = fromJust $ C.password cfg + cj <- make_cookie_jar + li_result <- log_in cj uname pword + + case li_result of + Left err -> do + let msg = "Failed to log in. " ++ err + hPutStrLn stderr msg + Right response_body -> do + hPutStrLn stderr response_body + + return $ cfg { C.cookie_jar = Just cj } + + -- | Try to parse the given article using HXT. We try a few different -- methods; if none of them work, we return 'Nothing'. get_article_contents :: C.Cfg -> URL -> IO (Maybe String) @@ -200,33 +223,11 @@ get_article_contents cfg article_name = do return $ Just $ contents False -> do -- Download the URL and try to parse it. - if C.use_account cfg then do - -- use_account would be false if these fromJusts would fail. - cj <- make_cookie_jar - li_result <- log_in cj - (fromJust $ C.username cfg) - (fromJust $ C.password cfg) - - case li_result of - Left err -> do - let msg = "Failed to log in. " ++ err - hPutStrLn stderr msg - Right response_body -> do - hPutStrLn stderr response_body - - html <- get_page (Just cj) my_article - - case html of - Left err -> do - let msg = "Failed to retrieve page. " ++ err - hPutStrLn stderr msg - return Nothing - Right h -> return $ Just h - else do - html <- get_page Nothing my_article - case html of - Left err -> do - let msg = "Failed to retrieve page. " ++ err - hPutStrLn stderr msg - return Nothing - Right h -> return $ Just h + html <- get_page (C.cookie_jar cfg) my_article + + case html of + Left err -> do + let msg = "Failed to retrieve page. " ++ err + hPutStrLn stderr msg + return Nothing + Right h -> return $ Just h diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index 97171c6..f0ada7c 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -7,6 +7,7 @@ import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) +import qualified Data.Map as Map (Map, empty, insert) import Data.Maybe (catMaybes, fromJust, isNothing) import Prelude hiding (readFile) import System.IO (Handle, hClose, hFlush) @@ -23,15 +24,18 @@ import Text.XML.HXT.Core ( ArrowXml, IOSArrow, XmlTree, + ($<), (>>>), (/>), (//>), changeAttrValue, + getAttrValue, getChildren, getText, hasName, processAttrl, processTopDown, + this, runX, xshow, when) @@ -46,12 +50,16 @@ import LWN.HTTP ( import LWN.URI (URL) import LWN.XHTML ( XHTML, + full_story_urls, image_srcs, + full_story_link, + full_story_paragraph, is_image, preprocess, remove_byline, remove_title, to_xhtml, + to_xml, xml_from_contents) @@ -106,19 +114,28 @@ page_from_url :: Cfg -> URL -> IO (Maybe Page) page_from_url cfg url = do contents <- get_article_contents cfg url case (xml_from_contents contents) of - Just html -> parse html + Just html -> parse cfg html Nothing -> return Nothing --- Should be called *after* preprocessing. -download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap -download_images xml = do - image_urls <- runX $ xml >>> image_srcs - download_image_urls image_urls - - - +insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree +insert_full_stories story_map = + processTopDown (article_xml `when` full_story_paragraph) + where + lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree + lookup_func href = + case Map.lookup href story_map of + -- Leave it alone if we don't have the full story. + Nothing -> this + Just v -> to_xml v + + article_xml :: (ArrowXml a) => a XmlTree XmlTree + article_xml = + lookup_func + $< + (this /> full_story_link >>> getAttrValue "href") + replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree replace_remote_img_srcs image_map = processTopDown (make_srcs_local `when` is_image) @@ -140,11 +157,47 @@ replace_remote_img_srcs image_map = processAttrl $ (change_src `when` (hasName "src")) -parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) -parse xml = do - let clean_xml = xml >>> preprocess + + +-- Should be called *after* preprocessing. +download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap +download_images xml = do + image_urls <- runX $ xml >>> image_srcs + download_image_urls image_urls + + + +type StoryMap = Map.Map URL Article + +-- These come *before* preprocessing. +download_full_story_urls :: Cfg -> [URL] -> IO StoryMap +download_full_story_urls cfg story_urls = do + pages <- mapM (page_from_url cfg) story_urls + let pairs = zip story_urls pages + return $ foldl my_insert empty_map pairs + where + empty_map = Map.empty :: StoryMap + + my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap + my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict + my_insert dict (_, _) = dict + + +download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap +download_full_stories cfg xml = do + story_urls <- runX $ xml >>> full_story_urls + download_full_story_urls cfg story_urls + + +parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page) +parse cfg xml = do + story_map <- download_full_stories cfg xml + let fs_xml = xml >>> insert_full_stories story_map + + let clean_xml = fs_xml >>> preprocess image_map <- download_images clean_xml let local_xml = clean_xml >>> replace_remote_img_srcs image_map + appr <- ap_parse local_xml fppr <- fp_parse local_xml return $ @@ -423,10 +476,33 @@ test_comments_removed = do expected_xml'' = parseHtml expected_html +test_full_story_urls_parsed :: Assertion +test_full_story_urls_parsed = do + actual <- runX $ actual' + + assertEqual + "Full Story URLs are parsed" + expected + actual + where + expected = ["/Articles/500738/", "/Articles/501837/"] + + full_story_html = + concat ["

", + "Full Story ", + "(comments: 49)", + "

", + "Full Story ", + "(comments: none)", + "

"] + + xml = parseHtml full_story_html + actual' = xml >>> full_story_urls page_tests :: Test page_tests = testGroup "Page Tests" [ testCase "Links are replaced with spans" test_preprocess_links, testCase "Image srcs are made absolute" test_absolve_images, - testCase "Comment links are removed" test_comments_removed ] + testCase "Comment links are removed" test_comments_removed, + testCase "Full Story URLs are parsed" test_full_story_urls_parsed ] diff --git a/src/LWN/XHTML.hs b/src/LWN/XHTML.hs index 8dfe3b2..a2f103f 100644 --- a/src/LWN/XHTML.hs +++ b/src/LWN/XHTML.hs @@ -1,7 +1,10 @@ module LWN.XHTML ( XHTML, XML, + full_story_urls, image_srcs, + full_story_link, + full_story_paragraph, is_image, parse_lwn, preprocess, @@ -13,16 +16,21 @@ module LWN.XHTML ( where import Text.HandsomeSoup (css) +import Text.Regex.Posix ((=~)) import Text.XML.HXT.Core ( (>>>), + (/>), ArrowXml, IOStateArrow, SysConfigList, XmlTree, changeAttrValue, + deep, getAttrValue, hasAttrValue, hasName, + hasText, + ifA, isElem, mkName, no, @@ -31,12 +39,14 @@ import Text.XML.HXT.Core ( processTopDown, readString, setElemName, + this, when, withParseHTML, withValidate, withWarnings, yes) + import LWN.URI (URL, try_make_absolute_url) import Misc (contains) @@ -132,6 +142,44 @@ image_srcs = >>> getAttrValue "src" + +full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree +full_story_paragraph = + isElem + >>> + hasName "p" + >>> + ifA + (this /> full_story_link) + this + none + + +-- Without regard to the parent paragraph. +full_story_link :: (ArrowXml a) => a XmlTree XmlTree +full_story_link = + isElem + >>> + hasName "a" + >>> + ifA + (this /> hasText (=~ "Full Story")) + this + none + + +-- | Get the hrefs of all full story links. +full_story_urls :: (ArrowXml a) => a XmlTree URL +full_story_urls = + deep $ + full_story_paragraph + /> + full_story_link + >>> + getAttrValue "href" + + + make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree make_image_srcs_absolute = processTopDown (make_srcs_absolute `when` is_image) diff --git a/src/Main.hs b/src/Main.hs index 88ba39c..a1230d6 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -2,6 +2,7 @@ module Main where +import System.Directory (doesFileExist) import System.IO ( Handle, IOMode (WriteMode), @@ -10,6 +11,8 @@ import System.IO ( import CommandLine (show_help) import Configuration (Cfg(..), get_cfg) +import LWN.Article (real_article_path) +import LWN.HTTP (get_login_cookie) import LWN.Page (epublish, page_from_url) @@ -24,9 +27,18 @@ get_output_handle path = openBinaryFile path WriteMode +argument_is_file :: Cfg -> IO Bool +argument_is_file cfg = do + path <- real_article_path (article cfg) + doesFileExist path + main :: IO () main = do - cfg <- get_cfg + cfg' <- get_cfg + aif <- argument_is_file cfg' + cfg <- case aif of + False -> get_login_cookie cfg' + True -> return cfg' page <- page_from_url cfg (article cfg) case page of Just p -> do -- 2.43.2