Implement the full story downloading (still buggy).
Reorganize many of the XML functions.
-- | Contains all of our configurable options.
data Cfg = Cfg {
- article :: String,
- output :: FilePath,
- password :: Maybe String,
- username :: Maybe String }
+ article :: String,
+ cookie_jar :: Maybe FilePath,
+ output :: FilePath,
+ password :: Maybe String,
+ username :: Maybe String }
instance Monoid Cfg where
mempty = Cfg { article = "",
+ cookie_jar = Nothing,
output = "",
password = Nothing,
username = Nothing }
mappend c1 c2 =
- let article' = (if null article1 then article2 else article1)
- output' = (if null output1 then output2 else output1)
- password' = password1 `mplus` password2
- username' = username1 `mplus` username2
+ let article' = (if null article1 then article2 else article1)
+ cookie_jar' = cookie_jar1 `mplus` cookie_jar2
+ output' = (if null output1 then output2 else output1)
+ password' = password1 `mplus` password2
+ username' = username1 `mplus` username2
in
- Cfg { article = article',
- output = output',
- password = password',
- username = username' }
+ Cfg { article = article',
+ cookie_jar = cookie_jar',
+ output = output',
+ password = password',
+ username = username' }
where
- article1 = article c1
- article2 = article c2
- output1 = output c1
- output2 = output c2
- password1 = password c1
- password2 = password c2
- username1 = username c1
- username2 = username c2
+ article1 = article c1
+ article2 = article c2
+ cookie_jar1 = cookie_jar c1
+ cookie_jar2 = cookie_jar c2
+ output1 = output c1
+ output2 = output c2
+ password1 = password c1
+ password2 = password c2
+ username1 = username c1
+ username2 = username c2
use_account :: Cfg -> Bool
where
abs_current = try_make_absolute_url ("/" ++ path)
abs_article = try_make_absolute_url ("Articles/" ++ path)
+ abs_full_article = try_make_absolute_url path
check_cases :: String
check_cases
| is_lwn_url path = make_https path
| isPrefixOf "current" path = abs_current
| path =~ "^[0-9]+$" = abs_article
+ | path =~ "^/Articles/[0-9]+/?$" = abs_full_article
| otherwise = path -- Give up
import qualified Data.ByteString as B (hPut)
import qualified Data.Map as Map (Map, empty, insert)
-import Data.Maybe (fromJust)
+import Data.Maybe (fromJust, isNothing)
import Network.Curl (
CurlCode(..),
CurlOption(..),
import qualified System.IO.UTF8 as Utf8 (readFile)
import System.IO.Temp (openBinaryTempFile, openTempFile)
-import qualified Configuration as C (Cfg, password, use_account, username)
+import qualified Configuration as C (Cfg(..))
import LWN.Article (real_article_path)
import LWN.URI (URL, filename)
my_insert dict (k, Just v) = Map.insert k v dict
+
+
+
+get_login_cookie :: C.Cfg -> IO C.Cfg
+get_login_cookie cfg
+ | isNothing (C.username cfg) = return cfg
+ | isNothing (C.password cfg) = return cfg
+ | otherwise = do
+ let uname = fromJust $ C.username cfg
+ let pword = fromJust $ C.password cfg
+ cj <- make_cookie_jar
+ li_result <- log_in cj uname pword
+
+ case li_result of
+ Left err -> do
+ let msg = "Failed to log in. " ++ err
+ hPutStrLn stderr msg
+ Right response_body -> do
+ hPutStrLn stderr response_body
+
+ return $ cfg { C.cookie_jar = Just cj }
+
+
-- | Try to parse the given article using HXT. We try a few different
-- methods; if none of them work, we return 'Nothing'.
get_article_contents :: C.Cfg -> URL -> IO (Maybe String)
return $ Just $ contents
False -> do
-- Download the URL and try to parse it.
- if C.use_account cfg then do
- -- use_account would be false if these fromJusts would fail.
- cj <- make_cookie_jar
- li_result <- log_in cj
- (fromJust $ C.username cfg)
- (fromJust $ C.password cfg)
-
- case li_result of
- Left err -> do
- let msg = "Failed to log in. " ++ err
- hPutStrLn stderr msg
- Right response_body -> do
- hPutStrLn stderr response_body
-
- html <- get_page (Just cj) my_article
-
- case html of
- Left err -> do
- let msg = "Failed to retrieve page. " ++ err
- hPutStrLn stderr msg
- return Nothing
- Right h -> return $ Just h
- else do
- html <- get_page Nothing my_article
- case html of
- Left err -> do
- let msg = "Failed to retrieve page. " ++ err
- hPutStrLn stderr msg
- return Nothing
- Right h -> return $ Just h
+ html <- get_page (C.cookie_jar cfg) my_article
+
+ case html of
+ Left err -> do
+ let msg = "Failed to retrieve page. " ++ err
+ hPutStrLn stderr msg
+ return Nothing
+ Right h -> return $ Just h
import Data.Time (getCurrentTime)
import qualified Data.ByteString.Lazy as B (ByteString, hPut)
import Data.String.Utils (split, strip)
+import qualified Data.Map as Map (Map, empty, insert)
import Data.Maybe (catMaybes, fromJust, isNothing)
import Prelude hiding (readFile)
import System.IO (Handle, hClose, hFlush)
ArrowXml,
IOSArrow,
XmlTree,
+ ($<),
(>>>),
(/>),
(//>),
changeAttrValue,
+ getAttrValue,
getChildren,
getText,
hasName,
processAttrl,
processTopDown,
+ this,
runX,
xshow,
when)
import LWN.URI (URL)
import LWN.XHTML (
XHTML,
+ full_story_urls,
image_srcs,
+ full_story_link,
+ full_story_paragraph,
is_image,
preprocess,
remove_byline,
remove_title,
to_xhtml,
+ to_xml,
xml_from_contents)
page_from_url cfg url = do
contents <- get_article_contents cfg url
case (xml_from_contents contents) of
- Just html -> parse html
+ Just html -> parse cfg html
Nothing -> return Nothing
--- Should be called *after* preprocessing.
-download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
-download_images xml = do
- image_urls <- runX $ xml >>> image_srcs
- download_image_urls image_urls
-
-
-
+insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
+insert_full_stories story_map =
+ processTopDown (article_xml `when` full_story_paragraph)
+ where
+ lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
+ lookup_func href =
+ case Map.lookup href story_map of
+ -- Leave it alone if we don't have the full story.
+ Nothing -> this
+ Just v -> to_xml v
+
+ article_xml :: (ArrowXml a) => a XmlTree XmlTree
+ article_xml =
+ lookup_func
+ $<
+ (this /> full_story_link >>> getAttrValue "href")
+
replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
replace_remote_img_srcs image_map =
processTopDown (make_srcs_local `when` is_image)
processAttrl $ (change_src `when` (hasName "src"))
-parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
-parse xml = do
- let clean_xml = xml >>> preprocess
+
+
+-- Should be called *after* preprocessing.
+download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
+download_images xml = do
+ image_urls <- runX $ xml >>> image_srcs
+ download_image_urls image_urls
+
+
+
+type StoryMap = Map.Map URL Article
+
+-- These come *before* preprocessing.
+download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
+download_full_story_urls cfg story_urls = do
+ pages <- mapM (page_from_url cfg) story_urls
+ let pairs = zip story_urls pages
+ return $ foldl my_insert empty_map pairs
+ where
+ empty_map = Map.empty :: StoryMap
+
+ my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
+ my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
+ my_insert dict (_, _) = dict
+
+
+download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
+download_full_stories cfg xml = do
+ story_urls <- runX $ xml >>> full_story_urls
+ download_full_story_urls cfg story_urls
+
+
+parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
+parse cfg xml = do
+ story_map <- download_full_stories cfg xml
+ let fs_xml = xml >>> insert_full_stories story_map
+
+ let clean_xml = fs_xml >>> preprocess
image_map <- download_images clean_xml
let local_xml = clean_xml >>> replace_remote_img_srcs image_map
+
appr <- ap_parse local_xml
fppr <- fp_parse local_xml
return $
expected_xml'' = parseHtml expected_html
+test_full_story_urls_parsed :: Assertion
+test_full_story_urls_parsed = do
+ actual <- runX $ actual'
+
+ assertEqual
+ "Full Story URLs are parsed"
+ expected
+ actual
+ where
+ expected = ["/Articles/500738/", "/Articles/501837/"]
+
+ full_story_html =
+ concat ["<p>",
+ "<a href=\"/Articles/500738/\">Full Story</a> ",
+ "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
+ "<p>",
+ "<a href=\"/Articles/501837/\">Full Story</a> ",
+ "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
+ "<p>"]
+
+ xml = parseHtml full_story_html
+ actual' = xml >>> full_story_urls
page_tests :: Test
page_tests =
testGroup "Page Tests" [
testCase "Links are replaced with spans" test_preprocess_links,
testCase "Image srcs are made absolute" test_absolve_images,
- testCase "Comment links are removed" test_comments_removed ]
+ testCase "Comment links are removed" test_comments_removed,
+ testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]
module LWN.XHTML (
XHTML,
XML,
+ full_story_urls,
image_srcs,
+ full_story_link,
+ full_story_paragraph,
is_image,
parse_lwn,
preprocess,
where
import Text.HandsomeSoup (css)
+import Text.Regex.Posix ((=~))
import Text.XML.HXT.Core (
(>>>),
+ (/>),
ArrowXml,
IOStateArrow,
SysConfigList,
XmlTree,
changeAttrValue,
+ deep,
getAttrValue,
hasAttrValue,
hasName,
+ hasText,
+ ifA,
isElem,
mkName,
no,
processTopDown,
readString,
setElemName,
+ this,
when,
withParseHTML,
withValidate,
withWarnings,
yes)
+
import LWN.URI (URL, try_make_absolute_url)
import Misc (contains)
>>>
getAttrValue "src"
+
+full_story_paragraph :: (ArrowXml a) => a XmlTree XmlTree
+full_story_paragraph =
+ isElem
+ >>>
+ hasName "p"
+ >>>
+ ifA
+ (this /> full_story_link)
+ this
+ none
+
+
+-- Without regard to the parent paragraph.
+full_story_link :: (ArrowXml a) => a XmlTree XmlTree
+full_story_link =
+ isElem
+ >>>
+ hasName "a"
+ >>>
+ ifA
+ (this /> hasText (=~ "Full Story"))
+ this
+ none
+
+
+-- | Get the hrefs of all full story links.
+full_story_urls :: (ArrowXml a) => a XmlTree URL
+full_story_urls =
+ deep $
+ full_story_paragraph
+ />
+ full_story_link
+ >>>
+ getAttrValue "href"
+
+
+
make_image_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree
make_image_srcs_absolute =
processTopDown (make_srcs_absolute `when` is_image)
module Main
where
+import System.Directory (doesFileExist)
import System.IO (
Handle,
IOMode (WriteMode),
import CommandLine (show_help)
import Configuration (Cfg(..), get_cfg)
+import LWN.Article (real_article_path)
+import LWN.HTTP (get_login_cookie)
import LWN.Page (epublish, page_from_url)
openBinaryFile path WriteMode
+argument_is_file :: Cfg -> IO Bool
+argument_is_file cfg = do
+ path <- real_article_path (article cfg)
+ doesFileExist path
+
main :: IO ()
main = do
- cfg <- get_cfg
+ cfg' <- get_cfg
+ aif <- argument_is_file cfg'
+ cfg <- case aif of
+ False -> get_login_cookie cfg'
+ True -> return cfg'
page <- page_from_url cfg (article cfg)
case page of
Just p -> do