X-Git-Url: http://gitweb.michael.orlitzky.com/?p=dead%2Flwn-epub.git;a=blobdiff_plain;f=src%2FLWN%2FPage.hs;h=3027eaef6c83e76067a744acf98be3a8b6b8579f;hp=6a097cbf7714d8929a37708ddd92a29c8982bf21;hb=2953924e2016393a1ffb9e2e82b4c90c8c57dfd3;hpb=f3321e2ce7d7645ad562dc8f6620bfd561edc75d diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index 6a097cb..3027eae 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -5,10 +5,13 @@ where import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) -import System.IO (Handle, hClose, hFlush) import qualified Data.ByteString.Lazy as B (ByteString, hPut) import Data.String.Utils (split, strip) import Data.Maybe (catMaybes, fromJust, isNothing) +import Prelude hiding (readFile) +import System.Directory (doesFileExist) +import System.IO (Handle, hClose, hFlush, hPutStrLn, stderr) +import System.IO.UTF8 (readFile) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) @@ -21,6 +24,7 @@ import Text.Pandoc ( import Text.XML.HXT.Core ( ArrowXml, IOSArrow, + IOStateArrow, XmlTree, (>>>), (/>), @@ -42,12 +46,64 @@ import Text.XML.HXT.Core ( when) import Text.HandsomeSoup (css, parseHtml) +import Configuration (Cfg, password, use_account, username) import LWN.Article -import LWN.HTTP (ImageMap, download_image_urls) +import LWN.HTTP ( + ImageMap, + download_image_urls, + get_page, + log_in, + make_cookie_jar) import LWN.URI (URL, try_make_absolute_url) -import LWN.XHTML (XHTML, to_xhtml) +import LWN.XHTML (XHTML, parse_lwn, to_xhtml) import Misc (contains) + +-- | Try to parse the given article using HXT. We try a few different +-- methods; if none of them work, we return 'Nothing'. +get_xml_from_article :: Cfg -> URL -> IO (Maybe (IOStateArrow s b XmlTree)) +get_xml_from_article cfg article_name = do + my_article <- real_article_path article_name + is_file <- doesFileExist my_article + case is_file of + True -> do + contents <- readFile my_article + return $ Just $ parse_lwn contents + False -> do + -- Download the URL and try to parse it. + if use_account cfg then do + -- use_account would be false if these fromJusts would fail. + cj <- make_cookie_jar + li_result <- log_in cj + (fromJust $ username cfg) + (fromJust $ password cfg) + + case li_result of + Left err -> do + let msg = "Failed to log in. " ++ err + hPutStrLn stderr msg + Right response_body -> do + hPutStrLn stderr response_body + + html <- get_page (Just cj) my_article + + case html of + Left err -> do + let msg = "Failed to retrieve page. " ++ err + hPutStrLn stderr msg + return Nothing + Right h -> return $ Just $ parse_lwn h + else do + html <- get_page Nothing my_article + case html of + Left err -> do + let msg = "Failed to retrieve page. " ++ err + hPutStrLn stderr msg + return Nothing + Right h -> return $ Just $ parse_lwn h + + + -- Should be called *after* preprocessing. download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap download_images xml = do @@ -77,9 +133,7 @@ instance XHTML Page where " " ++ (show $ LWN.Article.title a) ++ "" ++ "" ++ "" ++ - "
" ++ (to_xhtml a) ++ - "
" ++ "" ++ "" @@ -102,6 +156,14 @@ instance XHTML Page where +page_from_url :: Cfg -> URL -> IO (Maybe Page) +page_from_url cfg url = do + maybe_html <- get_xml_from_article cfg url + case maybe_html of + Just html -> parse html + Nothing -> return Nothing + + is_link :: (ArrowXml a) => a XmlTree XmlTree is_link = isElem >>> hasName "a" @@ -110,7 +172,7 @@ is_link = remove_comment_links :: (ArrowXml a) => a XmlTree XmlTree remove_comment_links = processTopDown $ kill_comments `when` is_link - where + where is_comment_link = hasAttrValue "href" (contains "#Comments") @@ -166,7 +228,7 @@ parse xml = do return $ if (isNothing appr) then fppr - else + else appr @@ -200,12 +262,12 @@ parse_byline xml = do -- ap_parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page) ap_parse xml = do - arts <- ap_parse_articles xml + arts <- ap_parse_articles xml case arts of [x] -> return $ Just $ ArticlePage x _ -> return Nothing - + ap_parse_body :: IOSArrow XmlTree XmlTree -> IO (Maybe String) ap_parse_body xml = do let element_filter = xml >>> css "div.ArticleText" @@ -227,11 +289,11 @@ ap_parse_articles xml = do if (isNothing parsed_headline) || (isNothing parsed_body) then return [] - else do + else do let title' = Title $ fromJust parsed_headline let byline' = Byline parsed_byline let body' = BodyHtml $ fromJust parsed_body - + return $ [Article title' byline' body'] @@ -323,7 +385,7 @@ parse_html_article html = do let xml = parseHtml $ wrap_in_body_div html fp_parse_article xml - + -- | In the full page, all of the article titles and bodies are -- wrapped in one big div.ArticleText. parse_bodies :: IOSArrow XmlTree XmlTree -> IOSArrow XmlTree XmlTree @@ -415,7 +477,7 @@ make_image_srcs_absolute = changeAttrValue try_make_absolute_url make_srcs_absolute :: (ArrowXml a) => a XmlTree XmlTree - make_srcs_absolute = + make_srcs_absolute = processAttrl $ change_src `when` hasName "src"