From f3321e2ce7d7645ad562dc8f6620bfd561edc75d Mon Sep 17 00:00:00 2001 From: Michael Orlitzky Date: Sat, 30 Jun 2012 12:37:00 -0400 Subject: [PATCH] Source reorganization and cleanup. --- src/LWN/Article.hs | 90 +++++++++++++++++++++++++++++++++++++- src/LWN/HTTP.hs | 30 ++++++++++--- src/LWN/Page.hs | 32 +++++--------- src/LWN/XHTML.hs | 33 ++++++++++++++ src/Main.hs | 107 ++++----------------------------------------- src/XHTML.hs | 5 --- test/TestSuite.hs | 4 +- 7 files changed, 166 insertions(+), 135 deletions(-) create mode 100644 src/LWN/XHTML.hs delete mode 100644 src/XHTML.hs diff --git a/src/LWN/Article.hs b/src/LWN/Article.hs index 70c68a5..9d4c858 100644 --- a/src/LWN/Article.hs +++ b/src/LWN/Article.hs @@ -1,7 +1,27 @@ -module LWN.Article +module LWN.Article ( + Article(..), + Byline(..), + Title(..), + BodyHtml(..), + article_tests, + real_article_path + ) where -import XHTML +import Data.List (isPrefixOf) +import System.Directory (doesFileExist) +import Test.HUnit (Assertion, assertEqual) +import Test.Framework (Test, testGroup) +import Test.Framework.Providers.HUnit (testCase) +import Text.Regex.Posix ((=~)) + +import LWN.URI ( + add_trailing_slash, + is_lwn_url, + try_make_absolute_url, + make_https) + +import LWN.XHTML (XHTML, to_xhtml) newtype Title = Title { getTitle :: String } newtype Byline = Byline { getByline :: Maybe String } @@ -36,3 +56,69 @@ instance XHTML Article where (to_xhtml t) ++ (to_xhtml bl) ++ (to_xhtml b) + + + +-- | Convert the given article to either a URL or a filesystem +-- path. If the given article exists on the filesystem, we assume +-- it's a file. Otherwise, we check to see if it's a URL. Failing +-- that, we try to construct a URL from what we're given and do our +-- best. +real_article_path :: String -> IO String +real_article_path path = do + is_file <- doesFileExist path + return $ if is_file then path else add_trailing_slash check_cases + where + abs_current = try_make_absolute_url ("/" ++ path) + abs_article = try_make_absolute_url ("Articles/" ++ path) + + check_cases :: String + check_cases + | is_lwn_url path = make_https path + | isPrefixOf "current" path = abs_current + | path =~ "^[0-9]+$" = abs_article + | otherwise = path -- Give up + + + +test_current_article_path :: Assertion +test_current_article_path = do + let expected = "https://lwn.net/current/" + actual <- real_article_path "current" + assertEqual "Current article path constructed" expected actual + +test_current_bigpage_article_path :: Assertion +test_current_bigpage_article_path = do + let expected = "https://lwn.net/current/bigpage" + actual <- real_article_path "current/bigpage" + assertEqual "Current bigpage article path constructed" expected actual + +test_numbered_article_path :: Assertion +test_numbered_article_path = do + let expected = "https://lwn.net/Articles/69/" + actual <- real_article_path "69" -- I'm twelve + assertEqual "Numbered article path constructed" expected actual + + +test_full_article_path :: Assertion +test_full_article_path = do + let expected = "https://lwn.net/Articles/502979/" + actual <- real_article_path "https://lwn.net/Articles/502979/" + assertEqual "Full article path left alone" expected actual + +test_non_https_article_path :: Assertion +test_non_https_article_path = do + let expected = "https://lwn.net/Articles/502979/" + actual <- real_article_path "http://lwn.net/Articles/502979/" + assertEqual "Non-https URL made https" expected actual + +article_tests :: Test +article_tests = + testGroup "Article Tests" [ + testCase "Current article path constructed" test_current_article_path, + testCase + "Current bigpage article path constructed" + test_current_bigpage_article_path, + testCase "Numbered article path constructed" test_numbered_article_path, + testCase "Full article path left alone" test_full_article_path, + testCase "Non-https URL made https" test_non_https_article_path ] diff --git a/src/LWN/HTTP.hs b/src/LWN/HTTP.hs index 71058dc..a8a1980 100644 --- a/src/LWN/HTTP.hs +++ b/src/LWN/HTTP.hs @@ -4,12 +4,11 @@ module LWN.HTTP where import qualified Data.ByteString as B (hPut) - +import qualified Data.Map as Map (Map, empty, insert) import Network.Curl ( CurlCode(..), CurlOption(..), CurlResponse, - URLString, do_curl_, initialize, respBody, @@ -21,9 +20,9 @@ import System.Directory (doesFileExist, getTemporaryDirectory) import System.IO (hClose, hPutStrLn, stderr) import System.IO.Temp (openBinaryTempFile, openTempFile) -import LWN.URI (filename) +import LWN.URI (URL, filename) -login_url :: URLString +login_url :: URL login_url = "https://lwn.net/login" username_field :: String @@ -59,7 +58,7 @@ make_cookie_jar = do hClose out_handle -- We just want to create it for now. return out_path -get_page :: Maybe FilePath -> URLString -> IO (Either String String) +get_page :: Maybe FilePath -> URL -> IO (Either String String) get_page cookie_file url = withCurlDo $ do -- Create a curl instance. @@ -143,7 +142,7 @@ log_in cookie_jar username password = -- We need to be able to parse the filename out of the URL -- so that when we stick our image in the document, the reader -- knows that type (jpg, png, etc.) it is. -save_image :: URLString -> IO (Maybe FilePath) +save_image :: URL -> IO (Maybe FilePath) save_image url = do it_exists <- doesFileExist url if it_exists then do @@ -164,3 +163,22 @@ save_image url = do Right bs -> do B.hPut out_handle bs return $ Just out_path + + + + +-- | Map absolute image URLs to local system file paths where the +-- image referenced by the URL is stored. +type ImageMap = Map.Map URL FilePath + +download_image_urls :: [URL] -> IO ImageMap +download_image_urls image_urls = do + files <- mapM save_image image_urls + let pairs = zip image_urls files + return $ foldl my_insert empty_map pairs + where + empty_map = Map.empty :: ImageMap + + my_insert :: ImageMap -> (URL, Maybe FilePath) -> ImageMap + my_insert dict (_, Nothing) = dict + my_insert dict (k, Just v) = Map.insert k v dict diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index 2bbe21a..6a097cb 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -3,7 +3,7 @@ module LWN.Page where -import qualified Data.Map as Map +import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) import System.IO (Handle, hClose, hFlush) import qualified Data.ByteString.Lazy as B (ByteString, hPut) @@ -12,7 +12,12 @@ import Data.Maybe (catMaybes, fromJust, isNothing) import Test.HUnit (Assertion, assertEqual) import Test.Framework (Test, testGroup) import Test.Framework.Providers.HUnit (testCase) -import Text.Pandoc +import Text.Pandoc ( + defaultParserState, + defaultWriterOptions, + readHtml, + writeEPUB, + writerEPUBMetadata) import Text.XML.HXT.Core ( ArrowXml, IOSArrow, @@ -34,31 +39,14 @@ import Text.XML.HXT.Core ( runX, setElemName, xshow, - when - ) + when) import Text.HandsomeSoup (css, parseHtml) import LWN.Article -import LWN.HTTP (save_image) +import LWN.HTTP (ImageMap, download_image_urls) import LWN.URI (URL, try_make_absolute_url) +import LWN.XHTML (XHTML, to_xhtml) import Misc (contains) -import XHTML - --- Map absolute image URLs to local system file paths where the image --- referenced by the URL is stored. -type ImageMap = Map.Map URL FilePath - -download_image_urls :: [URL] -> IO ImageMap -download_image_urls image_urls = do - files <- mapM save_image image_urls - let pairs = zip image_urls files - return $ foldl my_insert empty_map pairs - where - empty_map = Map.empty :: ImageMap - - my_insert :: ImageMap -> (URL, Maybe FilePath) -> ImageMap - my_insert dict (_, Nothing) = dict - my_insert dict (k, Just v) = Map.insert k v dict -- Should be called *after* preprocessing. download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap diff --git a/src/LWN/XHTML.hs b/src/LWN/XHTML.hs new file mode 100644 index 0000000..caa7403 --- /dev/null +++ b/src/LWN/XHTML.hs @@ -0,0 +1,33 @@ +module LWN.XHTML ( + XHTML, + parse_lwn, + to_xhtml + ) +where + +import Text.XML.HXT.Core ( + IOStateArrow, + SysConfigList, + XmlTree, + no, + readString, + withParseHTML, + withValidate, + withWarnings, + yes + ) + +class XHTML a where + to_xhtml :: a -> String + + + +-- | Options used when parsing HTML. +my_read_opts :: SysConfigList +my_read_opts = [ withValidate no, + withParseHTML yes, + withWarnings no ] + +-- | My version of HandsomeSoup's parseHTML. +parse_lwn :: String -> IOStateArrow s b XmlTree +parse_lwn = readString my_read_opts diff --git a/src/Main.hs b/src/Main.hs index 74971fd..5ce4026 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -2,7 +2,6 @@ module Main where -import Data.List (isPrefixOf) import Data.Maybe (fromJust) import Prelude hiding (readFile) import System.Directory (doesFileExist) @@ -12,44 +11,19 @@ import System.IO ( hPutStrLn, openBinaryFile, stderr, - stdout - ) + stdout) import System.IO.UTF8 (readFile) -import Test.HUnit (Assertion, assertEqual) -import Test.Framework (Test, testGroup) -import Test.Framework.Providers.HUnit (testCase) -import Text.Regex.Posix ((=~)) import Text.XML.HXT.Core ( IOStateArrow, - SysConfigList, - XmlTree, - no, - readString, - withParseHTML, - withValidate, - withWarnings, - yes - ) + XmlTree) + import CommandLine (show_help) import Configuration (Cfg(..), get_cfg, use_account) +import LWN.Article (real_article_path) import LWN.HTTP (get_page, log_in, make_cookie_jar) import LWN.Page (epublish, parse) -import LWN.URI ( - add_trailing_slash, - is_lwn_url, - try_make_absolute_url, - make_https) - - - -my_read_opts :: SysConfigList -my_read_opts = [ withValidate no, - withParseHTML yes, - withWarnings no ] +import LWN.XHTML (parse_lwn) --- | My version of HandsomeSoup's parseHTML. -my_read :: String -> IOStateArrow s b XmlTree -my_read = readString my_read_opts -- | Try to parse the given article using HXT. We try a few different @@ -61,7 +35,7 @@ get_xml_from_article cfg = do case is_file of True -> do contents <- readFile my_article - return $ Just $ my_read contents + return $ Just $ parse_lwn contents False -> do -- Download the URL and try to parse it. if use_account cfg then do @@ -85,7 +59,7 @@ get_xml_from_article cfg = do let msg = "Failed to retrieve page. " ++ err hPutStrLn stderr msg return Nothing - Right h -> return $ Just $ my_read h + Right h -> return $ Just $ parse_lwn h else do html <- get_page Nothing my_article case html of @@ -93,7 +67,8 @@ get_xml_from_article cfg = do let msg = "Failed to retrieve page. " ++ err hPutStrLn stderr msg return Nothing - Right h -> return $ Just $ my_read h + Right h -> return $ Just $ parse_lwn h + -- | If we're given an empty path, return a handle to -- 'stdout'. Otherwise, open the given file and return a read/write @@ -106,27 +81,6 @@ get_output_handle path = openBinaryFile path WriteMode - --- | Convert the given article to either a URL or a filesystem --- path. If the given article exists on the filesystem, we assume --- it's a file. Otherwise, we check to see if it's a URL. Failing --- that, we try to construct a URL from what we're given and do our --- best. -real_article_path :: String -> IO String -real_article_path path = do - is_file <- doesFileExist path - return $ if is_file then path else add_trailing_slash check_cases - where - abs_current = try_make_absolute_url ("/" ++ path) - abs_article = try_make_absolute_url ("Articles/" ++ path) - - check_cases :: String - check_cases - | is_lwn_url path = make_https path - | isPrefixOf "current" path = abs_current - | path =~ "^[0-9]+$" = abs_article - | otherwise = path -- Give up - main :: IO () main = do cfg <- get_cfg @@ -145,46 +99,3 @@ main = do Nothing -> do _ <- show_help return () - - -test_current_article_path :: Assertion -test_current_article_path = do - let expected = "https://lwn.net/current/" - actual <- real_article_path "current" - assertEqual "Current article path constructed" expected actual - -test_current_bigpage_article_path :: Assertion -test_current_bigpage_article_path = do - let expected = "https://lwn.net/current/bigpage" - actual <- real_article_path "current/bigpage" - assertEqual "Current bigpage article path constructed" expected actual - -test_numbered_article_path :: Assertion -test_numbered_article_path = do - let expected = "https://lwn.net/Articles/69/" - actual <- real_article_path "69" -- I'm twelve - assertEqual "Numbered article path constructed" expected actual - - -test_full_article_path :: Assertion -test_full_article_path = do - let expected = "https://lwn.net/Articles/502979/" - actual <- real_article_path "https://lwn.net/Articles/502979/" - assertEqual "Full article path left alone" expected actual - -test_non_https_article_path :: Assertion -test_non_https_article_path = do - let expected = "https://lwn.net/Articles/502979/" - actual <- real_article_path "http://lwn.net/Articles/502979/" - assertEqual "Non-https URL made https" expected actual - -main_tests :: Test -main_tests = - testGroup "Main Tests" [ - testCase "Current article path constructed" test_current_article_path, - testCase - "Current bigpage article path constructed" - test_current_bigpage_article_path, - testCase "Numbered article path constructed" test_numbered_article_path, - testCase "Full article path left alone" test_full_article_path, - testCase "Non-https URL made https" test_non_https_article_path ] diff --git a/src/XHTML.hs b/src/XHTML.hs deleted file mode 100644 index 6c4421f..0000000 --- a/src/XHTML.hs +++ /dev/null @@ -1,5 +0,0 @@ -module XHTML -where - -class XHTML a where - to_xhtml :: a -> String diff --git a/test/TestSuite.hs b/test/TestSuite.hs index ce648dc..adbb203 100644 --- a/test/TestSuite.hs +++ b/test/TestSuite.hs @@ -15,7 +15,7 @@ import Test.Framework.Runners.Options import Test.Framework.Providers.API (TestName) import Test.HUnit -import Main (main_tests) +import LWN.Article (article_tests) import LWN.Page (page_tests) import LWN.URI (uri_tests) @@ -23,6 +23,6 @@ main :: IO () main = defaultMain tests tests :: [Test.Framework.Test] -tests = [ main_tests, +tests = [ article_tests, page_tests, uri_tests ] -- 2.44.2