Refactor the Epublishable class into the new Page type.
Remove images before parsing rather than while parsing a full page.
Add an output file option.
argPos,
cmdArgsApply,
cmdArgsMode,
+ def,
details,
+ help,
program,
typ,
+ typFile,
summary
)
data Args =
- Args { article :: String }
+ Args { output :: FilePath,
+ article :: String }
deriving (Show, Data, Typeable)
lwn_epub_summary =
program_name ++ "-" ++ (showVersion version)
+output_help :: String
+output_help = "Output file, defaults to stdout"
+
arg_spec :: Mode (CmdArgs Args)
arg_spec = cmdArgsMode $
- Args { article = "" &= argPos 0 &= typ "ARTICLE" }
+ Args {
+ output = def &= typFile &= help output_help,
+ article = def &= argPos 0 &= typ "ARTICLE"
+ }
&= program program_name
&= summary lwn_epub_summary
&= details [description]
is_missing_arg_error s =
startswith "Requires at least" s
+
parse_args :: IO (CmdArgs Args)
parse_args = do
x <- getArgs
Right result -> return result
Left err ->
if (is_missing_arg_error err) then
+ -- Disregard the error message, show help instead.
withArgs ["--help"] parse_args
else do
hPutStrLn stderr err
exitWith (ExitFailure exit_args_parse_failed)
- -- Disregard the error message, show help instead.
+-- | Really get the command-line arguments. This calls 'parse_args'
+-- first to replace the default "wrong number of arguments" error,
+-- and then runs 'cmdArgsApply' on the result to do what the
+-- 'cmdArgs' function usually does.
apply_args :: IO Args
-apply_args = do
- x <- parse_args
- cmdArgsApply x
+apply_args =
+ parse_args >>= cmdArgsApply
+++ /dev/null
-module Epublishable
-where
-
-import Text.Pandoc
-import qualified Data.ByteString.Lazy as B (ByteString, writeFile)
-import Data.Time (getCurrentTime)
-import Data.Tree.NTree.TypeDefs (NTree)
-import System.FilePath (normalise)
-import Text.XML.HXT.Core (IOSArrow, XNode, XmlTree)
-
-import XHTML
-
-
-class (XHTML a) => Epublishable a where
- parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe a)
-
- title :: a -> String
-
- metadata :: a -> IO String
- metadata obj = do
- date <- getCurrentTime
- return $
- "<dc:creator>http://lwn.net/</dc:creator>\n" ++
- "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
- "<dc:language>en-US</dc:language>\n" ++
- "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
- "<dc:title>" ++ (title obj) ++ "</dc:title>\n"
-
- epublish :: a -> FilePath -> IO ()
- epublish obj path = do
- let xhtml = to_xhtml obj
- epmd <- metadata obj
- epub <- xhtml_to_epub epmd xhtml
- let normalized_path = normalise path
- B.writeFile normalized_path epub
-
-xhtml_to_epub :: String -> String -> IO B.ByteString
-xhtml_to_epub epmd =
- write_epub . read_html
- where
- my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
- write_epub = writeEPUB Nothing [] my_writer_options
- read_html = readHtml defaultParserState
+++ /dev/null
-module LWN.ArticlePage
-where
-
-import Data.String.Utils (strip)
-import Data.Maybe (fromJust)
-import Data.Tree.NTree.TypeDefs (NTree)
-import Text.XML.HXT.Core (
- IOSArrow,
- XmlTree,
- XNode,
- (>>>),
- (/>),
- getText,
- runX,
- xshow
- )
-import Text.HandsomeSoup (css)
-
-import Epublishable
-import LWN.Article
-import XHTML
-
--- | Defines the ArticlePage data type, containing one 'Article'.
-data ArticlePage = ArticlePage { article :: Article }
-
-
-instance XHTML ArticlePage where
- to_xhtml (ArticlePage a) =
- "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
- "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
- "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
- "<head>" ++
- " <meta http-equiv=\"Content-Type\"" ++
- " content=\"application/xhtml+xml; charset=utf-8\" />" ++
- " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
- "</head>" ++
- "<body>" ++
- "<div>" ++
- (to_xhtml a) ++
- "</div>" ++
- "</body>" ++
- "</html>"
-
-
-instance Epublishable ArticlePage where
- parse xml = do
- articles <- parse_articles xml
- case articles of
- Just [x] -> return $ Just $ ArticlePage x
- _ -> return Nothing
-
- title (ArticlePage x) = show $ LWN.Article.title x
-
-
--- | Takes data from an LWN page and determines whether or not it's a
--- single article (as opposed to a page with multiple articles).
-is_article_page :: String -> IO Bool
-is_article_page _ = return True
-
-
-parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-parse_headline xml = do
- let element_filter = xml >>> css "div.PageHeadline h1"
- let element_text_filter = element_filter /> getText
- element_text <- runX element_text_filter
- return $ case element_text of
- [x] -> Just $ strip x
- [] -> Nothing
- _ -> error "Found more than one headline."
-
-parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-parse_byline xml = do
- let element_filter = xml >>> css "div.Byline"
- let element_text_filter = element_filter /> getText
- element_text <- runX element_text_filter
- return $ case element_text of
- [x] -> Just $ strip x
- [] -> Nothing
- _ -> error "Found more than one byline."
-
-
-parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-parse_body xml = do
- let element_filter = xml >>> css "div.ArticleText"
- let element_html_filter = xshow element_filter
- element_html <- runX element_html_filter
- return $ case element_html of
- [x] -> Just x
- [] -> Nothing
- _ -> error "Found more than one article."
-
-
-parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
-parse_articles xml = do
- parsed_headline <- parse_headline xml
- parsed_byline <- parse_byline xml
- parsed_body <- parse_body xml
- let title' = Title (fromJust parsed_headline)
- let byline' = Byline parsed_byline
- let body' = BodyHtml (fromJust parsed_body)
- return $ Just $ [Article title' byline' body']
+++ /dev/null
-module LWN.FullPage
-where
-
-import Data.String.Utils (split, strip)
-import Data.Maybe (catMaybes, fromJust)
-import Data.Tree.NTree.TypeDefs (NTree)
-import Text.XML.HXT.Core (
- ArrowXml,
- IOSArrow,
- XmlTree,
- XNode,
- (>>>),
- (/>),
- (//>),
- getChildren,
- getText,
- hasAttrValue,
- hasName,
- none,
- processTopDown,
- runX,
- xshow,
- when
- )
-import Text.HandsomeSoup (css, parseHtml)
-
-import Epublishable
-import LWN.Article
-import XHTML
-
--- | An LWN page with more than one article on it. These require
--- different parsing and display functions than the single-article
--- pages.
-data FullPage = FullPage { headline :: String,
- articles :: [Article] }
-
-articles_xhtml :: FullPage -> String
-articles_xhtml fp = concatMap to_xhtml (articles fp)
-
-instance XHTML FullPage where
- to_xhtml fp =
- "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
- "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
- "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
- "<head>" ++
- " <meta http-equiv=\"Content-Type\"" ++
- " content=\"application/xhtml+xml; charset=utf-8\" />" ++
- " <title>" ++ (headline fp) ++ "</title>" ++
- "</head>" ++
- "<body>" ++
- "<div>" ++
- "<h1>" ++ (headline fp) ++ "</h1>" ++
- (articles_xhtml fp) ++
- "</div>" ++
- "</body>" ++
- "</html>"
-
-instance Epublishable FullPage where
- parse xml = do
- hl <- parse_headline xml
- parsed_articles <- parse_articles xml
- case parsed_articles of
- them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
- _ -> return Nothing
-
- title = headline
-
-
-parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-parse_headline xml = do
- let element_filter = xml >>> css "div.PageHeadline h1"
- let element_text_filter = element_filter /> getText
- element_text <- runX element_text_filter
- return $ case element_text of
- [x] -> Just $ strip x
- [] -> Nothing
- _ -> error "Found more than one headline."
-
-parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-parse_article_byline xml = do
- let element_filter = xml >>> css "div.FeatureByLine"
- let element_text_filter = element_filter /> getText
- element_text <- runX element_text_filter
- return $ case element_text of
- [x] -> Just $ strip x
- [] -> Nothing
- _ -> error "Found more than one article byline."
-
-
-parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-parse_article_title xml = do
- let element_filter = xml >>> css "h2.SummaryHL"
- let element_text_filter = element_filter //> getText
- element_text <- runX element_text_filter
- return $ case element_text of
- [x] -> Just $ strip x
- [] -> Nothing
- _ -> error "Found more than one article title."
-
-
-
-is_title :: (ArrowXml a) => a XmlTree XmlTree
-is_title =
- (hasName "h2")
- >>>
- (hasAttrValue "class" (== "SummaryHL"))
-
-
-is_byline :: (ArrowXml a) => a XmlTree XmlTree
-is_byline =
- (hasName "div")
- >>>
- (hasAttrValue "class" (== "FeatureByLine"))
-
-
-is_image :: (ArrowXml a) => a XmlTree XmlTree
-is_image =
- hasName "img"
-
-
-remove_title :: (ArrowXml a) => a XmlTree XmlTree
-remove_title =
- processTopDown ((none) `when` is_title)
-
-
-remove_byline :: (ArrowXml a) => a XmlTree XmlTree
-remove_byline =
- processTopDown ((none) `when` is_byline)
-
-
-remove_images :: (ArrowXml a) => a XmlTree XmlTree
-remove_images =
- processTopDown ((none) `when` is_image)
-
-
-
-parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-parse_article_body xml = do
- -- First, delete the article title and byline.
- let clean_xml' = xml >>> remove_title >>> remove_byline >>> remove_images
- -- The only child of the body element should be a div.lwn-article
- -- since we wrapped the article's HTML in that.
- let clean_xml = clean_xml' >>> css "body" >>> getChildren
- clean_html <- runX . xshow $ clean_xml
- return $ case clean_html of
- [x] -> Just x
- [] -> Nothing
- _ -> error "Found more than one article body."
-
-parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
-parse_article xml = do
- parsed_article_title <- parse_article_title xml
- parsed_article_byline <- parse_article_byline xml
- parsed_article_body <- parse_article_body xml
- let title' = Title $ fromJust parsed_article_title
- let byline' = Byline parsed_article_byline
- let body' = BodyHtml $ fromJust parsed_article_body
- return $ Just $ Article title' byline' body'
-
-parse_html_article :: String -> IO (Maybe Article)
-parse_html_article html = do
- let xml = parseHtml $ wrap_in_body_div html
- parse_article xml
-
-
--- | In the full page, all of the article titles and bodies are
--- wrapped in a div.ArticleText.
-parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
-parse_bodies xml =
- xml >>> css "div.ArticleText"
-
-
--- Debug, print a string.
-print_article :: String -> IO ()
-print_article s = do
- putStrLn "-----------"
- putStrLn "- Article -"
- putStrLn "-----------"
- putStrLn ""
- putStrLn s
- putStrLn ""
-
-
--- Debug, print an article's body html.
-print_body :: Article -> IO ()
-print_body x =
- print_article bh
- where
- bh' = body_html x
- bh = getBodyHtml bh'
-
-
-parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
-parse_articles xml = do
- bodies <- runX . xshow $ parse_bodies xml
- let article_separator = "<h2 class=\"SummaryHL\">"
- let split_articles'' = split article_separator (concat bodies)
- -- The first element will contain the crap before the first <h2...>.
- let split_articles' = tail split_articles''
- -- Put the separator back, it was lost during the split.
- let split_articles = map (article_separator ++) split_articles'
- --_ <- mapM print_article split_articles
- real_articles <- mapM parse_html_article split_articles
- let just_articles = catMaybes real_articles
- return just_articles
-
-
-wrap_in_body_div :: String -> String
-wrap_in_body_div s =
- "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
--- /dev/null
+module LWN.Page
+where
+
+import Text.Pandoc
+import Data.Time (getCurrentTime)
+import System.IO (Handle)
+import qualified Data.ByteString.Lazy as B (ByteString, hPut)
+import Data.String.Utils (split, strip)
+import Data.Maybe (catMaybes, fromJust, isNothing)
+import Data.Tree.NTree.TypeDefs (NTree)
+import Text.XML.HXT.Core (
+ ArrowXml,
+ IOSArrow,
+ XmlTree,
+ XNode,
+ (>>>),
+ (/>),
+ (//>),
+ getChildren,
+ getText,
+ hasAttrValue,
+ hasName,
+ none,
+ processTopDown,
+ runX,
+ xshow,
+ when
+ )
+import Text.HandsomeSoup (css, parseHtml)
+
+import LWN.Article
+import XHTML
+
+data Page =
+ -- | An LWN page with one article on it.
+ ArticlePage { article :: Article } |
+
+ -- | An LWN page with more than one article on it. These require
+ -- different parsing and display functions than the single-article
+ -- pages.
+ FullPage { headline :: String,
+ articles :: [Article] }
+
+
+instance XHTML Page where
+ to_xhtml (ArticlePage a) =
+ "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
+ "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
+ "<head>" ++
+ " <meta http-equiv=\"Content-Type\"" ++
+ " content=\"application/xhtml+xml; charset=utf-8\" />" ++
+ " <title>" ++ (show $ LWN.Article.title a) ++ "</title>" ++
+ "</head>" ++
+ "<body>" ++
+ "<div>" ++
+ (to_xhtml a) ++
+ "</div>" ++
+ "</body>" ++
+ "</html>"
+
+ to_xhtml (FullPage hl as) =
+ "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
+ "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
+ "<head>" ++
+ " <meta http-equiv=\"Content-Type\"" ++
+ " content=\"application/xhtml+xml; charset=utf-8\" />" ++
+ " <title>" ++ hl ++ "</title>" ++
+ "</head>" ++
+ "<body>" ++
+ "<div>" ++
+ "<h1>" ++ hl ++ "</h1>" ++
+ (concatMap to_xhtml as) ++
+ "</div>" ++
+ "</body>" ++
+ "</html>"
+
+
+
+remove_images :: (ArrowXml a) => a XmlTree XmlTree
+remove_images =
+ processTopDown ((none) `when` is_image)
+
+
+-- | Preprocessing common to both page types.
+preprocess :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
+preprocess xml =
+ xml >>>remove_images
+
+
+parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
+parse xml = do
+ let clean_xml = preprocess xml
+ appr <- ap_parse clean_xml
+ fppr <- fp_parse clean_xml
+ return $
+ if (isNothing appr) then
+ fppr
+ else
+ appr
+
+--
+-- ArticlePage Stuff
+--
+ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
+ap_parse xml = do
+ arts <- ap_parse_articles xml
+ case arts of
+ Just [x] -> return $ Just $ ArticlePage x
+ _ -> return Nothing
+
+
+ap_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+ap_parse_headline xml = do
+ let element_filter = xml >>> css "div.PageHeadline h1"
+ let element_text_filter = element_filter /> getText
+ element_text <- runX element_text_filter
+ return $ case element_text of
+ [x] -> Just $ strip x
+ [] -> Nothing
+ _ -> error "Found more than one headline."
+
+ap_parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+ap_parse_byline xml = do
+ let element_filter = xml >>> css "div.Byline"
+ let element_text_filter = element_filter /> getText
+ element_text <- runX element_text_filter
+ return $ case element_text of
+ [x] -> Just $ strip x
+ [] -> Nothing
+ _ -> error "Found more than one byline."
+
+
+ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+ap_parse_body xml = do
+ let element_filter = xml >>> css "div.ArticleText"
+ let element_html_filter = xshow element_filter
+ element_html <- runX element_html_filter
+ return $ case element_html of
+ [x] -> Just x
+ [] -> Nothing
+ _ -> error "Found more than one article."
+
+
+ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
+ap_parse_articles xml = do
+ parsed_headline <- ap_parse_headline xml
+ parsed_byline <- ap_parse_byline xml
+ parsed_body <- ap_parse_body xml
+ let title' = Title (fromJust parsed_headline)
+ let byline' = Byline parsed_byline
+ let body' = BodyHtml (fromJust parsed_body)
+ return $ Just $ [Article title' byline' body']
+
+
+
+
+--
+-- FullPage Stuff
+--
+
+fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
+fp_parse xml = do
+ hl <- fp_parse_headline xml
+ parsed_articles <- fp_parse_articles xml
+ case parsed_articles of
+ them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
+ _ -> return Nothing
+
+
+
+
+fp_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+fp_parse_headline xml = do
+ let element_filter = xml >>> css "div.PageHeadline h1"
+ let element_text_filter = element_filter /> getText
+ element_text <- runX element_text_filter
+ return $ case element_text of
+ [x] -> Just $ strip x
+ [] -> Nothing
+ _ -> error "Found more than one headline."
+
+fp_parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+fp_parse_article_byline xml = do
+ let element_filter = xml >>> css "div.FeatureByLine"
+ let element_text_filter = element_filter /> getText
+ element_text <- runX element_text_filter
+ return $ case element_text of
+ [x] -> Just $ strip x
+ [] -> Nothing
+ _ -> error "Found more than one article byline."
+
+
+fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+fp_parse_article_title xml = do
+ let element_filter = xml >>> css "h2.SummaryHL"
+ let element_text_filter = element_filter //> getText
+ element_text <- runX element_text_filter
+ return $ case element_text of
+ [x] -> Just $ strip x
+ [] -> Nothing
+ _ -> error "Found more than one article title."
+
+
+
+is_title :: (ArrowXml a) => a XmlTree XmlTree
+is_title =
+ (hasName "h2")
+ >>>
+ (hasAttrValue "class" (== "SummaryHL"))
+
+
+is_byline :: (ArrowXml a) => a XmlTree XmlTree
+is_byline =
+ (hasName "div")
+ >>>
+ (hasAttrValue "class" (== "FeatureByLine"))
+
+
+is_image :: (ArrowXml a) => a XmlTree XmlTree
+is_image =
+ hasName "img"
+
+
+remove_title :: (ArrowXml a) => a XmlTree XmlTree
+remove_title =
+ processTopDown ((none) `when` is_title)
+
+
+remove_byline :: (ArrowXml a) => a XmlTree XmlTree
+remove_byline =
+ processTopDown ((none) `when` is_byline)
+
+
+
+fp_parse_article_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+fp_parse_article_body xml = do
+ -- First, delete the article title and byline.
+ let clean_xml' = xml >>> remove_title >>> remove_byline
+ -- The only child of the body element should be a div.lwn-article
+ -- since we wrapped the article's HTML in that.
+ let clean_xml = clean_xml' >>> css "body" >>> getChildren
+ clean_html <- runX . xshow $ clean_xml
+ return $ case clean_html of
+ [x] -> Just x
+ [] -> Nothing
+ _ -> error "Found more than one article body."
+
+fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
+fp_parse_article xml = do
+ parsed_article_title <- fp_parse_article_title xml
+ parsed_article_byline <- fp_parse_article_byline xml
+ parsed_article_body <- fp_parse_article_body xml
+ let title' = Title $ fromJust parsed_article_title
+ let byline' = Byline parsed_article_byline
+ let body' = BodyHtml $ fromJust parsed_article_body
+ return $ Just $ Article title' byline' body'
+
+parse_html_article :: String -> IO (Maybe Article)
+parse_html_article html = do
+ let xml = parseHtml $ wrap_in_body_div html
+ fp_parse_article xml
+
+
+-- | In the full page, all of the article titles and bodies are
+-- wrapped in a div.ArticleText.
+parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
+parse_bodies xml =
+ xml >>> css "div.ArticleText"
+
+
+fp_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO [Article]
+fp_parse_articles xml = do
+ bodies <- runX . xshow $ parse_bodies xml
+ let article_separator = "<h2 class=\"SummaryHL\">"
+ let split_articles'' = split article_separator (concat bodies)
+ -- The first element will contain the crap before the first <h2...>.
+ let split_articles' = tail split_articles''
+ -- Put the separator back, it was lost during the split.
+ let split_articles = map (article_separator ++) split_articles'
+ --_ <- mapM print_article split_articles
+ real_articles <- mapM parse_html_article split_articles
+ let just_articles = catMaybes real_articles
+ return just_articles
+
+
+wrap_in_body_div :: String -> String
+wrap_in_body_div s =
+ "<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
+
+
+
+
+--
+-- Epublishable stuff
+--
+
+title :: Page -> String
+title (ArticlePage a) = getTitle $ LWN.Article.title a
+title (FullPage hl _) = hl
+
+
+metadata :: Page -> IO String
+metadata obj = do
+ date <- getCurrentTime
+ return $
+ "<dc:creator>http://lwn.net/</dc:creator>\n" ++
+ "<dc:date>" ++ (show date) ++ "</dc:date>\n" ++
+ "<dc:language>en-US</dc:language>\n" ++
+ "<dc:rights>Copyright Eklektix, Inc.</dc:rights>\n" ++
+ "<dc:title>" ++ (LWN.Page.title obj) ++ "</dc:title>\n"
+
+
+epublish :: Page -> Handle -> IO ()
+epublish obj handle = do
+ let xhtml = to_xhtml obj
+ epmd <- metadata obj
+ epub <- xhtml_to_epub epmd xhtml
+ B.hPut handle epub
+
+
+xhtml_to_epub :: String -> String -> IO B.ByteString
+xhtml_to_epub epmd =
+ write_epub . read_html
+ where
+ my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd }
+ write_epub = writeEPUB Nothing [] my_writer_options
+ read_html = readHtml defaultParserState
module Main
where
-import Data.Maybe (fromJust)
-import System.Console.CmdArgs (cmdArgsRun)
-
+import System.IO (
+ Handle,
+ IOMode (WriteMode),
+ openBinaryFile,
+ stdout
+ )
import Text.XML.HXT.Core
-
import CommandLine (Args(..), apply_args)
-import Epublishable
-import LWN.ArticlePage
-import LWN.FullPage
+import LWN.Page
my_read :: String -> IOStateArrow s b XmlTree
my_read =
withInputEncoding utf8,
withWarnings no ]
+-- | If we're given an empty path, return a handle to
+-- 'stdout'. Otherwise, open the given file and return a read/write
+-- handle to that.
+get_output_handle :: FilePath -> IO Handle
+get_output_handle path =
+ if (null path) then
+ return stdout
+ else
+ openBinaryFile path WriteMode
+
+
+-- | Convert the given article to either a URL or a filesystem
+-- path. If the given article exists on the filesystem, we assume
+-- it's a file. Otherwise, we check to see if it's a URL. Failing
+-- that, we try to construct a URL from what we're given and do our
+-- best.
+real_article_path :: String -> IO String
+real_article_path = return . id
+
main :: IO ()
main = do
Args{..} <- apply_args
- print article
-
- -- let article_html = my_read "test/fixtures/501317-article.html"
- -- ioap <- parse article_html
- -- let article_page :: ArticlePage = fromJust $ ioap
- -- epublish article_page "single_article.epub"
-
- -- let page_html = my_read "test/fixtures/500848-page.html"
- -- ioap_f <- parse page_html
- -- let full_page :: FullPage = fromJust $ ioap_f
- -- epublish full_page "full_page.epub"
+ output_handle <- get_output_handle output
+ input_path <- real_article_path article
+ let html = my_read input_path
+ result <- parse html
- -- let bigpage_html = my_read "test/fixtures/50844-bigpage.html"
- -- ioap_bp <- parse bigpage_html
- -- let bigpage :: FullPage = fromJust $ ioap_bp
- -- epublish bigpage "bigpage.epub"
+ case result of
+ Just stuff -> epublish stuff output_handle
+ Nothing -> return ()
- -- putStrLn "Done."
+ putStrLn "Done."