From d5bab2c486d30e54d3e94cc3bdbb230f2cd1f3f3 Mon Sep 17 00:00:00 2001 From: Michael Orlitzky Date: Sat, 23 Jun 2012 14:26:27 -0400 Subject: [PATCH] Switch from epub to pandoc for epub creation. --- lwn-epub.cabal | 4 +-- src/Epublishable.hs | 46 ++++++++++++++------------ src/LWN/FullPage.hs | 78 ++++++++++++++++++++++++++++++++++++++++++++- src/Main.hs | 5 ++- 4 files changed, 107 insertions(+), 26 deletions(-) diff --git a/lwn-epub.cabal b/lwn-epub.cabal index 8afbe2c..755845b 100644 --- a/lwn-epub.cabal +++ b/lwn-epub.cabal @@ -12,12 +12,12 @@ executable lwn-epub base == 4.5.*, bytestring == 0.9.*, directory == 1.1.*, - epub == 0.0.6, filepath == 1.3.*, HandsomeSoup == 0.3.*, hxt == 9.*, MissingH == 1.1.*, - old-time + pandoc == 1.9.*, + time == 1.* main-is: Main.hs diff --git a/src/Epublishable.hs b/src/Epublishable.hs index 01d27f9..3ca9b68 100644 --- a/src/Epublishable.hs +++ b/src/Epublishable.hs @@ -1,14 +1,13 @@ module Epublishable where -import Codec.EBook -import qualified Data.ByteString.Lazy as B (writeFile) -import Data.List (foldl') +import Text.Pandoc +import qualified Data.ByteString.Lazy as B (ByteString, writeFile) +import Data.Time (getCurrentTime) import Data.Tree.NTree.TypeDefs (NTree) import System.FilePath (normalise) import Text.XML.HXT.Core (IOSArrow, XNode, XmlTree) -import Misc (string_to_bytestring) import XHTML @@ -17,21 +16,28 @@ class (XHTML a) => Epublishable a where title :: a -> String - epublish :: a -> FilePath -> Integer -> IO () - epublish obj path time = do - let book_name = title obj - let book = - emptyBook { - bookID = "http://lwn.net/" ++ book_name, - bookAuthor = "LWN Copyright Eklektix, Inc.", - bookTitle = book_name - } + metadata :: a -> IO String + metadata obj = do + date <- getCurrentTime + return $ + "http://lwn.net/\n" ++ + "" ++ (show date) ++ "\n" ++ + "en-US\n" ++ + "Copyright Eklektix, Inc.\n" ++ + "" ++ (title obj) ++ "\n" + + epublish :: a -> FilePath -> IO () + epublish obj path = do let xhtml = to_xhtml obj - bs_xhtml <- string_to_bytestring xhtml - let iid = "iid-1" + epmd <- metadata obj + epub <- xhtml_to_epub epmd xhtml let normalized_path = normalise path - let metadata = Just (ChapterMetadata book_name) - let bi = BookItem iid normalized_path bs_xhtml opsMediatype metadata - let bookFull = foldl' addItem2Book book [bi] - let outdata = book2Bin bookFull time - B.writeFile normalized_path outdata + B.writeFile normalized_path epub + +xhtml_to_epub :: String -> String -> IO B.ByteString +xhtml_to_epub epmd = + write_epub . read_html + where + my_writer_options = defaultWriterOptions { writerEPUBMetadata = epmd } + write_epub = writeEPUB Nothing [] my_writer_options + read_html = readHtml defaultParserState diff --git a/src/LWN/FullPage.hs b/src/LWN/FullPage.hs index 9ff782c..cc89737 100644 --- a/src/LWN/FullPage.hs +++ b/src/LWN/FullPage.hs @@ -1,6 +1,21 @@ module LWN.FullPage where +import Data.String.Utils (strip) +import Data.Maybe (fromJust) +import Data.Tree.NTree.TypeDefs (NTree) +import Text.XML.HXT.Core ( + IOSArrow, + XmlTree, + XNode, + (>>>), + (/>), + getText, + runX, + xshow + ) +import Text.HandsomeSoup (css) + import Epublishable import LWN.Article import XHTML @@ -10,8 +25,25 @@ import XHTML -- pages. data FullPage = FullPage { articles :: [Article] } +articles_xhtml :: FullPage -> String +articles_xhtml fp = concatMap show (articles x) + instance XHTML FullPage where - to_xhtml fp = "" + to_xhtml fp = + "" ++ + "" ++ + "\n\n" ++ + " " ++ + " " ++ (headline a) ++ "\n" ++ + "\n" ++ + "\n" ++ + "
\n\n" ++ + (articles_xhtml fp) ++ + "\n\n
\n" ++ + "\n\n" ++ + "" instance Epublishable FullPage where parse xml = do @@ -21,3 +53,47 @@ instance Epublishable FullPage where _ -> return Nothing title _ = "LWN.net" + + +-- parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +-- parse_headline xml = do +-- let element_filter = xml >>> css "div.PageHeadline h1" +-- let element_text_filter = element_filter /> getText +-- element_text <- runX element_text_filter +-- return $ case element_text of +-- [x] -> Just $ strip x +-- [] -> Nothing +-- _ -> error "Found more than one headline." + +-- parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +-- parse_byline xml = do +-- let element_filter = xml >>> css "div.Byline" +-- let element_text_filter = element_filter /> getText +-- element_text <- runX element_text_filter +-- return $ case element_text of +-- [x] -> Just $ strip x +-- [] -> Nothing +-- _ -> error "Found more than one byline." + + +-- parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +-- parse_body xml = do +-- let element_filter = xml >>> css "div.ArticleText" +-- let element_html_filter = xshow element_filter +-- element_html <- runX element_html_filter +-- return $ case element_html of +-- [x] -> Just x +-- [] -> Nothing +-- _ -> error "Found more than one article." + + +-- parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article]) +-- parse_articles xml = do +-- parsed_headline <- parse_headline xml +-- parsed_byline <- parse_byline xml +-- parsed_body <- parse_body xml +-- let headline' = fromJust parsed_headline +-- let byline' = fromJust parsed_byline +-- let body' = fromJust parsed_body +-- return $ Just $ [Article headline' byline' body'] + diff --git a/src/Main.hs b/src/Main.hs index a3728d2..a810bc9 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -1,9 +1,9 @@ +{-# LANGUAGE ScopedTypeVariables #-} module Main where import Data.Maybe (fromJust) import Text.HandsomeSoup (parseHtml) -import System.Time (ClockTime( TOD ), getClockTime) import Epublishable import LWN.ArticlePage @@ -13,6 +13,5 @@ main = do article_html <- readFile "test/fixtures/501317-article.html" ioap <- parse $ parseHtml article_html let article_page :: ArticlePage = fromJust $ ioap - (TOD t _) <- getClockTime - epublish article_page "out.epub" t + epublish article_page "out.epub" putStrLn "Done." -- 2.44.2