executable lwn-epub
build-depends:
- base == 4.5.*,
- tagsoup == 0.12.*
+ base == 4.5.*,
+ bytestring == 0.9.*,
+ directory == 1.1.*,
+ epub == 0.0.6,
+ filepath == 1.3.*,
+ HandsomeSoup == 0.3.*,
+ hxt == 9.*,
+ MissingH == 1.1.*,
+ old-time
main-is:
Main.hs
--- /dev/null
+module Epublishable
+where
+
+import Codec.EBook
+import qualified Data.ByteString.Lazy as B (writeFile)
+import Data.List (foldl')
+import Data.Tree.NTree.TypeDefs (NTree)
+import System.FilePath (normalise)
+import Text.XML.HXT.Core (IOSArrow, XNode, XmlTree)
+
+import Misc (string_to_bytestring)
+import XHTML
+
+
+class (XHTML a) => Epublishable a where
+ parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe a)
+
+ title :: a -> String
+
+ epublish :: a -> FilePath -> Integer -> IO ()
+ epublish obj path time = do
+ let book_name = title obj
+ let book =
+ emptyBook {
+ bookID = "http://lwn.net/" ++ book_name,
+ bookAuthor = "LWN <http://lwn.net> Copyright Eklektix, Inc.",
+ bookTitle = book_name
+ }
+ let xhtml = to_xhtml obj
+ bs_xhtml <- string_to_bytestring xhtml
+ let iid = "iid-1"
+ let normalized_path = normalise path
+ let metadata = Just (ChapterMetadata book_name)
+ let bi = BookItem iid normalized_path bs_xhtml opsMediatype metadata
+ let bookFull = foldl' addItem2Book book [bi]
+ let outdata = book2Bin bookFull time
+ B.writeFile normalized_path outdata
--- /dev/null
+module LWN.Article
+where
+
+import XHTML
+
+data Article = Article { headline :: String,
+ byline :: String,
+ body_html :: String }
+
+instance XHTML Article where
+ to_xhtml (Article hl bl b) =
+ "<h1>" ++ hl ++ "</h1>\n\n" ++
+ "<h2>" ++ bl ++ "</h2>\n\n" ++
+ b
--- /dev/null
+module LWN.ArticlePage
+where
+
+import Data.String.Utils (strip)
+import Data.Maybe (fromJust)
+import Data.Tree.NTree.TypeDefs (NTree)
+import Text.XML.HXT.Core (
+ IOSArrow,
+ XmlTree,
+ XNode,
+ (>>>),
+ (/>),
+ getText,
+ runX,
+ xshow
+ )
+import Text.HandsomeSoup (css)
+
+import Epublishable
+import LWN.Article
+import XHTML
+
+-- | Defines the ArticlePage data type, containing one 'Article'.
+data ArticlePage = ArticlePage { article :: Article }
+
+
+instance XHTML ArticlePage where
+ to_xhtml (ArticlePage a) =
+ "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
+ "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
+ "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
+ "\n<head>\n" ++
+ " <meta http-equiv=\"Content-Type\"" ++
+ " content=\"application/xhtml+xml; charset=utf-8\" />" ++
+ " <title>" ++ (headline a) ++ "</title>\n" ++
+ "</head>\n" ++
+ "<body>\n" ++
+ "<div>\n\n" ++
+ (to_xhtml a) ++
+ "\n\n</div>\n" ++
+ "\n</body>\n" ++
+ "</html>"
+
+
+instance Epublishable ArticlePage where
+ parse xml = do
+ articles <- parse_articles xml
+ case articles of
+ Just [x] -> return $ Just $ ArticlePage x
+ _ -> return Nothing
+
+ title (ArticlePage x) = headline x
+
+
+-- | Takes data from an LWN page and determines whether or not it's a
+-- single article (as opposed to a page with multiple articles).
+is_article_page :: String -> IO Bool
+is_article_page _ = return True
+
+
+parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_headline xml = do
+ let element_filter = xml >>> css "div.PageHeadline h1"
+ let element_text_filter = element_filter /> getText
+ element_text <- runX element_text_filter
+ return $ case element_text of
+ [x] -> Just $ strip x
+ [] -> Nothing
+ _ -> error "Found more than one headline."
+
+parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_byline xml = do
+ let element_filter = xml >>> css "div.Byline"
+ let element_text_filter = element_filter /> getText
+ element_text <- runX element_text_filter
+ return $ case element_text of
+ [x] -> Just $ strip x
+ [] -> Nothing
+ _ -> error "Found more than one byline."
+
+
+parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_body xml = do
+ let element_filter = xml >>> css "div.ArticleText"
+ let element_html_filter = xshow element_filter
+ element_html <- runX element_html_filter
+ return $ case element_html of
+ [x] -> Just x
+ [] -> Nothing
+ _ -> error "Found more than one article."
+
+
+parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
+parse_articles xml = do
+ parsed_headline <- parse_headline xml
+ parsed_byline <- parse_byline xml
+ parsed_body <- parse_body xml
+ let headline' = fromJust parsed_headline
+ let byline' = fromJust parsed_byline
+ let body' = fromJust parsed_body
+ return $ Just $ [Article headline' byline' body']
--- /dev/null
+module LWN.FullPage
+where
+
+import Epublishable
+import LWN.Article
+import XHTML
+
+-- | An LWN page with more than one article on it. These require
+-- different parsing and display functions than the single-article
+-- pages.
+data FullPage = FullPage { articles :: [Article] }
+
+instance XHTML FullPage where
+ to_xhtml fp = ""
+
+instance Epublishable FullPage where
+ parse xml = do
+ articles <- parse_articles xml
+ case articles of
+ (x:xs)@all -> return $ Just $ FullPage all
+ _ -> return Nothing
+
+ title _ = "LWN.net"
module Main
where
-data Article = Article { headline :: String,
- byline :: String,
- body_html :: String }
- deriving (Eq, Show)
+import Data.Maybe (fromJust)
+import Text.HandsomeSoup (parseHtml)
+import System.Time (ClockTime( TOD ), getClockTime)
-parse_article :: String -> String
-parse_article _ = ""
+import Epublishable
+import LWN.ArticlePage
main :: IO ()
main = do
- putStrLn "Hello, world."
+ article_html <- readFile "test/fixtures/501317-article.html"
+ ioap <- parse $ parseHtml article_html
+ let article_page :: ArticlePage = fromJust $ ioap
+ (TOD t _) <- getClockTime
+ epublish article_page "out.epub" t
+ putStrLn "Done."
--- /dev/null
+module Misc
+where
+
+import qualified Data.ByteString.Lazy as B (ByteString, readFile)
+import System.Directory (getTemporaryDirectory, removeFile)
+import System.IO (hClose, hPutStr, hSetEncoding, openTempFile, utf8)
+
+-- | Run a 'String' through the filesystem to convert it to a
+-- 'ByteString' in the stupidest way possible.
+string_to_bytestring :: String -> IO B.ByteString
+string_to_bytestring s = do
+ dir <- getTemporaryDirectory
+ (path, h) <- openTempFile dir "nu1Uideehe"
+ hSetEncoding h utf8
+ hPutStr h s
+ hClose h
+ result <- B.readFile path
+ removeFile path
+ return result
--- /dev/null
+module XHTML
+where
+
+class XHTML a where
+ to_xhtml :: a -> String