From b591d5f5c3c47d253c24144beae1edf8648cd94b Mon Sep 17 00:00:00 2001 From: Michael Orlitzky Date: Sat, 23 Jun 2012 01:01:02 -0400 Subject: [PATCH] Add a first draft using HXT, HandsomeSoup, and the Haskell epub library. --- lwn-epub.cabal | 11 ++++- src/Epublishable.hs | 37 +++++++++++++++ src/LWN/Article.hs | 14 ++++++ src/LWN/ArticlePage.hs | 101 +++++++++++++++++++++++++++++++++++++++++ src/LWN/FullPage.hs | 23 ++++++++++ src/Main.hs | 18 +++++--- src/Misc.hs | 19 ++++++++ src/XHTML.hs | 5 ++ 8 files changed, 219 insertions(+), 9 deletions(-) create mode 100644 src/Epublishable.hs create mode 100644 src/LWN/Article.hs create mode 100644 src/LWN/ArticlePage.hs create mode 100644 src/LWN/FullPage.hs create mode 100644 src/Misc.hs create mode 100644 src/XHTML.hs diff --git a/lwn-epub.cabal b/lwn-epub.cabal index 3742244..8afbe2c 100644 --- a/lwn-epub.cabal +++ b/lwn-epub.cabal @@ -9,8 +9,15 @@ build-type: Simple executable lwn-epub build-depends: - base == 4.5.*, - tagsoup == 0.12.* + base == 4.5.*, + bytestring == 0.9.*, + directory == 1.1.*, + epub == 0.0.6, + filepath == 1.3.*, + HandsomeSoup == 0.3.*, + hxt == 9.*, + MissingH == 1.1.*, + old-time main-is: Main.hs diff --git a/src/Epublishable.hs b/src/Epublishable.hs new file mode 100644 index 0000000..01d27f9 --- /dev/null +++ b/src/Epublishable.hs @@ -0,0 +1,37 @@ +module Epublishable +where + +import Codec.EBook +import qualified Data.ByteString.Lazy as B (writeFile) +import Data.List (foldl') +import Data.Tree.NTree.TypeDefs (NTree) +import System.FilePath (normalise) +import Text.XML.HXT.Core (IOSArrow, XNode, XmlTree) + +import Misc (string_to_bytestring) +import XHTML + + +class (XHTML a) => Epublishable a where + parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe a) + + title :: a -> String + + epublish :: a -> FilePath -> Integer -> IO () + epublish obj path time = do + let book_name = title obj + let book = + emptyBook { + bookID = "http://lwn.net/" ++ book_name, + bookAuthor = "LWN Copyright Eklektix, Inc.", + bookTitle = book_name + } + let xhtml = to_xhtml obj + bs_xhtml <- string_to_bytestring xhtml + let iid = "iid-1" + let normalized_path = normalise path + let metadata = Just (ChapterMetadata book_name) + let bi = BookItem iid normalized_path bs_xhtml opsMediatype metadata + let bookFull = foldl' addItem2Book book [bi] + let outdata = book2Bin bookFull time + B.writeFile normalized_path outdata diff --git a/src/LWN/Article.hs b/src/LWN/Article.hs new file mode 100644 index 0000000..2da1735 --- /dev/null +++ b/src/LWN/Article.hs @@ -0,0 +1,14 @@ +module LWN.Article +where + +import XHTML + +data Article = Article { headline :: String, + byline :: String, + body_html :: String } + +instance XHTML Article where + to_xhtml (Article hl bl b) = + "

" ++ hl ++ "

\n\n" ++ + "

" ++ bl ++ "

\n\n" ++ + b diff --git a/src/LWN/ArticlePage.hs b/src/LWN/ArticlePage.hs new file mode 100644 index 0000000..0e699a0 --- /dev/null +++ b/src/LWN/ArticlePage.hs @@ -0,0 +1,101 @@ +module LWN.ArticlePage +where + +import Data.String.Utils (strip) +import Data.Maybe (fromJust) +import Data.Tree.NTree.TypeDefs (NTree) +import Text.XML.HXT.Core ( + IOSArrow, + XmlTree, + XNode, + (>>>), + (/>), + getText, + runX, + xshow + ) +import Text.HandsomeSoup (css) + +import Epublishable +import LWN.Article +import XHTML + +-- | Defines the ArticlePage data type, containing one 'Article'. +data ArticlePage = ArticlePage { article :: Article } + + +instance XHTML ArticlePage where + to_xhtml (ArticlePage a) = + "" ++ + "" ++ + "\n\n" ++ + " " ++ + " " ++ (headline a) ++ "\n" ++ + "\n" ++ + "\n" ++ + "

\n\n" ++ + (to_xhtml a) ++ + "\n\n

\n" ++ + "\n\n" ++ + "" + + +instance Epublishable ArticlePage where + parse xml = do + articles <- parse_articles xml + case articles of + Just [x] -> return $ Just $ ArticlePage x + _ -> return Nothing + + title (ArticlePage x) = headline x + + +-- | Takes data from an LWN page and determines whether or not it's a +-- single article (as opposed to a page with multiple articles). +is_article_page :: String -> IO Bool +is_article_page _ = return True + + +parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_headline xml = do + let element_filter = xml >>> css "div.PageHeadline h1" + let element_text_filter = element_filter /> getText + element_text <- runX element_text_filter + return $ case element_text of + [x] -> Just $ strip x + [] -> Nothing + _ -> error "Found more than one headline." + +parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_byline xml = do + let element_filter = xml >>> css "div.Byline" + let element_text_filter = element_filter /> getText + element_text <- runX element_text_filter + return $ case element_text of + [x] -> Just $ strip x + [] -> Nothing + _ -> error "Found more than one byline." + + +parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String) +parse_body xml = do + let element_filter = xml >>> css "div.ArticleText" + let element_html_filter = xshow element_filter + element_html <- runX element_html_filter + return $ case element_html of + [x] -> Just x + [] -> Nothing + _ -> error "Found more than one article." + + +parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article]) +parse_articles xml = do + parsed_headline <- parse_headline xml + parsed_byline <- parse_byline xml + parsed_body <- parse_body xml + let headline' = fromJust parsed_headline + let byline' = fromJust parsed_byline + let body' = fromJust parsed_body + return $ Just $ [Article headline' byline' body'] diff --git a/src/LWN/FullPage.hs b/src/LWN/FullPage.hs new file mode 100644 index 0000000..9ff782c --- /dev/null +++ b/src/LWN/FullPage.hs @@ -0,0 +1,23 @@ +module LWN.FullPage +where + +import Epublishable +import LWN.Article +import XHTML + +-- | An LWN page with more than one article on it. These require +-- different parsing and display functions than the single-article +-- pages. +data FullPage = FullPage { articles :: [Article] } + +instance XHTML FullPage where + to_xhtml fp = "" + +instance Epublishable FullPage where + parse xml = do + articles <- parse_articles xml + case articles of + (x:xs)@all -> return $ Just $ FullPage all + _ -> return Nothing + + title _ = "LWN.net" diff --git a/src/Main.hs b/src/Main.hs index e91654d..a3728d2 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -1,14 +1,18 @@ module Main where -data Article = Article { headline :: String, - byline :: String, - body_html :: String } - deriving (Eq, Show) +import Data.Maybe (fromJust) +import Text.HandsomeSoup (parseHtml) +import System.Time (ClockTime( TOD ), getClockTime) -parse_article :: String -> String -parse_article _ = "" +import Epublishable +import LWN.ArticlePage main :: IO () main = do - putStrLn "Hello, world." + article_html <- readFile "test/fixtures/501317-article.html" + ioap <- parse $ parseHtml article_html + let article_page :: ArticlePage = fromJust $ ioap + (TOD t _) <- getClockTime + epublish article_page "out.epub" t + putStrLn "Done." diff --git a/src/Misc.hs b/src/Misc.hs new file mode 100644 index 0000000..bec8917 --- /dev/null +++ b/src/Misc.hs @@ -0,0 +1,19 @@ +module Misc +where + +import qualified Data.ByteString.Lazy as B (ByteString, readFile) +import System.Directory (getTemporaryDirectory, removeFile) +import System.IO (hClose, hPutStr, hSetEncoding, openTempFile, utf8) + +-- | Run a 'String' through the filesystem to convert it to a +-- 'ByteString' in the stupidest way possible. +string_to_bytestring :: String -> IO B.ByteString +string_to_bytestring s = do + dir <- getTemporaryDirectory + (path, h) <- openTempFile dir "nu1Uideehe" + hSetEncoding h utf8 + hPutStr h s + hClose h + result <- B.readFile path + removeFile path + return result diff --git a/src/XHTML.hs b/src/XHTML.hs new file mode 100644 index 0000000..6c4421f --- /dev/null +++ b/src/XHTML.hs @@ -0,0 +1,5 @@ +module XHTML +where + +class XHTML a where + to_xhtml :: a -> String -- 2.43.2