Add a first draft using HXT, HandsomeSoup, and the Haskell epub library.

author Michael Orlitzky <michael@orlitzky.com>

Sat, 23 Jun 2012 05:01:02 +0000 (01:01 -0400)

committer Michael Orlitzky <michael@orlitzky.com>

Sat, 23 Jun 2012 05:01:02 +0000 (01:01 -0400)
author Michael Orlitzky <michael@orlitzky.com>
Sat, 23 Jun 2012 05:01:02 +0000 (01:01 -0400)
committer Michael Orlitzky <michael@orlitzky.com>
Sat, 23 Jun 2012 05:01:02 +0000 (01:01 -0400)
diff --git a/lwn-epub.cabal b/lwn-epub.cabal

index 37422447b0c4a5875ff92110f159bded2eb10dcf..8afbe2c752cc40d6fcf1f192b2d9e70677f37e28 100644 (file)
--- a/lwn-epub.cabal
+++ b/lwn-epub.cabal
@@ -9,8 +9,15 @@ build-type:     Simple
  
  executable lwn-epub
    build-depends:
-    base                        == 4.5.*,
-    tagsoup                     == 0.12.*
+    base                    == 4.5.*,
+    bytestring              == 0.9.*,
+    directory               == 1.1.*,
+    epub                    == 0.0.6,
+    filepath                == 1.3.*,
+    HandsomeSoup            == 0.3.*,
+    hxt                     == 9.*,
+    MissingH                == 1.1.*,
+    old-time
  
    main-is:
      Main.hs
diff --git a/src/Epublishable.hs b/src/Epublishable.hs

new file mode 100644 (file)

index 0000000..01d27f9
--- /dev/null
+++ b/src/Epublishable.hs
@@ -0,0 +1,37 @@
+module Epublishable
+where
+
+import Codec.EBook
+import qualified Data.ByteString.Lazy as B (writeFile)
+import Data.List (foldl')
+import Data.Tree.NTree.TypeDefs (NTree)
+import System.FilePath (normalise)
+import Text.XML.HXT.Core (IOSArrow, XNode, XmlTree)
+
+import Misc (string_to_bytestring)
+import XHTML
+
+
+class (XHTML a) => Epublishable a where
+  parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe a)
+
+  title :: a -> String
+
+  epublish :: a -> FilePath -> Integer -> IO ()
+  epublish obj path time = do
+    let book_name = title obj
+    let book =
+          emptyBook { 
+            bookID = "http://lwn.net/" ++ book_name,
+            bookAuthor = "LWN <http://lwn.net> Copyright Eklektix, Inc.",
+            bookTitle = book_name
+          }
+    let xhtml = to_xhtml obj
+    bs_xhtml <- string_to_bytestring xhtml
+    let iid = "iid-1"
+    let normalized_path = normalise path
+    let metadata = Just (ChapterMetadata book_name)
+    let bi = BookItem iid normalized_path bs_xhtml opsMediatype metadata
+    let bookFull = foldl' addItem2Book book [bi]
+    let outdata = book2Bin bookFull time
+    B.writeFile normalized_path outdata
diff --git a/src/LWN/Article.hs b/src/LWN/Article.hs

new file mode 100644 (file)

index 0000000..2da1735
--- /dev/null
+++ b/src/LWN/Article.hs
@@ -0,0 +1,14 @@
+module LWN.Article
+where
+
+import XHTML
+
+data Article = Article { headline  :: String,
+                         byline    :: String,
+                         body_html :: String }
+
+instance XHTML Article where
+  to_xhtml (Article hl bl b) =
+    "<h1>" ++ hl ++ "</h1>\n\n" ++
+    "<h2>" ++ bl ++ "</h2>\n\n" ++
+    b
diff --git a/src/LWN/ArticlePage.hs b/src/LWN/ArticlePage.hs

new file mode 100644 (file)

index 0000000..0e699a0
--- /dev/null
+++ b/src/LWN/ArticlePage.hs
@@ -0,0 +1,101 @@
+module LWN.ArticlePage
+where
+
+import Data.String.Utils (strip)
+import Data.Maybe (fromJust)
+import Data.Tree.NTree.TypeDefs (NTree)
+import Text.XML.HXT.Core (
+  IOSArrow,
+  XmlTree,
+  XNode,
+  (>>>),
+  (/>),
+  getText,
+  runX,
+  xshow
+  )
+import Text.HandsomeSoup (css)
+
+import Epublishable
+import LWN.Article
+import XHTML
+
+-- | Defines the ArticlePage data type, containing one 'Article'.
+data ArticlePage = ArticlePage { article :: Article }
+
+
+instance XHTML ArticlePage where
+  to_xhtml (ArticlePage a) =
+    "<?xml version=\"1.0\" encoding=\"utf-8\" ?>" ++
+    "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.1//EN\"" ++
+    "\"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd\">" ++
+    "\n<head>\n" ++
+    "  <meta http-equiv=\"Content-Type\"" ++
+    " content=\"application/xhtml+xml; charset=utf-8\" />" ++
+    "  <title>" ++ (headline a) ++ "</title>\n" ++
+    "</head>\n" ++
+    "<body>\n" ++
+    "<div>\n\n" ++
+    (to_xhtml a) ++
+    "\n\n</div>\n" ++
+    "\n</body>\n" ++
+    "</html>"
+
+
+instance Epublishable ArticlePage where
+  parse xml = do
+    articles <- parse_articles xml
+    case articles of
+      Just [x] -> return $ Just $ ArticlePage x
+      _   -> return Nothing
+
+  title (ArticlePage x) = headline x
+
+
+-- | Takes data from an LWN page and determines whether or not it's a
+--   single article (as opposed to a page with multiple articles).
+is_article_page :: String -> IO Bool
+is_article_page _ = return True
+
+
+parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_headline xml = do
+  let element_filter = xml >>> css "div.PageHeadline h1"
+  let element_text_filter = element_filter /> getText
+  element_text <- runX element_text_filter
+  return $ case element_text of
+            [x] -> Just $ strip x
+            []  -> Nothing
+            _   -> error "Found more than one headline."
+
+parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_byline xml = do
+  let element_filter = xml >>> css "div.Byline"
+  let element_text_filter = element_filter /> getText
+  element_text <- runX element_text_filter
+  return $ case element_text of
+            [x] -> Just $ strip x
+            []  -> Nothing
+            _   -> error "Found more than one byline."
+
+
+parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_body xml = do
+  let element_filter = xml >>> css "div.ArticleText"
+  let element_html_filter = xshow element_filter
+  element_html <- runX element_html_filter
+  return $ case element_html of
+            [x] -> Just x
+            []  -> Nothing
+            _   -> error "Found more than one article."
+
+
+parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
+parse_articles xml = do
+  parsed_headline <- parse_headline xml
+  parsed_byline   <- parse_byline xml
+  parsed_body     <- parse_body xml
+  let headline' = fromJust parsed_headline
+  let byline'   = fromJust parsed_byline
+  let body'     = fromJust parsed_body
+  return $ Just $ [Article headline' byline' body']
diff --git a/src/LWN/FullPage.hs b/src/LWN/FullPage.hs

new file mode 100644 (file)

index 0000000..9ff782c
--- /dev/null
+++ b/src/LWN/FullPage.hs
@@ -0,0 +1,23 @@
+module LWN.FullPage
+where
+
+import Epublishable
+import LWN.Article
+import XHTML
+
+-- | An LWN page with more than one article on it. These require
+--   different parsing and display functions than the single-article
+--   pages.
+data FullPage = FullPage { articles :: [Article] }
+
+instance XHTML FullPage where
+  to_xhtml fp = ""
+
+instance Epublishable FullPage where
+  parse xml = do
+    articles <- parse_articles xml
+    case articles of
+      (x:xs)@all -> return $ Just $ FullPage all
+      _          -> return Nothing
+
+  title _ = "LWN.net"
diff --git a/src/Main.hs b/src/Main.hs

index e91654da85a6446d32a02afed636a106dd514a7e..a3728d289004783a09045ab739ea8a30edbd22b7 100644 (file)
--- a/src/Main.hs
+++ b/src/Main.hs
@@ -1,14 +1,18 @@
  module Main
  where
  
-data Article = Article { headline  :: String,
-                         byline    :: String,
-                         body_html :: String }
-             deriving (Eq, Show)
+import Data.Maybe (fromJust)
+import Text.HandsomeSoup (parseHtml)
+import System.Time (ClockTime( TOD ), getClockTime)
  
-parse_article :: String -> String
-parse_article _ = ""
+import Epublishable
+import LWN.ArticlePage
  
  main :: IO ()
  main = do
-  putStrLn "Hello, world."
+  article_html <- readFile "test/fixtures/501317-article.html"
+  ioap <- parse $ parseHtml article_html
+  let article_page :: ArticlePage = fromJust $ ioap
+  (TOD t _) <- getClockTime 
+  epublish article_page "out.epub" t
+  putStrLn "Done."
diff --git a/src/Misc.hs b/src/Misc.hs

new file mode 100644 (file)

index 0000000..bec8917
--- /dev/null
+++ b/src/Misc.hs
@@ -0,0 +1,19 @@
+module Misc
+where
+
+import qualified Data.ByteString.Lazy as B (ByteString, readFile)
+import System.Directory (getTemporaryDirectory, removeFile)
+import System.IO (hClose, hPutStr, hSetEncoding, openTempFile, utf8)
+
+-- | Run a 'String' through the filesystem to convert it to a
+--   'ByteString' in the stupidest way possible.
+string_to_bytestring :: String -> IO B.ByteString
+string_to_bytestring s = do
+  dir <- getTemporaryDirectory
+  (path, h) <- openTempFile dir "nu1Uideehe"
+  hSetEncoding h utf8
+  hPutStr h s
+  hClose h
+  result <- B.readFile path
+  removeFile path
+  return result
diff --git a/src/XHTML.hs b/src/XHTML.hs

new file mode 100644 (file)

index 0000000..6c4421f
--- /dev/null
+++ b/src/XHTML.hs
@@ -0,0 +1,5 @@
+module XHTML
+where
+
+class XHTML a where
+  to_xhtml :: a -> String
author	Michael Orlitzky <michael@orlitzky.com>
	Sat, 23 Jun 2012 05:01:02 +0000 (01:01 -0400)
committer	Michael Orlitzky <michael@orlitzky.com>
	Sat, 23 Jun 2012 05:01:02 +0000 (01:01 -0400)
lwn-epub.cabal		patch \| blob \| history
src/Epublishable.hs	[new file with mode: 0644]	patch \| blob
src/LWN/Article.hs	[new file with mode: 0644]	patch \| blob
src/LWN/ArticlePage.hs	[new file with mode: 0644]	patch \| blob
src/LWN/FullPage.hs	[new file with mode: 0644]	patch \| blob
src/Main.hs		patch \| blob \| history
src/Misc.hs	[new file with mode: 0644]	patch \| blob
src/XHTML.hs	[new file with mode: 0644]	patch \| blob