module LWN.Page
where
+import Control.Concurrent.ParallelIO (parallel)
import qualified Data.Map as Map (lookup)
import Data.Time (getCurrentTime)
import qualified Data.ByteString.Lazy as B (ByteString, hPut)
import Data.String.Utils (split, strip)
+import qualified Data.Map as Map (Map, empty, insert)
import Data.Maybe (catMaybes, fromJust, isNothing)
import Prelude hiding (readFile)
import System.IO (Handle, hClose, hFlush)
ArrowXml,
IOSArrow,
XmlTree,
+ ($<),
(>>>),
(/>),
(//>),
changeAttrValue,
+ getAttrValue,
getChildren,
getText,
hasName,
processAttrl,
processTopDown,
+ this,
runX,
xshow,
when)
import Text.HandsomeSoup (css, parseHtml)
-import Configuration (Cfg)
+import Configuration (Cfg, full_stories)
import LWN.Article
import LWN.HTTP (
ImageMap,
import LWN.URI (URL)
import LWN.XHTML (
XHTML,
+ full_story_urls,
image_srcs,
+ full_story_link,
+ full_story_paragraph,
is_image,
preprocess,
remove_byline,
remove_title,
to_xhtml,
+ to_xml,
xml_from_contents)
page_from_url cfg url = do
contents <- get_article_contents cfg url
case (xml_from_contents contents) of
- Just html -> parse html
+ Just html -> parse cfg html
Nothing -> return Nothing
--- Should be called *after* preprocessing.
-download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
-download_images xml = do
- image_urls <- runX $ xml >>> image_srcs
- download_image_urls image_urls
-
-
-
+insert_full_stories :: (ArrowXml a) => StoryMap -> a XmlTree XmlTree
+insert_full_stories story_map =
+ processTopDown (article_xml `when` full_story_paragraph)
+ where
+ lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
+ lookup_func href =
+ case Map.lookup href story_map of
+ -- Leave it alone if we don't have the full story.
+ Nothing -> this
+ Just v -> to_xml v
+
+ article_xml :: (ArrowXml a) => a XmlTree XmlTree
+ article_xml =
+ lookup_func
+ $<
+ (this /> full_story_link >>> getAttrValue "href")
+
replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
replace_remote_img_srcs image_map =
processTopDown (make_srcs_local `when` is_image)
processAttrl $ (change_src `when` (hasName "src"))
-parse :: IOSArrow XmlTree XmlTree -> IO (Maybe Page)
-parse xml = do
- let clean_xml = xml >>> preprocess
+
+
+-- Should be called *after* preprocessing.
+download_images :: IOSArrow XmlTree XmlTree -> IO ImageMap
+download_images xml = do
+ image_urls <- runX $ xml >>> image_srcs
+ download_image_urls image_urls
+
+
+
+type StoryMap = Map.Map URL Article
+
+-- These come *before* preprocessing.
+download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
+download_full_story_urls cfg story_urls = do
+ pages <- parallel $ map (page_from_url cfg) story_urls
+ let pairs = zip story_urls pages
+ return $ foldl my_insert empty_map pairs
+ where
+ empty_map = Map.empty :: StoryMap
+
+ my_insert :: StoryMap -> (URL, Maybe Page) -> StoryMap
+ my_insert dict (k, Just (ArticlePage v)) = Map.insert k v dict
+ my_insert dict (_, _) = dict
+
+
+download_full_stories :: Cfg -> IOSArrow XmlTree XmlTree -> IO StoryMap
+download_full_stories cfg xml = do
+ story_urls <- runX $ xml >>> full_story_urls
+ download_full_story_urls cfg story_urls
+
+
+parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
+parse cfg xml = do
+ story_map <- download_full_stories cfg xml
+ let fs_xml = if (full_stories cfg) then
+ xml >>> insert_full_stories story_map
+ else
+ xml
+
+ let clean_xml = fs_xml >>> preprocess
image_map <- download_images clean_xml
let local_xml = clean_xml >>> replace_remote_img_srcs image_map
+
appr <- ap_parse local_xml
fppr <- fp_parse local_xml
return $
parsed_byline <- parse_byline xml
parsed_body <- ap_parse_body xml
- putStrLn $ fromJust parsed_headline
-
if (isNothing parsed_headline) || (isNothing parsed_body)
then return []
else do
[] -> []
-- Put the separator back, it was lost during the split.
let split_articles = map (article_separator ++) split_articles'
- --_ <- mapM print_article split_articles
real_articles <- mapM parse_html_article split_articles
let just_articles = catMaybes real_articles
return just_articles
expected_xml'' = parseHtml expected_html
+test_full_story_urls_parsed :: Assertion
+test_full_story_urls_parsed = do
+ actual <- runX $ actual'
+
+ assertEqual
+ "Full Story URLs are parsed"
+ expected
+ actual
+ where
+ expected = ["/Articles/500738/", "/Articles/501837/"]
+
+ full_story_html =
+ concat ["<p>",
+ "<a href=\"/Articles/500738/\">Full Story</a> ",
+ "(<a href=\"/Articles/500738/#Comments\">comments: 49</a>)",
+ "<p>",
+ "<a href=\"/Articles/501837/\">Full Story</a> ",
+ "(<a href=\"/Articles/501837/#Comments\">comments: none</a>)",
+ "<p>"]
+
+ xml = parseHtml full_story_html
+ actual' = xml >>> full_story_urls
page_tests :: Test
page_tests =
testGroup "Page Tests" [
testCase "Links are replaced with spans" test_preprocess_links,
testCase "Image srcs are made absolute" test_absolve_images,
- testCase "Comment links are removed" test_comments_removed ]
+ testCase "Comment links are removed" test_comments_removed,
+ testCase "Full Story URLs are parsed" test_full_story_urls_parsed ]