X-Git-Url: http://gitweb.michael.orlitzky.com/?a=blobdiff_plain;f=src%2FLWN%2FPage.hs;h=30c54b550a530777eb9ccfd818b16785c46cc0ef;hb=da4d3ce666bd8fc7a0011ecf495714bb55f0127d;hp=f0ada7cd13cfff0707904272f93fa2b4b354f9e3;hpb=fc0052e451aa03675ebd9a128dfa46573b9357d7;p=dead%2Flwn-epub.git diff --git a/src/LWN/Page.hs b/src/LWN/Page.hs index f0ada7c..30c54b5 100644 --- a/src/LWN/Page.hs +++ b/src/LWN/Page.hs @@ -3,6 +3,7 @@ module LWN.Page where +import Control.Concurrent.ParallelIO (parallel) import qualified Data.Map as Map (lookup) import Data.Time (getCurrentTime) import qualified Data.ByteString.Lazy as B (ByteString, hPut) @@ -33,6 +34,7 @@ import Text.XML.HXT.Core ( getChildren, getText, hasName, + none, processAttrl, processTopDown, this, @@ -41,7 +43,7 @@ import Text.XML.HXT.Core ( when) import Text.HandsomeSoup (css, parseHtml) -import Configuration (Cfg) +import Configuration (Cfg, full_stories) import LWN.Article import LWN.HTTP ( ImageMap, @@ -57,6 +59,7 @@ import LWN.XHTML ( is_image, preprocess, remove_byline, + remove_full_story_paragraphs, remove_title, to_xhtml, to_xml, @@ -126,14 +129,14 @@ insert_full_stories story_map = lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree lookup_func href = case Map.lookup href story_map of - -- Leave it alone if we don't have the full story. - Nothing -> this + -- Drop the paragraph if we don't have the contents. + Nothing -> none Just v -> to_xml v article_xml :: (ArrowXml a) => a XmlTree XmlTree article_xml = lookup_func - $< + $< -- From HXT's Control.Arrow.ArrowList (this /> full_story_link >>> getAttrValue "href") replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree @@ -172,7 +175,7 @@ type StoryMap = Map.Map URL Article -- These come *before* preprocessing. download_full_story_urls :: Cfg -> [URL] -> IO StoryMap download_full_story_urls cfg story_urls = do - pages <- mapM (page_from_url cfg) story_urls + pages <- parallel $ map (page_from_url cfg) story_urls let pairs = zip story_urls pages return $ foldl my_insert empty_map pairs where @@ -191,8 +194,12 @@ download_full_stories cfg xml = do parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page) parse cfg xml = do - story_map <- download_full_stories cfg xml - let fs_xml = xml >>> insert_full_stories story_map + fs_xml <- if (full_stories cfg) then do + story_map <- download_full_stories cfg xml + return $ xml >>> insert_full_stories story_map + else do + -- Get rid of them if we don't want them. + return $ xml >>> remove_full_story_paragraphs let clean_xml = fs_xml >>> preprocess image_map <- download_images clean_xml @@ -260,8 +267,6 @@ ap_parse_articles xml = do parsed_byline <- parse_byline xml parsed_body <- ap_parse_body xml - putStrLn $ fromJust parsed_headline - if (isNothing parsed_headline) || (isNothing parsed_body) then return [] else do @@ -352,7 +357,6 @@ fp_parse_articles xml = do [] -> [] -- Put the separator back, it was lost during the split. let split_articles = map (article_separator ++) split_articles' - --_ <- mapM print_article split_articles real_articles <- mapM parse_html_article split_articles let just_articles = catMaybes real_articles return just_articles