module LWN.Page
where
+import Control.Concurrent.ParallelIO (parallel)
import qualified Data.Map as Map (lookup)
import Data.Time (getCurrentTime)
import qualified Data.ByteString.Lazy as B (ByteString, hPut)
when)
import Text.HandsomeSoup (css, parseHtml)
-import Configuration (Cfg)
+import Configuration (Cfg, full_stories)
import LWN.Article
import LWN.HTTP (
ImageMap,
-- These come *before* preprocessing.
download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
download_full_story_urls cfg story_urls = do
- pages <- mapM (page_from_url cfg) story_urls
+ pages <- parallel $ map (page_from_url cfg) story_urls
let pairs = zip story_urls pages
return $ foldl my_insert empty_map pairs
where
parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
parse cfg xml = do
story_map <- download_full_stories cfg xml
- let fs_xml = xml >>> insert_full_stories story_map
+ let fs_xml = if (full_stories cfg) then
+ xml >>> insert_full_stories story_map
+ else
+ xml
let clean_xml = fs_xml >>> preprocess
image_map <- download_images clean_xml
parsed_byline <- parse_byline xml
parsed_body <- ap_parse_body xml
- putStrLn $ fromJust parsed_headline
-
if (isNothing parsed_headline) || (isNothing parsed_body)
then return []
else do
[] -> []
-- Put the separator back, it was lost during the split.
let split_articles = map (article_separator ++) split_articles'
- --_ <- mapM print_article split_articles
real_articles <- mapM parse_html_article split_articles
let just_articles = catMaybes real_articles
return just_articles