module LWN.Page
where
+import Control.Concurrent.ParallelIO (parallel)
import qualified Data.Map as Map (lookup)
import Data.Time (getCurrentTime)
import qualified Data.ByteString.Lazy as B (ByteString, hPut)
getChildren,
getText,
hasName,
+ none,
processAttrl,
processTopDown,
this,
when)
import Text.HandsomeSoup (css, parseHtml)
-import Configuration (Cfg)
+import Configuration (Cfg, full_stories)
import LWN.Article
import LWN.HTTP (
ImageMap,
is_image,
preprocess,
remove_byline,
+ remove_full_story_paragraphs,
remove_title,
to_xhtml,
to_xml,
lookup_func :: (ArrowXml a) => URL -> a XmlTree XmlTree
lookup_func href =
case Map.lookup href story_map of
- -- Leave it alone if we don't have the full story.
- Nothing -> this
+ -- Drop the paragraph if we don't have the contents.
+ Nothing -> none
Just v -> to_xml v
article_xml :: (ArrowXml a) => a XmlTree XmlTree
article_xml =
lookup_func
- $<
+ $< -- From HXT's Control.Arrow.ArrowList
(this /> full_story_link >>> getAttrValue "href")
replace_remote_img_srcs :: (ArrowXml a) => ImageMap -> a XmlTree XmlTree
-- These come *before* preprocessing.
download_full_story_urls :: Cfg -> [URL] -> IO StoryMap
download_full_story_urls cfg story_urls = do
- pages <- mapM (page_from_url cfg) story_urls
+ pages <- parallel $ map (page_from_url cfg) story_urls
let pairs = zip story_urls pages
return $ foldl my_insert empty_map pairs
where
parse :: Cfg -> IOSArrow XmlTree XmlTree -> IO (Maybe Page)
parse cfg xml = do
- story_map <- download_full_stories cfg xml
- let fs_xml = xml >>> insert_full_stories story_map
+ fs_xml <- if (full_stories cfg) then do
+ story_map <- download_full_stories cfg xml
+ return $ xml >>> insert_full_stories story_map
+ else do
+ -- Get rid of them if we don't want them.
+ return $ xml >>> remove_full_story_paragraphs
let clean_xml = fs_xml >>> preprocess
image_map <- download_images clean_xml
parsed_byline <- parse_byline xml
parsed_body <- ap_parse_body xml
- putStrLn $ fromJust parsed_headline
-
if (isNothing parsed_headline) || (isNothing parsed_body)
then return []
else do
[] -> []
-- Put the separator back, it was lost during the split.
let split_articles = map (article_separator ++) split_articles'
- --_ <- mapM print_article split_articles
real_articles <- mapM parse_html_article split_articles
let just_articles = catMaybes real_articles
return just_articles