else
appr
---
--- ArticlePage Stuff
---
-ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
-ap_parse xml = do
- arts <- ap_parse_articles xml
- case arts of
- Just [x] -> return $ Just $ ArticlePage x
- _ -> return Nothing
-ap_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-ap_parse_headline xml = do
+parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_headline xml = do
let element_filter = xml >>> css "div.PageHeadline h1"
let element_text_filter = element_filter /> getText
element_text <- runX element_text_filter
- return $ case element_text of
- [x] -> Just $ strip x
- [] -> Nothing
- _ -> error "Found more than one headline."
+ return $
+ case element_text of
+ [x] -> Just $ strip x
+ [] -> Nothing
+ _ -> error "Found more than one headline."
-ap_parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-ap_parse_byline xml = do
- let element_filter = xml >>> css "div.Byline"
+
+parse_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
+parse_byline xml = do
+ let element_filter = xml >>> css "div.FeatureByLine"
let element_text_filter = element_filter /> getText
element_text <- runX element_text_filter
- return $ case element_text of
- [x] -> Just $ strip x
- [] -> Nothing
- _ -> error "Found more than one byline."
+ return $
+ case element_text of
+ [x] -> Just $ strip x
+ [] -> Nothing
+ _ -> error "Found more than one article byline."
+--
+-- ArticlePage Stuff
+--
+ap_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
+ap_parse xml = do
+ arts <- ap_parse_articles xml
+ case arts of
+ Just [x] -> return $ Just $ ArticlePage x
+ _ -> return Nothing
+
+
ap_parse_body :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
ap_parse_body xml = do
let element_filter = xml >>> css "div.ArticleText"
ap_parse_articles :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe [Article])
ap_parse_articles xml = do
- parsed_headline <- ap_parse_headline xml
- parsed_byline <- ap_parse_byline xml
+ parsed_headline <- parse_headline xml
+ parsed_byline <- parse_byline xml
parsed_body <- ap_parse_body xml
let title' = Title (fromJust parsed_headline)
let byline' = Byline parsed_byline
fp_parse :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Page)
fp_parse xml = do
- hl <- fp_parse_headline xml
+ hl <- parse_headline xml
parsed_articles <- fp_parse_articles xml
case parsed_articles of
them@(_:_) -> return $ Just $ FullPage (fromJust hl) them
-
-fp_parse_headline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-fp_parse_headline xml = do
- let element_filter = xml >>> css "div.PageHeadline h1"
- let element_text_filter = element_filter /> getText
- element_text <- runX element_text_filter
- return $ case element_text of
- [x] -> Just $ strip x
- [] -> Nothing
- _ -> error "Found more than one headline."
-
-fp_parse_article_byline :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
-fp_parse_article_byline xml = do
- let element_filter = xml >>> css "div.FeatureByLine"
- let element_text_filter = element_filter /> getText
- element_text <- runX element_text_filter
- return $ case element_text of
- [x] -> Just $ strip x
- [] -> Nothing
- _ -> error "Found more than one article byline."
-
-
fp_parse_article_title :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe String)
fp_parse_article_title xml = do
let element_filter = xml >>> css "h2.SummaryHL"
fp_parse_article :: IOSArrow XmlTree (NTree XNode) -> IO (Maybe Article)
fp_parse_article xml = do
parsed_article_title <- fp_parse_article_title xml
- parsed_article_byline <- fp_parse_article_byline xml
+ parsed_article_byline <- parse_byline xml
parsed_article_body <- fp_parse_article_body xml
let title' = Title $ fromJust parsed_article_title
let byline' = Byline parsed_article_byline
-- | In the full page, all of the article titles and bodies are
--- wrapped in a div.ArticleText.
+-- wrapped in one big div.ArticleText.
parse_bodies :: IOSArrow XmlTree (NTree XNode) -> IOSArrow XmlTree (NTree XNode)
parse_bodies xml =
xml >>> css "div.ArticleText"
return just_articles
+-- | This makes it easy to select otherwise-random chunks of html
+-- using 'css'.
wrap_in_body_div :: String -> String
wrap_in_body_div s =
"<body><div class=\"lwn-article\">" ++ s ++ "</div></body>"
+
+--
+-- Tests
+--
+
test_preprocess_links :: Assertion
test_preprocess_links = do
actual_xml' <- runX $ (preprocess input_xml) >>> css "body"