X-Git-Url: http://gitweb.michael.orlitzky.com/?a=blobdiff_plain;f=src%2FTSN%2FXML%2FNews.hs;h=513bf5c5764a1839c0140fff8344dbad08af28fe;hb=b6ba5347fa75f33f63b2eb358d5ab421392dd52e;hp=50584be229e89389377e49bbbcb24e19b99a0e29;hpb=02a9363651a96a2469567cb1fb303a9a3cf33dce;p=dead%2Fhtsn-import.git diff --git a/src/TSN/XML/News.hs b/src/TSN/XML/News.hs index 50584be..513bf5c 100644 --- a/src/TSN/XML/News.hs +++ b/src/TSN/XML/News.hs @@ -11,6 +11,7 @@ -- module TSN.XML.News ( dtd, + has_only_single_sms, pickle_message, -- * Tests news_tests, @@ -45,6 +46,16 @@ import Test.Tasty ( TestTree, testGroup ) import Test.Tasty.HUnit ( (@?=), testCase ) import Text.XML.HXT.Core ( PU, + XmlTree, + (/>), + (>>>), + addNav, + descendantAxis, + filterAxis, + followingSiblingAxis, + hasName, + remNav, + runLA, xp13Tuple, xpAttr, xpElem, @@ -53,7 +64,6 @@ import Text.XML.HXT.Core ( xpOption, xpPair, xpText, - xpTriple, xpWrap ) -- Local imports. @@ -63,13 +73,14 @@ import TSN.Codegen ( import TSN.Database ( insert_or_select ) import TSN.DbImport ( DbImport(..), ImportResult(..), run_dbmigrate ) import TSN.Picklers ( xp_time_stamp ) -import TSN.Location ( Location(..) ) +import TSN.Location ( Location(..), pickle_location ) import TSN.XmlImport ( XmlImport(..) ) import Xml ( FromXml(..), ToDb(..), pickle_unpickle, unpickleable, + unsafe_read_document, unsafe_unpickle ) @@ -108,7 +119,7 @@ data Message = xml_sport :: String, xml_url :: Maybe String, xml_teams :: [NewsTeam], - xml_locations :: [NewsLocationXml], + xml_locations :: [Location], xml_sms :: String, xml_editor :: Maybe String, xml_text :: Maybe String, -- Text and continue seem to show up in pairs, @@ -190,38 +201,6 @@ data News_NewsTeam = News_NewsTeam (DefaultKey NewsTeam) --- * NewsLocationXml - --- | The XML type for locations as they show up in the news. The --- associated database type comes from "TSN.Location". --- -data NewsLocationXml = - NewsLocationXml { - xml_city :: Maybe String, - xml_state :: Maybe String, - xml_country :: String } - deriving (Eq, Show) - - -instance ToDb NewsLocationXml where - -- | The database analogue of a NewsLocationXml is a Location. - type Db NewsLocationXml = Location - - -instance FromXml NewsLocationXml where - -- | To convert from the XML representation to the database one, we - -- don't have to do anything. Just copy the fields. - -- - from_xml NewsLocationXml{..} = - Location xml_city xml_state xml_country - - --- | Allow us to import the XML representation directly into the --- database, without having to perform the conversion manually first. --- -instance XmlImport NewsLocationXml - - -- * News_Location -- | Mapping between 'News' records and 'Location' records in the @@ -235,8 +214,37 @@ data News_Location = News_Location + +-- | Some newsxml documents contain two \ elements in a row, +-- violating the DTD. The second one has always been empty, but it's +-- irrelevant: we can't parse these, and would like to detect them +-- in order to report the fact that the busted document is +-- unsupported. -- --- Database code +-- This function detects whether two \ elements appear in a +-- row, as siblings. +-- +has_only_single_sms :: XmlTree -> Bool +has_only_single_sms xmltree = + case elements of + [] -> True + _ -> False + where + parse :: XmlTree -> [XmlTree] + parse = runLA $ hasName "/" + /> hasName "message" + >>> addNav + >>> descendantAxis + >>> filterAxis (hasName "SMS") + >>> followingSiblingAxis + >>> remNav + >>> hasName "SMS" + + elements = parse xmltree + + +-- +-- * Database code -- -- | Define 'dbmigrate' and 'dbimport' for 'Message's. The import is @@ -267,7 +275,7 @@ instance DbImport Message where mapM_ insert_ news_news_teams -- Do all of that over again for the Locations. - loc_ids <- mapM insert_xml_or_select (xml_locations message) + loc_ids <- mapM insert_or_select (xml_locations message) let news_news_locations = map (News_Location news_id) loc_ids mapM_ insert_ news_news_locations @@ -283,7 +291,7 @@ mkPersist defaultCodegenConfig [groundhog| constructors: - name: NewsTeam uniques: - - name: unique_news_team + - name: unique_news_teams type: constraint fields: [team_name] @@ -374,20 +382,6 @@ pickle_msg_id = to_tuple m = (db_msg_id m, db_event_id m) --- | Convert a 'NewsLocationXml' to/from XML. --- -pickle_location :: PU NewsLocationXml -pickle_location = - xpElem "location" $ - xpWrap (from_tuple, to_tuple) $ - xpTriple (xpOption (xpElem "city" xpText)) - (xpOption (xpElem "state" xpText)) - (xpElem "country" xpText) - where - from_tuple = - uncurryN NewsLocationXml - to_tuple l = (xml_city l, xml_state l, xml_country l) - -- | Convert a 'Message' to/from XML. -- @@ -454,7 +448,8 @@ news_tests = [ test_news_fields_have_correct_names, test_on_delete_cascade, test_pickle_of_unpickle_is_identity, - test_unpickle_succeeds ] + test_unpickle_succeeds, + test_sms_detected_correctly ] -- | Make sure our codegen is producing the correct database names. @@ -553,3 +548,22 @@ test_on_delete_cascade = testGroup "cascading delete tests" count_e <- countAll e return $ count_a + count_b + count_c + count_d + count_e actual @?= expected + + +-- | We want to make sure the single-SMS documents and the multi-SMS +-- documents are identified correctly. +-- +test_sms_detected_correctly :: TestTree +test_sms_detected_correctly = + testGroup "newsxml SMS count determined correctly" + [ check "test/xml/newsxml.xml" + "single SMS detected correctly" + True, + check "test/xml/newsxml-multiple-sms.xml" + "multiple SMS detected correctly" + False ] + where + check path desc expected = testCase desc $ do + xmltree <- unsafe_read_document path + let actual = has_only_single_sms xmltree + actual @?= expected