From: Michael Orlitzky Date: Sat, 5 Jul 2014 23:00:31 +0000 (-0400) Subject: Add a function to TSN.XML.Weather to detect the unsupported second type. X-Git-Tag: 0.0.6~28 X-Git-Url: https://gitweb.michael.orlitzky.com/?a=commitdiff_plain;h=f60eab7bde994afb9b2f727e56b0a635413bdd3b;p=dead%2Fhtsn-import.git Add a function to TSN.XML.Weather to detect the unsupported second type. Fail with ImportUnsupported instead of an error when we encounter the second type of weatherxml. Update the man page with a mention of the unsupported weatherxml handling. Remove a TODO. Add tests for the unsupported weather document and its underlying function. --- diff --git a/doc/TODO b/doc/TODO index 9413031..f9d38e6 100644 --- a/doc/TODO +++ b/doc/TODO @@ -4,10 +4,7 @@ 2. Write a test for test/xml/Odds_XML-long-import.xml once it no longer takes 10 minutes to import (Postgres only?). -3. Return ImportUnsupported for the second type of weatherxml (see man - page). - -4. We have DTDs but no sample XML for the following SportInfo types, +3. We have DTDs but no sample XML for the following SportInfo types, which have therefore been left unimplmented for now: * Cbask_Indv_No_Avg_XML.dtd @@ -27,7 +24,7 @@ * NFLYardsXML.dtd * NFL_PuntingLeaders_XML.dtd -5. The following DTD types were handled (in some form) by the old +4. The following DTD types were handled (in some form) by the old FeedGrabber. They are not yet handled by htsn-import (some may not be valid): @@ -66,11 +63,11 @@ * WNBA_Individual_Stats_XML * WNBATeamScheduleXML -6. Consolidate all of the make_game_time functions which take a +5. Consolidate all of the make_game_time functions which take a date/time and produce a combined time. -7. Move the News/Scores Locations into TSN.Locations. +6. Move the News/Scores Locations into TSN.Locations. -8. Regenerate the News/Scores dbschema diagrams. +7. Regenerate the News/Scores dbschema diagrams. -9. Re-test import of News/Scores samples. +8. Re-test import of News/Scores samples. diff --git a/doc/man1/htsn-import.1 b/doc/man1/htsn-import.1 index 5e3d5ac..47ca0d1 100644 --- a/doc/man1/htsn-import.1 +++ b/doc/man1/htsn-import.1 @@ -300,7 +300,13 @@ There appear to be two types of weather documents; the first has contained within . While it would be possible to parse both, it would greatly complicate things. The first form is more common, so that's all we support for now. An example is provided as -doc/xml-samples/weird-weatherxml.xml. +test/xml/weatherxml-type2.xml. + +We are however able to identify the second type. When one is +encountered, an informational message (that it is unsupported) will be +printed. If the \fI\-\-remove\fR flag is used, the file will be +deleted. This prevents documents that we know we can't import from +building up. .SH DEPLOYMENT .P diff --git a/src/Main.hs b/src/Main.hs index fc6d14b..0fe6096 100644 --- a/src/Main.hs +++ b/src/Main.hs @@ -65,7 +65,7 @@ import qualified TSN.XML.ScheduleChanges as ScheduleChanges ( pickle_message ) import qualified TSN.XML.Scores as Scores ( dtd, pickle_message ) import qualified TSN.XML.SportInfo as SportInfo ( dtds, parse_xml ) -import qualified TSN.XML.Weather as Weather ( dtd, pickle_message ) +import qualified TSN.XML.Weather as Weather ( dtd, is_type1, pickle_message ) import Xml ( DtdName(..), parse_opts ) @@ -203,7 +203,14 @@ import_file cfg path = do | dtd == Scores.dtd = go Scores.pickle_message -- SportInfo and GameInfo appear last in the guards - | dtd == Weather.dtd = go Weather.pickle_message + | dtd == Weather.dtd = + if Weather.is_type1 xml + then go Weather.pickle_message + else do + -- We want these to "succeed" so that they're deleted. + -- We already know we can't parse them. + let msg = "Unsupported weatherxml.dtd type (" ++ path ++ ")" + return $ ImportUnsupported msg | dtd `elem` GameInfo.dtds = do let either_m = GameInfo.parse_xml dtd xml diff --git a/src/TSN/XML/Weather.hs b/src/TSN/XML/Weather.hs index c2eee4a..351df24 100644 --- a/src/TSN/XML/Weather.hs +++ b/src/TSN/XML/Weather.hs @@ -11,6 +11,7 @@ -- module TSN.XML.Weather ( dtd, + is_type1, pickle_message, -- * Tests weather_tests, @@ -42,6 +43,12 @@ import Test.Tasty ( TestTree, testGroup ) import Test.Tasty.HUnit ( (@?=), testCase ) import Text.XML.HXT.Core ( PU, + XmlTree, + (/>), + hasName, + readDocument, + runLA, + runX, xp8Tuple, xp9Tuple, xpAttr, @@ -65,6 +72,7 @@ import Xml ( FromXml(..), FromXmlFk(..), ToDb(..), + parse_opts, pickle_unpickle, unpickleable, unsafe_unpickle ) @@ -384,6 +392,33 @@ mkPersist tsn_codegen_config [groundhog| |] + +-- | There are two different types of documents that claim to be +-- \"weatherxml.dtd\". The first, more common type has listings +-- within forecasts. The second type has forecasts within +-- listings. Clearly we can't parse both of these using the same +-- parser! +-- +-- For now we're simply punting on the issue and refusing to parse +-- the second type. This will check the given @xmltree@ to see if +-- there are any forecasts contained within listings. If there are, +-- then it's the second type that we don't know what to do with. +-- +is_type1 :: XmlTree -> Bool +is_type1 xmltree = + case elements of + [] -> True + _ -> False + where + parse :: XmlTree -> [XmlTree] + parse = runLA $ hasName "/" + /> hasName "message" + /> hasName "listing" + /> hasName "forecast" + + elements = parse xmltree + + instance DbImport Message where dbmigrate _ = run_dbmigrate $ do @@ -553,7 +588,8 @@ weather_tests = "Weather tests" [ test_on_delete_cascade, test_pickle_of_unpickle_is_identity, - test_unpickle_succeeds ] + test_unpickle_succeeds, + test_types_detected_correctly ] -- | If we unpickle something and then pickle it, we should wind up @@ -619,3 +655,26 @@ test_on_delete_cascade = testGroup "cascading delete tests" return $ count_a + count_b + count_c + count_d let expected = 0 actual @?= expected + + +test_types_detected_correctly :: TestTree +test_types_detected_correctly = + testGroup "weatherxml types detected correctly" $ + [ check "test/xml/weatherxml.xml" + "first type detected correctly" + True, + check "test/xml/weatherxml-detailed.xml" + "first type detected correctly (detailed)" + True, + check "test/xml/weatherxml-type2.xml" + "second type detected correctly" + False ] + where + unsafe_get_xmltree :: String -> IO XmlTree + unsafe_get_xmltree path = + fmap head $ runX $ readDocument parse_opts path + + check path desc expected = testCase desc $ do + xmltree <- unsafe_get_xmltree path + let actual = is_type1 xmltree + actual @?= expected diff --git a/test/shell/import-duplicates.test b/test/shell/import-duplicates.test index 247a996..9f46f2e 100644 --- a/test/shell/import-duplicates.test +++ b/test/shell/import-duplicates.test @@ -12,10 +12,11 @@ rm -f shelltest.sqlite3 >>>= 0 # We note the number of XML files that we have. There's one extra -# Heartbeat.xml that doesn't really count. +# Heartbeat.xml that doesn't really count, and a weatherxml that +# isn't really supposed to import. find ./test/xml -maxdepth 1 -name '*.xml' | wc -l >>> -22 +23 >>>= 0 # Run the imports again; we should get complaints about the duplicate diff --git a/test/shell/weatherxml-type2-unsupported.test b/test/shell/weatherxml-type2-unsupported.test new file mode 100644 index 0000000..d68eb6f --- /dev/null +++ b/test/shell/weatherxml-type2-unsupported.test @@ -0,0 +1,11 @@ +# +# The second type (see the man page) of weatherxml is unsupported, but +# we don't want to consider it an error when we encounter one, because +# we want to delete it. +# +# The real output contains escape characters so we use a regexp to get +# a pretty good idea of what it says. + +./dist/build/htsn-import/htsn-import test/xml/weatherxml-type2.xml +>>> /Unsupported weatherxml\.dtd type \(test\/xml\/weatherxml-type2.xml\)/ +>>>= 0 diff --git a/doc/xml-samples/weird-weatherxml.xml b/test/xml/weatherxml-type2.xml similarity index 100% rename from doc/xml-samples/weird-weatherxml.xml rename to test/xml/weatherxml-type2.xml