]> gitweb.michael.orlitzky.com - dead/htsn-import.git/blob - src/TSN/XML/News.hs
f401d41cb2dbb67d139395a6089c36f09ac9a23d
[dead/htsn-import.git] / src / TSN / XML / News.hs
1 {-# LANGUAGE DeriveDataTypeable #-}
2 {-# LANGUAGE FlexibleInstances #-}
3 {-# LANGUAGE GADTs #-}
4 {-# LANGUAGE QuasiQuotes #-}
5 {-# LANGUAGE RecordWildCards #-}
6 {-# LANGUAGE TemplateHaskell #-}
7 {-# LANGUAGE TypeFamilies #-}
8
9 -- | Parse TSN XML for the DTD \"newsxml.dtd\". Each document contains
10 -- a root element \<message\> that contains an entire news item.
11 --
12 module TSN.XML.News (
13 dtd,
14 has_only_single_sms,
15 pickle_message,
16 -- * Tests
17 news_tests,
18 -- * WARNING: these are private but exported to silence warnings
19 News_LocationConstructor(..),
20 News_NewsTeamConstructor(..),
21 NewsConstructor(..),
22 NewsTeamConstructor(..) )
23 where
24
25 -- System imports.
26 import Data.Data ( Data, constrFields, dataTypeConstrs, dataTypeOf )
27 import Data.Time.Clock ( UTCTime )
28 import Data.List.Utils ( join, split )
29 import Data.Tuple.Curry ( uncurryN )
30 import Data.Typeable ( Typeable )
31 import Database.Groundhog (
32 countAll,
33 deleteAll,
34 insert_,
35 migrate,
36 runMigration,
37 silentMigrationLogger )
38 import Database.Groundhog.Core ( DefaultKey )
39 import Database.Groundhog.Generic ( runDbConn )
40 import Database.Groundhog.Sqlite ( withSqliteConn )
41 import Database.Groundhog.TH (
42 defaultCodegenConfig,
43 groundhog,
44 mkPersist )
45 import Test.Tasty ( TestTree, testGroup )
46 import Test.Tasty.HUnit ( (@?=), testCase )
47 import Text.XML.HXT.Core (
48 PU,
49 XmlTree,
50 (/>),
51 (>>>),
52 addNav,
53 descendantAxis,
54 filterAxis,
55 followingSiblingAxis,
56 hasName,
57 remNav,
58 runLA,
59 xp13Tuple,
60 xpAttr,
61 xpElem,
62 xpInt,
63 xpList,
64 xpOption,
65 xpPair,
66 xpText,
67 xpWrap )
68
69 -- Local imports.
70 import TSN.Codegen (
71 tsn_codegen_config,
72 tsn_db_field_namer ) -- Used in a test
73 import TSN.Database ( insert_or_select )
74 import TSN.DbImport ( DbImport(..), ImportResult(..), run_dbmigrate )
75 import TSN.Picklers ( xp_time_stamp )
76 import TSN.Location ( Location(..), pickle_location )
77 import TSN.XmlImport ( XmlImport(..) )
78 import Xml (
79 FromXml(..),
80 ToDb(..),
81 pickle_unpickle,
82 unpickleable,
83 unsafe_read_document,
84 unsafe_unpickle )
85
86
87 -- | The DTD to which this module corresponds. Used to invoke dbimport.
88 --
89 dtd :: String
90 dtd = "newsxml.dtd"
91
92
93 --
94 -- DB/XML Data types
95 --
96
97 -- * News/Message
98
99 -- | The msg_id child of \<message\> contains an event_id attribute; we
100 -- embed it into the 'News' type. We (pointlessly) use the \"db_\"
101 -- prefix here so that the two names don't collide on \"id\" when
102 -- Groundhog is creating its fields using our field namer.
103 --
104 data MsgId =
105 MsgId {
106 db_msg_id :: Int,
107 db_event_id :: Maybe Int }
108 deriving (Data, Eq, Show, Typeable)
109
110
111 -- | The XML representation of a news item (\<message\>).
112 --
113 data Message =
114 Message {
115 xml_xml_file_id :: Int,
116 xml_heading :: String,
117 xml_mid :: MsgId,
118 xml_category :: String,
119 xml_sport :: String,
120 xml_url :: Maybe String,
121 xml_teams :: [NewsTeam],
122 xml_locations :: [Location],
123 xml_sms :: Maybe String,
124 xml_editor :: Maybe String,
125 xml_text :: Maybe String, -- Text and continue seem to show up in pairs,
126 xml_continue :: Maybe String, -- either both present or both missing.
127 xml_time_stamp :: UTCTime }
128 deriving (Eq, Show)
129
130
131 -- | The database representation of a news item. We drop several
132 -- uninteresting fields from 'Message', and omit the list fields which
133 -- will be represented as join tables.
134 --
135 data News =
136 News {
137 db_xml_file_id :: Int,
138 db_mid :: MsgId,
139 db_sport :: String,
140 db_url :: Maybe String,
141 db_sms :: Maybe String,
142 db_editor :: Maybe String,
143 db_text :: Maybe String,
144 db_continue :: Maybe String,
145 db_time_stamp :: UTCTime }
146 deriving (Data, Eq, Show, Typeable)
147
148
149
150 instance ToDb Message where
151 -- | The database representation of 'Message' is 'News'.
152 type Db Message = News
153
154 -- | Convert the XML representation 'Message' to the database
155 -- representation 'News'.
156 --
157 instance FromXml Message where
158 -- | We use a record wildcard so GHC doesn't complain that we never
159 -- used the field names.
160 --
161 from_xml Message{..} = News { db_xml_file_id = xml_xml_file_id,
162 db_mid = xml_mid,
163 db_sport = xml_sport,
164 db_url = xml_url,
165 db_sms = xml_sms,
166 db_editor = xml_editor,
167 db_text = xml_text,
168 db_continue = xml_continue,
169 db_time_stamp = xml_time_stamp }
170
171 -- | This lets us insert the XML representation 'Message' directly.
172 --
173 instance XmlImport Message
174
175
176 -- * NewsTeam
177
178 -- | The database/XML type for teams as they show up in the news. We
179 -- can't reuse the representation from "TSN.Team" because they
180 -- require a team id. We wouldn't want to make the team ID optional
181 -- and then insert a team with no id, only to find the same team
182 -- later with an id and be unable to update the record. (We could
183 -- add the update logic, but it would be more trouble than it's
184 -- worth.)
185 --
186 data NewsTeam =
187 NewsTeam { team_name :: String }
188 deriving (Eq, Show)
189
190
191
192 -- * News_NewsTeam
193
194 -- | Mapping between News records and NewsTeam records in the
195 -- database. We don't name the fields because we don't use the names
196 -- explicitly; that means we have to give them nice database names
197 -- via groundhog.
198 --
199 data News_NewsTeam = News_NewsTeam
200 (DefaultKey News)
201 (DefaultKey NewsTeam)
202
203
204 -- * News_Location
205
206 -- | Mapping between 'News' records and 'Location' records in the
207 -- database. We don't name the fields because we don't use the names
208 -- explicitly; that means we have to give them nice database names
209 -- via groundhog.
210 --
211 data News_Location = News_Location
212 (DefaultKey News)
213 (DefaultKey Location)
214
215
216
217
218 -- | Some newsxml documents contain two \<SMS\> elements in a row,
219 -- violating the DTD. The second one has always been empty, but it's
220 -- irrelevant: we can't parse these, and would like to detect them
221 -- in order to report the fact that the busted document is
222 -- unsupported.
223 --
224 -- This function detects whether two \<SMS\> elements appear in a
225 -- row, as siblings.
226 --
227 has_only_single_sms :: XmlTree -> Bool
228 has_only_single_sms xmltree =
229 case elements of
230 [] -> True
231 _ -> False
232 where
233 parse :: XmlTree -> [XmlTree]
234 parse = runLA $ hasName "/"
235 /> hasName "message"
236 >>> addNav
237 >>> descendantAxis
238 >>> filterAxis (hasName "SMS")
239 >>> followingSiblingAxis
240 >>> remNav
241 >>> hasName "SMS"
242
243 elements = parse xmltree
244
245
246 --
247 -- * Database code
248 --
249
250 -- | Define 'dbmigrate' and 'dbimport' for 'Message's. The import is
251 -- slightly non-generic because of our 'News_NewsTeam' and
252 -- 'News_Location' join tables.
253 --
254 instance DbImport Message where
255 dbmigrate _ =
256 run_dbmigrate $ do
257 migrate (undefined :: Location)
258 migrate (undefined :: News)
259 migrate (undefined :: NewsTeam)
260 migrate (undefined :: News_NewsTeam)
261 migrate (undefined :: News_Location)
262
263 dbimport message = do
264 -- Insert the message and acquire its primary key (unique ID)
265 news_id <- insert_xml message
266
267 -- Now insert the teams. We use insert_or_select because we know
268 -- that most teams will already exist, and we want to get back the
269 -- id for the existing team when there's a collision.
270 nt_ids <- mapM insert_or_select (xml_teams message)
271
272 -- Now that the teams have been inserted, create
273 -- news__news_team records mapping beween the two.
274 let news_news_teams = map (News_NewsTeam news_id) nt_ids
275 mapM_ insert_ news_news_teams
276
277 -- Do all of that over again for the Locations.
278 loc_ids <- mapM insert_or_select (xml_locations message)
279 let news_news_locations = map (News_Location news_id) loc_ids
280 mapM_ insert_ news_news_locations
281
282 return ImportSucceeded
283
284
285 -- These types don't have special XML representations or field name
286 -- collisions so we use the defaultCodegenConfig and give their
287 -- fields nice simple names.
288 mkPersist defaultCodegenConfig [groundhog|
289 - entity: NewsTeam
290 dbName: news_teams
291 constructors:
292 - name: NewsTeam
293 uniques:
294 - name: unique_news_teams
295 type: constraint
296 fields: [team_name]
297
298 |]
299
300
301 -- These types have fields with e.g. db_ and xml_ prefixes, so we
302 -- use our own codegen to peel those off before naming the columns.
303 mkPersist tsn_codegen_config [groundhog|
304 - entity: News
305 constructors:
306 - name: News
307 uniques:
308 - name: unique_news
309 type: constraint
310 # Prevent multiple imports of the same message.
311 fields: [db_xml_file_id]
312 fields:
313 - name: db_mid
314 embeddedType:
315 - {name: msg_id, dbName: msg_id}
316 - {name: event_id, dbName: event_id}
317
318 - embedded: MsgId
319 fields:
320 - name: db_msg_id
321 dbName: msg_id
322 - name: db_event_id
323 dbName: event_id
324
325 - entity: News_NewsTeam
326 dbName: news__news_teams
327 constructors:
328 - name: News_NewsTeam
329 fields:
330 - name: news_NewsTeam0 # Default created by mkNormalFieldName
331 dbName: news_id
332 reference:
333 onDelete: cascade
334 - name: news_NewsTeam1 # Default created by mkNormalFieldName
335 dbName: news_teams_id
336 reference:
337 onDelete: cascade
338
339 - entity: News_Location
340 dbName: news__locations
341 constructors:
342 - name: News_Location
343 fields:
344 - name: news_Location0 # Default created by mkNormalFieldName
345 dbName: news_id
346 reference:
347 onDelete: cascade
348 - name: news_Location1 # Default created by mkNormalFieldName
349 dbName: locations_id
350 reference:
351 onDelete: cascade
352 |]
353
354
355 --
356 -- XML Picklers
357 --
358
359 -- | Convert a 'NewsTeam' to/from XML.
360 --
361 pickle_news_team :: PU NewsTeam
362 pickle_news_team =
363 xpElem "team" $
364 xpWrap (from_string, to_string) xpText
365 where
366 to_string :: NewsTeam -> String
367 to_string = team_name
368
369 from_string :: String -> NewsTeam
370 from_string = NewsTeam
371
372
373 -- | Convert a 'MsgId' to/from XML.
374 --
375 pickle_msg_id :: PU MsgId
376 pickle_msg_id =
377 xpElem "msg_id" $
378 xpWrap (from_tuple, to_tuple) $
379 xpPair xpInt (xpAttr "EventId" (xpOption xpInt))
380 where
381 from_tuple = uncurryN MsgId
382 to_tuple m = (db_msg_id m, db_event_id m)
383
384
385
386 -- | Convert a 'Message' to/from XML.
387 --
388 pickle_message :: PU Message
389 pickle_message =
390 xpElem "message" $
391 xpWrap (from_tuple, to_tuple) $
392 xp13Tuple (xpElem "XML_File_ID" xpInt)
393 (xpElem "heading" xpText)
394 pickle_msg_id
395 (xpElem "category" xpText)
396 (xpElem "sport" xpText)
397 (xpElem "url" $ xpOption xpText)
398 (xpList pickle_news_team)
399 (xpList pickle_location)
400 (xpElem "SMS" $ xpOption xpText)
401 (xpOption (xpElem "Editor" xpText))
402 (xpOption (xpElem "text" xpText))
403 pickle_continue
404 (xpElem "time_stamp" xp_time_stamp)
405 where
406 from_tuple = uncurryN Message
407 to_tuple m = (xml_xml_file_id m, -- Verbose,
408 xml_heading m, -- but
409 xml_mid m, -- eliminates
410 xml_category m, -- GHC
411 xml_sport m, -- warnings
412 xml_url m, -- .
413 xml_teams m, -- .
414 xml_locations m, -- .
415 xml_sms m,
416 xml_editor m,
417 xml_text m,
418 xml_continue m,
419 xml_time_stamp m)
420
421 -- | We combine all of the \<continue\> elements into one 'String'
422 -- while unpickling and do the reverse while pickling.
423 --
424 pickle_continue :: PU (Maybe String)
425 pickle_continue =
426 xpOption $
427 xpWrap (to_string, from_string) $
428 xpElem "continue" $
429 xpList (xpElem "P" xpText)
430 where
431 from_string :: String -> [String]
432 from_string = split "\n"
433
434 to_string :: [String] -> String
435 to_string = join "\n"
436
437
438 --
439 -- Tasty Tests
440 --
441
442 -- | A list of all tests for this module.
443 --
444 news_tests :: TestTree
445 news_tests =
446 testGroup
447 "News tests"
448 [ test_news_fields_have_correct_names,
449 test_on_delete_cascade,
450 test_pickle_of_unpickle_is_identity,
451 test_unpickle_succeeds,
452 test_sms_detected_correctly ]
453
454
455 -- | Make sure our codegen is producing the correct database names.
456 --
457 test_news_fields_have_correct_names :: TestTree
458 test_news_fields_have_correct_names =
459 testCase "news fields get correct database names" $
460 mapM_ check (zip actual expected)
461 where
462 -- This is cool, it uses the (derived) Data instance of
463 -- News.News to get its constructor names.
464 field_names :: [String]
465 field_names =
466 constrFields . head $ dataTypeConstrs $ dataTypeOf (undefined :: News)
467
468 expected :: [String]
469 expected =
470 map (\x -> tsn_db_field_namer "herp" "derp" 8675309 x 90210) field_names
471
472 actual :: [String]
473 actual = ["xml_file_id",
474 "mid",
475 "sport",
476 "url",
477 "sms",
478 "editor",
479 "text",
480 "continue"]
481
482 check (x,y) = (x @?= y)
483
484
485 -- | If we unpickle something and then pickle it, we should wind up
486 -- with the same thing we started with. WARNING: success of this
487 -- test does not mean that unpickling succeeded.
488 --
489 test_pickle_of_unpickle_is_identity :: TestTree
490 test_pickle_of_unpickle_is_identity = testGroup "pickle-unpickle tests"
491 [ check "pickle composed with unpickle is the identity"
492 "test/xml/newsxml.xml",
493
494 check "pickle composed with unpickle is the identity (with Editor)"
495 "test/xml/newsxml-with-editor.xml",
496
497 check "pickle composed with unpickle is the identity (empty SMS)"
498 "test/xml/newsxml-empty-sms.xml" ]
499 where
500 check desc path = testCase desc $ do
501 (expected, actual) <- pickle_unpickle pickle_message path
502 actual @?= expected
503
504
505 -- | Make sure we can actually unpickle these things.
506 --
507 test_unpickle_succeeds :: TestTree
508 test_unpickle_succeeds = testGroup "unpickle tests"
509 [ check "unpickling succeeds"
510 "test/xml/newsxml.xml",
511
512 check "unpickling succeeds (with Editor)"
513 "test/xml/newsxml-with-editor.xml",
514
515 check "unpickling succeeds (empty SMS)"
516 "test/xml/newsxml-empty-sms.xml" ]
517 where
518 check desc path = testCase desc $ do
519 actual <- unpickleable path pickle_message
520 let expected = True
521 actual @?= expected
522
523
524 -- | Make sure everything gets deleted when we delete the top-level
525 -- record.
526 --
527 test_on_delete_cascade :: TestTree
528 test_on_delete_cascade = testGroup "cascading delete tests"
529 [ check "deleting news deletes its children"
530 "test/xml/newsxml.xml"
531 4 -- 2 news_teams and 2 news_locations that should remain.
532 ,
533 check "deleting news deletes its children (empty SMS)"
534 "test/xml/newsxml-empty-sms.xml"
535 4 -- 2 news_teams and 2 news_locations
536 ]
537 where
538 check desc path expected = testCase desc $ do
539 news <- unsafe_unpickle path pickle_message
540 let a = undefined :: Location
541 let b = undefined :: News
542 let c = undefined :: NewsTeam
543 let d = undefined :: News_NewsTeam
544 let e = undefined :: News_Location
545 actual <- withSqliteConn ":memory:" $ runDbConn $ do
546 runMigration silentMigrationLogger $ do
547 migrate a
548 migrate b
549 migrate c
550 migrate d
551 migrate e
552 _ <- dbimport news
553 deleteAll b
554 count_a <- countAll a
555 count_b <- countAll b
556 count_c <- countAll c
557 count_d <- countAll d
558 count_e <- countAll e
559 return $ count_a + count_b + count_c + count_d + count_e
560 actual @?= expected
561
562
563 -- | We want to make sure the single-SMS documents and the multi-SMS
564 -- documents are identified correctly.
565 --
566 test_sms_detected_correctly :: TestTree
567 test_sms_detected_correctly =
568 testGroup "newsxml SMS count determined correctly"
569 [ check "test/xml/newsxml.xml"
570 "single SMS detected correctly"
571 True,
572 check "test/xml/newsxml-multiple-sms.xml"
573 "multiple SMS detected correctly"
574 False ]
575 where
576 check path desc expected = testCase desc $ do
577 xmltree <- unsafe_read_document path
578 let actual = has_only_single_sms xmltree
579 actual @?= expected