bin/djbdns-logparse.py

   1 #!/usr/bin/python3
   2 r"""
   3 Convert tinydns and dnscache logs to human-readable form
   4 """
   5
   6 import re, typing
   7 from struct import pack
   8
   9 ## Regular expressions for matching tinydns/dnscache log lines. We
  10 ## compile these once here rather than within the corresponding
  11 ## matching functions, because the latter get executed repeatedly.
  12
  13 # This first pattern is used to match the timestamp format that the
  14 # tai64nlocal program produces. It appears in both dnscache and
  15 # tinydns lines, after they've been piped through tai64nlocal, of
  16 # course.
  17 timestamp_pat = r'[\d-]+ [\d:\.]+'
  18
  19 # The regex to match dnscache log lines.
  20 dnscache_log_re = re.compile(fr'({timestamp_pat}) (\w+)(.*)')
  21
  22 # The "hex4" pattern matches a string of four hexadecimal digits. This
  23 # is used, for example, by tinydns to encode the query type
  24 # identifier.
  25 hex4_pat = r'[0-9a-f]{4}'
  26
  27 # The IP pattern matches a string of either 8 or 32 hexadecimal
  28 # characters, which correspond to IPv4 and IPv6 addresses,
  29 # respectively, in tinydns logs.
  30 ip_pat = r'[0-9a-f]{8,32}'
  31
  32 # The regex to match tinydns log lines.
  33 tinydns_log_re = re.compile(
  34     rf'({timestamp_pat}) ({ip_pat}):({hex4_pat}):({hex4_pat}) ([\+\-IC/]) ({hex4_pat}) (.*)'
  35 )
  36
  37 # A dictionary mapping query type identifiers, in decimal, to their
  38 # friendly names for tinydns. Reference:
  39 #
  40 #   https://en.wikipedia.org/wiki/List_of_DNS_record_types
  41 #
  42 # Note that mapping here is non-exhaustive, and that tinydns will
  43 # log responses for record types that it does not know about.
  44 query_type = {
  45       1: "a",
  46       2: "ns",
  47       5: "cname",
  48       6: "soa",
  49      12: "ptr",
  50      13: "hinfo",
  51      15: "mx",
  52      16: "txt",
  53      17: "rp",
  54      24: "sig",
  55      25: "key",
  56      28: "aaaa",
  57      33: "srv",
  58      35: "naptr",
  59      38: "a6",
  60      48: "dnskey",
  61      52: "tlsa",
  62      65: "https",
  63     252: "axfr",
  64     255: "any",
  65     257: "caa"
  66 }
  67
  68 # tinydns can drop a query for one of three reasons; this dictionary
  69 # maps the symbol that gets logged in each case to a human-readable
  70 # reason. We include the "+" case here, indicating that the query was
  71 # NOT dropped, to avoid a special case later on when we're formatting
  72 # the human-readable output.
  73 query_drop_reason = {
  74     "+": None,
  75     "-": "no authority",
  76     "I": "invalid query",
  77     "C": "invalid class",
  78     "/": "couldn't parse"
  79 }
  80
  81
  82 def convert_ip(ip : str) -> str:
  83     r"""
  84     Convert a hex string representing an IP address to
  85     human-readable form.
  86
  87     Parameters
  88     ----------
  89
  90     ip : str
  91         The hexadecimal representation of either an IPv4 or an IPv6
  92         address.
  93
  94     Returns
  95     -------
  96
  97     The usual decimal dotted-quad representation is returned for an
  98     IPv4 address. IPv6 addresses are returned almost as-is, but with
  99     colons inserted in the appropriate places, between every four
 100     characters.
 101
 102     Examples
 103     --------
 104
 105         >>> convert_ip("7f000001")
 106         '127.0.0.1'
 107         >>> convert_ip("00000000000000000000ffff7f000001")
 108         '0000:0000:0000:0000:0000:ffff:7f00:0001'
 109     """
 110     if len(ip) == 8:
 111         # IPv4, eg. "7f000001" -> "7f 00 00 01" -> "127.0.0.1"
 112         return ".".join(map(str, pack(">L", int(ip, 16))))
 113     elif len(ip) == 32:
 114         # IPv6 is actually simpler -- it's just a string-slicing operation.
 115         return ":".join([ip[(4*i) : (4*i+4)] for i in range(8)])
 116
 117
 118 def decode_client(words : list, i : int):
 119     r"""
 120     Helper function to decode the client field in a dnscache log
 121     entry.
 122
 123     There are two possible formats for the client field,
 124
 125       1. clientip:clientport, used by tcpopen/tcpclose entries,
 126       2. clientip:clientport:id, used by "query" entries.
 127
 128     We convert each part from hex to decimal, and in the second
 129     format, separate the packet id from the client information.
 130
 131     Parameters
 132     ----------
 133
 134     words : list
 135         The ``words`` list (a list of fields) from
 136         :func:`handle_dnscache_log`.
 137
 138     i : int
 139         The index of the client field within ``words``
 140
 141     Returns
 142     -------
 143
 144     Nothing; the ``i``th entry in the ``words`` list is modified
 145     in-place.
 146
 147     Examples
 148     --------
 149
 150         >>> words = ["foo", "bar", "7f000001:9253", "quux"]
 151         >>> decode_client(words, 2)
 152         >>> words
 153         ['foo', 'bar', '127.0.0.1:37459', 'quux']
 154
 155         >>> words = ["foo", "7f000001:a3db:4fb9", "bar", "quux"]
 156         >>> decode_client(words, 1)
 157         >>> words
 158         ['foo', '127.0.0.1:41947 (id 20409)', 'bar', 'quux']
 159
 160     """
 161     chunks = words[i].split(":")
 162
 163     ip = convert_ip(chunks[0])
 164     port = int(chunks[1], 16)
 165     words[i] = f"{ip}:{port}"
 166
 167     if len(chunks) == 3:
 168         # For a "query" entry's clientip:clientport:id field.
 169         id = int(chunks[2], 16)
 170         words[i] += f" (id {id})"
 171
 172 def decode_ip(words : list, i : int):
 173     r"""
 174     Helper function to decode the ip field in a dnscache log
 175     entry.
 176
 177     A single "serverip" field is present in the lame, nodata,
 178     nxdomain, and rr entry types. We convert it from hex to decimal.
 179
 180     Parameters
 181     ----------
 182
 183     words : list
 184         The ``words`` list (a list of fields) from
 185         :func:`handle_dnscache_log`.
 186
 187     i : int
 188         The index of the ip field within ``words``
 189
 190     Returns
 191     -------
 192
 193     Nothing; the ``i``th entry in the ``words`` list is modified
 194     in-place.
 195
 196     Examples
 197     --------
 198
 199         >>> words = ["foo", "bar", "7f000001", "quux"]
 200         >>> decode_ip(words, 2)
 201         >>> words
 202         ['foo', 'bar', '127.0.0.1', 'quux']
 203
 204         >>> words = ["foo", "00000000000000000000ffff7f000001", "bar", "quux"]
 205         >>> decode_ip(words, 1)
 206         >>> words
 207         ['foo', '0000:0000:0000:0000:0000:ffff:7f00:0001', 'bar', 'quux']
 208     """
 209     words[i] = convert_ip(words[i])
 210
 211 def decode_ttl(words : list, i : int):
 212     r"""
 213     Helper function to decode the ttl field in a dnscache log
 214     entry.
 215
 216     A single "ttl" field is present in the nodata, nxdomain, and
 217     rr entry types. We prefix it with "TTL=" so that its meaning
 218     is clear in the human-readable logs.
 219
 220     Parameters
 221     ----------
 222
 223     words : list
 224         The ``words`` list (a list of fields) from
 225         :func:`handle_dnscache_log`.
 226
 227     i : int
 228         The index of the ttl field within ``words``
 229
 230     Returns
 231     -------
 232
 233     Nothing; the ``i``th entry in the ``words`` list is modified
 234     in-place.
 235
 236     Examples
 237     --------
 238
 239         >>> words = ["c0a80101", "20865", "1", "www.example.com.", "5db8d822"]
 240         >>> decode_ttl(words, 1)
 241         >>> words
 242         ['c0a80101', 'TTL=20865', '1', 'www.example.com.', '5db8d822']
 243
 244     """
 245     words[i] = f"TTL={words[i]}"
 246
 247 def decode_serial(words : list, i : int):
 248     r"""
 249     Helper function to decode the serial field in a dnscache log
 250     entry.
 251
 252     A single "serial" field is present in the drop and query entry
 253     types. It's already in decimal; we simply prefix it with a hash.
 254
 255     Parameters
 256     ----------
 257
 258     words : list
 259         The ``words`` list (a list of fields) from
 260         :func:`handle_dnscache_log`.
 261
 262     i : int
 263         The index of the serial field within ``words``
 264
 265     Returns
 266     -------
 267
 268     Nothing; the ``i``th entry in the ``words`` list is modified
 269     in-place.
 270
 271     Examples
 272     --------
 273
 274         >>> words = ["1", "7f000001:a3db:4fb9", "1", "www.example.com."]
 275         >>> decode_serial(words, 0)
 276         >>> words
 277         ['#1', '7f000001:a3db:4fb9', '1', 'www.example.com.']
 278
 279     """
 280     words[i] = f"#{words[i]}"
 281
 282 def decode_type(words : list, i : int):
 283     qt = words[i]
 284     words[i] = query_type.get(int(qt), qt)
 285
 286 def handle_dnscache_log(line : str) -> typing.Optional[str]:
 287     r"""
 288     Handle a single log line if it matches the ``dnscache_log_re`` regex.
 289
 290     Parameters
 291     ----------
 292
 293     line : string
 294         The log line that might match ``dnscache_log_re``.
 295
 296     Returns
 297     -------
 298
 299     Either the human-readable string if the log line was handled (that
 300     is, if it was really a dnscache log line), or ``None`` if it was
 301     not.
 302
 303     Examples
 304     --------
 305
 306         >>> line = "2022-09-15 18:37:33.863805500 query 1 7f000001:a3db:4fb9 1 www.example.com."
 307         >>> handle_dnscache_log(line)
 308         '2022-09-15 18:37:33.863805500 query #1 127.0.0.1:41947 (id 20409) a www.example.com.'
 309
 310         >>> line = "2022-09-15 18:37:33.863874500 tx 0 1 www.example.com. . c0a80101"
 311         >>> handle_dnscache_log(line)
 312         '2022-09-15 18:37:33.863874500 tx g=0 a www.example.com. . 192.168.1.1'
 313
 314         >>> line = "2022-09-15 18:37:33.878529500 rr c0a80101 20865 1 www.example.com. 5db8d822"
 315         >>> handle_dnscache_log(line)
 316         '2022-09-15 18:37:33.878529500 rr 192.168.1.1 TTL=20865 a www.example.com. 93.184.216.34'
 317
 318         >>> line = "2022-09-15 18:37:33.878532500 stats 1 43 1 0"
 319         >>> handle_dnscache_log(line)
 320         '2022-09-15 18:37:33.878532500 stats count=1 motion=43 udp-active=1 tcp-active=0'
 321
 322         >>> line = "2022-09-15 18:37:33.878602500 sent 1 49"
 323         >>> handle_dnscache_log(line)
 324         '2022-09-15 18:37:33.878602500 sent #1 49'
 325
 326         >>> line = "this line is nonsense"
 327         >>> handle_dnscache_log(line)
 328
 329     """
 330     match = dnscache_log_re.match(line)
 331     if not match:
 332         return None
 333
 334     (timestamp, event, data) = match.groups()
 335
 336     words = data.split()
 337     if event == "cached":
 338         if words[0] not in ("cname", "ns", "nxdomain"):
 339             decode_type(words, 0)
 340
 341     elif event == "drop":
 342         decode_serial(words, 0)
 343
 344     elif event == "lame":
 345         decode_ip(words, 0)
 346
 347     elif event == "nodata":
 348         decode_ip(words, 0)
 349         decode_ttl(words, 1)
 350         decode_type(words, 2)
 351
 352     elif event == "nxdomain":
 353         decode_ip(words, 0)
 354         decode_ttl(words, 1)
 355
 356     elif event == "query":
 357         decode_serial(words, 0)
 358         decode_client(words, 1)
 359         decode_type(words, 2)
 360
 361     elif event == "rr":
 362         decode_ip(words, 0)
 363         decode_ttl(words, 1)
 364         if words[2] not in ("cname", "mx", "ns", "ptr", "soa"):
 365             decode_type(words, 2)
 366             if words[2] == "a":         # decode answer to an A query
 367                 decode_ip(words, 4)
 368             if words[2] == "txt":       # text record
 369                 response = words[4]
 370                 if response.endswith("..."):
 371                     ellipsis = "..."
 372                     response = response[0:-3]
 373                 else:
 374                     ellipsis = ""
 375                 length = int(response[0:2], 16)
 376                 chars = []
 377                 for i in range(1, len(response)//2):
 378                     chars.append(chr(int(response[2*i : (2*i)+2], 16)))
 379                 txt = "".join(chars)
 380                 words[4] = f"{length}:\"{txt}{ellipsis}\""
 381
 382     elif event == "sent":
 383         decode_serial(words, 0)
 384
 385     elif event == "stats":
 386         words[0] = f"count={words[0]}"
 387         words[1] = f"motion={words[1]}"
 388         words[2] = f"udp-active={words[2]}"
 389         words[3] = f"tcp-active={words[3]}"
 390
 391     elif event == "tx":
 392         words[0] = f"g={words[0]}"
 393         decode_type(words, 1)
 394         # words[2] = name
 395         # words[3] = control (domain for which these servers are believed
 396         #            to be authoritative)
 397         for i in range(4, len(words)):
 398             decode_ip(words, i)
 399
 400     elif event in ("tcpopen", "tcpclose"):
 401         decode_client(words, 0)
 402
 403     # Reconstitute "data" (i.e. everything after the timestamp and the
 404     # event) from "words", which was originally obtained by splitting
 405     # "data".
 406     data = " ".join(words)
 407     return f"{timestamp} {event} {data}"
 408
 409
 410
 411 def handle_tinydns_log(line : str) -> typing.Optional[str]:
 412     r"""
 413     Handle a single log line if it matches the ``tinydns_log_re`` regex.
 414
 415     Parameters
 416     ----------
 417
 418     line : string
 419         The log line that might match ``tinydns_log_re``.
 420
 421     Returns
 422     -------
 423
 424     Either the human-readable string if the log line was handled (that
 425     is, if it was really a tinydns log line), or ``None`` if it was
 426     not.
 427
 428     Examples
 429     --------
 430
 431         >>> line = "2022-09-14 21:04:40.206516500 7f000001:9d61:be69 - 0001 www.example.com"
 432         >>> handle_tinydns_log(line)
 433         '2022-09-14 21:04:40.206516500 dropped query (no authority) from 127.0.0.1:40289 (id 48745): a www.example.com'
 434
 435         >>> line = "this line is nonsense"
 436         >>> handle_tinydns_log(line)
 437
 438     """
 439     match = tinydns_log_re.match(line)
 440     if not match:
 441         return None
 442
 443     (timestamp, ip, port, id, code, type, name) = match.groups()
 444     ip = convert_ip(ip)
 445     port = int(port, 16)
 446     id = int(id, 16)
 447
 448     # Convert the "type" field to a human-readable record type name
 449     # using the query_type dictionary. If the right name isn't present
 450     # in the dictionary, we use the (decimal) type id instead.
 451     type = int(type, 16)                # "001c" -> 28
 452     type = query_type.get(type, type)   # 28 -> "aaaa"
 453
 454     line_tpl = "{timestamp} "
 455
 456     reason = query_drop_reason[code]
 457     if code == "+":
 458         line_tpl += "sent response to {ip}:{port} (id {id}): {type} {name}"
 459     else:
 460         line_tpl += "dropped query ({reason}) from {ip}:{port}"
 461         if code != "/":
 462             # If the query can actually be parsed, the log line is a
 463             # bit more informative than it would have been otherwise.
 464             line_tpl += " (id {id}): {type} {name}"
 465
 466     return line_tpl.format(timestamp=timestamp,
 467                            reason=reason,
 468                            ip=ip,
 469                            port=port,
 470                            id=id,
 471                            type=type,
 472                            name=name)
 473
 474
 475 def parse_logfile(file : typing.TextIO):
 476     r"""
 477     Process a single log ``file``.
 478
 479     Parameters
 480     ----------
 481
 482     file : typing.TextIO
 483         An open log file, or stdin.
 484
 485     Examples
 486     --------
 487
 488         >>> line = "@4000000063227a320c4f3114 7f000001:9d61:be69 - 0001 www.example.com\n"
 489         >>> from tempfile import NamedTemporaryFile
 490         >>> with NamedTemporaryFile(mode="w", delete=False) as f:
 491         ...     _ = f.write(line)
 492         >>> f = open(f.name, 'r')
 493         >>> parse_logfile(f)
 494         2022-09-14 21:04:40.206516500 dropped query (no authority) from 127.0.0.1:40289 (id 48745): a www.example.com
 495         >>> f.close()
 496         >>> from os import remove
 497         >>> remove(f.name)
 498
 499     """
 500     # Open a pipe to tai64nlocal. We'll write lines of our input file
 501     # (the log file) to it, and read back the same lines but with
 502     # friendly timestamps in them.
 503     from subprocess import Popen, PIPE
 504     tai = Popen(["tai64nlocal"], stdin=PIPE, stdout=PIPE, text=True, bufsize=0)
 505
 506     for line in file:
 507         tai.stdin.write(line)
 508         line = tai.stdout.readline()
 509
 510         friendly_line = handle_tinydns_log(line)
 511         if not friendly_line:
 512             friendly_line = handle_dnscache_log(line)
 513             if not friendly_line:
 514                 friendly_line = line
 515
 516         print(friendly_line)
 517
 518 def main():
 519     r"""
 520     The entry point to the program.
 521
 522     This function is responsible only for parsing any command-line
 523     arguments, and then calling :func`parse_logfile` on them.
 524     """
 525     # Create an argument parser using the file's docsctring as its
 526     # description.
 527     from argparse import ArgumentParser, FileType
 528     parser = ArgumentParser(description = __doc__)
 529
 530     # Parse zero or more positional arguments into a list of
 531     # "logfiles". If none are given, read from stdin instead.
 532     from sys import stdin
 533     parser.add_argument("logfiles",
 534                         metavar="LOGFILE",
 535                         type=FileType("r"),
 536                         nargs="*",
 537                         default=[stdin],
 538                         help="djbdns logfile to process (default: stdin)")
 539
 540     # Warning: argparse automatically opens its file arguments here,
 541     # and they only get closed when the program terminates. There's no
 542     # real benefit to closing them one-at-a-time after calling
 543     # parse_logfile(), because the "scarce" resource of open file
 544     # descriptors gets consumed immediately, before any processing has
 545     # happened. In other words, if you're going to run out of file
 546     # descriptors, it's going to happen right now.
 547     #
 548     # So anyway, don't run this on several million logfiles.
 549     args = parser.parse_args()
 550     for f in args.logfiles:
 551         parse_logfile(f)
 552
 553
 554 if __name__ == "__main__":
 555     main()