bin/djbdns-logparse.py

   1 #!/usr/bin/python3
   2 """
   3 Convert tinydns and dnscache logs to human-readable form
   4 """
   5
   6 import re, typing
   7 from struct import pack
   8
   9 ## Regular expressions for matching tinydns/dnscache log lines. We
  10 ## compile these once here rather than within the corresponding
  11 ## matching functions, because the latter get executed repeatedly.
  12
  13 # This first pattern is used to match the timestamp format that the
  14 # tai64nlocal program produces. It appears in both dnscache and
  15 # tinydns lines, after they've been piped through tai64nlocal, of
  16 # course.
  17 timestamp_pat = r'[\d-]+ [\d:\.]+'
  18
  19 # The regex to match dnscache log lines.
  20 dnscache_log_re = re.compile(fr'({timestamp_pat}) (\w+)(.*)')
  21
  22 # The "hex4" pattern matches a string of four hexadecimal digits. This
  23 # is used, for example, by tinydns to encode the query type
  24 # identifier.
  25 hex4_pat = r'[0-9a-f]{4}'
  26
  27 # The IP pattern matches a string of either 8 or 32 hexadecimal
  28 # characters, which correspond to IPv4 and IPv6 addresses,
  29 # respectively, in tinydns logs.
  30 ip_pat = r'[0-9a-f]{8,32}'
  31
  32 # The regex to match tinydns log lines.
  33 tinydns_log_re = re.compile(
  34     rf'({timestamp_pat}) ({ip_pat}):({hex4_pat}):({hex4_pat}) ([\+\-IC/]) ({hex4_pat}) (.*)'
  35 )
  36
  37 # A dictionary mapping query type identifiers, in decimal, to their
  38 # friendly names for tinydns. Reference:
  39 #
  40 #   https://en.wikipedia.org/wiki/List_of_DNS_record_types
  41 #
  42 # Note that mapping here is non-exhaustive, and that tinydns will
  43 # log responses for record types that it does not know about.
  44 query_type = {
  45       1: "a",
  46       2: "ns",
  47       5: "cname",
  48       6: "soa",
  49      12: "ptr",
  50      13: "hinfo",
  51      15: "mx",
  52      16: "txt",
  53      17: "rp",
  54      24: "sig",
  55      25: "key",
  56      28: "aaaa",
  57      33: "srv",
  58      35: "naptr",
  59      38: "a6",
  60      48: "dnskey",
  61      52: "tlsa",
  62      65: "https",
  63     252: "axfr",
  64     255: "any",
  65     257: "caa"
  66 }
  67
  68 # tinydns can drop a query for one of three reasons; this dictionary
  69 # maps the symbol that gets logged in each case to a human-readable
  70 # reason. We include the "+" case here, indicating that the query was
  71 # NOT dropped, to avoid a special case later on when we're formatting
  72 # the human-readable output.
  73 query_drop_reason = {
  74     "+": None,
  75     "-": "no authority",
  76     "I": "invalid query",
  77     "C": "invalid class",
  78     "/": "couldn't parse"
  79 }
  80
  81
  82 def convert_ip(ip : str) -> str:
  83     """
  84     Convert a hex string representing an IP address to
  85     human-readable form.
  86
  87     Parameters
  88     ----------
  89
  90     ip : str
  91         The hexadecimal representation of either an IPv4 or an IPv6
  92         address.
  93
  94     Returns
  95     -------
  96
  97     The usual decimal dotted-quad representation is returned for an
  98     IPv4 address. IPv6 addresses are returned almost as-is, but with
  99     colons inserted in the appropriate places, between every four
 100     characters.
 101
 102     Examples
 103     --------
 104
 105         >>> convert_ip("7f000001")
 106         '127.0.0.1'
 107         >>> convert_ip("00000000000000000000ffff7f000001")
 108         '0000:0000:0000:0000:0000:ffff:7f00:0001'
 109     """
 110     if len(ip) == 8:
 111         # IPv4, eg. "7f000001" -> "7f 00 00 01" -> "127.0.0.1"
 112         return ".".join(map(str, pack(">L", int(ip, 16))))
 113     elif len(ip) == 32:
 114         # IPv6 is actually simpler -- it's just a string-slicing operation.
 115         return ":".join([ip[(4*i) : (4*i+4)] for i in range(8)])
 116
 117
 118 def decode_client(words : list, i : int):
 119     r"""
 120     Helper function to decode the client field in a dnscache log
 121     entry.
 122
 123     There are two possible formats for the client field,
 124
 125       1. clientip:clientport, used by tcpopen/tcpclose entries,
 126       2. clientip:clientport:id, used by "query" entries.
 127
 128     Parameters
 129     ----------
 130
 131     words : list
 132         The ``words`` list (a list of fields) from
 133         :func:`handle_dnscache_log`.
 134
 135     i : int
 136         The index of the client field within ``words``
 137
 138     Returns
 139     -------
 140
 141     Nothing; the ``i``th entry in the ``words`` list is modified
 142     in-place.
 143
 144     Examples
 145     --------
 146
 147         >>> words = ["foo", "bar", "7f000001:9253", "quux"]
 148         >>> decode_client(words, 2)
 149         >>> words
 150         ['foo', 'bar', '127.0.0.1:37459', 'quux']
 151
 152         >>> words = ["foo", "7f000001:a3db:4fb9", "bar", "quux"]
 153         >>> decode_client(words, 1)
 154         >>> words
 155         ['foo', '127.0.0.1:41947 (id 20409)', 'bar', 'quux']
 156
 157     """
 158     chunks = words[i].split(":")
 159
 160     ip = convert_ip(chunks[0])
 161     port = int(chunks[1], 16)
 162     words[i] = f"{ip}:{port}"
 163
 164     if len(chunks) == 3:
 165         # For a "query" entry's clientip:clientport:id field.
 166         id = int(chunks[2], 16)
 167         words[i] += f" (id {id})"
 168
 169 def decode_ip(words, i):
 170     words[i] = convert_ip(words[i])
 171
 172 def decode_ttl(words, i):
 173     words[i] = f"TTL={words[i]}"
 174
 175 def decode_serial(words, i):
 176     serial = int(words[i])
 177     words[i] = f"#{serial}"
 178
 179 def decode_type(words, i):
 180     qt = words[i]
 181     words[i] = query_type.get(int(qt), qt)
 182
 183 def handle_dnscache_log(line) -> typing.Optional[str]:
 184     """
 185     Handle a single log line if it matches the ``dnscache_log_re`` regex.
 186
 187     Parameters
 188     ----------
 189
 190     line : string
 191         The log line that might match ``dnscache_log_re``.
 192
 193     Returns
 194     -------
 195
 196     Either the human-readable string if the log line was handled (that
 197     is, if it was really a dnscache log line), or ``None`` if it was
 198     not.
 199
 200     Examples
 201     --------
 202
 203         >>> line = "2022-09-15 18:37:33.863805500 query 1 7f000001:a3db:4fb9 1 www.example.com."
 204         >>> handle_dnscache_log(line)
 205         '2022-09-15 18:37:33.863805500 query #1 127.0.0.1:41947 (id 20409) a www.example.com.'
 206
 207         >>> line = "2022-09-15 18:37:33.863874500 tx 0 1 www.example.com. . c0a80101"
 208         >>> handle_dnscache_log(line)
 209         '2022-09-15 18:37:33.863874500 tx g=0 a www.example.com. . 192.168.1.1'
 210
 211         >>> line = "2022-09-15 18:37:33.878529500 rr c0a80101 20865 1 www.example.com. 5db8d822"
 212         >>> handle_dnscache_log(line)
 213         '2022-09-15 18:37:33.878529500 rr 192.168.1.1 TTL=20865 a www.example.com. 93.184.216.34'
 214
 215         >>> line = "2022-09-15 18:37:33.878532500 stats 1 43 1 0"
 216         >>> handle_dnscache_log(line)
 217         '2022-09-15 18:37:33.878532500 stats count=1 motion=43 udp-active=1 tcp-active=0'
 218
 219         >>> line = "2022-09-15 18:37:33.878602500 sent 1 49"
 220         >>> handle_dnscache_log(line)
 221         '2022-09-15 18:37:33.878602500 sent #1 49'
 222
 223         >>> line = "this line is nonsense"
 224         >>> handle_dnscache_log(line)
 225
 226     """
 227     match = dnscache_log_re.match(line)
 228     if not match:
 229         return None
 230
 231     (timestamp, event, data) = match.groups()
 232
 233     words = data.split()
 234     if event == "cached":
 235         if words[0] not in ("cname", "ns", "nxdomain"):
 236             decode_type(words, 0)
 237
 238     elif event == "drop":
 239         decode_serial(words, 0)
 240
 241     elif event == "lame":
 242         decode_ip(words, 0)
 243
 244     elif event == "nodata":
 245         decode_ip(words, 0)
 246         decode_ttl(words, 1)
 247         decode_type(words, 2)
 248
 249     elif event == "nxdomain":
 250         decode_ip(words, 0)
 251         decode_ttl(words, 1)
 252
 253     elif event == "query":
 254         decode_serial(words, 0)
 255         decode_client(words, 1)
 256         decode_type(words, 2)
 257
 258     elif event == "rr":
 259         decode_ip(words, 0)
 260         decode_ttl(words, 1)
 261         if words[2] not in ("cname", "mx", "ns", "ptr", "soa"):
 262             decode_type(words, 2)
 263             if words[2] == "a":         # decode answer to an A query
 264                 decode_ip(words, 4)
 265             if words[2] == "txt":       # text record
 266                 response = words[4]
 267                 if response.endswith("..."):
 268                     ellipsis = "..."
 269                     response = response[0:-3]
 270                 else:
 271                     ellipsis = ""
 272                 length = int(response[0:2], 16)
 273                 chars = []
 274                 for i in range(1, len(response)//2):
 275                     chars.append(chr(int(response[2*i : (2*i)+2], 16)))
 276                 txt = "".join(chars)
 277                 words[4] = f"{length}:\"{txt}{ellipsis}\""
 278
 279     elif event == "sent":
 280         decode_serial(words, 0)
 281
 282     elif event == "stats":
 283         words[0] = f"count={words[0]}"
 284         words[1] = f"motion={words[1]}"
 285         words[2] = f"udp-active={words[2]}"
 286         words[3] = f"tcp-active={words[3]}"
 287
 288     elif event == "tx":
 289         words[0] = f"g={words[0]}"
 290         decode_type(words, 1)
 291         # words[2] = name
 292         # words[3] = control (domain for which these servers are believed
 293         #            to be authoritative)
 294         for i in range(4, len(words)):
 295             decode_ip(words, i)
 296
 297     elif event in ("tcpopen", "tcpclose"):
 298         decode_client(words, 0)
 299
 300     # Reconstitute "data" (i.e. everything after the timestamp and the
 301     # event) from "words", which was originally obtained by splitting
 302     # "data".
 303     data = " ".join(words)
 304     return f"{timestamp} {event} {data}"
 305
 306
 307
 308 def handle_tinydns_log(line : str) -> typing.Optional[str]:
 309     """
 310     Handle a single log line if it matches the ``tinydns_log_re`` regex.
 311
 312     Parameters
 313     ----------
 314
 315     line : string
 316         The log line that might match ``tinydns_log_re``.
 317
 318     Returns
 319     -------
 320
 321     Either the human-readable string if the log line was handled (that
 322     is, if it was really a tinydns log line), or ``None`` if it was
 323     not.
 324
 325     Examples
 326     --------
 327
 328         >>> line = "2022-09-14 21:04:40.206516500 7f000001:9d61:be69 - 0001 www.example.com"
 329         >>> handle_tinydns_log(line)
 330         '2022-09-14 21:04:40.206516500 dropped query (no authority) from 127.0.0.1:40289 (id 48745): a www.example.com'
 331
 332         >>> line = "this line is nonsense"
 333         >>> handle_tinydns_log(line)
 334
 335     """
 336     match = tinydns_log_re.match(line)
 337     if not match:
 338         return None
 339
 340     (timestamp, ip, port, id, code, type, name) = match.groups()
 341     ip = convert_ip(ip)
 342     port = int(port, 16)
 343     id = int(id, 16)
 344
 345     # Convert the "type" field to a human-readable record type name
 346     # using the query_type dictionary. If the right name isn't present
 347     # in the dictionary, we use the (decimal) type id instead.
 348     type = int(type, 16)                # "001c" -> 28
 349     type = query_type.get(type, type)   # 28 -> "aaaa"
 350
 351     line_tpl = "{timestamp} "
 352
 353     reason = query_drop_reason[code]
 354     if code == "+":
 355         line_tpl += "sent response to {ip}:{port} (id {id}): {type} {name}"
 356     else:
 357         line_tpl += "dropped query ({reason}) from {ip}:{port}"
 358         if code != "/":
 359             # If the query can actually be parsed, the log line is a
 360             # bit more informative than it would have been otherwise.
 361             line_tpl += " (id {id}): {type} {name}"
 362
 363     return line_tpl.format(timestamp=timestamp,
 364                            reason=reason,
 365                            ip=ip,
 366                            port=port,
 367                            id=id,
 368                            type=type,
 369                            name=name)
 370
 371
 372 def parse_logfile(file : typing.TextIO):
 373     r"""
 374     Process a single log ``file``.
 375
 376     Parameters
 377     ----------
 378
 379     file : typing.TextIO
 380         An open log file, or stdin.
 381
 382     Examples
 383     --------
 384
 385         >>> line = "@4000000063227a320c4f3114 7f000001:9d61:be69 - 0001 www.example.com\n"
 386         >>> from tempfile import NamedTemporaryFile
 387         >>> with NamedTemporaryFile(mode="w", delete=False) as f:
 388         ...     _ = f.write(line)
 389         >>> f = open(f.name, 'r')
 390         >>> parse_logfile(f)
 391         2022-09-14 21:04:40.206516500 dropped query (no authority) from 127.0.0.1:40289 (id 48745): a www.example.com
 392         >>> f.close()
 393         >>> from os import remove
 394         >>> remove(f.name)
 395
 396     """
 397     # Open a pipe to tai64nlocal. We'll write lines of our input file
 398     # (the log file) to it, and read back the same lines but with
 399     # friendly timestamps in them.
 400     from subprocess import Popen, PIPE
 401     tai = Popen(["tai64nlocal"], stdin=PIPE, stdout=PIPE, text=True, bufsize=0)
 402
 403     for line in file:
 404         tai.stdin.write(line)
 405         line = tai.stdout.readline()
 406
 407         friendly_line = handle_tinydns_log(line)
 408         if not friendly_line:
 409             friendly_line = handle_dnscache_log(line)
 410             if not friendly_line:
 411                 friendly_line = line
 412
 413         print(friendly_line)
 414
 415 def main():
 416     r"""
 417     The entry point to the program.
 418
 419     This function is responsible only for parsing any command-line
 420     arguments, and then calling :func`parse_logfile` on them.
 421     """
 422     # Create an argument parser using the file's docsctring as its
 423     # description.
 424     from argparse import ArgumentParser, FileType
 425     parser = ArgumentParser(description = __doc__)
 426
 427     # Parse zero or more positional arguments into a list of
 428     # "logfiles". If none are given, read from stdin instead.
 429     from sys import stdin
 430     parser.add_argument("logfiles",
 431                         metavar="LOGFILE",
 432                         type=FileType("r"),
 433                         nargs="*",
 434                         default=[stdin],
 435                         help="djbdns logfile to process (default: stdin)")
 436
 437     # Warning: argparse automatically opens its file arguments here,
 438     # and they only get closed when the program terminates. There's no
 439     # real benefit to closing them one-at-a-time after calling
 440     # parse_logfile(), because the "scarce" resource of open file
 441     # descriptors gets consumed immediately, before any processing has
 442     # happened. In other words, if you're going to run out of file
 443     # descriptors, it's going to happen right now.
 444     #
 445     # So anyway, don't run this on several million logfiles.
 446     args = parser.parse_args()
 447     for f in args.logfiles:
 448         parse_logfile(f)
 449
 450
 451 if __name__ == "__main__":
 452     main()