bin/djbdns-logparse.py

   1 #!/usr/bin/python3
   2 """
   3 Convert tinydns and dnscache logs to human-readable form
   4 """
   5
   6 import re, typing
   7 from struct import pack
   8 from time import strftime, gmtime
   9 from subprocess import Popen, PIPE
  10
  11
  12 ## Regular expressions for matching tinydns/dnscache log lines. We
  13 ## compile these once here rather than within the corresponding
  14 ## matching functions, because the latter get executed repeatedly.
  15
  16 # This first pattern is used to match the timestamp format that the
  17 # tai64nlocal program produces. It appears in both dnscache and
  18 # tinydns lines, after they've been piped through tai64nlocal, of
  19 # course.
  20 timestamp_pat = r'[\d-]+ [\d:\.]+'
  21
  22 # The regex to match dnscache log lines.
  23 dnscache_log_re = re.compile(fr'({timestamp_pat}) (\w+)(.*)')
  24
  25 # The "hex4" pattern matches a string of four hexadecimal digits. This
  26 # is used, for example, by tinydns to encode the query type
  27 # identifier.
  28 hex4_pat = r'[0-9a-f]{4}'
  29
  30 # The IP pattern matches a string of either 8 or 32 hexadecimal
  31 # characters, which correspond to IPv4 and IPv6 addresses,
  32 # respectively, in tinydns logs.
  33 ip_pat = r'[0-9a-f]{8,32}'
  34
  35 # The regex to match tinydns log lines.
  36 tinydns_log_re = re.compile(
  37     rf'({timestamp_pat}) ({ip_pat}):({hex4_pat}):({hex4_pat}) ([\+\-IC/]) ({hex4_pat}) (.*)'
  38 )
  39
  40 # A dictionary mapping query type identifiers, in decimal, to their
  41 # friendly names for tinydns. Reference:
  42 #
  43 #   https://en.wikipedia.org/wiki/List_of_DNS_record_types
  44 #
  45 # Note that mapping here is non-exhaustive, and that tinydns will
  46 # log responses for record types that it does not know about.
  47 query_type = {
  48       1: "a",
  49       2: "ns",
  50       5: "cname",
  51       6: "soa",
  52      12: "ptr",
  53      13: "hinfo",
  54      15: "mx",
  55      16: "txt",
  56      17: "rp",
  57      24: "sig",
  58      25: "key",
  59      28: "aaaa",
  60      33: "srv",
  61      35: "naptr",
  62      38: "a6",
  63      48: "dnskey",
  64      52: "tlsa",
  65      65: "https",
  66     252: "axfr",
  67     255: "any",
  68     257: "caa"
  69 }
  70
  71 # tinydns can drop a query for one of three reasons; this dictionary
  72 # maps the symbol that gets logged in each case to a human-readable
  73 # reason. We include the "+" case here, indicating that the query was
  74 # NOT dropped, to avoid a special case later on when we're formatting
  75 # the human-readable output.
  76 query_drop_reason = {
  77     "+": None,
  78     "-": "no authority",
  79     "I": "invalid query",
  80     "C": "invalid class",
  81     "/": "couldn't parse"
  82 }
  83
  84
  85 def convert_ip(ip : str):
  86     """
  87     Convert a hex string representing an IP address to conventional
  88     human-readable form, ie. dotted-quad decimal for IPv4, and
  89     8 colon-separated hex shorts for IPv6.
  90
  91     Examples
  92     --------
  93
  94         >>> convert_ip("7f000001")
  95         '127.0.0.1'
  96         >>> convert_ip("00000000000000000000ffff7f000001")
  97         '0000:0000:0000:0000:0000:ffff:7f00:0001'
  98
  99     """
 100     if len(ip) == 8:
 101         # IPv4, eg. "7f000001" -> "7f 00 00 01" -> "127.0.0.1"
 102         return "%d.%d.%d.%d" % tuple(pack(">L", int(ip, 16)))
 103     elif len(ip) == 32:
 104         # IPv6 is actually simpler -- it's just a string-slicing operation.
 105         return ":".join([ip[(4*i) : (4*i+4)] for i in range(8)])
 106
 107
 108 def decode_client(words, i):
 109     chunks = words[i].split(":")
 110     if len(chunks) == 2:                # ip:port
 111         words[i] = "%s:%d" % (convert_ip(chunks[0]), int(chunks[1], 16))
 112     elif len(chunks) == 3:
 113         words[i] = "%s:%d (id %d)" % (convert_ip(chunks[0]),
 114                                       int(chunks[1], 16),
 115                                       int(chunks[2], 16))
 116
 117 def decode_ip(words, i):
 118     words[i] = convert_ip(words[i])
 119
 120 def decode_ttl(words, i):
 121     words[i] = "TTL=%s" % words[i]
 122
 123 def decode_serial(words, i):
 124     serial = int(words[i])
 125     words[i] = "#%d" % serial
 126
 127 def decode_type(words, i):
 128     qt = words[i]
 129     words[i] = query_type.get(int(qt), qt)
 130
 131 def handle_dnscache_log(line, match):
 132     (timestamp, event, data) = match.groups()
 133
 134     words = data.split()
 135     if event == "cached":
 136         if words[0] not in ("cname", "ns", "nxdomain"):
 137             decode_type(words, 0)
 138
 139     elif event == "drop":
 140         decode_serial(words, 0)
 141
 142     elif event == "lame":
 143         decode_ip(words, 0)
 144
 145     elif event == "nodata":
 146         decode_ip(words, 0)
 147         decode_ttl(words, 1)
 148         decode_type(words, 2)
 149
 150     elif event == "nxdomain":
 151         decode_ip(words, 0)
 152         decode_ttl(words, 1)
 153
 154     elif event == "query":
 155         decode_serial(words, 0)
 156         decode_client(words, 1)
 157         decode_type(words, 2)
 158
 159     elif event == "rr":
 160         decode_ip(words, 0)
 161         decode_ttl(words, 1)
 162         if words[2] not in ("cname", "mx", "ns", "ptr", "soa"):
 163             decode_type(words, 2)
 164             if words[2] == "a":         # decode answer to an A query
 165                 decode_ip(words, 4)
 166             if words[2] == "txt":       # text record
 167                 response = words[4]
 168                 if response.endswith("..."):
 169                     ellipsis = "..."
 170                     response = response[0:-3]
 171                 else:
 172                     ellipsis = ""
 173                 length = int(response[0:2], 16)
 174                 chars = []
 175                 for i in range(1, len(response)/2):
 176                     chars.append(chr(int(response[2*i : (2*i)+2], 16)))
 177                 words[4] = "%d:\"%s%s\"" % (length, "".join(chars), ellipsis)
 178
 179     elif event == "sent":
 180         decode_serial(words, 0)
 181
 182     elif event == "stats":
 183         words[0] = "count=%s" % words[0]
 184         words[1] = "motion=%s" % words[1]
 185         words[2] = "udp-active=%s" % words[2]
 186         words[3] = "tcp-active=%s" % words[3]
 187
 188     elif event == "tx":
 189         words[0] = "g=%s" % words[0]
 190         decode_type(words, 1)
 191         # words[2] = name
 192         # words[3] = control (domain for which these servers are believed
 193         #            to be authoritative)
 194         for i in range(4, len(words)):
 195             decode_ip(words, i)
 196
 197     elif event in ("tcpopen", "tcpclose"):
 198         decode_client(words, 0)
 199
 200     print(timestamp, event, " ".join(words))
 201
 202
 203 def handle_tinydns_log(line : str, match: re.Match):
 204     """
 205     Handle a line that matched the ``tinydns_log_re`` regex.
 206
 207     Parameters
 208     ----------
 209
 210     line : string
 211         The tinydns log line that matched ``tinydns_log_re``.
 212
 213     match : re.Match
 214         The match object that was returned when ``line`` was
 215         tested against ``tinydns_log_re``.
 216
 217     Examples
 218     --------
 219
 220         >>> line = "2022-09-14 21:04:40.206516500 7f000001:9d61:be69 - 0001 www.example.com"
 221         >>> match = tinydns_log_re.match(line)
 222         >>> handle_tinydns_log(line, match)
 223         2022-09-14 21:04:40.206516500 dropped query (no authority) from 127.0.0.1:40289 (id 48745): a www.example.com
 224
 225     """
 226     (timestamp, ip, port, id, code, type, name) = match.groups()
 227     ip = convert_ip(ip)
 228     port = int(port, 16)
 229     id = int(id, 16)
 230
 231     # Convert the "type" field to a human-readable record type name
 232     # using the query_type dictionary. If the right name isn't present
 233     # in the dictionary, we use the (decimal) type id instead.
 234     type = int(type, 16)                # "001c" -> 28
 235     type = query_type.get(type, type)   # 28 -> "aaaa"
 236
 237     print(timestamp, end=' ')
 238
 239     reason = query_drop_reason[code]
 240     if code == "+":
 241         line_tpl = "sent response to {ip}:{port} (id {id}): {type} {name}"
 242     else:
 243         line_tpl = "dropped query ({reason}) from {ip}:{port}"
 244         if code != "/":
 245             # If the query can actually be parsed, the log line is a
 246             # bit more informative than it would have been otherwise.
 247             line_tpl += " (id {id}): {type} {name}"
 248
 249     print(line_tpl.format(reason=reason,
 250                           ip=ip,
 251                           port=port,
 252                           id=id,
 253                           type=type,
 254                           name=name))
 255
 256
 257 def parse_logfile(file : typing.TextIO):
 258     r"""
 259     Process a single log ``file``.
 260
 261     Parameters
 262     ----------
 263
 264     file : typing.TextIO
 265         An open log file, or stdin.
 266
 267     Examples
 268     --------
 269
 270         >>> line = "@4000000063227a320c4f3114 7f000001:9d61:be69 - 0001 www.example.com\n"
 271         >>> from tempfile import NamedTemporaryFile
 272         >>> with NamedTemporaryFile(mode="w", delete=False) as f:
 273         ...     _ = f.write(line)
 274         >>> f = open(f.name, 'r')
 275         >>> parse_logfile(f)
 276         2022-09-14 21:04:40.206516500 dropped query (no authority) from 127.0.0.1:40289 (id 48745): a www.example.com
 277         >>> f.close()
 278         >>> from os import remove
 279         >>> remove(f.name)
 280
 281     """
 282     # Open pipe to tai64nlocal: we will write lines of our input (the
 283     # raw log file) to it, and read log lines with readable timestamps
 284     # from it.
 285     tai = Popen(["tai64nlocal"], stdin=PIPE, stdout=PIPE, text=True, bufsize=0)
 286
 287     for line in file:
 288         tai.stdin.write(line)
 289         line = tai.stdout.readline()
 290
 291         match = tinydns_log_re.match(line)
 292         if match:
 293             handle_tinydns_log(line, match)
 294             continue
 295
 296         match = dnscache_log_re.match(line)
 297         if match:
 298             handle_dnscache_log(line, match)
 299             continue
 300
 301         print(line)
 302
 303 def main():
 304     # Create an argument parser using the file's docsctring as its
 305     # description.
 306     from argparse import ArgumentParser, FileType
 307     parser = ArgumentParser(description = __doc__)
 308
 309     # Parse zero or more positional arguments into a list of
 310     # "logfiles". If none are given, read from stdin instead.
 311     from sys import stdin
 312     parser.add_argument("logfiles",
 313                         metavar="LOGFILE",
 314                         type=FileType("r"),
 315                         nargs="*",
 316                         default=[stdin],
 317                         help="djbdns logfile to process (default: stdin)")
 318
 319     # Warning: argparse automatically opens its file arguments here,
 320     # and they only get closed when the program terminates. There's no
 321     # real benefit to closing them one-at-a-time after calling
 322     # parse_logfile(), because the "scarce" resource of open file
 323     # descriptors gets consumed immediately, before any processing has
 324     # happened. In other words, if you're going to run out of file
 325     # descriptors, it's going to happen right now.
 326     #
 327     # So anyway, don't run this on several million logfiles.
 328     args = parser.parse_args()
 329     for f in args.logfiles:
 330         parse_logfile(f)
 331
 332
 333 if __name__ == "__main__":
 334     main()