parser/parse_packet_frequency.py

   1 #!/usr/bin/python
   2
   3 """
   4 Script that takes a file (output by wireshark/tshark, in JSON format) and analyze
   5 the traffic frequency of a certain device at a certain time.
   6 """
   7
   8 import sys
   9 import json
  10 import numpy as np
  11 from collections import defaultdict
  12 from dateutil import parser
  13 from datetime import datetime
  14 from decimal import *
  15
  16 JSON_KEY_SOURCE = "_source"
  17 JSON_KEY_LAYERS = "layers"
  18
  19 JSON_KEY_ETH = "eth"
  20 JSON_KEY_ETH_DST = "eth.dst"
  21 JSON_KEY_ETH_SRC = "eth.src"
  22 JSON_KEY_FRAME = "frame"
  23 JSON_KEY_FRAME_TIME = "frame.time"
  24 TABLE_HEADER_X = "Timestamp (hh:mm:ss)"
  25 TABLE_HEADER_Y = "Packet frequency (pps)"
  26 INCOMING_APPENDIX = "_incoming"
  27 OUTGOING_APPENDIX = "_outgoing"
  28 FILE_APPENDIX = ".dat"
  29
  30 # Use this constant as a flag
  31 WINDOW_SIZE = 5
  32 USE_MOVING_AVERAGE = False
  33 USE_BINNING = True
  34 # Range = 6, i.e. 3 to left and 3 to right (in seconds)
  35 TOTAL_RANGE = 60 # TOTAL_RANGE = 2 x RANGE
  36 RANGE = 30
  37
  38
  39 def moving_average(array, window=3):
  40     """ Calculate moving average
  41         Args:
  42             array: array of numbers
  43             window: window of moving average (default = 3)
  44         Adapted from:
  45             https://stackoverflow.com/questions/14313510/how-to-calculate-moving-average-using-numpy
  46     """
  47     # Check if window > len(array)
  48     if window > len(array):
  49         window = len(array)
  50     # Calculate cumulative sum of each array element
  51     retarr = np.cumsum(array, dtype=float)
  52     # Adjust cumulative sum of each array element
  53     #   based on window size
  54     retarr[window:] = retarr[window:] - retarr[:-window]
  55     # Pad the first array elements with zeroes
  56     retarr[:window - 1] = np.zeros(window - 1)
  57     # Calculate moving average starting from the element
  58     #   at window size, e.g. element 4 for window=5
  59     retarr[window - 1:] = retarr[window - 1:] / window
  60     return retarr
  61
  62 def hms_to_seconds(t):
  63     """ Calculate hms to seconds
  64         Args:
  65             t = time in hh:mm:ss string
  66         Adapted from:
  67             https://stackoverflow.com/questions/10742296/python-time-conversion-hms-to-seconds
  68     """
  69     h, m, s = [int(i) for i in t.split(':')]
  70     return 3600*h + 60*m + s
  71
  72 def seconds_to_hms(t):
  73     """ Calculate seconds to hms
  74         Args:
  75             t = time in seconds
  76         Adapted from:
  77             https://stackoverflow.com/questions/10742296/python-time-conversion-hms-to-seconds
  78     """
  79     h = t / 3600
  80     m = (t - (h * 3600)) / 60
  81     s = t - (h * 3600) - (m * 60)
  82     hh = str(h)
  83     if len(hh) is 1:
  84         hh = "0" + hh
  85     mm = str(m)
  86     if len(mm) is 1:
  87         mm = "0" + mm
  88     ss = str(s)
  89     if len(ss) is 1:
  90         ss = "0" + ss
  91     return hh + ":" + mm + ":" + ss
  92
  93 def save_to_file(tblheader, dictionary, filenameout):
  94     """ Show summary of statistics of PCAP file
  95         Args:
  96             tblheader: header for the saved table
  97             dictionary: dictionary to be saved
  98             filename_out: file name to save
  99     """
 100     # Appending, not overwriting!
 101     f = open(filenameout, 'a')
 102     # Write the table header
 103     f.write("# " + tblheader + "\n")
 104     f.write("# " + TABLE_HEADER_X + " " + TABLE_HEADER_Y + "\n")
 105     # Write "0 0" if dictionary is empty
 106     if not dictionary:
 107         f.write("0 0")
 108         f.close()
 109         print "Writing zeroes to file: ", filenameout
 110         return
 111
 112     if USE_MOVING_AVERAGE:
 113         # Use moving average if this flag is true
 114         sortedarr = []
 115         for key in sorted(dictionary):
 116             sortedarr.append(dictionary[key])
 117         valarr = moving_average(sortedarr, WINDOW_SIZE)
 118         #print vallist
 119         # Iterate over dictionary and write (key, value) pairs
 120         ind = 0
 121         for key in sorted(dictionary):
 122             # Space separated
 123             f.write(str(key) + " " + str(valarr[ind]) + "\n")
 124             ind += 1
 125     elif USE_BINNING:
 126         sortedlist = []
 127         # Iterate over dictionary and write (key, value) pairs
 128         ind = 0
 129         first = 0
 130         last = 0
 131         for key in sorted(dictionary):
 132             sortedlist.append(key)
 133             print "Key: ", key, " - Value: ", dictionary[key], " - Ind: ", ind
 134             ind += 1
 135         first = hms_to_seconds(sortedlist[0])
 136         #print "First: ", key
 137         last = hms_to_seconds(sortedlist[ind-1])
 138         #print "Last: ", key
 139         resultdict = dict()
 140         # Put new binning keys
 141         time_ind = first
 142         ind = 0
 143         while time_ind < last:
 144             # Initialize with the first key in the list
 145             curr_key = sortedlist[ind]
 146             curr_key_secs = hms_to_seconds(curr_key)
 147             # Initialize with 0 first
 148             resultdict[time_ind] = 0
 149             # Check if this is still within RANGE - bin the value if it is
 150             while time_ind - RANGE <= curr_key_secs and curr_key_secs <= time_ind + RANGE:
 151                 resultdict[time_ind] += dictionary[curr_key]
 152                 print "Time index: ", seconds_to_hms(time_ind), " Value: ", resultdict[time_ind]
 153                 ind += 1
 154                 if ind > len(dictionary)-1:
 155                     break
 156                 # Initialize with the key in the list
 157                 curr_key = sortedlist[ind]
 158                 curr_key_secs = hms_to_seconds(curr_key)
 159             # Increment time index
 160             time_ind += TOTAL_RANGE
 161         # Now write to file after binning
 162         for key in sorted(resultdict):
 163             # Space separated
 164             f.write(seconds_to_hms(key) + " " + str(resultdict[key]) + "\n")
 165             #print seconds_to_hms(key) + " " + str(resultdict[key])
 166     else:
 167         # Iterate over dictionary and write (key, value) pairs
 168         for key in sorted(dictionary):
 169             # Space separated
 170             f.write(str(key) + " " + str(dictionary[key]) + "\n")
 171     f.close()
 172     print "Writing output to file: ", filenameout
 173
 174
 175 def main():
 176     """ Main function
 177     """
 178     if len(sys.argv) < 5:
 179         print "Usage: python", sys.argv[0], "<input_file> <output_file> <device_name> <mac_address>"
 180         return
 181     # Parse the file for the specified MAC address
 182     timefreq_incoming = parse_json(sys.argv[1], sys.argv[4], True)
 183     #timefreq_outgoing = parse_json(sys.argv[1], sys.argv[4], False)
 184     # Write statistics into file
 185     print "====================================================================="
 186     print "==> Analyzing incoming traffic ..."
 187     save_to_file(sys.argv[3] + INCOMING_APPENDIX, timefreq_incoming, sys.argv[2] + INCOMING_APPENDIX + FILE_APPENDIX)
 188     print "====================================================================="
 189     #print "==> Analyzing outgoing traffic ..."
 190     #save_to_file(sys.argv[3] + OUTGOING_APPENDIX, timefreq_outgoing, sys.argv[2] + OUTGOING_APPENDIX + FILE_APPENDIX)
 191     #print "====================================================================="
 192     #for time in time_freq.keys():
 193     #for key in sorted(time_freq):
 194     #    print key, " => ", time_freq[key]
 195     #print "====================================================================="
 196
 197
 198 # Convert JSON file containing DNS traffic to a map in which a hostname points to its set of associated IPs.
 199 def parse_json(filepath, macaddress, incomingoutgoing):
 200     """ Show summary of statistics of PCAP file
 201         Args:
 202             filepath: path of the read file
 203             macaddress: MAC address of a device to analyze
 204             incomingoutgoing: boolean to define whether we collect incoming or outgoing traffic
 205                               True = incoming, False = outgoing
 206     """
 207     # Maps timestamps to frequencies of packets
 208     timefreq = dict()
 209     with open(filepath) as jf:
 210         # Read JSON.
 211         # data becomes reference to root JSON object (or in our case json array)
 212         data = json.load(jf)
 213         # Loop through json objects in data
 214         # Each entry is a pcap entry (request/response (packet) and associated metadata)
 215         for p in data:
 216             # p is a JSON object, not an index
 217             layers = p[JSON_KEY_SOURCE][JSON_KEY_LAYERS]
 218             # Get timestamp
 219             frame = layers.get(JSON_KEY_FRAME, None)
 220             datetime = frame.get(JSON_KEY_FRAME_TIME, None)
 221             # Get into the Ethernet address part
 222             eth = layers.get(JSON_KEY_ETH, None)
 223             # Skip any non DNS traffic
 224             if eth is None:
 225                 print "[ WARNING: Packet has no ethernet address! ]"
 226                 continue
 227             # Get source and destination MAC addresses
 228             src = eth.get(JSON_KEY_ETH_SRC, None)
 229             dst = eth.get(JSON_KEY_ETH_DST, None)
 230             # Get just the time part
 231             datetimeobj = parser.parse(datetime)
 232             # Remove the microsecond part
 233             timestr = str(datetimeobj.time())[:8]
 234             print str(timestr) + " - src:" + str(src) + " - dest:" + str(dst)
 235             # Get and count the traffic for the specified MAC address
 236             if incomingoutgoing:
 237                 if dst == macaddress:
 238                     # Check if timestamp already exists in the map
 239                     # If yes, then just increment the frequency value...
 240                     if timestr in timefreq:
 241                         timefreq[timestr] = timefreq[timestr] + 1
 242                     else: # If not, then put the value one there
 243                         timefreq[timestr] = 1
 244             else:
 245                 if src == macaddress:
 246                     # Check if timestamp already exists in the map
 247                     # If yes, then just increment the frequency value...
 248                     if timestr in timefreq:
 249                         timefreq[timestr] = timefreq[timestr] + 1
 250                     else: # If not, then put the value one there
 251                         timefreq[timestr] = 1
 252
 253     return timefreq
 254
 255
 256 if __name__ == '__main__':
 257     main()
 258