Deleting unwanted scripts and checking in relevant ones.
[pingpong.git] / parser / parse_packet_frequency.py
index 9520298a28aed1ae925655656380561220732108..d6ab9d4143cfda36b95659d07a1ee9b9b00a3656 100644 (file)
@@ -7,8 +7,11 @@ the traffic frequency of a certain device at a certain time.
 
 import sys
 import json
+import numpy as np
 from collections import defaultdict
 from dateutil import parser
+from datetime import datetime
+from decimal import *
 
 JSON_KEY_SOURCE = "_source"
 JSON_KEY_LAYERS = "layers"
@@ -18,33 +21,183 @@ JSON_KEY_ETH_DST = "eth.dst"
 JSON_KEY_ETH_SRC = "eth.src"
 JSON_KEY_FRAME = "frame"
 JSON_KEY_FRAME_TIME = "frame.time"
+JSON_KEY_FRAME_LEN = "frame.len"
 TABLE_HEADER_X = "Timestamp (hh:mm:ss)"
-TABLE_HEADER_Y = "Packet frequency (pps)"
+TABLE_HEADER_Y = "Packet frequency"
+INCOMING_APPENDIX = "_incoming"
+OUTGOING_APPENDIX = "_outgoing"
+FILE_APPENDIX = ".dat"
 
+# Use this constant as a flag
+WINDOW_SIZE = 5
+USE_MOVING_AVERAGE = False
+USE_BINNING = True
+# Range = 6, i.e. 3 to left and 3 to right (in seconds)
+#TOTAL_RANGE = 60 # TOTAL_RANGE = 2 x RANGE
+#RANGE = 30
+TOTAL_RANGE = 20 # TOTAL_RANGE = 2 x RANGE
+RANGE = 10
 
-def save_to_file(tbl_header, dictionary, filename_out):
+def moving_average(array, window=3):
+    """ Calculate moving average
+        Args:
+            array: array of numbers
+            window: window of moving average (default = 3)
+        Adapted from: 
+            https://stackoverflow.com/questions/14313510/how-to-calculate-moving-average-using-numpy
+    """
+    # Check if window > len(array)
+    if window > len(array):
+        window = len(array)
+    # Calculate cumulative sum of each array element
+    retarr = np.cumsum(array, dtype=float)
+    # Adjust cumulative sum of each array element
+    #   based on window size
+    retarr[window:] = retarr[window:] - retarr[:-window]
+    # Pad the first array elements with zeroes
+    retarr[:window - 1] = np.zeros(window - 1)
+    # Calculate moving average starting from the element
+    #   at window size, e.g. element 4 for window=5
+    retarr[window - 1:] = retarr[window - 1:] / window
+    return retarr
+
+def hms_to_seconds(t):
+    """ Calculate hms to seconds
+        Args:
+            t = time in hh:mm:ss string
+        Adapted from:
+            https://stackoverflow.com/questions/10742296/python-time-conversion-hms-to-seconds
+    """
+    h, m, s = [int(i) for i in t.split(':')]
+    return 3600*h + 60*m + s
+    
+def seconds_to_hms(t):
+    """ Calculate seconds to hms
+        Args:
+            t = time in seconds
+        Adapted from:
+            https://stackoverflow.com/questions/10742296/python-time-conversion-hms-to-seconds
+    """
+    h = t / 3600
+    m = (t - (h * 3600)) / 60
+    s = t - (h * 3600) - (m * 60)
+    hh = str(h)
+    if len(hh) is 1:
+        hh = "0" + hh
+    mm = str(m)
+    if len(mm) is 1:
+        mm = "0" + mm
+    ss = str(s) 
+    if len(ss) is 1:
+        ss = "0" + ss
+    return hh + ":" + mm + ":" + ss
+    
+def include_timestamps_zero_packets(timelen):
+    """ Include every second that has zero packets (no packets/transmission)
+        Args:
+            timelen = dictionary that maps timestamps to packet length
+    """
+    sortedkeylist = []
+    for key in sorted(timelen):
+        sortedkeylist.append(key)
+    first = sortedkeylist[0]
+    last = sortedkeylist[len(sortedkeylist)-1]
+    # Calculate the number of seconds between first and last packets
+    first_seconds = hms_to_seconds(first)
+    last_seconds = hms_to_seconds(last)
+    seconds = last_seconds - first_seconds
+    # Start counting and filling in timestamps with zero packets
+    counter = 0
+    while counter < seconds:
+        timestamp = seconds_to_hms(first_seconds + counter)
+        if timestamp not in timelen:
+            timelen[timestamp] = 0
+        counter += 1
+    return timelen
+    
+
+def save_to_file(tblheader, dictionary, filenameout):
     """ Show summary of statistics of PCAP file
         Args:
-            tbl_header: header for the saved table
+            tblheader: header for the saved table
             dictionary: dictionary to be saved
             filename_out: file name to save
     """
     # Appending, not overwriting!
-    f = open(filename_out, 'a')
+    f = open(filenameout, 'a')
     # Write the table header
-    f.write("# " + TABLE_HEADER_X + " " + TABLE_HEADER_Y + "\n");
-    # Iterate over dictionary and write (key, value) pairs
-    for key in sorted(dictionary):
-        # Comma separated
-        #f.write(str(key) + ", " + str(dictionary[key]) + "\n")
-        # Space separated
-        f.write(str(key) + " " + str(dictionary[key]) + "\n")
+    f.write("# " + tblheader + "\n")
+    f.write("# " + TABLE_HEADER_X + " " + TABLE_HEADER_Y + "\n")
     # Write "0 0" if dictionary is empty
     if not dictionary:
-        f.write("0 0");
+        f.write("0 0")
+        f.close()
+        print "Writing zeroes to file: ", filenameout
+        return
+
+    if USE_MOVING_AVERAGE:
+        # Use moving average if this flag is true
+        sortedarr = []
+        for key in sorted(dictionary):
+            sortedarr.append(dictionary[key])
+        valarr = moving_average(sortedarr, WINDOW_SIZE)
+        #print vallist
+        # Iterate over dictionary and write (key, value) pairs
+        ind = 0
+        for key in sorted(dictionary):
+            # Space separated
+            f.write(str(key) + " " + str(valarr[ind]) + "\n")
+            ind += 1
 
+    elif USE_BINNING:
+        sortedlist = []
+        # Iterate over dictionary and write (key, value) pairs
+        ind = 0
+        first = 0
+        last = 0
+        for key in sorted(dictionary):
+            sortedlist.append(key)
+            print "Key: ", key, " - Value: ", dictionary[key], " - Ind: ", ind
+            ind += 1
+        first = hms_to_seconds(sortedlist[0])
+        #print "First: ", key
+        last = hms_to_seconds(sortedlist[ind-1])
+        #print "Last: ", key
+        resultdict = dict()
+        # Put new binning keys
+        time_ind = first
+        ind = 0
+        while time_ind < last:
+            # Initialize with the first key in the list
+            curr_key = sortedlist[ind]
+            curr_key_secs = hms_to_seconds(curr_key)
+            # Initialize with 0 first
+            resultdict[time_ind] = 0
+            # Check if this is still within RANGE - bin the value if it is
+            while time_ind - RANGE <= curr_key_secs and curr_key_secs <= time_ind + RANGE:
+                resultdict[time_ind] += dictionary[curr_key]
+                print "Time index: ", seconds_to_hms(time_ind), " Value: ", resultdict[time_ind]
+                ind += 1
+                if ind > len(dictionary)-1:
+                    break
+                # Initialize with the key in the list
+                curr_key = sortedlist[ind]
+                curr_key_secs = hms_to_seconds(curr_key)
+            # Increment time index
+            time_ind += TOTAL_RANGE
+        # Now write to file after binning
+        for key in sorted(resultdict):
+            # Space separated
+            f.write(seconds_to_hms(key) + " " + str(resultdict[key]) + "\n")
+            #print seconds_to_hms(key) + " " + str(resultdict[key])
+
+    else:
+        # Iterate over dictionary and write (key, value) pairs
+        for key in sorted(dictionary):
+            # Space separated
+            f.write(str(key) + " " + str(dictionary[key]) + "\n")
     f.close()
-    print "Writing output to file: ", filename_out
+    print "Writing output to file: ", filenameout
 
 
 def main():
@@ -54,26 +207,42 @@ def main():
         print "Usage: python", sys.argv[0], "<input_file> <output_file> <device_name> <mac_address>"
         return
     # Parse the file for the specified MAC address
-    time_freq = parse_json(sys.argv[1], sys.argv[4])
-    # Write statistics into file
-    save_to_file(sys.argv[3], time_freq, sys.argv[2])
     print "====================================================================="
-    #for time in time_freq.keys():
-    for key in sorted(time_freq):
-        print key, " => ", time_freq[key]
+    print "Analyzing file: ", sys.argv[1]
+    timelen_incoming = parse_json(sys.argv[1], sys.argv[4], True)
+    if len(timelen_incoming) > 0:
+        timelen_incoming = include_timestamps_zero_packets(timelen_incoming)
+        print "==> Printing incoming traffic ..."
+        save_to_file(sys.argv[3] + INCOMING_APPENDIX, timelen_incoming, sys.argv[2] + INCOMING_APPENDIX + FILE_APPENDIX)
+    else:
+        print "No incoming traffic to this MAC address!"
     print "====================================================================="
+    timelen_outgoing = parse_json(sys.argv[1], sys.argv[4], False)
+    if len(timelen_outgoing) > 0:
+        timelen_outgoing = include_timestamps_zero_packets(timelen_outgoing)
+        print "==> Printing outgoing traffic ..."
+        save_to_file(sys.argv[3] + OUTGOING_APPENDIX, timelen_outgoing, sys.argv[2] + OUTGOING_APPENDIX + FILE_APPENDIX)
+    else:
+        print "No outgoing traffic from this MAC address!"
+    print "====================================================================="
+    #for time in time_freq.keys():
+    #for key in sorted(time_freq):
+    #    print key, " => ", time_freq[key]
+    #print "====================================================================="
 
 
 # Convert JSON file containing DNS traffic to a map in which a hostname points to its set of associated IPs.
-def parse_json(file_path, mac_address):
+def parse_json(filepath, macaddress, incomingoutgoing):
     """ Show summary of statistics of PCAP file
         Args:
-            file_path: path of the read file
-            mac_address: MAC address of a device to analyze
+            filepath: path of the read file
+            macaddress: MAC address of a device to analyze
+            incomingoutgoing: boolean to define whether we collect incoming or outgoing traffic
+                              True = incoming, False = outgoing
     """
-    # Maps timestamps to frequencies of packets
-    time_freq = dict()
-    with open(file_path) as jf:
+    # Maps timestamps to lengths of packets
+    timelen = dict()
+    with open(filepath) as jf:
         # Read JSON.
         # data becomes reference to root JSON object (or in our case json array)
         data = json.load(jf)
@@ -84,7 +253,9 @@ def parse_json(file_path, mac_address):
             layers = p[JSON_KEY_SOURCE][JSON_KEY_LAYERS]
             # Get timestamp
             frame = layers.get(JSON_KEY_FRAME, None)
-            date_time = frame.get(JSON_KEY_FRAME_TIME, None)
+            datetime = frame.get(JSON_KEY_FRAME_TIME, None)
+            # Get frame length
+            length = frame.get(JSON_KEY_FRAME_LEN, None)
             # Get into the Ethernet address part
             eth = layers.get(JSON_KEY_ETH, None)
             # Skip any non DNS traffic
@@ -95,19 +266,30 @@ def parse_json(file_path, mac_address):
             src = eth.get(JSON_KEY_ETH_SRC, None)
             dst = eth.get(JSON_KEY_ETH_DST, None)
             # Get just the time part
-            date_time_obj = parser.parse(date_time)
+            datetimeobj = parser.parse(datetime)
             # Remove the microsecond part
-            time_str = str(date_time_obj.time())[:8]
-            print str(time_str) + " - src:" + str(src) + " - dest:" + str(dst)
+            timestr = str(datetimeobj.time())[:8]
+            #print str(timestr) + " - src:" + str(src) + " - dest:" + str(dst) + " - length: ", length
             # Get and count the traffic for the specified MAC address
-            if src == mac_address or dst == mac_address:
-                # Check if timestamp already exists in the map
-                # If yes, then just increment the frequency value...
-                if time_str in time_freq:
-                    time_freq[time_str] = time_freq[time_str] + 1
-                else: # If not, then put the value one there
-                    time_freq[time_str] = 1
-    return time_freq
+            if incomingoutgoing:           
+                if dst == macaddress:
+                    # Check if timestamp already exists in the map
+                    # If yes, then just increment the frequency value...
+                    if timestr in timelen:
+                        timelen[timestr] = timelen[timestr] + int(length)
+                    else: # If not, then put the value one there
+                        timelen[timestr] = int(length)
+            else:
+                if src == macaddress:
+                    # Check if timestamp already exists in the map
+                    # If yes, then just increment the frequency value...
+                    if timestr in timelen:
+                        timelen[timestr] = timelen[timestr] + int(length)
+                    else: # If not, then put the value one there
+                        timelen[timestr] = int(length)
+
+    return timelen
+
 
 if __name__ == '__main__':
     main()